1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <linux/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <linux/jhash.h> 93 #include <net/dst.h> 94 #include <net/dst_metadata.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/lwtunnel.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #endif 112 #include <net/secure_seq.h> 113 #include <net/ip_tunnels.h> 114 #include <net/l3mdev.h> 115 116 #include "fib_lookup.h" 117 118 #define RT_FL_TOS(oldflp4) \ 119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 120 121 #define RT_GC_TIMEOUT (300*HZ) 122 123 static int ip_rt_max_size; 124 static int ip_rt_redirect_number __read_mostly = 9; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 127 static int ip_rt_error_cost __read_mostly = HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 130 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 131 static int ip_rt_min_advmss __read_mostly = 256; 132 133 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 134 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_mtu(const struct dst_entry *dst); 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb, u32 mtu); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static int rt_cache_seq_open(struct inode *inode, struct file *file) 238 { 239 return seq_open(file, &rt_cache_seq_ops); 240 } 241 242 static const struct file_operations rt_cache_seq_fops = { 243 .open = rt_cache_seq_open, 244 .read = seq_read, 245 .llseek = seq_lseek, 246 .release = seq_release, 247 }; 248 249 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 251 { 252 int cpu; 253 254 if (*pos == 0) 255 return SEQ_START_TOKEN; 256 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 return NULL; 264 } 265 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 267 { 268 int cpu; 269 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 271 if (!cpu_possible(cpu)) 272 continue; 273 *pos = cpu+1; 274 return &per_cpu(rt_cache_stat, cpu); 275 } 276 return NULL; 277 278 } 279 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 281 { 282 283 } 284 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 286 { 287 struct rt_cache_stat *st = v; 288 289 if (v == SEQ_START_TOKEN) { 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 291 return 0; 292 } 293 294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 296 dst_entries_get_slow(&ipv4_dst_ops), 297 0, /* st->in_hit */ 298 st->in_slow_tot, 299 st->in_slow_mc, 300 st->in_no_route, 301 st->in_brd, 302 st->in_martian_dst, 303 st->in_martian_src, 304 305 0, /* st->out_hit */ 306 st->out_slow_tot, 307 st->out_slow_mc, 308 309 0, /* st->gc_total */ 310 0, /* st->gc_ignored */ 311 0, /* st->gc_goal_miss */ 312 0, /* st->gc_dst_overflow */ 313 0, /* st->in_hlist_search */ 314 0 /* st->out_hlist_search */ 315 ); 316 return 0; 317 } 318 319 static const struct seq_operations rt_cpu_seq_ops = { 320 .start = rt_cpu_seq_start, 321 .next = rt_cpu_seq_next, 322 .stop = rt_cpu_seq_stop, 323 .show = rt_cpu_seq_show, 324 }; 325 326 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 328 { 329 return seq_open(file, &rt_cpu_seq_ops); 330 } 331 332 static const struct file_operations rt_cpu_seq_fops = { 333 .open = rt_cpu_seq_open, 334 .read = seq_read, 335 .llseek = seq_lseek, 336 .release = seq_release, 337 }; 338 339 #ifdef CONFIG_IP_ROUTE_CLASSID 340 static int rt_acct_proc_show(struct seq_file *m, void *v) 341 { 342 struct ip_rt_acct *dst, *src; 343 unsigned int i, j; 344 345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 346 if (!dst) 347 return -ENOMEM; 348 349 for_each_possible_cpu(i) { 350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 351 for (j = 0; j < 256; j++) { 352 dst[j].o_bytes += src[j].o_bytes; 353 dst[j].o_packets += src[j].o_packets; 354 dst[j].i_bytes += src[j].i_bytes; 355 dst[j].i_packets += src[j].i_packets; 356 } 357 } 358 359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 360 kfree(dst); 361 return 0; 362 } 363 #endif 364 365 static int __net_init ip_rt_do_proc_init(struct net *net) 366 { 367 struct proc_dir_entry *pde; 368 369 pde = proc_create("rt_cache", 0444, net->proc_net, 370 &rt_cache_seq_fops); 371 if (!pde) 372 goto err1; 373 374 pde = proc_create("rt_cache", 0444, 375 net->proc_net_stat, &rt_cpu_seq_fops); 376 if (!pde) 377 goto err2; 378 379 #ifdef CONFIG_IP_ROUTE_CLASSID 380 pde = proc_create_single("rt_acct", 0, net->proc_net, 381 rt_acct_proc_show); 382 if (!pde) 383 goto err3; 384 #endif 385 return 0; 386 387 #ifdef CONFIG_IP_ROUTE_CLASSID 388 err3: 389 remove_proc_entry("rt_cache", net->proc_net_stat); 390 #endif 391 err2: 392 remove_proc_entry("rt_cache", net->proc_net); 393 err1: 394 return -ENOMEM; 395 } 396 397 static void __net_exit ip_rt_do_proc_exit(struct net *net) 398 { 399 remove_proc_entry("rt_cache", net->proc_net_stat); 400 remove_proc_entry("rt_cache", net->proc_net); 401 #ifdef CONFIG_IP_ROUTE_CLASSID 402 remove_proc_entry("rt_acct", net->proc_net); 403 #endif 404 } 405 406 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 407 .init = ip_rt_do_proc_init, 408 .exit = ip_rt_do_proc_exit, 409 }; 410 411 static int __init ip_rt_proc_init(void) 412 { 413 return register_pernet_subsys(&ip_rt_proc_ops); 414 } 415 416 #else 417 static inline int ip_rt_proc_init(void) 418 { 419 return 0; 420 } 421 #endif /* CONFIG_PROC_FS */ 422 423 static inline bool rt_is_expired(const struct rtable *rth) 424 { 425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 426 } 427 428 void rt_cache_flush(struct net *net) 429 { 430 rt_genid_bump_ipv4(net); 431 } 432 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 434 struct sk_buff *skb, 435 const void *daddr) 436 { 437 const struct rtable *rt = container_of(dst, struct rtable, dst); 438 struct net_device *dev = dst->dev; 439 struct neighbour *n; 440 441 rcu_read_lock_bh(); 442 443 if (likely(rt->rt_gw_family == AF_INET)) { 444 n = ip_neigh_gw4(dev, rt->rt_gw4); 445 } else if (rt->rt_gw_family == AF_INET6) { 446 n = ip_neigh_gw6(dev, &rt->rt_gw6); 447 } else { 448 __be32 pkey; 449 450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); 451 n = ip_neigh_gw4(dev, pkey); 452 } 453 454 if (n && !refcount_inc_not_zero(&n->refcnt)) 455 n = NULL; 456 457 rcu_read_unlock_bh(); 458 459 return n; 460 } 461 462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 463 { 464 const struct rtable *rt = container_of(dst, struct rtable, dst); 465 struct net_device *dev = dst->dev; 466 const __be32 *pkey = daddr; 467 468 if (rt->rt_gw_family == AF_INET) { 469 pkey = (const __be32 *)&rt->rt_gw4; 470 } else if (rt->rt_gw_family == AF_INET6) { 471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); 472 } else if (!daddr || 473 (rt->rt_flags & 474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { 475 return; 476 } 477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 478 } 479 480 #define IP_IDENTS_SZ 2048u 481 482 static atomic_t *ip_idents __read_mostly; 483 static u32 *ip_tstamps __read_mostly; 484 485 /* In order to protect privacy, we add a perturbation to identifiers 486 * if one generator is seldom used. This makes hard for an attacker 487 * to infer how many packets were sent between two points in time. 488 */ 489 u32 ip_idents_reserve(u32 hash, int segs) 490 { 491 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 492 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 493 u32 old = READ_ONCE(*p_tstamp); 494 u32 now = (u32)jiffies; 495 u32 new, delta = 0; 496 497 if (old != now && cmpxchg(p_tstamp, old, now) == old) 498 delta = prandom_u32_max(now - old); 499 500 /* Do not use atomic_add_return() as it makes UBSAN unhappy */ 501 do { 502 old = (u32)atomic_read(p_id); 503 new = old + delta + segs; 504 } while (atomic_cmpxchg(p_id, old, new) != old); 505 506 return new - segs; 507 } 508 EXPORT_SYMBOL(ip_idents_reserve); 509 510 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 511 { 512 u32 hash, id; 513 514 /* Note the following code is not safe, but this is okay. */ 515 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) 516 get_random_bytes(&net->ipv4.ip_id_key, 517 sizeof(net->ipv4.ip_id_key)); 518 519 hash = siphash_3u32((__force u32)iph->daddr, 520 (__force u32)iph->saddr, 521 iph->protocol, 522 &net->ipv4.ip_id_key); 523 id = ip_idents_reserve(hash, segs); 524 iph->id = htons(id); 525 } 526 EXPORT_SYMBOL(__ip_select_ident); 527 528 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 529 const struct sock *sk, 530 const struct iphdr *iph, 531 int oif, u8 tos, 532 u8 prot, u32 mark, int flow_flags) 533 { 534 if (sk) { 535 const struct inet_sock *inet = inet_sk(sk); 536 537 oif = sk->sk_bound_dev_if; 538 mark = sk->sk_mark; 539 tos = RT_CONN_FLAGS(sk); 540 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 541 } 542 flowi4_init_output(fl4, oif, mark, tos, 543 RT_SCOPE_UNIVERSE, prot, 544 flow_flags, 545 iph->daddr, iph->saddr, 0, 0, 546 sock_net_uid(net, sk)); 547 } 548 549 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 550 const struct sock *sk) 551 { 552 const struct net *net = dev_net(skb->dev); 553 const struct iphdr *iph = ip_hdr(skb); 554 int oif = skb->dev->ifindex; 555 u8 tos = RT_TOS(iph->tos); 556 u8 prot = iph->protocol; 557 u32 mark = skb->mark; 558 559 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 560 } 561 562 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 563 { 564 const struct inet_sock *inet = inet_sk(sk); 565 const struct ip_options_rcu *inet_opt; 566 __be32 daddr = inet->inet_daddr; 567 568 rcu_read_lock(); 569 inet_opt = rcu_dereference(inet->inet_opt); 570 if (inet_opt && inet_opt->opt.srr) 571 daddr = inet_opt->opt.faddr; 572 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 573 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 574 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 575 inet_sk_flowi_flags(sk), 576 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 577 rcu_read_unlock(); 578 } 579 580 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 581 const struct sk_buff *skb) 582 { 583 if (skb) 584 build_skb_flow_key(fl4, skb, sk); 585 else 586 build_sk_flow_key(fl4, sk); 587 } 588 589 static DEFINE_SPINLOCK(fnhe_lock); 590 591 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 592 { 593 struct rtable *rt; 594 595 rt = rcu_dereference(fnhe->fnhe_rth_input); 596 if (rt) { 597 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 598 dst_dev_put(&rt->dst); 599 dst_release(&rt->dst); 600 } 601 rt = rcu_dereference(fnhe->fnhe_rth_output); 602 if (rt) { 603 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 604 dst_dev_put(&rt->dst); 605 dst_release(&rt->dst); 606 } 607 } 608 609 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 610 { 611 struct fib_nh_exception *fnhe, *oldest; 612 613 oldest = rcu_dereference(hash->chain); 614 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 615 fnhe = rcu_dereference(fnhe->fnhe_next)) { 616 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 617 oldest = fnhe; 618 } 619 fnhe_flush_routes(oldest); 620 return oldest; 621 } 622 623 static inline u32 fnhe_hashfun(__be32 daddr) 624 { 625 static u32 fnhe_hashrnd __read_mostly; 626 u32 hval; 627 628 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 629 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); 630 return hash_32(hval, FNHE_HASH_SHIFT); 631 } 632 633 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 634 { 635 rt->rt_pmtu = fnhe->fnhe_pmtu; 636 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; 637 rt->dst.expires = fnhe->fnhe_expires; 638 639 if (fnhe->fnhe_gw) { 640 rt->rt_flags |= RTCF_REDIRECTED; 641 rt->rt_gw_family = AF_INET; 642 rt->rt_gw4 = fnhe->fnhe_gw; 643 } 644 } 645 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 647 u32 pmtu, bool lock, unsigned long expires) 648 { 649 struct fnhe_hash_bucket *hash; 650 struct fib_nh_exception *fnhe; 651 struct rtable *rt; 652 u32 genid, hval; 653 unsigned int i; 654 int depth; 655 656 genid = fnhe_genid(dev_net(nh->fib_nh_dev)); 657 hval = fnhe_hashfun(daddr); 658 659 spin_lock_bh(&fnhe_lock); 660 661 hash = rcu_dereference(nh->nh_exceptions); 662 if (!hash) { 663 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); 664 if (!hash) 665 goto out_unlock; 666 rcu_assign_pointer(nh->nh_exceptions, hash); 667 } 668 669 hash += hval; 670 671 depth = 0; 672 for (fnhe = rcu_dereference(hash->chain); fnhe; 673 fnhe = rcu_dereference(fnhe->fnhe_next)) { 674 if (fnhe->fnhe_daddr == daddr) 675 break; 676 depth++; 677 } 678 679 if (fnhe) { 680 if (fnhe->fnhe_genid != genid) 681 fnhe->fnhe_genid = genid; 682 if (gw) 683 fnhe->fnhe_gw = gw; 684 if (pmtu) { 685 fnhe->fnhe_pmtu = pmtu; 686 fnhe->fnhe_mtu_locked = lock; 687 } 688 fnhe->fnhe_expires = max(1UL, expires); 689 /* Update all cached dsts too */ 690 rt = rcu_dereference(fnhe->fnhe_rth_input); 691 if (rt) 692 fill_route_from_fnhe(rt, fnhe); 693 rt = rcu_dereference(fnhe->fnhe_rth_output); 694 if (rt) 695 fill_route_from_fnhe(rt, fnhe); 696 } else { 697 if (depth > FNHE_RECLAIM_DEPTH) 698 fnhe = fnhe_oldest(hash); 699 else { 700 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 701 if (!fnhe) 702 goto out_unlock; 703 704 fnhe->fnhe_next = hash->chain; 705 rcu_assign_pointer(hash->chain, fnhe); 706 } 707 fnhe->fnhe_genid = genid; 708 fnhe->fnhe_daddr = daddr; 709 fnhe->fnhe_gw = gw; 710 fnhe->fnhe_pmtu = pmtu; 711 fnhe->fnhe_mtu_locked = lock; 712 fnhe->fnhe_expires = max(1UL, expires); 713 714 /* Exception created; mark the cached routes for the nexthop 715 * stale, so anyone caching it rechecks if this exception 716 * applies to them. 717 */ 718 rt = rcu_dereference(nh->nh_rth_input); 719 if (rt) 720 rt->dst.obsolete = DST_OBSOLETE_KILL; 721 722 for_each_possible_cpu(i) { 723 struct rtable __rcu **prt; 724 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 725 rt = rcu_dereference(*prt); 726 if (rt) 727 rt->dst.obsolete = DST_OBSOLETE_KILL; 728 } 729 } 730 731 fnhe->fnhe_stamp = jiffies; 732 733 out_unlock: 734 spin_unlock_bh(&fnhe_lock); 735 } 736 737 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 738 bool kill_route) 739 { 740 __be32 new_gw = icmp_hdr(skb)->un.gateway; 741 __be32 old_gw = ip_hdr(skb)->saddr; 742 struct net_device *dev = skb->dev; 743 struct in_device *in_dev; 744 struct fib_result res; 745 struct neighbour *n; 746 struct net *net; 747 748 switch (icmp_hdr(skb)->code & 7) { 749 case ICMP_REDIR_NET: 750 case ICMP_REDIR_NETTOS: 751 case ICMP_REDIR_HOST: 752 case ICMP_REDIR_HOSTTOS: 753 break; 754 755 default: 756 return; 757 } 758 759 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) 760 return; 761 762 in_dev = __in_dev_get_rcu(dev); 763 if (!in_dev) 764 return; 765 766 net = dev_net(dev); 767 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 768 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 769 ipv4_is_zeronet(new_gw)) 770 goto reject_redirect; 771 772 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 773 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 774 goto reject_redirect; 775 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 776 goto reject_redirect; 777 } else { 778 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 779 goto reject_redirect; 780 } 781 782 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 783 if (!n) 784 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 785 if (!IS_ERR(n)) { 786 if (!(n->nud_state & NUD_VALID)) { 787 neigh_event_send(n, NULL); 788 } else { 789 if (fib_lookup(net, fl4, &res, 0) == 0) { 790 struct fib_nh_common *nhc = FIB_RES_NHC(res); 791 struct fib_nh *nh; 792 793 nh = container_of(nhc, struct fib_nh, nh_common); 794 update_or_create_fnhe(nh, fl4->daddr, new_gw, 795 0, false, 796 jiffies + ip_rt_gc_timeout); 797 } 798 if (kill_route) 799 rt->dst.obsolete = DST_OBSOLETE_KILL; 800 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 801 } 802 neigh_release(n); 803 } 804 return; 805 806 reject_redirect: 807 #ifdef CONFIG_IP_ROUTE_VERBOSE 808 if (IN_DEV_LOG_MARTIANS(in_dev)) { 809 const struct iphdr *iph = (const struct iphdr *) skb->data; 810 __be32 daddr = iph->daddr; 811 __be32 saddr = iph->saddr; 812 813 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 814 " Advised path = %pI4 -> %pI4\n", 815 &old_gw, dev->name, &new_gw, 816 &saddr, &daddr); 817 } 818 #endif 819 ; 820 } 821 822 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 823 { 824 struct rtable *rt; 825 struct flowi4 fl4; 826 const struct iphdr *iph = (const struct iphdr *) skb->data; 827 struct net *net = dev_net(skb->dev); 828 int oif = skb->dev->ifindex; 829 u8 tos = RT_TOS(iph->tos); 830 u8 prot = iph->protocol; 831 u32 mark = skb->mark; 832 833 rt = (struct rtable *) dst; 834 835 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 836 __ip_do_redirect(rt, skb, &fl4, true); 837 } 838 839 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 840 { 841 struct rtable *rt = (struct rtable *)dst; 842 struct dst_entry *ret = dst; 843 844 if (rt) { 845 if (dst->obsolete > 0) { 846 ip_rt_put(rt); 847 ret = NULL; 848 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 849 rt->dst.expires) { 850 ip_rt_put(rt); 851 ret = NULL; 852 } 853 } 854 return ret; 855 } 856 857 /* 858 * Algorithm: 859 * 1. The first ip_rt_redirect_number redirects are sent 860 * with exponential backoff, then we stop sending them at all, 861 * assuming that the host ignores our redirects. 862 * 2. If we did not see packets requiring redirects 863 * during ip_rt_redirect_silence, we assume that the host 864 * forgot redirected route and start to send redirects again. 865 * 866 * This algorithm is much cheaper and more intelligent than dumb load limiting 867 * in icmp.c. 868 * 869 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 870 * and "frag. need" (breaks PMTU discovery) in icmp.c. 871 */ 872 873 void ip_rt_send_redirect(struct sk_buff *skb) 874 { 875 struct rtable *rt = skb_rtable(skb); 876 struct in_device *in_dev; 877 struct inet_peer *peer; 878 struct net *net; 879 int log_martians; 880 int vif; 881 882 rcu_read_lock(); 883 in_dev = __in_dev_get_rcu(rt->dst.dev); 884 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 885 rcu_read_unlock(); 886 return; 887 } 888 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 889 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 890 rcu_read_unlock(); 891 892 net = dev_net(rt->dst.dev); 893 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 894 if (!peer) { 895 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 896 rt_nexthop(rt, ip_hdr(skb)->daddr)); 897 return; 898 } 899 900 /* No redirected packets during ip_rt_redirect_silence; 901 * reset the algorithm. 902 */ 903 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { 904 peer->rate_tokens = 0; 905 peer->n_redirects = 0; 906 } 907 908 /* Too many ignored redirects; do not send anything 909 * set dst.rate_last to the last seen redirected packet. 910 */ 911 if (peer->n_redirects >= ip_rt_redirect_number) { 912 peer->rate_last = jiffies; 913 goto out_put_peer; 914 } 915 916 /* Check for load limit; set rate_last to the latest sent 917 * redirect. 918 */ 919 if (peer->rate_tokens == 0 || 920 time_after(jiffies, 921 (peer->rate_last + 922 (ip_rt_redirect_load << peer->rate_tokens)))) { 923 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 924 925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 926 peer->rate_last = jiffies; 927 ++peer->rate_tokens; 928 ++peer->n_redirects; 929 #ifdef CONFIG_IP_ROUTE_VERBOSE 930 if (log_martians && 931 peer->rate_tokens == ip_rt_redirect_number) 932 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 933 &ip_hdr(skb)->saddr, inet_iif(skb), 934 &ip_hdr(skb)->daddr, &gw); 935 #endif 936 } 937 out_put_peer: 938 inet_putpeer(peer); 939 } 940 941 static int ip_error(struct sk_buff *skb) 942 { 943 struct rtable *rt = skb_rtable(skb); 944 struct net_device *dev = skb->dev; 945 struct in_device *in_dev; 946 struct inet_peer *peer; 947 unsigned long now; 948 struct net *net; 949 bool send; 950 int code; 951 952 if (netif_is_l3_master(skb->dev)) { 953 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); 954 if (!dev) 955 goto out; 956 } 957 958 in_dev = __in_dev_get_rcu(dev); 959 960 /* IP on this device is disabled. */ 961 if (!in_dev) 962 goto out; 963 964 net = dev_net(rt->dst.dev); 965 if (!IN_DEV_FORWARD(in_dev)) { 966 switch (rt->dst.error) { 967 case EHOSTUNREACH: 968 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 969 break; 970 971 case ENETUNREACH: 972 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 973 break; 974 } 975 goto out; 976 } 977 978 switch (rt->dst.error) { 979 case EINVAL: 980 default: 981 goto out; 982 case EHOSTUNREACH: 983 code = ICMP_HOST_UNREACH; 984 break; 985 case ENETUNREACH: 986 code = ICMP_NET_UNREACH; 987 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 988 break; 989 case EACCES: 990 code = ICMP_PKT_FILTERED; 991 break; 992 } 993 994 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 995 l3mdev_master_ifindex(skb->dev), 1); 996 997 send = true; 998 if (peer) { 999 now = jiffies; 1000 peer->rate_tokens += now - peer->rate_last; 1001 if (peer->rate_tokens > ip_rt_error_burst) 1002 peer->rate_tokens = ip_rt_error_burst; 1003 peer->rate_last = now; 1004 if (peer->rate_tokens >= ip_rt_error_cost) 1005 peer->rate_tokens -= ip_rt_error_cost; 1006 else 1007 send = false; 1008 inet_putpeer(peer); 1009 } 1010 if (send) 1011 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1012 1013 out: kfree_skb(skb); 1014 return 0; 1015 } 1016 1017 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1018 { 1019 struct dst_entry *dst = &rt->dst; 1020 u32 old_mtu = ipv4_mtu(dst); 1021 struct fib_result res; 1022 bool lock = false; 1023 1024 if (ip_mtu_locked(dst)) 1025 return; 1026 1027 if (old_mtu < mtu) 1028 return; 1029 1030 if (mtu < ip_rt_min_pmtu) { 1031 lock = true; 1032 mtu = min(old_mtu, ip_rt_min_pmtu); 1033 } 1034 1035 if (rt->rt_pmtu == mtu && !lock && 1036 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1037 return; 1038 1039 rcu_read_lock(); 1040 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1041 struct fib_nh_common *nhc = FIB_RES_NHC(res); 1042 struct fib_nh *nh; 1043 1044 nh = container_of(nhc, struct fib_nh, nh_common); 1045 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, 1046 jiffies + ip_rt_mtu_expires); 1047 } 1048 rcu_read_unlock(); 1049 } 1050 1051 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1052 struct sk_buff *skb, u32 mtu) 1053 { 1054 struct rtable *rt = (struct rtable *) dst; 1055 struct flowi4 fl4; 1056 1057 ip_rt_build_flow_key(&fl4, sk, skb); 1058 __ip_rt_update_pmtu(rt, &fl4, mtu); 1059 } 1060 1061 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1062 int oif, u8 protocol) 1063 { 1064 const struct iphdr *iph = (const struct iphdr *) skb->data; 1065 struct flowi4 fl4; 1066 struct rtable *rt; 1067 u32 mark = IP4_REPLY_MARK(net, skb->mark); 1068 1069 __build_flow_key(net, &fl4, NULL, iph, oif, 1070 RT_TOS(iph->tos), protocol, mark, 0); 1071 rt = __ip_route_output_key(net, &fl4); 1072 if (!IS_ERR(rt)) { 1073 __ip_rt_update_pmtu(rt, &fl4, mtu); 1074 ip_rt_put(rt); 1075 } 1076 } 1077 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1078 1079 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1080 { 1081 const struct iphdr *iph = (const struct iphdr *) skb->data; 1082 struct flowi4 fl4; 1083 struct rtable *rt; 1084 1085 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1086 1087 if (!fl4.flowi4_mark) 1088 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1089 1090 rt = __ip_route_output_key(sock_net(sk), &fl4); 1091 if (!IS_ERR(rt)) { 1092 __ip_rt_update_pmtu(rt, &fl4, mtu); 1093 ip_rt_put(rt); 1094 } 1095 } 1096 1097 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1098 { 1099 const struct iphdr *iph = (const struct iphdr *) skb->data; 1100 struct flowi4 fl4; 1101 struct rtable *rt; 1102 struct dst_entry *odst = NULL; 1103 bool new = false; 1104 struct net *net = sock_net(sk); 1105 1106 bh_lock_sock(sk); 1107 1108 if (!ip_sk_accept_pmtu(sk)) 1109 goto out; 1110 1111 odst = sk_dst_get(sk); 1112 1113 if (sock_owned_by_user(sk) || !odst) { 1114 __ipv4_sk_update_pmtu(skb, sk, mtu); 1115 goto out; 1116 } 1117 1118 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1119 1120 rt = (struct rtable *)odst; 1121 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1122 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1123 if (IS_ERR(rt)) 1124 goto out; 1125 1126 new = true; 1127 } 1128 1129 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); 1130 1131 if (!dst_check(&rt->dst, 0)) { 1132 if (new) 1133 dst_release(&rt->dst); 1134 1135 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1136 if (IS_ERR(rt)) 1137 goto out; 1138 1139 new = true; 1140 } 1141 1142 if (new) 1143 sk_dst_set(sk, &rt->dst); 1144 1145 out: 1146 bh_unlock_sock(sk); 1147 dst_release(odst); 1148 } 1149 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1150 1151 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1152 int oif, u8 protocol) 1153 { 1154 const struct iphdr *iph = (const struct iphdr *) skb->data; 1155 struct flowi4 fl4; 1156 struct rtable *rt; 1157 1158 __build_flow_key(net, &fl4, NULL, iph, oif, 1159 RT_TOS(iph->tos), protocol, 0, 0); 1160 rt = __ip_route_output_key(net, &fl4); 1161 if (!IS_ERR(rt)) { 1162 __ip_do_redirect(rt, skb, &fl4, false); 1163 ip_rt_put(rt); 1164 } 1165 } 1166 EXPORT_SYMBOL_GPL(ipv4_redirect); 1167 1168 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1169 { 1170 const struct iphdr *iph = (const struct iphdr *) skb->data; 1171 struct flowi4 fl4; 1172 struct rtable *rt; 1173 struct net *net = sock_net(sk); 1174 1175 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1176 rt = __ip_route_output_key(net, &fl4); 1177 if (!IS_ERR(rt)) { 1178 __ip_do_redirect(rt, skb, &fl4, false); 1179 ip_rt_put(rt); 1180 } 1181 } 1182 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1183 1184 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1185 { 1186 struct rtable *rt = (struct rtable *) dst; 1187 1188 /* All IPV4 dsts are created with ->obsolete set to the value 1189 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1190 * into this function always. 1191 * 1192 * When a PMTU/redirect information update invalidates a route, 1193 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1194 * DST_OBSOLETE_DEAD. 1195 */ 1196 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1197 return NULL; 1198 return dst; 1199 } 1200 1201 static void ipv4_link_failure(struct sk_buff *skb) 1202 { 1203 struct ip_options opt; 1204 struct rtable *rt; 1205 int res; 1206 1207 /* Recompile ip options since IPCB may not be valid anymore. 1208 */ 1209 memset(&opt, 0, sizeof(opt)); 1210 opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); 1211 1212 rcu_read_lock(); 1213 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); 1214 rcu_read_unlock(); 1215 1216 if (res) 1217 return; 1218 1219 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); 1220 1221 rt = skb_rtable(skb); 1222 if (rt) 1223 dst_set_expires(&rt->dst, 0); 1224 } 1225 1226 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1227 { 1228 pr_debug("%s: %pI4 -> %pI4, %s\n", 1229 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1230 skb->dev ? skb->dev->name : "?"); 1231 kfree_skb(skb); 1232 WARN_ON(1); 1233 return 0; 1234 } 1235 1236 /* 1237 We do not cache source address of outgoing interface, 1238 because it is used only by IP RR, TS and SRR options, 1239 so that it out of fast path. 1240 1241 BTW remember: "addr" is allowed to be not aligned 1242 in IP options! 1243 */ 1244 1245 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1246 { 1247 __be32 src; 1248 1249 if (rt_is_output_route(rt)) 1250 src = ip_hdr(skb)->saddr; 1251 else { 1252 struct fib_result res; 1253 struct iphdr *iph = ip_hdr(skb); 1254 struct flowi4 fl4 = { 1255 .daddr = iph->daddr, 1256 .saddr = iph->saddr, 1257 .flowi4_tos = RT_TOS(iph->tos), 1258 .flowi4_oif = rt->dst.dev->ifindex, 1259 .flowi4_iif = skb->dev->ifindex, 1260 .flowi4_mark = skb->mark, 1261 }; 1262 1263 rcu_read_lock(); 1264 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1265 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); 1266 else 1267 src = inet_select_addr(rt->dst.dev, 1268 rt_nexthop(rt, iph->daddr), 1269 RT_SCOPE_UNIVERSE); 1270 rcu_read_unlock(); 1271 } 1272 memcpy(addr, &src, 4); 1273 } 1274 1275 #ifdef CONFIG_IP_ROUTE_CLASSID 1276 static void set_class_tag(struct rtable *rt, u32 tag) 1277 { 1278 if (!(rt->dst.tclassid & 0xFFFF)) 1279 rt->dst.tclassid |= tag & 0xFFFF; 1280 if (!(rt->dst.tclassid & 0xFFFF0000)) 1281 rt->dst.tclassid |= tag & 0xFFFF0000; 1282 } 1283 #endif 1284 1285 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1286 { 1287 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1288 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1289 ip_rt_min_advmss); 1290 1291 return min(advmss, IPV4_MAX_PMTU - header_size); 1292 } 1293 1294 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1295 { 1296 const struct rtable *rt = (const struct rtable *) dst; 1297 unsigned int mtu = rt->rt_pmtu; 1298 1299 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1300 mtu = dst_metric_raw(dst, RTAX_MTU); 1301 1302 if (mtu) 1303 return mtu; 1304 1305 mtu = READ_ONCE(dst->dev->mtu); 1306 1307 if (unlikely(ip_mtu_locked(dst))) { 1308 if (rt->rt_gw_family && mtu > 576) 1309 mtu = 576; 1310 } 1311 1312 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1313 1314 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1315 } 1316 1317 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) 1318 { 1319 struct fnhe_hash_bucket *hash; 1320 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1321 u32 hval = fnhe_hashfun(daddr); 1322 1323 spin_lock_bh(&fnhe_lock); 1324 1325 hash = rcu_dereference_protected(nh->nh_exceptions, 1326 lockdep_is_held(&fnhe_lock)); 1327 hash += hval; 1328 1329 fnhe_p = &hash->chain; 1330 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1331 while (fnhe) { 1332 if (fnhe->fnhe_daddr == daddr) { 1333 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1334 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1335 /* set fnhe_daddr to 0 to ensure it won't bind with 1336 * new dsts in rt_bind_exception(). 1337 */ 1338 fnhe->fnhe_daddr = 0; 1339 fnhe_flush_routes(fnhe); 1340 kfree_rcu(fnhe, rcu); 1341 break; 1342 } 1343 fnhe_p = &fnhe->fnhe_next; 1344 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1345 lockdep_is_held(&fnhe_lock)); 1346 } 1347 1348 spin_unlock_bh(&fnhe_lock); 1349 } 1350 1351 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1352 { 1353 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); 1354 struct fib_nh_exception *fnhe; 1355 u32 hval; 1356 1357 if (!hash) 1358 return NULL; 1359 1360 hval = fnhe_hashfun(daddr); 1361 1362 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1363 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1364 if (fnhe->fnhe_daddr == daddr) { 1365 if (fnhe->fnhe_expires && 1366 time_after(jiffies, fnhe->fnhe_expires)) { 1367 ip_del_fnhe(nh, daddr); 1368 break; 1369 } 1370 return fnhe; 1371 } 1372 } 1373 return NULL; 1374 } 1375 1376 /* MTU selection: 1377 * 1. mtu on route is locked - use it 1378 * 2. mtu from nexthop exception 1379 * 3. mtu from egress device 1380 */ 1381 1382 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1383 { 1384 struct fib_nh_common *nhc = res->nhc; 1385 struct net_device *dev = nhc->nhc_dev; 1386 struct fib_info *fi = res->fi; 1387 u32 mtu = 0; 1388 1389 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1390 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1391 mtu = fi->fib_mtu; 1392 1393 if (likely(!mtu)) { 1394 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 1395 struct fib_nh_exception *fnhe; 1396 1397 fnhe = find_exception(nh, daddr); 1398 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1399 mtu = fnhe->fnhe_pmtu; 1400 } 1401 1402 if (likely(!mtu)) 1403 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1404 1405 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); 1406 } 1407 1408 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1409 __be32 daddr, const bool do_cache) 1410 { 1411 bool ret = false; 1412 1413 spin_lock_bh(&fnhe_lock); 1414 1415 if (daddr == fnhe->fnhe_daddr) { 1416 struct rtable __rcu **porig; 1417 struct rtable *orig; 1418 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1419 1420 if (rt_is_input_route(rt)) 1421 porig = &fnhe->fnhe_rth_input; 1422 else 1423 porig = &fnhe->fnhe_rth_output; 1424 orig = rcu_dereference(*porig); 1425 1426 if (fnhe->fnhe_genid != genid) { 1427 fnhe->fnhe_genid = genid; 1428 fnhe->fnhe_gw = 0; 1429 fnhe->fnhe_pmtu = 0; 1430 fnhe->fnhe_expires = 0; 1431 fnhe->fnhe_mtu_locked = false; 1432 fnhe_flush_routes(fnhe); 1433 orig = NULL; 1434 } 1435 fill_route_from_fnhe(rt, fnhe); 1436 if (!rt->rt_gw4) { 1437 rt->rt_gw4 = daddr; 1438 rt->rt_gw_family = AF_INET; 1439 } 1440 1441 if (do_cache) { 1442 dst_hold(&rt->dst); 1443 rcu_assign_pointer(*porig, rt); 1444 if (orig) { 1445 dst_dev_put(&orig->dst); 1446 dst_release(&orig->dst); 1447 } 1448 ret = true; 1449 } 1450 1451 fnhe->fnhe_stamp = jiffies; 1452 } 1453 spin_unlock_bh(&fnhe_lock); 1454 1455 return ret; 1456 } 1457 1458 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1459 { 1460 struct rtable *orig, *prev, **p; 1461 bool ret = true; 1462 1463 if (rt_is_input_route(rt)) { 1464 p = (struct rtable **)&nh->nh_rth_input; 1465 } else { 1466 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); 1467 } 1468 orig = *p; 1469 1470 /* hold dst before doing cmpxchg() to avoid race condition 1471 * on this dst 1472 */ 1473 dst_hold(&rt->dst); 1474 prev = cmpxchg(p, orig, rt); 1475 if (prev == orig) { 1476 if (orig) { 1477 dst_dev_put(&orig->dst); 1478 dst_release(&orig->dst); 1479 } 1480 } else { 1481 dst_release(&rt->dst); 1482 ret = false; 1483 } 1484 1485 return ret; 1486 } 1487 1488 struct uncached_list { 1489 spinlock_t lock; 1490 struct list_head head; 1491 }; 1492 1493 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1494 1495 void rt_add_uncached_list(struct rtable *rt) 1496 { 1497 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1498 1499 rt->rt_uncached_list = ul; 1500 1501 spin_lock_bh(&ul->lock); 1502 list_add_tail(&rt->rt_uncached, &ul->head); 1503 spin_unlock_bh(&ul->lock); 1504 } 1505 1506 void rt_del_uncached_list(struct rtable *rt) 1507 { 1508 if (!list_empty(&rt->rt_uncached)) { 1509 struct uncached_list *ul = rt->rt_uncached_list; 1510 1511 spin_lock_bh(&ul->lock); 1512 list_del(&rt->rt_uncached); 1513 spin_unlock_bh(&ul->lock); 1514 } 1515 } 1516 1517 static void ipv4_dst_destroy(struct dst_entry *dst) 1518 { 1519 struct rtable *rt = (struct rtable *)dst; 1520 1521 ip_dst_metrics_put(dst); 1522 rt_del_uncached_list(rt); 1523 } 1524 1525 void rt_flush_dev(struct net_device *dev) 1526 { 1527 struct net *net = dev_net(dev); 1528 struct rtable *rt; 1529 int cpu; 1530 1531 for_each_possible_cpu(cpu) { 1532 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1533 1534 spin_lock_bh(&ul->lock); 1535 list_for_each_entry(rt, &ul->head, rt_uncached) { 1536 if (rt->dst.dev != dev) 1537 continue; 1538 rt->dst.dev = net->loopback_dev; 1539 dev_hold(rt->dst.dev); 1540 dev_put(dev); 1541 } 1542 spin_unlock_bh(&ul->lock); 1543 } 1544 } 1545 1546 static bool rt_cache_valid(const struct rtable *rt) 1547 { 1548 return rt && 1549 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1550 !rt_is_expired(rt); 1551 } 1552 1553 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1554 const struct fib_result *res, 1555 struct fib_nh_exception *fnhe, 1556 struct fib_info *fi, u16 type, u32 itag, 1557 const bool do_cache) 1558 { 1559 bool cached = false; 1560 1561 if (fi) { 1562 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1563 struct fib_nh *nh; 1564 1565 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { 1566 rt->rt_gw_family = nhc->nhc_gw_family; 1567 /* only INET and INET6 are supported */ 1568 if (likely(nhc->nhc_gw_family == AF_INET)) 1569 rt->rt_gw4 = nhc->nhc_gw.ipv4; 1570 else 1571 rt->rt_gw6 = nhc->nhc_gw.ipv6; 1572 } 1573 1574 ip_dst_init_metrics(&rt->dst, fi->fib_metrics); 1575 1576 nh = container_of(nhc, struct fib_nh, nh_common); 1577 #ifdef CONFIG_IP_ROUTE_CLASSID 1578 rt->dst.tclassid = nh->nh_tclassid; 1579 #endif 1580 rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); 1581 if (unlikely(fnhe)) 1582 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1583 else if (do_cache) 1584 cached = rt_cache_route(nh, rt); 1585 if (unlikely(!cached)) { 1586 /* Routes we intend to cache in nexthop exception or 1587 * FIB nexthop have the DST_NOCACHE bit clear. 1588 * However, if we are unsuccessful at storing this 1589 * route into the cache we really need to set it. 1590 */ 1591 if (!rt->rt_gw4) { 1592 rt->rt_gw_family = AF_INET; 1593 rt->rt_gw4 = daddr; 1594 } 1595 rt_add_uncached_list(rt); 1596 } 1597 } else 1598 rt_add_uncached_list(rt); 1599 1600 #ifdef CONFIG_IP_ROUTE_CLASSID 1601 #ifdef CONFIG_IP_MULTIPLE_TABLES 1602 set_class_tag(rt, res->tclassid); 1603 #endif 1604 set_class_tag(rt, itag); 1605 #endif 1606 } 1607 1608 struct rtable *rt_dst_alloc(struct net_device *dev, 1609 unsigned int flags, u16 type, 1610 bool nopolicy, bool noxfrm, bool will_cache) 1611 { 1612 struct rtable *rt; 1613 1614 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1615 (will_cache ? 0 : DST_HOST) | 1616 (nopolicy ? DST_NOPOLICY : 0) | 1617 (noxfrm ? DST_NOXFRM : 0)); 1618 1619 if (rt) { 1620 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1621 rt->rt_flags = flags; 1622 rt->rt_type = type; 1623 rt->rt_is_input = 0; 1624 rt->rt_iif = 0; 1625 rt->rt_pmtu = 0; 1626 rt->rt_mtu_locked = 0; 1627 rt->rt_gw_family = 0; 1628 rt->rt_gw4 = 0; 1629 INIT_LIST_HEAD(&rt->rt_uncached); 1630 1631 rt->dst.output = ip_output; 1632 if (flags & RTCF_LOCAL) 1633 rt->dst.input = ip_local_deliver; 1634 } 1635 1636 return rt; 1637 } 1638 EXPORT_SYMBOL(rt_dst_alloc); 1639 1640 /* called in rcu_read_lock() section */ 1641 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1642 u8 tos, struct net_device *dev, 1643 struct in_device *in_dev, u32 *itag) 1644 { 1645 int err; 1646 1647 /* Primary sanity checks. */ 1648 if (!in_dev) 1649 return -EINVAL; 1650 1651 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1652 skb->protocol != htons(ETH_P_IP)) 1653 return -EINVAL; 1654 1655 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1656 return -EINVAL; 1657 1658 if (ipv4_is_zeronet(saddr)) { 1659 if (!ipv4_is_local_multicast(daddr) && 1660 ip_hdr(skb)->protocol != IPPROTO_IGMP) 1661 return -EINVAL; 1662 } else { 1663 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1664 in_dev, itag); 1665 if (err < 0) 1666 return err; 1667 } 1668 return 0; 1669 } 1670 1671 /* called in rcu_read_lock() section */ 1672 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1673 u8 tos, struct net_device *dev, int our) 1674 { 1675 struct in_device *in_dev = __in_dev_get_rcu(dev); 1676 unsigned int flags = RTCF_MULTICAST; 1677 struct rtable *rth; 1678 u32 itag = 0; 1679 int err; 1680 1681 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1682 if (err) 1683 return err; 1684 1685 if (our) 1686 flags |= RTCF_LOCAL; 1687 1688 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1689 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1690 if (!rth) 1691 return -ENOBUFS; 1692 1693 #ifdef CONFIG_IP_ROUTE_CLASSID 1694 rth->dst.tclassid = itag; 1695 #endif 1696 rth->dst.output = ip_rt_bug; 1697 rth->rt_is_input= 1; 1698 1699 #ifdef CONFIG_IP_MROUTE 1700 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1701 rth->dst.input = ip_mr_input; 1702 #endif 1703 RT_CACHE_STAT_INC(in_slow_mc); 1704 1705 skb_dst_set(skb, &rth->dst); 1706 return 0; 1707 } 1708 1709 1710 static void ip_handle_martian_source(struct net_device *dev, 1711 struct in_device *in_dev, 1712 struct sk_buff *skb, 1713 __be32 daddr, 1714 __be32 saddr) 1715 { 1716 RT_CACHE_STAT_INC(in_martian_src); 1717 #ifdef CONFIG_IP_ROUTE_VERBOSE 1718 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1719 /* 1720 * RFC1812 recommendation, if source is martian, 1721 * the only hint is MAC header. 1722 */ 1723 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1724 &daddr, &saddr, dev->name); 1725 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1726 print_hex_dump(KERN_WARNING, "ll header: ", 1727 DUMP_PREFIX_OFFSET, 16, 1, 1728 skb_mac_header(skb), 1729 dev->hard_header_len, false); 1730 } 1731 } 1732 #endif 1733 } 1734 1735 /* called in rcu_read_lock() section */ 1736 static int __mkroute_input(struct sk_buff *skb, 1737 const struct fib_result *res, 1738 struct in_device *in_dev, 1739 __be32 daddr, __be32 saddr, u32 tos) 1740 { 1741 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1742 struct net_device *dev = nhc->nhc_dev; 1743 struct fib_nh_exception *fnhe; 1744 struct rtable *rth; 1745 struct fib_nh *nh; 1746 int err; 1747 struct in_device *out_dev; 1748 bool do_cache; 1749 u32 itag = 0; 1750 1751 /* get a working reference to the output device */ 1752 out_dev = __in_dev_get_rcu(dev); 1753 if (!out_dev) { 1754 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1755 return -EINVAL; 1756 } 1757 1758 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1759 in_dev->dev, in_dev, &itag); 1760 if (err < 0) { 1761 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1762 saddr); 1763 1764 goto cleanup; 1765 } 1766 1767 do_cache = res->fi && !itag; 1768 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1769 skb->protocol == htons(ETH_P_IP)) { 1770 __be32 gw; 1771 1772 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; 1773 if (IN_DEV_SHARED_MEDIA(out_dev) || 1774 inet_addr_onlink(out_dev, saddr, gw)) 1775 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1776 } 1777 1778 if (skb->protocol != htons(ETH_P_IP)) { 1779 /* Not IP (i.e. ARP). Do not create route, if it is 1780 * invalid for proxy arp. DNAT routes are always valid. 1781 * 1782 * Proxy arp feature have been extended to allow, ARP 1783 * replies back to the same interface, to support 1784 * Private VLAN switch technologies. See arp.c. 1785 */ 1786 if (out_dev == in_dev && 1787 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1788 err = -EINVAL; 1789 goto cleanup; 1790 } 1791 } 1792 1793 nh = container_of(nhc, struct fib_nh, nh_common); 1794 fnhe = find_exception(nh, daddr); 1795 if (do_cache) { 1796 if (fnhe) 1797 rth = rcu_dereference(fnhe->fnhe_rth_input); 1798 else 1799 rth = rcu_dereference(nh->nh_rth_input); 1800 if (rt_cache_valid(rth)) { 1801 skb_dst_set_noref(skb, &rth->dst); 1802 goto out; 1803 } 1804 } 1805 1806 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1807 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1808 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1809 if (!rth) { 1810 err = -ENOBUFS; 1811 goto cleanup; 1812 } 1813 1814 rth->rt_is_input = 1; 1815 RT_CACHE_STAT_INC(in_slow_tot); 1816 1817 rth->dst.input = ip_forward; 1818 1819 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1820 do_cache); 1821 lwtunnel_set_redirect(&rth->dst); 1822 skb_dst_set(skb, &rth->dst); 1823 out: 1824 err = 0; 1825 cleanup: 1826 return err; 1827 } 1828 1829 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1830 /* To make ICMP packets follow the right flow, the multipath hash is 1831 * calculated from the inner IP addresses. 1832 */ 1833 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1834 struct flow_keys *hash_keys) 1835 { 1836 const struct iphdr *outer_iph = ip_hdr(skb); 1837 const struct iphdr *key_iph = outer_iph; 1838 const struct iphdr *inner_iph; 1839 const struct icmphdr *icmph; 1840 struct iphdr _inner_iph; 1841 struct icmphdr _icmph; 1842 1843 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1844 goto out; 1845 1846 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1847 goto out; 1848 1849 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1850 &_icmph); 1851 if (!icmph) 1852 goto out; 1853 1854 if (icmph->type != ICMP_DEST_UNREACH && 1855 icmph->type != ICMP_REDIRECT && 1856 icmph->type != ICMP_TIME_EXCEEDED && 1857 icmph->type != ICMP_PARAMETERPROB) 1858 goto out; 1859 1860 inner_iph = skb_header_pointer(skb, 1861 outer_iph->ihl * 4 + sizeof(_icmph), 1862 sizeof(_inner_iph), &_inner_iph); 1863 if (!inner_iph) 1864 goto out; 1865 1866 key_iph = inner_iph; 1867 out: 1868 hash_keys->addrs.v4addrs.src = key_iph->saddr; 1869 hash_keys->addrs.v4addrs.dst = key_iph->daddr; 1870 } 1871 1872 /* if skb is set it will be used and fl4 can be NULL */ 1873 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, 1874 const struct sk_buff *skb, struct flow_keys *flkeys) 1875 { 1876 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; 1877 struct flow_keys hash_keys; 1878 u32 mhash; 1879 1880 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1881 case 0: 1882 memset(&hash_keys, 0, sizeof(hash_keys)); 1883 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1884 if (skb) { 1885 ip_multipath_l3_keys(skb, &hash_keys); 1886 } else { 1887 hash_keys.addrs.v4addrs.src = fl4->saddr; 1888 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1889 } 1890 break; 1891 case 1: 1892 /* skb is currently provided only when forwarding */ 1893 if (skb) { 1894 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1895 struct flow_keys keys; 1896 1897 /* short-circuit if we already have L4 hash present */ 1898 if (skb->l4_hash) 1899 return skb_get_hash_raw(skb) >> 1; 1900 1901 memset(&hash_keys, 0, sizeof(hash_keys)); 1902 1903 if (!flkeys) { 1904 skb_flow_dissect_flow_keys(skb, &keys, flag); 1905 flkeys = &keys; 1906 } 1907 1908 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1909 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 1910 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 1911 hash_keys.ports.src = flkeys->ports.src; 1912 hash_keys.ports.dst = flkeys->ports.dst; 1913 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1914 } else { 1915 memset(&hash_keys, 0, sizeof(hash_keys)); 1916 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1917 hash_keys.addrs.v4addrs.src = fl4->saddr; 1918 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1919 hash_keys.ports.src = fl4->fl4_sport; 1920 hash_keys.ports.dst = fl4->fl4_dport; 1921 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1922 } 1923 break; 1924 } 1925 mhash = flow_hash_from_keys(&hash_keys); 1926 1927 if (multipath_hash) 1928 mhash = jhash_2words(mhash, multipath_hash, 0); 1929 1930 return mhash >> 1; 1931 } 1932 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1933 1934 static int ip_mkroute_input(struct sk_buff *skb, 1935 struct fib_result *res, 1936 struct in_device *in_dev, 1937 __be32 daddr, __be32 saddr, u32 tos, 1938 struct flow_keys *hkeys) 1939 { 1940 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1941 if (res->fi && res->fi->fib_nhs > 1) { 1942 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 1943 1944 fib_select_multipath(res, h); 1945 } 1946 #endif 1947 1948 /* create a routing cache entry */ 1949 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1950 } 1951 1952 /* 1953 * NOTE. We drop all the packets that has local source 1954 * addresses, because every properly looped back packet 1955 * must have correct destination already attached by output routine. 1956 * 1957 * Such approach solves two big problems: 1958 * 1. Not simplex devices are handled properly. 1959 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1960 * called with rcu_read_lock() 1961 */ 1962 1963 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1964 u8 tos, struct net_device *dev, 1965 struct fib_result *res) 1966 { 1967 struct in_device *in_dev = __in_dev_get_rcu(dev); 1968 struct flow_keys *flkeys = NULL, _flkeys; 1969 struct net *net = dev_net(dev); 1970 struct ip_tunnel_info *tun_info; 1971 int err = -EINVAL; 1972 unsigned int flags = 0; 1973 u32 itag = 0; 1974 struct rtable *rth; 1975 struct flowi4 fl4; 1976 bool do_cache; 1977 1978 /* IP on this device is disabled. */ 1979 1980 if (!in_dev) 1981 goto out; 1982 1983 /* Check for the most weird martians, which can be not detected 1984 by fib_lookup. 1985 */ 1986 1987 tun_info = skb_tunnel_info(skb); 1988 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1989 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 1990 else 1991 fl4.flowi4_tun_key.tun_id = 0; 1992 skb_dst_drop(skb); 1993 1994 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1995 goto martian_source; 1996 1997 res->fi = NULL; 1998 res->table = NULL; 1999 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2000 goto brd_input; 2001 2002 /* Accept zero addresses only to limited broadcast; 2003 * I even do not know to fix it or not. Waiting for complains :-) 2004 */ 2005 if (ipv4_is_zeronet(saddr)) 2006 goto martian_source; 2007 2008 if (ipv4_is_zeronet(daddr)) 2009 goto martian_destination; 2010 2011 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 2012 * and call it once if daddr or/and saddr are loopback addresses 2013 */ 2014 if (ipv4_is_loopback(daddr)) { 2015 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2016 goto martian_destination; 2017 } else if (ipv4_is_loopback(saddr)) { 2018 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2019 goto martian_source; 2020 } 2021 2022 /* 2023 * Now we are ready to route packet. 2024 */ 2025 fl4.flowi4_oif = 0; 2026 fl4.flowi4_iif = dev->ifindex; 2027 fl4.flowi4_mark = skb->mark; 2028 fl4.flowi4_tos = tos; 2029 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2030 fl4.flowi4_flags = 0; 2031 fl4.daddr = daddr; 2032 fl4.saddr = saddr; 2033 fl4.flowi4_uid = sock_net_uid(net, NULL); 2034 2035 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { 2036 flkeys = &_flkeys; 2037 } else { 2038 fl4.flowi4_proto = 0; 2039 fl4.fl4_sport = 0; 2040 fl4.fl4_dport = 0; 2041 } 2042 2043 err = fib_lookup(net, &fl4, res, 0); 2044 if (err != 0) { 2045 if (!IN_DEV_FORWARD(in_dev)) 2046 err = -EHOSTUNREACH; 2047 goto no_route; 2048 } 2049 2050 if (res->type == RTN_BROADCAST) { 2051 if (IN_DEV_BFORWARD(in_dev)) 2052 goto make_route; 2053 goto brd_input; 2054 } 2055 2056 if (res->type == RTN_LOCAL) { 2057 err = fib_validate_source(skb, saddr, daddr, tos, 2058 0, dev, in_dev, &itag); 2059 if (err < 0) 2060 goto martian_source; 2061 goto local_input; 2062 } 2063 2064 if (!IN_DEV_FORWARD(in_dev)) { 2065 err = -EHOSTUNREACH; 2066 goto no_route; 2067 } 2068 if (res->type != RTN_UNICAST) 2069 goto martian_destination; 2070 2071 make_route: 2072 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2073 out: return err; 2074 2075 brd_input: 2076 if (skb->protocol != htons(ETH_P_IP)) 2077 goto e_inval; 2078 2079 if (!ipv4_is_zeronet(saddr)) { 2080 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2081 in_dev, &itag); 2082 if (err < 0) 2083 goto martian_source; 2084 } 2085 flags |= RTCF_BROADCAST; 2086 res->type = RTN_BROADCAST; 2087 RT_CACHE_STAT_INC(in_brd); 2088 2089 local_input: 2090 do_cache = false; 2091 if (res->fi) { 2092 if (!itag) { 2093 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2094 struct fib_nh *nh; 2095 2096 nh = container_of(nhc, struct fib_nh, nh_common); 2097 rth = rcu_dereference(nh->nh_rth_input); 2098 if (rt_cache_valid(rth)) { 2099 skb_dst_set_noref(skb, &rth->dst); 2100 err = 0; 2101 goto out; 2102 } 2103 do_cache = true; 2104 } 2105 } 2106 2107 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 2108 flags | RTCF_LOCAL, res->type, 2109 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 2110 if (!rth) 2111 goto e_nobufs; 2112 2113 rth->dst.output= ip_rt_bug; 2114 #ifdef CONFIG_IP_ROUTE_CLASSID 2115 rth->dst.tclassid = itag; 2116 #endif 2117 rth->rt_is_input = 1; 2118 2119 RT_CACHE_STAT_INC(in_slow_tot); 2120 if (res->type == RTN_UNREACHABLE) { 2121 rth->dst.input= ip_error; 2122 rth->dst.error= -err; 2123 rth->rt_flags &= ~RTCF_LOCAL; 2124 } 2125 2126 if (do_cache) { 2127 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2128 struct fib_nh *nh; 2129 2130 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 2131 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2132 WARN_ON(rth->dst.input == lwtunnel_input); 2133 rth->dst.lwtstate->orig_input = rth->dst.input; 2134 rth->dst.input = lwtunnel_input; 2135 } 2136 2137 nh = container_of(nhc, struct fib_nh, nh_common); 2138 if (unlikely(!rt_cache_route(nh, rth))) 2139 rt_add_uncached_list(rth); 2140 } 2141 skb_dst_set(skb, &rth->dst); 2142 err = 0; 2143 goto out; 2144 2145 no_route: 2146 RT_CACHE_STAT_INC(in_no_route); 2147 res->type = RTN_UNREACHABLE; 2148 res->fi = NULL; 2149 res->table = NULL; 2150 goto local_input; 2151 2152 /* 2153 * Do not cache martian addresses: they should be logged (RFC1812) 2154 */ 2155 martian_destination: 2156 RT_CACHE_STAT_INC(in_martian_dst); 2157 #ifdef CONFIG_IP_ROUTE_VERBOSE 2158 if (IN_DEV_LOG_MARTIANS(in_dev)) 2159 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2160 &daddr, &saddr, dev->name); 2161 #endif 2162 2163 e_inval: 2164 err = -EINVAL; 2165 goto out; 2166 2167 e_nobufs: 2168 err = -ENOBUFS; 2169 goto out; 2170 2171 martian_source: 2172 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2173 goto out; 2174 } 2175 2176 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2177 u8 tos, struct net_device *dev) 2178 { 2179 struct fib_result res; 2180 int err; 2181 2182 tos &= IPTOS_RT_MASK; 2183 rcu_read_lock(); 2184 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2185 rcu_read_unlock(); 2186 2187 return err; 2188 } 2189 EXPORT_SYMBOL(ip_route_input_noref); 2190 2191 /* called with rcu_read_lock held */ 2192 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2193 u8 tos, struct net_device *dev, struct fib_result *res) 2194 { 2195 /* Multicast recognition logic is moved from route cache to here. 2196 The problem was that too many Ethernet cards have broken/missing 2197 hardware multicast filters :-( As result the host on multicasting 2198 network acquires a lot of useless route cache entries, sort of 2199 SDR messages from all the world. Now we try to get rid of them. 2200 Really, provided software IP multicast filter is organized 2201 reasonably (at least, hashed), it does not result in a slowdown 2202 comparing with route cache reject entries. 2203 Note, that multicast routers are not affected, because 2204 route cache entry is created eventually. 2205 */ 2206 if (ipv4_is_multicast(daddr)) { 2207 struct in_device *in_dev = __in_dev_get_rcu(dev); 2208 int our = 0; 2209 int err = -EINVAL; 2210 2211 if (!in_dev) 2212 return err; 2213 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2214 ip_hdr(skb)->protocol); 2215 2216 /* check l3 master if no match yet */ 2217 if (!our && netif_is_l3_slave(dev)) { 2218 struct in_device *l3_in_dev; 2219 2220 l3_in_dev = __in_dev_get_rcu(skb->dev); 2221 if (l3_in_dev) 2222 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2223 ip_hdr(skb)->protocol); 2224 } 2225 2226 if (our 2227 #ifdef CONFIG_IP_MROUTE 2228 || 2229 (!ipv4_is_local_multicast(daddr) && 2230 IN_DEV_MFORWARD(in_dev)) 2231 #endif 2232 ) { 2233 err = ip_route_input_mc(skb, daddr, saddr, 2234 tos, dev, our); 2235 } 2236 return err; 2237 } 2238 2239 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2240 } 2241 2242 /* called with rcu_read_lock() */ 2243 static struct rtable *__mkroute_output(const struct fib_result *res, 2244 const struct flowi4 *fl4, int orig_oif, 2245 struct net_device *dev_out, 2246 unsigned int flags) 2247 { 2248 struct fib_info *fi = res->fi; 2249 struct fib_nh_exception *fnhe; 2250 struct in_device *in_dev; 2251 u16 type = res->type; 2252 struct rtable *rth; 2253 bool do_cache; 2254 2255 in_dev = __in_dev_get_rcu(dev_out); 2256 if (!in_dev) 2257 return ERR_PTR(-EINVAL); 2258 2259 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2260 if (ipv4_is_loopback(fl4->saddr) && 2261 !(dev_out->flags & IFF_LOOPBACK) && 2262 !netif_is_l3_master(dev_out)) 2263 return ERR_PTR(-EINVAL); 2264 2265 if (ipv4_is_lbcast(fl4->daddr)) 2266 type = RTN_BROADCAST; 2267 else if (ipv4_is_multicast(fl4->daddr)) 2268 type = RTN_MULTICAST; 2269 else if (ipv4_is_zeronet(fl4->daddr)) 2270 return ERR_PTR(-EINVAL); 2271 2272 if (dev_out->flags & IFF_LOOPBACK) 2273 flags |= RTCF_LOCAL; 2274 2275 do_cache = true; 2276 if (type == RTN_BROADCAST) { 2277 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2278 fi = NULL; 2279 } else if (type == RTN_MULTICAST) { 2280 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2281 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2282 fl4->flowi4_proto)) 2283 flags &= ~RTCF_LOCAL; 2284 else 2285 do_cache = false; 2286 /* If multicast route do not exist use 2287 * default one, but do not gateway in this case. 2288 * Yes, it is hack. 2289 */ 2290 if (fi && res->prefixlen < 4) 2291 fi = NULL; 2292 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2293 (orig_oif != dev_out->ifindex)) { 2294 /* For local routes that require a particular output interface 2295 * we do not want to cache the result. Caching the result 2296 * causes incorrect behaviour when there are multiple source 2297 * addresses on the interface, the end result being that if the 2298 * intended recipient is waiting on that interface for the 2299 * packet he won't receive it because it will be delivered on 2300 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2301 * be set to the loopback interface as well. 2302 */ 2303 do_cache = false; 2304 } 2305 2306 fnhe = NULL; 2307 do_cache &= fi != NULL; 2308 if (fi) { 2309 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2310 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 2311 struct rtable __rcu **prth; 2312 2313 fnhe = find_exception(nh, fl4->daddr); 2314 if (!do_cache) 2315 goto add; 2316 if (fnhe) { 2317 prth = &fnhe->fnhe_rth_output; 2318 } else { 2319 if (unlikely(fl4->flowi4_flags & 2320 FLOWI_FLAG_KNOWN_NH && 2321 !(nhc->nhc_gw_family && 2322 nhc->nhc_scope == RT_SCOPE_LINK))) { 2323 do_cache = false; 2324 goto add; 2325 } 2326 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); 2327 } 2328 rth = rcu_dereference(*prth); 2329 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2330 return rth; 2331 } 2332 2333 add: 2334 rth = rt_dst_alloc(dev_out, flags, type, 2335 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2336 IN_DEV_CONF_GET(in_dev, NOXFRM), 2337 do_cache); 2338 if (!rth) 2339 return ERR_PTR(-ENOBUFS); 2340 2341 rth->rt_iif = orig_oif; 2342 2343 RT_CACHE_STAT_INC(out_slow_tot); 2344 2345 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2346 if (flags & RTCF_LOCAL && 2347 !(dev_out->flags & IFF_LOOPBACK)) { 2348 rth->dst.output = ip_mc_output; 2349 RT_CACHE_STAT_INC(out_slow_mc); 2350 } 2351 #ifdef CONFIG_IP_MROUTE 2352 if (type == RTN_MULTICAST) { 2353 if (IN_DEV_MFORWARD(in_dev) && 2354 !ipv4_is_local_multicast(fl4->daddr)) { 2355 rth->dst.input = ip_mr_input; 2356 rth->dst.output = ip_mc_output; 2357 } 2358 } 2359 #endif 2360 } 2361 2362 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2363 lwtunnel_set_redirect(&rth->dst); 2364 2365 return rth; 2366 } 2367 2368 /* 2369 * Major route resolver routine. 2370 */ 2371 2372 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2373 const struct sk_buff *skb) 2374 { 2375 __u8 tos = RT_FL_TOS(fl4); 2376 struct fib_result res = { 2377 .type = RTN_UNSPEC, 2378 .fi = NULL, 2379 .table = NULL, 2380 .tclassid = 0, 2381 }; 2382 struct rtable *rth; 2383 2384 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2385 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2386 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2387 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2388 2389 rcu_read_lock(); 2390 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2391 rcu_read_unlock(); 2392 2393 return rth; 2394 } 2395 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2396 2397 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2398 struct fib_result *res, 2399 const struct sk_buff *skb) 2400 { 2401 struct net_device *dev_out = NULL; 2402 int orig_oif = fl4->flowi4_oif; 2403 unsigned int flags = 0; 2404 struct rtable *rth; 2405 int err = -ENETUNREACH; 2406 2407 if (fl4->saddr) { 2408 rth = ERR_PTR(-EINVAL); 2409 if (ipv4_is_multicast(fl4->saddr) || 2410 ipv4_is_lbcast(fl4->saddr) || 2411 ipv4_is_zeronet(fl4->saddr)) 2412 goto out; 2413 2414 /* I removed check for oif == dev_out->oif here. 2415 It was wrong for two reasons: 2416 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2417 is assigned to multiple interfaces. 2418 2. Moreover, we are allowed to send packets with saddr 2419 of another iface. --ANK 2420 */ 2421 2422 if (fl4->flowi4_oif == 0 && 2423 (ipv4_is_multicast(fl4->daddr) || 2424 ipv4_is_lbcast(fl4->daddr))) { 2425 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2426 dev_out = __ip_dev_find(net, fl4->saddr, false); 2427 if (!dev_out) 2428 goto out; 2429 2430 /* Special hack: user can direct multicasts 2431 and limited broadcast via necessary interface 2432 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2433 This hack is not just for fun, it allows 2434 vic,vat and friends to work. 2435 They bind socket to loopback, set ttl to zero 2436 and expect that it will work. 2437 From the viewpoint of routing cache they are broken, 2438 because we are not allowed to build multicast path 2439 with loopback source addr (look, routing cache 2440 cannot know, that ttl is zero, so that packet 2441 will not leave this host and route is valid). 2442 Luckily, this hack is good workaround. 2443 */ 2444 2445 fl4->flowi4_oif = dev_out->ifindex; 2446 goto make_route; 2447 } 2448 2449 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2450 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2451 if (!__ip_dev_find(net, fl4->saddr, false)) 2452 goto out; 2453 } 2454 } 2455 2456 2457 if (fl4->flowi4_oif) { 2458 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2459 rth = ERR_PTR(-ENODEV); 2460 if (!dev_out) 2461 goto out; 2462 2463 /* RACE: Check return value of inet_select_addr instead. */ 2464 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2465 rth = ERR_PTR(-ENETUNREACH); 2466 goto out; 2467 } 2468 if (ipv4_is_local_multicast(fl4->daddr) || 2469 ipv4_is_lbcast(fl4->daddr) || 2470 fl4->flowi4_proto == IPPROTO_IGMP) { 2471 if (!fl4->saddr) 2472 fl4->saddr = inet_select_addr(dev_out, 0, 2473 RT_SCOPE_LINK); 2474 goto make_route; 2475 } 2476 if (!fl4->saddr) { 2477 if (ipv4_is_multicast(fl4->daddr)) 2478 fl4->saddr = inet_select_addr(dev_out, 0, 2479 fl4->flowi4_scope); 2480 else if (!fl4->daddr) 2481 fl4->saddr = inet_select_addr(dev_out, 0, 2482 RT_SCOPE_HOST); 2483 } 2484 } 2485 2486 if (!fl4->daddr) { 2487 fl4->daddr = fl4->saddr; 2488 if (!fl4->daddr) 2489 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2490 dev_out = net->loopback_dev; 2491 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2492 res->type = RTN_LOCAL; 2493 flags |= RTCF_LOCAL; 2494 goto make_route; 2495 } 2496 2497 err = fib_lookup(net, fl4, res, 0); 2498 if (err) { 2499 res->fi = NULL; 2500 res->table = NULL; 2501 if (fl4->flowi4_oif && 2502 (ipv4_is_multicast(fl4->daddr) || 2503 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2504 /* Apparently, routing tables are wrong. Assume, 2505 that the destination is on link. 2506 2507 WHY? DW. 2508 Because we are allowed to send to iface 2509 even if it has NO routes and NO assigned 2510 addresses. When oif is specified, routing 2511 tables are looked up with only one purpose: 2512 to catch if destination is gatewayed, rather than 2513 direct. Moreover, if MSG_DONTROUTE is set, 2514 we send packet, ignoring both routing tables 2515 and ifaddr state. --ANK 2516 2517 2518 We could make it even if oif is unknown, 2519 likely IPv6, but we do not. 2520 */ 2521 2522 if (fl4->saddr == 0) 2523 fl4->saddr = inet_select_addr(dev_out, 0, 2524 RT_SCOPE_LINK); 2525 res->type = RTN_UNICAST; 2526 goto make_route; 2527 } 2528 rth = ERR_PTR(err); 2529 goto out; 2530 } 2531 2532 if (res->type == RTN_LOCAL) { 2533 if (!fl4->saddr) { 2534 if (res->fi->fib_prefsrc) 2535 fl4->saddr = res->fi->fib_prefsrc; 2536 else 2537 fl4->saddr = fl4->daddr; 2538 } 2539 2540 /* L3 master device is the loopback for that domain */ 2541 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2542 net->loopback_dev; 2543 2544 /* make sure orig_oif points to fib result device even 2545 * though packet rx/tx happens over loopback or l3mdev 2546 */ 2547 orig_oif = FIB_RES_OIF(*res); 2548 2549 fl4->flowi4_oif = dev_out->ifindex; 2550 flags |= RTCF_LOCAL; 2551 goto make_route; 2552 } 2553 2554 fib_select_path(net, res, fl4, skb); 2555 2556 dev_out = FIB_RES_DEV(*res); 2557 fl4->flowi4_oif = dev_out->ifindex; 2558 2559 2560 make_route: 2561 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2562 2563 out: 2564 return rth; 2565 } 2566 2567 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2568 { 2569 return NULL; 2570 } 2571 2572 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2573 { 2574 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2575 2576 return mtu ? : dst->dev->mtu; 2577 } 2578 2579 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2580 struct sk_buff *skb, u32 mtu) 2581 { 2582 } 2583 2584 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2585 struct sk_buff *skb) 2586 { 2587 } 2588 2589 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2590 unsigned long old) 2591 { 2592 return NULL; 2593 } 2594 2595 static struct dst_ops ipv4_dst_blackhole_ops = { 2596 .family = AF_INET, 2597 .check = ipv4_blackhole_dst_check, 2598 .mtu = ipv4_blackhole_mtu, 2599 .default_advmss = ipv4_default_advmss, 2600 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2601 .redirect = ipv4_rt_blackhole_redirect, 2602 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2603 .neigh_lookup = ipv4_neigh_lookup, 2604 }; 2605 2606 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2607 { 2608 struct rtable *ort = (struct rtable *) dst_orig; 2609 struct rtable *rt; 2610 2611 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2612 if (rt) { 2613 struct dst_entry *new = &rt->dst; 2614 2615 new->__use = 1; 2616 new->input = dst_discard; 2617 new->output = dst_discard_out; 2618 2619 new->dev = net->loopback_dev; 2620 if (new->dev) 2621 dev_hold(new->dev); 2622 2623 rt->rt_is_input = ort->rt_is_input; 2624 rt->rt_iif = ort->rt_iif; 2625 rt->rt_pmtu = ort->rt_pmtu; 2626 rt->rt_mtu_locked = ort->rt_mtu_locked; 2627 2628 rt->rt_genid = rt_genid_ipv4(net); 2629 rt->rt_flags = ort->rt_flags; 2630 rt->rt_type = ort->rt_type; 2631 rt->rt_gw_family = ort->rt_gw_family; 2632 if (rt->rt_gw_family == AF_INET) 2633 rt->rt_gw4 = ort->rt_gw4; 2634 else if (rt->rt_gw_family == AF_INET6) 2635 rt->rt_gw6 = ort->rt_gw6; 2636 2637 INIT_LIST_HEAD(&rt->rt_uncached); 2638 } 2639 2640 dst_release(dst_orig); 2641 2642 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2643 } 2644 2645 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2646 const struct sock *sk) 2647 { 2648 struct rtable *rt = __ip_route_output_key(net, flp4); 2649 2650 if (IS_ERR(rt)) 2651 return rt; 2652 2653 if (flp4->flowi4_proto) 2654 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2655 flowi4_to_flowi(flp4), 2656 sk, 0); 2657 2658 return rt; 2659 } 2660 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2661 2662 /* called with rcu_read_lock held */ 2663 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2664 struct rtable *rt, u32 table_id, struct flowi4 *fl4, 2665 struct sk_buff *skb, u32 portid, u32 seq) 2666 { 2667 struct rtmsg *r; 2668 struct nlmsghdr *nlh; 2669 unsigned long expires = 0; 2670 u32 error; 2671 u32 metrics[RTAX_MAX]; 2672 2673 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); 2674 if (!nlh) 2675 return -EMSGSIZE; 2676 2677 r = nlmsg_data(nlh); 2678 r->rtm_family = AF_INET; 2679 r->rtm_dst_len = 32; 2680 r->rtm_src_len = 0; 2681 r->rtm_tos = fl4->flowi4_tos; 2682 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2683 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2684 goto nla_put_failure; 2685 r->rtm_type = rt->rt_type; 2686 r->rtm_scope = RT_SCOPE_UNIVERSE; 2687 r->rtm_protocol = RTPROT_UNSPEC; 2688 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2689 if (rt->rt_flags & RTCF_NOTIFY) 2690 r->rtm_flags |= RTM_F_NOTIFY; 2691 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2692 r->rtm_flags |= RTCF_DOREDIRECT; 2693 2694 if (nla_put_in_addr(skb, RTA_DST, dst)) 2695 goto nla_put_failure; 2696 if (src) { 2697 r->rtm_src_len = 32; 2698 if (nla_put_in_addr(skb, RTA_SRC, src)) 2699 goto nla_put_failure; 2700 } 2701 if (rt->dst.dev && 2702 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2703 goto nla_put_failure; 2704 #ifdef CONFIG_IP_ROUTE_CLASSID 2705 if (rt->dst.tclassid && 2706 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2707 goto nla_put_failure; 2708 #endif 2709 if (!rt_is_input_route(rt) && 2710 fl4->saddr != src) { 2711 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2712 goto nla_put_failure; 2713 } 2714 if (rt->rt_gw_family == AF_INET && 2715 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { 2716 goto nla_put_failure; 2717 } else if (rt->rt_gw_family == AF_INET6) { 2718 int alen = sizeof(struct in6_addr); 2719 struct nlattr *nla; 2720 struct rtvia *via; 2721 2722 nla = nla_reserve(skb, RTA_VIA, alen + 2); 2723 if (!nla) 2724 goto nla_put_failure; 2725 2726 via = nla_data(nla); 2727 via->rtvia_family = AF_INET6; 2728 memcpy(via->rtvia_addr, &rt->rt_gw6, alen); 2729 } 2730 2731 expires = rt->dst.expires; 2732 if (expires) { 2733 unsigned long now = jiffies; 2734 2735 if (time_before(now, expires)) 2736 expires -= now; 2737 else 2738 expires = 0; 2739 } 2740 2741 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2742 if (rt->rt_pmtu && expires) 2743 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2744 if (rt->rt_mtu_locked && expires) 2745 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); 2746 if (rtnetlink_put_metrics(skb, metrics) < 0) 2747 goto nla_put_failure; 2748 2749 if (fl4->flowi4_mark && 2750 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2751 goto nla_put_failure; 2752 2753 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2754 nla_put_u32(skb, RTA_UID, 2755 from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) 2756 goto nla_put_failure; 2757 2758 error = rt->dst.error; 2759 2760 if (rt_is_input_route(rt)) { 2761 #ifdef CONFIG_IP_MROUTE 2762 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2763 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2764 int err = ipmr_get_route(net, skb, 2765 fl4->saddr, fl4->daddr, 2766 r, portid); 2767 2768 if (err <= 0) { 2769 if (err == 0) 2770 return 0; 2771 goto nla_put_failure; 2772 } 2773 } else 2774 #endif 2775 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) 2776 goto nla_put_failure; 2777 } 2778 2779 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2780 goto nla_put_failure; 2781 2782 nlmsg_end(skb, nlh); 2783 return 0; 2784 2785 nla_put_failure: 2786 nlmsg_cancel(skb, nlh); 2787 return -EMSGSIZE; 2788 } 2789 2790 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, 2791 u8 ip_proto, __be16 sport, 2792 __be16 dport) 2793 { 2794 struct sk_buff *skb; 2795 struct iphdr *iph; 2796 2797 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2798 if (!skb) 2799 return NULL; 2800 2801 /* Reserve room for dummy headers, this skb can pass 2802 * through good chunk of routing engine. 2803 */ 2804 skb_reset_mac_header(skb); 2805 skb_reset_network_header(skb); 2806 skb->protocol = htons(ETH_P_IP); 2807 iph = skb_put(skb, sizeof(struct iphdr)); 2808 iph->protocol = ip_proto; 2809 iph->saddr = src; 2810 iph->daddr = dst; 2811 iph->version = 0x4; 2812 iph->frag_off = 0; 2813 iph->ihl = 0x5; 2814 skb_set_transport_header(skb, skb->len); 2815 2816 switch (iph->protocol) { 2817 case IPPROTO_UDP: { 2818 struct udphdr *udph; 2819 2820 udph = skb_put_zero(skb, sizeof(struct udphdr)); 2821 udph->source = sport; 2822 udph->dest = dport; 2823 udph->len = sizeof(struct udphdr); 2824 udph->check = 0; 2825 break; 2826 } 2827 case IPPROTO_TCP: { 2828 struct tcphdr *tcph; 2829 2830 tcph = skb_put_zero(skb, sizeof(struct tcphdr)); 2831 tcph->source = sport; 2832 tcph->dest = dport; 2833 tcph->doff = sizeof(struct tcphdr) / 4; 2834 tcph->rst = 1; 2835 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), 2836 src, dst, 0); 2837 break; 2838 } 2839 case IPPROTO_ICMP: { 2840 struct icmphdr *icmph; 2841 2842 icmph = skb_put_zero(skb, sizeof(struct icmphdr)); 2843 icmph->type = ICMP_ECHO; 2844 icmph->code = 0; 2845 } 2846 } 2847 2848 return skb; 2849 } 2850 2851 static int inet_rtm_valid_getroute_req(struct sk_buff *skb, 2852 const struct nlmsghdr *nlh, 2853 struct nlattr **tb, 2854 struct netlink_ext_ack *extack) 2855 { 2856 struct rtmsg *rtm; 2857 int i, err; 2858 2859 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 2860 NL_SET_ERR_MSG(extack, 2861 "ipv4: Invalid header for route get request"); 2862 return -EINVAL; 2863 } 2864 2865 if (!netlink_strict_get_check(skb)) 2866 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 2867 rtm_ipv4_policy, extack); 2868 2869 rtm = nlmsg_data(nlh); 2870 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || 2871 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || 2872 rtm->rtm_table || rtm->rtm_protocol || 2873 rtm->rtm_scope || rtm->rtm_type) { 2874 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); 2875 return -EINVAL; 2876 } 2877 2878 if (rtm->rtm_flags & ~(RTM_F_NOTIFY | 2879 RTM_F_LOOKUP_TABLE | 2880 RTM_F_FIB_MATCH)) { 2881 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); 2882 return -EINVAL; 2883 } 2884 2885 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 2886 rtm_ipv4_policy, extack); 2887 if (err) 2888 return err; 2889 2890 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 2891 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 2892 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); 2893 return -EINVAL; 2894 } 2895 2896 for (i = 0; i <= RTA_MAX; i++) { 2897 if (!tb[i]) 2898 continue; 2899 2900 switch (i) { 2901 case RTA_IIF: 2902 case RTA_OIF: 2903 case RTA_SRC: 2904 case RTA_DST: 2905 case RTA_IP_PROTO: 2906 case RTA_SPORT: 2907 case RTA_DPORT: 2908 case RTA_MARK: 2909 case RTA_UID: 2910 break; 2911 default: 2912 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); 2913 return -EINVAL; 2914 } 2915 } 2916 2917 return 0; 2918 } 2919 2920 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2921 struct netlink_ext_ack *extack) 2922 { 2923 struct net *net = sock_net(in_skb->sk); 2924 struct nlattr *tb[RTA_MAX+1]; 2925 u32 table_id = RT_TABLE_MAIN; 2926 __be16 sport = 0, dport = 0; 2927 struct fib_result res = {}; 2928 u8 ip_proto = IPPROTO_UDP; 2929 struct rtable *rt = NULL; 2930 struct sk_buff *skb; 2931 struct rtmsg *rtm; 2932 struct flowi4 fl4 = {}; 2933 __be32 dst = 0; 2934 __be32 src = 0; 2935 kuid_t uid; 2936 u32 iif; 2937 int err; 2938 int mark; 2939 2940 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 2941 if (err < 0) 2942 return err; 2943 2944 rtm = nlmsg_data(nlh); 2945 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2946 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2947 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2948 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2949 if (tb[RTA_UID]) 2950 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 2951 else 2952 uid = (iif ? INVALID_UID : current_uid()); 2953 2954 if (tb[RTA_IP_PROTO]) { 2955 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 2956 &ip_proto, AF_INET, extack); 2957 if (err) 2958 return err; 2959 } 2960 2961 if (tb[RTA_SPORT]) 2962 sport = nla_get_be16(tb[RTA_SPORT]); 2963 2964 if (tb[RTA_DPORT]) 2965 dport = nla_get_be16(tb[RTA_DPORT]); 2966 2967 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); 2968 if (!skb) 2969 return -ENOBUFS; 2970 2971 fl4.daddr = dst; 2972 fl4.saddr = src; 2973 fl4.flowi4_tos = rtm->rtm_tos; 2974 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2975 fl4.flowi4_mark = mark; 2976 fl4.flowi4_uid = uid; 2977 if (sport) 2978 fl4.fl4_sport = sport; 2979 if (dport) 2980 fl4.fl4_dport = dport; 2981 fl4.flowi4_proto = ip_proto; 2982 2983 rcu_read_lock(); 2984 2985 if (iif) { 2986 struct net_device *dev; 2987 2988 dev = dev_get_by_index_rcu(net, iif); 2989 if (!dev) { 2990 err = -ENODEV; 2991 goto errout_rcu; 2992 } 2993 2994 fl4.flowi4_iif = iif; /* for rt_fill_info */ 2995 skb->dev = dev; 2996 skb->mark = mark; 2997 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 2998 dev, &res); 2999 3000 rt = skb_rtable(skb); 3001 if (err == 0 && rt->dst.error) 3002 err = -rt->dst.error; 3003 } else { 3004 fl4.flowi4_iif = LOOPBACK_IFINDEX; 3005 skb->dev = net->loopback_dev; 3006 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 3007 err = 0; 3008 if (IS_ERR(rt)) 3009 err = PTR_ERR(rt); 3010 else 3011 skb_dst_set(skb, &rt->dst); 3012 } 3013 3014 if (err) 3015 goto errout_rcu; 3016 3017 if (rtm->rtm_flags & RTM_F_NOTIFY) 3018 rt->rt_flags |= RTCF_NOTIFY; 3019 3020 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 3021 table_id = res.table ? res.table->tb_id : 0; 3022 3023 /* reset skb for netlink reply msg */ 3024 skb_trim(skb, 0); 3025 skb_reset_network_header(skb); 3026 skb_reset_transport_header(skb); 3027 skb_reset_mac_header(skb); 3028 3029 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 3030 if (!res.fi) { 3031 err = fib_props[res.type].error; 3032 if (!err) 3033 err = -EHOSTUNREACH; 3034 goto errout_rcu; 3035 } 3036 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 3037 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 3038 rt->rt_type, res.prefix, res.prefixlen, 3039 fl4.flowi4_tos, res.fi, 0); 3040 } else { 3041 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, 3042 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 3043 } 3044 if (err < 0) 3045 goto errout_rcu; 3046 3047 rcu_read_unlock(); 3048 3049 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3050 3051 errout_free: 3052 return err; 3053 errout_rcu: 3054 rcu_read_unlock(); 3055 kfree_skb(skb); 3056 goto errout_free; 3057 } 3058 3059 void ip_rt_multicast_event(struct in_device *in_dev) 3060 { 3061 rt_cache_flush(dev_net(in_dev->dev)); 3062 } 3063 3064 #ifdef CONFIG_SYSCTL 3065 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 3066 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 3067 static int ip_rt_gc_elasticity __read_mostly = 8; 3068 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; 3069 3070 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 3071 void __user *buffer, 3072 size_t *lenp, loff_t *ppos) 3073 { 3074 struct net *net = (struct net *)__ctl->extra1; 3075 3076 if (write) { 3077 rt_cache_flush(net); 3078 fnhe_genid_bump(net); 3079 return 0; 3080 } 3081 3082 return -EINVAL; 3083 } 3084 3085 static struct ctl_table ipv4_route_table[] = { 3086 { 3087 .procname = "gc_thresh", 3088 .data = &ipv4_dst_ops.gc_thresh, 3089 .maxlen = sizeof(int), 3090 .mode = 0644, 3091 .proc_handler = proc_dointvec, 3092 }, 3093 { 3094 .procname = "max_size", 3095 .data = &ip_rt_max_size, 3096 .maxlen = sizeof(int), 3097 .mode = 0644, 3098 .proc_handler = proc_dointvec, 3099 }, 3100 { 3101 /* Deprecated. Use gc_min_interval_ms */ 3102 3103 .procname = "gc_min_interval", 3104 .data = &ip_rt_gc_min_interval, 3105 .maxlen = sizeof(int), 3106 .mode = 0644, 3107 .proc_handler = proc_dointvec_jiffies, 3108 }, 3109 { 3110 .procname = "gc_min_interval_ms", 3111 .data = &ip_rt_gc_min_interval, 3112 .maxlen = sizeof(int), 3113 .mode = 0644, 3114 .proc_handler = proc_dointvec_ms_jiffies, 3115 }, 3116 { 3117 .procname = "gc_timeout", 3118 .data = &ip_rt_gc_timeout, 3119 .maxlen = sizeof(int), 3120 .mode = 0644, 3121 .proc_handler = proc_dointvec_jiffies, 3122 }, 3123 { 3124 .procname = "gc_interval", 3125 .data = &ip_rt_gc_interval, 3126 .maxlen = sizeof(int), 3127 .mode = 0644, 3128 .proc_handler = proc_dointvec_jiffies, 3129 }, 3130 { 3131 .procname = "redirect_load", 3132 .data = &ip_rt_redirect_load, 3133 .maxlen = sizeof(int), 3134 .mode = 0644, 3135 .proc_handler = proc_dointvec, 3136 }, 3137 { 3138 .procname = "redirect_number", 3139 .data = &ip_rt_redirect_number, 3140 .maxlen = sizeof(int), 3141 .mode = 0644, 3142 .proc_handler = proc_dointvec, 3143 }, 3144 { 3145 .procname = "redirect_silence", 3146 .data = &ip_rt_redirect_silence, 3147 .maxlen = sizeof(int), 3148 .mode = 0644, 3149 .proc_handler = proc_dointvec, 3150 }, 3151 { 3152 .procname = "error_cost", 3153 .data = &ip_rt_error_cost, 3154 .maxlen = sizeof(int), 3155 .mode = 0644, 3156 .proc_handler = proc_dointvec, 3157 }, 3158 { 3159 .procname = "error_burst", 3160 .data = &ip_rt_error_burst, 3161 .maxlen = sizeof(int), 3162 .mode = 0644, 3163 .proc_handler = proc_dointvec, 3164 }, 3165 { 3166 .procname = "gc_elasticity", 3167 .data = &ip_rt_gc_elasticity, 3168 .maxlen = sizeof(int), 3169 .mode = 0644, 3170 .proc_handler = proc_dointvec, 3171 }, 3172 { 3173 .procname = "mtu_expires", 3174 .data = &ip_rt_mtu_expires, 3175 .maxlen = sizeof(int), 3176 .mode = 0644, 3177 .proc_handler = proc_dointvec_jiffies, 3178 }, 3179 { 3180 .procname = "min_pmtu", 3181 .data = &ip_rt_min_pmtu, 3182 .maxlen = sizeof(int), 3183 .mode = 0644, 3184 .proc_handler = proc_dointvec_minmax, 3185 .extra1 = &ip_min_valid_pmtu, 3186 }, 3187 { 3188 .procname = "min_adv_mss", 3189 .data = &ip_rt_min_advmss, 3190 .maxlen = sizeof(int), 3191 .mode = 0644, 3192 .proc_handler = proc_dointvec, 3193 }, 3194 { } 3195 }; 3196 3197 static struct ctl_table ipv4_route_flush_table[] = { 3198 { 3199 .procname = "flush", 3200 .maxlen = sizeof(int), 3201 .mode = 0200, 3202 .proc_handler = ipv4_sysctl_rtcache_flush, 3203 }, 3204 { }, 3205 }; 3206 3207 static __net_init int sysctl_route_net_init(struct net *net) 3208 { 3209 struct ctl_table *tbl; 3210 3211 tbl = ipv4_route_flush_table; 3212 if (!net_eq(net, &init_net)) { 3213 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3214 if (!tbl) 3215 goto err_dup; 3216 3217 /* Don't export sysctls to unprivileged users */ 3218 if (net->user_ns != &init_user_ns) 3219 tbl[0].procname = NULL; 3220 } 3221 tbl[0].extra1 = net; 3222 3223 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3224 if (!net->ipv4.route_hdr) 3225 goto err_reg; 3226 return 0; 3227 3228 err_reg: 3229 if (tbl != ipv4_route_flush_table) 3230 kfree(tbl); 3231 err_dup: 3232 return -ENOMEM; 3233 } 3234 3235 static __net_exit void sysctl_route_net_exit(struct net *net) 3236 { 3237 struct ctl_table *tbl; 3238 3239 tbl = net->ipv4.route_hdr->ctl_table_arg; 3240 unregister_net_sysctl_table(net->ipv4.route_hdr); 3241 BUG_ON(tbl == ipv4_route_flush_table); 3242 kfree(tbl); 3243 } 3244 3245 static __net_initdata struct pernet_operations sysctl_route_ops = { 3246 .init = sysctl_route_net_init, 3247 .exit = sysctl_route_net_exit, 3248 }; 3249 #endif 3250 3251 static __net_init int rt_genid_init(struct net *net) 3252 { 3253 atomic_set(&net->ipv4.rt_genid, 0); 3254 atomic_set(&net->fnhe_genid, 0); 3255 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 3256 return 0; 3257 } 3258 3259 static __net_initdata struct pernet_operations rt_genid_ops = { 3260 .init = rt_genid_init, 3261 }; 3262 3263 static int __net_init ipv4_inetpeer_init(struct net *net) 3264 { 3265 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3266 3267 if (!bp) 3268 return -ENOMEM; 3269 inet_peer_base_init(bp); 3270 net->ipv4.peers = bp; 3271 return 0; 3272 } 3273 3274 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3275 { 3276 struct inet_peer_base *bp = net->ipv4.peers; 3277 3278 net->ipv4.peers = NULL; 3279 inetpeer_invalidate_tree(bp); 3280 kfree(bp); 3281 } 3282 3283 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3284 .init = ipv4_inetpeer_init, 3285 .exit = ipv4_inetpeer_exit, 3286 }; 3287 3288 #ifdef CONFIG_IP_ROUTE_CLASSID 3289 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3290 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3291 3292 int __init ip_rt_init(void) 3293 { 3294 int cpu; 3295 3296 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), 3297 GFP_KERNEL); 3298 if (!ip_idents) 3299 panic("IP: failed to allocate ip_idents\n"); 3300 3301 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3302 3303 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3304 if (!ip_tstamps) 3305 panic("IP: failed to allocate ip_tstamps\n"); 3306 3307 for_each_possible_cpu(cpu) { 3308 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3309 3310 INIT_LIST_HEAD(&ul->head); 3311 spin_lock_init(&ul->lock); 3312 } 3313 #ifdef CONFIG_IP_ROUTE_CLASSID 3314 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3315 if (!ip_rt_acct) 3316 panic("IP: failed to allocate ip_rt_acct\n"); 3317 #endif 3318 3319 ipv4_dst_ops.kmem_cachep = 3320 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3321 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3322 3323 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3324 3325 if (dst_entries_init(&ipv4_dst_ops) < 0) 3326 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3327 3328 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3329 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3330 3331 ipv4_dst_ops.gc_thresh = ~0; 3332 ip_rt_max_size = INT_MAX; 3333 3334 devinet_init(); 3335 ip_fib_init(); 3336 3337 if (ip_rt_proc_init()) 3338 pr_err("Unable to create route proc files\n"); 3339 #ifdef CONFIG_XFRM 3340 xfrm_init(); 3341 xfrm4_init(); 3342 #endif 3343 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3344 RTNL_FLAG_DOIT_UNLOCKED); 3345 3346 #ifdef CONFIG_SYSCTL 3347 register_pernet_subsys(&sysctl_route_ops); 3348 #endif 3349 register_pernet_subsys(&rt_genid_ops); 3350 register_pernet_subsys(&ipv4_inetpeer_ops); 3351 return 0; 3352 } 3353 3354 #ifdef CONFIG_SYSCTL 3355 /* 3356 * We really need to sanitize the damn ipv4 init order, then all 3357 * this nonsense will go away. 3358 */ 3359 void __init ip_static_sysctl_init(void) 3360 { 3361 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3362 } 3363 #endif 3364