1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/mm.h> 72 #include <linux/bootmem.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/workqueue.h> 83 #include <linux/skbuff.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <linux/slab.h> 94 #include <net/dst.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/netevent.h> 107 #include <net/rtnetlink.h> 108 #ifdef CONFIG_SYSCTL 109 #include <linux/sysctl.h> 110 #endif 111 #include <net/atmclip.h> 112 113 #define RT_FL_TOS(oldflp4) \ 114 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 115 116 #define IP_MAX_MTU 0xFFF0 117 118 #define RT_GC_TIMEOUT (300*HZ) 119 120 static int ip_rt_max_size; 121 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 122 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 124 static int ip_rt_redirect_number __read_mostly = 9; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 127 static int ip_rt_error_cost __read_mostly = HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 129 static int ip_rt_gc_elasticity __read_mostly = 8; 130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 132 static int ip_rt_min_advmss __read_mostly = 256; 133 static int rt_chain_length_max __read_mostly = 20; 134 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_default_mtu(const struct dst_entry *dst); 142 static void ipv4_dst_destroy(struct dst_entry *dst); 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 144 static void ipv4_link_failure(struct sk_buff *skb); 145 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 146 static int rt_garbage_collect(struct dst_ops *ops); 147 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 149 int how) 150 { 151 } 152 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 154 { 155 struct rtable *rt = (struct rtable *) dst; 156 struct inet_peer *peer; 157 u32 *p = NULL; 158 159 if (!rt->peer) 160 rt_bind_peer(rt, rt->rt_dst, 1); 161 162 peer = rt->peer; 163 if (peer) { 164 u32 *old_p = __DST_METRICS_PTR(old); 165 unsigned long prev, new; 166 167 p = peer->metrics; 168 if (inet_metrics_new(peer)) 169 memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 170 171 new = (unsigned long) p; 172 prev = cmpxchg(&dst->_metrics, old, new); 173 174 if (prev != old) { 175 p = __DST_METRICS_PTR(prev); 176 if (prev & DST_METRICS_READ_ONLY) 177 p = NULL; 178 } else { 179 if (rt->fi) { 180 fib_info_put(rt->fi); 181 rt->fi = NULL; 182 } 183 } 184 } 185 return p; 186 } 187 188 static struct dst_ops ipv4_dst_ops = { 189 .family = AF_INET, 190 .protocol = cpu_to_be16(ETH_P_IP), 191 .gc = rt_garbage_collect, 192 .check = ipv4_dst_check, 193 .default_advmss = ipv4_default_advmss, 194 .default_mtu = ipv4_default_mtu, 195 .cow_metrics = ipv4_cow_metrics, 196 .destroy = ipv4_dst_destroy, 197 .ifdown = ipv4_dst_ifdown, 198 .negative_advice = ipv4_negative_advice, 199 .link_failure = ipv4_link_failure, 200 .update_pmtu = ip_rt_update_pmtu, 201 .local_out = __ip_local_out, 202 }; 203 204 #define ECN_OR_COST(class) TC_PRIO_##class 205 206 const __u8 ip_tos2prio[16] = { 207 TC_PRIO_BESTEFFORT, 208 ECN_OR_COST(BESTEFFORT), 209 TC_PRIO_BESTEFFORT, 210 ECN_OR_COST(BESTEFFORT), 211 TC_PRIO_BULK, 212 ECN_OR_COST(BULK), 213 TC_PRIO_BULK, 214 ECN_OR_COST(BULK), 215 TC_PRIO_INTERACTIVE, 216 ECN_OR_COST(INTERACTIVE), 217 TC_PRIO_INTERACTIVE, 218 ECN_OR_COST(INTERACTIVE), 219 TC_PRIO_INTERACTIVE_BULK, 220 ECN_OR_COST(INTERACTIVE_BULK), 221 TC_PRIO_INTERACTIVE_BULK, 222 ECN_OR_COST(INTERACTIVE_BULK) 223 }; 224 225 226 /* 227 * Route cache. 228 */ 229 230 /* The locking scheme is rather straight forward: 231 * 232 * 1) Read-Copy Update protects the buckets of the central route hash. 233 * 2) Only writers remove entries, and they hold the lock 234 * as they look at rtable reference counts. 235 * 3) Only readers acquire references to rtable entries, 236 * they do so with atomic increments and with the 237 * lock held. 238 */ 239 240 struct rt_hash_bucket { 241 struct rtable __rcu *chain; 242 }; 243 244 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 245 defined(CONFIG_PROVE_LOCKING) 246 /* 247 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 248 * The size of this table is a power of two and depends on the number of CPUS. 249 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 250 */ 251 #ifdef CONFIG_LOCKDEP 252 # define RT_HASH_LOCK_SZ 256 253 #else 254 # if NR_CPUS >= 32 255 # define RT_HASH_LOCK_SZ 4096 256 # elif NR_CPUS >= 16 257 # define RT_HASH_LOCK_SZ 2048 258 # elif NR_CPUS >= 8 259 # define RT_HASH_LOCK_SZ 1024 260 # elif NR_CPUS >= 4 261 # define RT_HASH_LOCK_SZ 512 262 # else 263 # define RT_HASH_LOCK_SZ 256 264 # endif 265 #endif 266 267 static spinlock_t *rt_hash_locks; 268 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 269 270 static __init void rt_hash_lock_init(void) 271 { 272 int i; 273 274 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 275 GFP_KERNEL); 276 if (!rt_hash_locks) 277 panic("IP: failed to allocate rt_hash_locks\n"); 278 279 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 280 spin_lock_init(&rt_hash_locks[i]); 281 } 282 #else 283 # define rt_hash_lock_addr(slot) NULL 284 285 static inline void rt_hash_lock_init(void) 286 { 287 } 288 #endif 289 290 static struct rt_hash_bucket *rt_hash_table __read_mostly; 291 static unsigned rt_hash_mask __read_mostly; 292 static unsigned int rt_hash_log __read_mostly; 293 294 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 295 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 296 297 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 298 int genid) 299 { 300 return jhash_3words((__force u32)daddr, (__force u32)saddr, 301 idx, genid) 302 & rt_hash_mask; 303 } 304 305 static inline int rt_genid(struct net *net) 306 { 307 return atomic_read(&net->ipv4.rt_genid); 308 } 309 310 #ifdef CONFIG_PROC_FS 311 struct rt_cache_iter_state { 312 struct seq_net_private p; 313 int bucket; 314 int genid; 315 }; 316 317 static struct rtable *rt_cache_get_first(struct seq_file *seq) 318 { 319 struct rt_cache_iter_state *st = seq->private; 320 struct rtable *r = NULL; 321 322 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 323 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)) 324 continue; 325 rcu_read_lock_bh(); 326 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 327 while (r) { 328 if (dev_net(r->dst.dev) == seq_file_net(seq) && 329 r->rt_genid == st->genid) 330 return r; 331 r = rcu_dereference_bh(r->dst.rt_next); 332 } 333 rcu_read_unlock_bh(); 334 } 335 return r; 336 } 337 338 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 339 struct rtable *r) 340 { 341 struct rt_cache_iter_state *st = seq->private; 342 343 r = rcu_dereference_bh(r->dst.rt_next); 344 while (!r) { 345 rcu_read_unlock_bh(); 346 do { 347 if (--st->bucket < 0) 348 return NULL; 349 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain)); 350 rcu_read_lock_bh(); 351 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 352 } 353 return r; 354 } 355 356 static struct rtable *rt_cache_get_next(struct seq_file *seq, 357 struct rtable *r) 358 { 359 struct rt_cache_iter_state *st = seq->private; 360 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 361 if (dev_net(r->dst.dev) != seq_file_net(seq)) 362 continue; 363 if (r->rt_genid == st->genid) 364 break; 365 } 366 return r; 367 } 368 369 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 370 { 371 struct rtable *r = rt_cache_get_first(seq); 372 373 if (r) 374 while (pos && (r = rt_cache_get_next(seq, r))) 375 --pos; 376 return pos ? NULL : r; 377 } 378 379 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 380 { 381 struct rt_cache_iter_state *st = seq->private; 382 if (*pos) 383 return rt_cache_get_idx(seq, *pos - 1); 384 st->genid = rt_genid(seq_file_net(seq)); 385 return SEQ_START_TOKEN; 386 } 387 388 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 389 { 390 struct rtable *r; 391 392 if (v == SEQ_START_TOKEN) 393 r = rt_cache_get_first(seq); 394 else 395 r = rt_cache_get_next(seq, v); 396 ++*pos; 397 return r; 398 } 399 400 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 401 { 402 if (v && v != SEQ_START_TOKEN) 403 rcu_read_unlock_bh(); 404 } 405 406 static int rt_cache_seq_show(struct seq_file *seq, void *v) 407 { 408 if (v == SEQ_START_TOKEN) 409 seq_printf(seq, "%-127s\n", 410 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 411 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 412 "HHUptod\tSpecDst"); 413 else { 414 struct rtable *r = v; 415 int len; 416 417 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 418 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 419 r->dst.dev ? r->dst.dev->name : "*", 420 (__force u32)r->rt_dst, 421 (__force u32)r->rt_gateway, 422 r->rt_flags, atomic_read(&r->dst.__refcnt), 423 r->dst.__use, 0, (__force u32)r->rt_src, 424 dst_metric_advmss(&r->dst) + 40, 425 dst_metric(&r->dst, RTAX_WINDOW), 426 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 427 dst_metric(&r->dst, RTAX_RTTVAR)), 428 r->rt_key_tos, 429 r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, 430 r->dst.hh ? (r->dst.hh->hh_output == 431 dev_queue_xmit) : 0, 432 r->rt_spec_dst, &len); 433 434 seq_printf(seq, "%*s\n", 127 - len, ""); 435 } 436 return 0; 437 } 438 439 static const struct seq_operations rt_cache_seq_ops = { 440 .start = rt_cache_seq_start, 441 .next = rt_cache_seq_next, 442 .stop = rt_cache_seq_stop, 443 .show = rt_cache_seq_show, 444 }; 445 446 static int rt_cache_seq_open(struct inode *inode, struct file *file) 447 { 448 return seq_open_net(inode, file, &rt_cache_seq_ops, 449 sizeof(struct rt_cache_iter_state)); 450 } 451 452 static const struct file_operations rt_cache_seq_fops = { 453 .owner = THIS_MODULE, 454 .open = rt_cache_seq_open, 455 .read = seq_read, 456 .llseek = seq_lseek, 457 .release = seq_release_net, 458 }; 459 460 461 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 462 { 463 int cpu; 464 465 if (*pos == 0) 466 return SEQ_START_TOKEN; 467 468 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 469 if (!cpu_possible(cpu)) 470 continue; 471 *pos = cpu+1; 472 return &per_cpu(rt_cache_stat, cpu); 473 } 474 return NULL; 475 } 476 477 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 478 { 479 int cpu; 480 481 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 482 if (!cpu_possible(cpu)) 483 continue; 484 *pos = cpu+1; 485 return &per_cpu(rt_cache_stat, cpu); 486 } 487 return NULL; 488 489 } 490 491 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 492 { 493 494 } 495 496 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 497 { 498 struct rt_cache_stat *st = v; 499 500 if (v == SEQ_START_TOKEN) { 501 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 502 return 0; 503 } 504 505 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 506 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 507 dst_entries_get_slow(&ipv4_dst_ops), 508 st->in_hit, 509 st->in_slow_tot, 510 st->in_slow_mc, 511 st->in_no_route, 512 st->in_brd, 513 st->in_martian_dst, 514 st->in_martian_src, 515 516 st->out_hit, 517 st->out_slow_tot, 518 st->out_slow_mc, 519 520 st->gc_total, 521 st->gc_ignored, 522 st->gc_goal_miss, 523 st->gc_dst_overflow, 524 st->in_hlist_search, 525 st->out_hlist_search 526 ); 527 return 0; 528 } 529 530 static const struct seq_operations rt_cpu_seq_ops = { 531 .start = rt_cpu_seq_start, 532 .next = rt_cpu_seq_next, 533 .stop = rt_cpu_seq_stop, 534 .show = rt_cpu_seq_show, 535 }; 536 537 538 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 539 { 540 return seq_open(file, &rt_cpu_seq_ops); 541 } 542 543 static const struct file_operations rt_cpu_seq_fops = { 544 .owner = THIS_MODULE, 545 .open = rt_cpu_seq_open, 546 .read = seq_read, 547 .llseek = seq_lseek, 548 .release = seq_release, 549 }; 550 551 #ifdef CONFIG_IP_ROUTE_CLASSID 552 static int rt_acct_proc_show(struct seq_file *m, void *v) 553 { 554 struct ip_rt_acct *dst, *src; 555 unsigned int i, j; 556 557 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 558 if (!dst) 559 return -ENOMEM; 560 561 for_each_possible_cpu(i) { 562 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 563 for (j = 0; j < 256; j++) { 564 dst[j].o_bytes += src[j].o_bytes; 565 dst[j].o_packets += src[j].o_packets; 566 dst[j].i_bytes += src[j].i_bytes; 567 dst[j].i_packets += src[j].i_packets; 568 } 569 } 570 571 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 572 kfree(dst); 573 return 0; 574 } 575 576 static int rt_acct_proc_open(struct inode *inode, struct file *file) 577 { 578 return single_open(file, rt_acct_proc_show, NULL); 579 } 580 581 static const struct file_operations rt_acct_proc_fops = { 582 .owner = THIS_MODULE, 583 .open = rt_acct_proc_open, 584 .read = seq_read, 585 .llseek = seq_lseek, 586 .release = single_release, 587 }; 588 #endif 589 590 static int __net_init ip_rt_do_proc_init(struct net *net) 591 { 592 struct proc_dir_entry *pde; 593 594 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 595 &rt_cache_seq_fops); 596 if (!pde) 597 goto err1; 598 599 pde = proc_create("rt_cache", S_IRUGO, 600 net->proc_net_stat, &rt_cpu_seq_fops); 601 if (!pde) 602 goto err2; 603 604 #ifdef CONFIG_IP_ROUTE_CLASSID 605 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 606 if (!pde) 607 goto err3; 608 #endif 609 return 0; 610 611 #ifdef CONFIG_IP_ROUTE_CLASSID 612 err3: 613 remove_proc_entry("rt_cache", net->proc_net_stat); 614 #endif 615 err2: 616 remove_proc_entry("rt_cache", net->proc_net); 617 err1: 618 return -ENOMEM; 619 } 620 621 static void __net_exit ip_rt_do_proc_exit(struct net *net) 622 { 623 remove_proc_entry("rt_cache", net->proc_net_stat); 624 remove_proc_entry("rt_cache", net->proc_net); 625 #ifdef CONFIG_IP_ROUTE_CLASSID 626 remove_proc_entry("rt_acct", net->proc_net); 627 #endif 628 } 629 630 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 631 .init = ip_rt_do_proc_init, 632 .exit = ip_rt_do_proc_exit, 633 }; 634 635 static int __init ip_rt_proc_init(void) 636 { 637 return register_pernet_subsys(&ip_rt_proc_ops); 638 } 639 640 #else 641 static inline int ip_rt_proc_init(void) 642 { 643 return 0; 644 } 645 #endif /* CONFIG_PROC_FS */ 646 647 static inline void rt_free(struct rtable *rt) 648 { 649 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 650 } 651 652 static inline void rt_drop(struct rtable *rt) 653 { 654 ip_rt_put(rt); 655 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 656 } 657 658 static inline int rt_fast_clean(struct rtable *rth) 659 { 660 /* Kill broadcast/multicast entries very aggresively, if they 661 collide in hash table with more useful entries */ 662 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 663 rt_is_input_route(rth) && rth->dst.rt_next; 664 } 665 666 static inline int rt_valuable(struct rtable *rth) 667 { 668 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 669 (rth->peer && rth->peer->pmtu_expires); 670 } 671 672 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 673 { 674 unsigned long age; 675 int ret = 0; 676 677 if (atomic_read(&rth->dst.__refcnt)) 678 goto out; 679 680 age = jiffies - rth->dst.lastuse; 681 if ((age <= tmo1 && !rt_fast_clean(rth)) || 682 (age <= tmo2 && rt_valuable(rth))) 683 goto out; 684 ret = 1; 685 out: return ret; 686 } 687 688 /* Bits of score are: 689 * 31: very valuable 690 * 30: not quite useless 691 * 29..0: usage counter 692 */ 693 static inline u32 rt_score(struct rtable *rt) 694 { 695 u32 score = jiffies - rt->dst.lastuse; 696 697 score = ~score & ~(3<<30); 698 699 if (rt_valuable(rt)) 700 score |= (1<<31); 701 702 if (rt_is_output_route(rt) || 703 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 704 score |= (1<<30); 705 706 return score; 707 } 708 709 static inline bool rt_caching(const struct net *net) 710 { 711 return net->ipv4.current_rt_cache_rebuild_count <= 712 net->ipv4.sysctl_rt_cache_rebuild_count; 713 } 714 715 static inline bool compare_hash_inputs(const struct rtable *rt1, 716 const struct rtable *rt2) 717 { 718 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 719 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 720 (rt1->rt_iif ^ rt2->rt_iif)) == 0); 721 } 722 723 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) 724 { 725 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 726 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 727 (rt1->rt_mark ^ rt2->rt_mark) | 728 (rt1->rt_key_tos ^ rt2->rt_key_tos) | 729 (rt1->rt_oif ^ rt2->rt_oif) | 730 (rt1->rt_iif ^ rt2->rt_iif)) == 0; 731 } 732 733 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 734 { 735 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); 736 } 737 738 static inline int rt_is_expired(struct rtable *rth) 739 { 740 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 741 } 742 743 /* 744 * Perform a full scan of hash table and free all entries. 745 * Can be called by a softirq or a process. 746 * In the later case, we want to be reschedule if necessary 747 */ 748 static void rt_do_flush(struct net *net, int process_context) 749 { 750 unsigned int i; 751 struct rtable *rth, *next; 752 753 for (i = 0; i <= rt_hash_mask; i++) { 754 struct rtable __rcu **pprev; 755 struct rtable *list; 756 757 if (process_context && need_resched()) 758 cond_resched(); 759 rth = rcu_dereference_raw(rt_hash_table[i].chain); 760 if (!rth) 761 continue; 762 763 spin_lock_bh(rt_hash_lock_addr(i)); 764 765 list = NULL; 766 pprev = &rt_hash_table[i].chain; 767 rth = rcu_dereference_protected(*pprev, 768 lockdep_is_held(rt_hash_lock_addr(i))); 769 770 while (rth) { 771 next = rcu_dereference_protected(rth->dst.rt_next, 772 lockdep_is_held(rt_hash_lock_addr(i))); 773 774 if (!net || 775 net_eq(dev_net(rth->dst.dev), net)) { 776 rcu_assign_pointer(*pprev, next); 777 rcu_assign_pointer(rth->dst.rt_next, list); 778 list = rth; 779 } else { 780 pprev = &rth->dst.rt_next; 781 } 782 rth = next; 783 } 784 785 spin_unlock_bh(rt_hash_lock_addr(i)); 786 787 for (; list; list = next) { 788 next = rcu_dereference_protected(list->dst.rt_next, 1); 789 rt_free(list); 790 } 791 } 792 } 793 794 /* 795 * While freeing expired entries, we compute average chain length 796 * and standard deviation, using fixed-point arithmetic. 797 * This to have an estimation of rt_chain_length_max 798 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 799 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 800 */ 801 802 #define FRACT_BITS 3 803 #define ONE (1UL << FRACT_BITS) 804 805 /* 806 * Given a hash chain and an item in this hash chain, 807 * find if a previous entry has the same hash_inputs 808 * (but differs on tos, mark or oif) 809 * Returns 0 if an alias is found. 810 * Returns ONE if rth has no alias before itself. 811 */ 812 static int has_noalias(const struct rtable *head, const struct rtable *rth) 813 { 814 const struct rtable *aux = head; 815 816 while (aux != rth) { 817 if (compare_hash_inputs(aux, rth)) 818 return 0; 819 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 820 } 821 return ONE; 822 } 823 824 /* 825 * Perturbation of rt_genid by a small quantity [1..256] 826 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 827 * many times (2^24) without giving recent rt_genid. 828 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 829 */ 830 static void rt_cache_invalidate(struct net *net) 831 { 832 unsigned char shuffle; 833 834 get_random_bytes(&shuffle, sizeof(shuffle)); 835 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 836 } 837 838 /* 839 * delay < 0 : invalidate cache (fast : entries will be deleted later) 840 * delay >= 0 : invalidate & flush cache (can be long) 841 */ 842 void rt_cache_flush(struct net *net, int delay) 843 { 844 rt_cache_invalidate(net); 845 if (delay >= 0) 846 rt_do_flush(net, !in_softirq()); 847 } 848 849 /* Flush previous cache invalidated entries from the cache */ 850 void rt_cache_flush_batch(struct net *net) 851 { 852 rt_do_flush(net, !in_softirq()); 853 } 854 855 static void rt_emergency_hash_rebuild(struct net *net) 856 { 857 if (net_ratelimit()) 858 printk(KERN_WARNING "Route hash chain too long!\n"); 859 rt_cache_invalidate(net); 860 } 861 862 /* 863 Short description of GC goals. 864 865 We want to build algorithm, which will keep routing cache 866 at some equilibrium point, when number of aged off entries 867 is kept approximately equal to newly generated ones. 868 869 Current expiration strength is variable "expire". 870 We try to adjust it dynamically, so that if networking 871 is idle expires is large enough to keep enough of warm entries, 872 and when load increases it reduces to limit cache size. 873 */ 874 875 static int rt_garbage_collect(struct dst_ops *ops) 876 { 877 static unsigned long expire = RT_GC_TIMEOUT; 878 static unsigned long last_gc; 879 static int rover; 880 static int equilibrium; 881 struct rtable *rth; 882 struct rtable __rcu **rthp; 883 unsigned long now = jiffies; 884 int goal; 885 int entries = dst_entries_get_fast(&ipv4_dst_ops); 886 887 /* 888 * Garbage collection is pretty expensive, 889 * do not make it too frequently. 890 */ 891 892 RT_CACHE_STAT_INC(gc_total); 893 894 if (now - last_gc < ip_rt_gc_min_interval && 895 entries < ip_rt_max_size) { 896 RT_CACHE_STAT_INC(gc_ignored); 897 goto out; 898 } 899 900 entries = dst_entries_get_slow(&ipv4_dst_ops); 901 /* Calculate number of entries, which we want to expire now. */ 902 goal = entries - (ip_rt_gc_elasticity << rt_hash_log); 903 if (goal <= 0) { 904 if (equilibrium < ipv4_dst_ops.gc_thresh) 905 equilibrium = ipv4_dst_ops.gc_thresh; 906 goal = entries - equilibrium; 907 if (goal > 0) { 908 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 909 goal = entries - equilibrium; 910 } 911 } else { 912 /* We are in dangerous area. Try to reduce cache really 913 * aggressively. 914 */ 915 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 916 equilibrium = entries - goal; 917 } 918 919 if (now - last_gc >= ip_rt_gc_min_interval) 920 last_gc = now; 921 922 if (goal <= 0) { 923 equilibrium += goal; 924 goto work_done; 925 } 926 927 do { 928 int i, k; 929 930 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 931 unsigned long tmo = expire; 932 933 k = (k + 1) & rt_hash_mask; 934 rthp = &rt_hash_table[k].chain; 935 spin_lock_bh(rt_hash_lock_addr(k)); 936 while ((rth = rcu_dereference_protected(*rthp, 937 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { 938 if (!rt_is_expired(rth) && 939 !rt_may_expire(rth, tmo, expire)) { 940 tmo >>= 1; 941 rthp = &rth->dst.rt_next; 942 continue; 943 } 944 *rthp = rth->dst.rt_next; 945 rt_free(rth); 946 goal--; 947 } 948 spin_unlock_bh(rt_hash_lock_addr(k)); 949 if (goal <= 0) 950 break; 951 } 952 rover = k; 953 954 if (goal <= 0) 955 goto work_done; 956 957 /* Goal is not achieved. We stop process if: 958 959 - if expire reduced to zero. Otherwise, expire is halfed. 960 - if table is not full. 961 - if we are called from interrupt. 962 - jiffies check is just fallback/debug loop breaker. 963 We will not spin here for long time in any case. 964 */ 965 966 RT_CACHE_STAT_INC(gc_goal_miss); 967 968 if (expire == 0) 969 break; 970 971 expire >>= 1; 972 973 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 974 goto out; 975 } while (!in_softirq() && time_before_eq(jiffies, now)); 976 977 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 978 goto out; 979 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) 980 goto out; 981 if (net_ratelimit()) 982 printk(KERN_WARNING "dst cache overflow\n"); 983 RT_CACHE_STAT_INC(gc_dst_overflow); 984 return 1; 985 986 work_done: 987 expire += ip_rt_gc_min_interval; 988 if (expire > ip_rt_gc_timeout || 989 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || 990 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) 991 expire = ip_rt_gc_timeout; 992 out: return 0; 993 } 994 995 /* 996 * Returns number of entries in a hash chain that have different hash_inputs 997 */ 998 static int slow_chain_length(const struct rtable *head) 999 { 1000 int length = 0; 1001 const struct rtable *rth = head; 1002 1003 while (rth) { 1004 length += has_noalias(head, rth); 1005 rth = rcu_dereference_protected(rth->dst.rt_next, 1); 1006 } 1007 return length >> FRACT_BITS; 1008 } 1009 1010 static int rt_bind_neighbour(struct rtable *rt) 1011 { 1012 static const __be32 inaddr_any = 0; 1013 struct net_device *dev = rt->dst.dev; 1014 struct neigh_table *tbl = &arp_tbl; 1015 const __be32 *nexthop; 1016 struct neighbour *n; 1017 1018 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 1019 if (dev->type == ARPHRD_ATM) 1020 tbl = clip_tbl_hook; 1021 #endif 1022 nexthop = &rt->rt_gateway; 1023 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) 1024 nexthop = &inaddr_any; 1025 n = ipv4_neigh_lookup(tbl, dev, nexthop); 1026 if (IS_ERR(n)) 1027 return PTR_ERR(n); 1028 rt->dst.neighbour = n; 1029 1030 return 0; 1031 } 1032 1033 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, 1034 struct sk_buff *skb, int ifindex) 1035 { 1036 struct rtable *rth, *cand; 1037 struct rtable __rcu **rthp, **candp; 1038 unsigned long now; 1039 u32 min_score; 1040 int chain_length; 1041 int attempts = !in_softirq(); 1042 1043 restart: 1044 chain_length = 0; 1045 min_score = ~(u32)0; 1046 cand = NULL; 1047 candp = NULL; 1048 now = jiffies; 1049 1050 if (!rt_caching(dev_net(rt->dst.dev))) { 1051 /* 1052 * If we're not caching, just tell the caller we 1053 * were successful and don't touch the route. The 1054 * caller hold the sole reference to the cache entry, and 1055 * it will be released when the caller is done with it. 1056 * If we drop it here, the callers have no way to resolve routes 1057 * when we're not caching. Instead, just point *rp at rt, so 1058 * the caller gets a single use out of the route 1059 * Note that we do rt_free on this new route entry, so that 1060 * once its refcount hits zero, we are still able to reap it 1061 * (Thanks Alexey) 1062 * Note: To avoid expensive rcu stuff for this uncached dst, 1063 * we set DST_NOCACHE so that dst_release() can free dst without 1064 * waiting a grace period. 1065 */ 1066 1067 rt->dst.flags |= DST_NOCACHE; 1068 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1069 int err = rt_bind_neighbour(rt); 1070 if (err) { 1071 if (net_ratelimit()) 1072 printk(KERN_WARNING 1073 "Neighbour table failure & not caching routes.\n"); 1074 ip_rt_put(rt); 1075 return ERR_PTR(err); 1076 } 1077 } 1078 1079 goto skip_hashing; 1080 } 1081 1082 rthp = &rt_hash_table[hash].chain; 1083 1084 spin_lock_bh(rt_hash_lock_addr(hash)); 1085 while ((rth = rcu_dereference_protected(*rthp, 1086 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1087 if (rt_is_expired(rth)) { 1088 *rthp = rth->dst.rt_next; 1089 rt_free(rth); 1090 continue; 1091 } 1092 if (compare_keys(rth, rt) && compare_netns(rth, rt)) { 1093 /* Put it first */ 1094 *rthp = rth->dst.rt_next; 1095 /* 1096 * Since lookup is lockfree, the deletion 1097 * must be visible to another weakly ordered CPU before 1098 * the insertion at the start of the hash chain. 1099 */ 1100 rcu_assign_pointer(rth->dst.rt_next, 1101 rt_hash_table[hash].chain); 1102 /* 1103 * Since lookup is lockfree, the update writes 1104 * must be ordered for consistency on SMP. 1105 */ 1106 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1107 1108 dst_use(&rth->dst, now); 1109 spin_unlock_bh(rt_hash_lock_addr(hash)); 1110 1111 rt_drop(rt); 1112 if (skb) 1113 skb_dst_set(skb, &rth->dst); 1114 return rth; 1115 } 1116 1117 if (!atomic_read(&rth->dst.__refcnt)) { 1118 u32 score = rt_score(rth); 1119 1120 if (score <= min_score) { 1121 cand = rth; 1122 candp = rthp; 1123 min_score = score; 1124 } 1125 } 1126 1127 chain_length++; 1128 1129 rthp = &rth->dst.rt_next; 1130 } 1131 1132 if (cand) { 1133 /* ip_rt_gc_elasticity used to be average length of chain 1134 * length, when exceeded gc becomes really aggressive. 1135 * 1136 * The second limit is less certain. At the moment it allows 1137 * only 2 entries per bucket. We will see. 1138 */ 1139 if (chain_length > ip_rt_gc_elasticity) { 1140 *candp = cand->dst.rt_next; 1141 rt_free(cand); 1142 } 1143 } else { 1144 if (chain_length > rt_chain_length_max && 1145 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { 1146 struct net *net = dev_net(rt->dst.dev); 1147 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1148 if (!rt_caching(net)) { 1149 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1150 rt->dst.dev->name, num); 1151 } 1152 rt_emergency_hash_rebuild(net); 1153 spin_unlock_bh(rt_hash_lock_addr(hash)); 1154 1155 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1156 ifindex, rt_genid(net)); 1157 goto restart; 1158 } 1159 } 1160 1161 /* Try to bind route to arp only if it is output 1162 route or unicast forwarding path. 1163 */ 1164 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1165 int err = rt_bind_neighbour(rt); 1166 if (err) { 1167 spin_unlock_bh(rt_hash_lock_addr(hash)); 1168 1169 if (err != -ENOBUFS) { 1170 rt_drop(rt); 1171 return ERR_PTR(err); 1172 } 1173 1174 /* Neighbour tables are full and nothing 1175 can be released. Try to shrink route cache, 1176 it is most likely it holds some neighbour records. 1177 */ 1178 if (attempts-- > 0) { 1179 int saved_elasticity = ip_rt_gc_elasticity; 1180 int saved_int = ip_rt_gc_min_interval; 1181 ip_rt_gc_elasticity = 1; 1182 ip_rt_gc_min_interval = 0; 1183 rt_garbage_collect(&ipv4_dst_ops); 1184 ip_rt_gc_min_interval = saved_int; 1185 ip_rt_gc_elasticity = saved_elasticity; 1186 goto restart; 1187 } 1188 1189 if (net_ratelimit()) 1190 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1191 rt_drop(rt); 1192 return ERR_PTR(-ENOBUFS); 1193 } 1194 } 1195 1196 rt->dst.rt_next = rt_hash_table[hash].chain; 1197 1198 /* 1199 * Since lookup is lockfree, we must make sure 1200 * previous writes to rt are committed to memory 1201 * before making rt visible to other CPUS. 1202 */ 1203 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1204 1205 spin_unlock_bh(rt_hash_lock_addr(hash)); 1206 1207 skip_hashing: 1208 if (skb) 1209 skb_dst_set(skb, &rt->dst); 1210 return rt; 1211 } 1212 1213 static atomic_t __rt_peer_genid = ATOMIC_INIT(0); 1214 1215 static u32 rt_peer_genid(void) 1216 { 1217 return atomic_read(&__rt_peer_genid); 1218 } 1219 1220 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) 1221 { 1222 struct inet_peer *peer; 1223 1224 peer = inet_getpeer_v4(daddr, create); 1225 1226 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1227 inet_putpeer(peer); 1228 else 1229 rt->rt_peer_genid = rt_peer_genid(); 1230 } 1231 1232 /* 1233 * Peer allocation may fail only in serious out-of-memory conditions. However 1234 * we still can generate some output. 1235 * Random ID selection looks a bit dangerous because we have no chances to 1236 * select ID being unique in a reasonable period of time. 1237 * But broken packet identifier may be better than no packet at all. 1238 */ 1239 static void ip_select_fb_ident(struct iphdr *iph) 1240 { 1241 static DEFINE_SPINLOCK(ip_fb_id_lock); 1242 static u32 ip_fallback_id; 1243 u32 salt; 1244 1245 spin_lock_bh(&ip_fb_id_lock); 1246 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1247 iph->id = htons(salt & 0xFFFF); 1248 ip_fallback_id = salt; 1249 spin_unlock_bh(&ip_fb_id_lock); 1250 } 1251 1252 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1253 { 1254 struct rtable *rt = (struct rtable *) dst; 1255 1256 if (rt) { 1257 if (rt->peer == NULL) 1258 rt_bind_peer(rt, rt->rt_dst, 1); 1259 1260 /* If peer is attached to destination, it is never detached, 1261 so that we need not to grab a lock to dereference it. 1262 */ 1263 if (rt->peer) { 1264 iph->id = htons(inet_getid(rt->peer, more)); 1265 return; 1266 } 1267 } else 1268 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1269 __builtin_return_address(0)); 1270 1271 ip_select_fb_ident(iph); 1272 } 1273 EXPORT_SYMBOL(__ip_select_ident); 1274 1275 static void rt_del(unsigned hash, struct rtable *rt) 1276 { 1277 struct rtable __rcu **rthp; 1278 struct rtable *aux; 1279 1280 rthp = &rt_hash_table[hash].chain; 1281 spin_lock_bh(rt_hash_lock_addr(hash)); 1282 ip_rt_put(rt); 1283 while ((aux = rcu_dereference_protected(*rthp, 1284 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1285 if (aux == rt || rt_is_expired(aux)) { 1286 *rthp = aux->dst.rt_next; 1287 rt_free(aux); 1288 continue; 1289 } 1290 rthp = &aux->dst.rt_next; 1291 } 1292 spin_unlock_bh(rt_hash_lock_addr(hash)); 1293 } 1294 1295 /* called in rcu_read_lock() section */ 1296 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1297 __be32 saddr, struct net_device *dev) 1298 { 1299 struct in_device *in_dev = __in_dev_get_rcu(dev); 1300 struct inet_peer *peer; 1301 struct net *net; 1302 1303 if (!in_dev) 1304 return; 1305 1306 net = dev_net(dev); 1307 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 1308 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 1309 ipv4_is_zeronet(new_gw)) 1310 goto reject_redirect; 1311 1312 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1313 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1314 goto reject_redirect; 1315 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1316 goto reject_redirect; 1317 } else { 1318 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1319 goto reject_redirect; 1320 } 1321 1322 peer = inet_getpeer_v4(daddr, 1); 1323 if (peer) { 1324 peer->redirect_learned.a4 = new_gw; 1325 1326 inet_putpeer(peer); 1327 1328 atomic_inc(&__rt_peer_genid); 1329 } 1330 return; 1331 1332 reject_redirect: 1333 #ifdef CONFIG_IP_ROUTE_VERBOSE 1334 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1335 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1336 " Advised path = %pI4 -> %pI4\n", 1337 &old_gw, dev->name, &new_gw, 1338 &saddr, &daddr); 1339 #endif 1340 ; 1341 } 1342 1343 static bool peer_pmtu_expired(struct inet_peer *peer) 1344 { 1345 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1346 1347 return orig && 1348 time_after_eq(jiffies, orig) && 1349 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1350 } 1351 1352 static bool peer_pmtu_cleaned(struct inet_peer *peer) 1353 { 1354 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1355 1356 return orig && 1357 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1358 } 1359 1360 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1361 { 1362 struct rtable *rt = (struct rtable *)dst; 1363 struct dst_entry *ret = dst; 1364 1365 if (rt) { 1366 if (dst->obsolete > 0) { 1367 ip_rt_put(rt); 1368 ret = NULL; 1369 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1370 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1371 rt->rt_oif, 1372 rt_genid(dev_net(dst->dev))); 1373 rt_del(hash, rt); 1374 ret = NULL; 1375 } else if (rt->peer && peer_pmtu_expired(rt->peer)) { 1376 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); 1377 } 1378 } 1379 return ret; 1380 } 1381 1382 /* 1383 * Algorithm: 1384 * 1. The first ip_rt_redirect_number redirects are sent 1385 * with exponential backoff, then we stop sending them at all, 1386 * assuming that the host ignores our redirects. 1387 * 2. If we did not see packets requiring redirects 1388 * during ip_rt_redirect_silence, we assume that the host 1389 * forgot redirected route and start to send redirects again. 1390 * 1391 * This algorithm is much cheaper and more intelligent than dumb load limiting 1392 * in icmp.c. 1393 * 1394 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1395 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1396 */ 1397 1398 void ip_rt_send_redirect(struct sk_buff *skb) 1399 { 1400 struct rtable *rt = skb_rtable(skb); 1401 struct in_device *in_dev; 1402 struct inet_peer *peer; 1403 int log_martians; 1404 1405 rcu_read_lock(); 1406 in_dev = __in_dev_get_rcu(rt->dst.dev); 1407 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1408 rcu_read_unlock(); 1409 return; 1410 } 1411 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1412 rcu_read_unlock(); 1413 1414 if (!rt->peer) 1415 rt_bind_peer(rt, rt->rt_dst, 1); 1416 peer = rt->peer; 1417 if (!peer) { 1418 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1419 return; 1420 } 1421 1422 /* No redirected packets during ip_rt_redirect_silence; 1423 * reset the algorithm. 1424 */ 1425 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 1426 peer->rate_tokens = 0; 1427 1428 /* Too many ignored redirects; do not send anything 1429 * set dst.rate_last to the last seen redirected packet. 1430 */ 1431 if (peer->rate_tokens >= ip_rt_redirect_number) { 1432 peer->rate_last = jiffies; 1433 return; 1434 } 1435 1436 /* Check for load limit; set rate_last to the latest sent 1437 * redirect. 1438 */ 1439 if (peer->rate_tokens == 0 || 1440 time_after(jiffies, 1441 (peer->rate_last + 1442 (ip_rt_redirect_load << peer->rate_tokens)))) { 1443 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1444 peer->rate_last = jiffies; 1445 ++peer->rate_tokens; 1446 #ifdef CONFIG_IP_ROUTE_VERBOSE 1447 if (log_martians && 1448 peer->rate_tokens == ip_rt_redirect_number && 1449 net_ratelimit()) 1450 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1451 &ip_hdr(skb)->saddr, rt->rt_iif, 1452 &rt->rt_dst, &rt->rt_gateway); 1453 #endif 1454 } 1455 } 1456 1457 static int ip_error(struct sk_buff *skb) 1458 { 1459 struct rtable *rt = skb_rtable(skb); 1460 struct inet_peer *peer; 1461 unsigned long now; 1462 bool send; 1463 int code; 1464 1465 switch (rt->dst.error) { 1466 case EINVAL: 1467 default: 1468 goto out; 1469 case EHOSTUNREACH: 1470 code = ICMP_HOST_UNREACH; 1471 break; 1472 case ENETUNREACH: 1473 code = ICMP_NET_UNREACH; 1474 IP_INC_STATS_BH(dev_net(rt->dst.dev), 1475 IPSTATS_MIB_INNOROUTES); 1476 break; 1477 case EACCES: 1478 code = ICMP_PKT_FILTERED; 1479 break; 1480 } 1481 1482 if (!rt->peer) 1483 rt_bind_peer(rt, rt->rt_dst, 1); 1484 peer = rt->peer; 1485 1486 send = true; 1487 if (peer) { 1488 now = jiffies; 1489 peer->rate_tokens += now - peer->rate_last; 1490 if (peer->rate_tokens > ip_rt_error_burst) 1491 peer->rate_tokens = ip_rt_error_burst; 1492 peer->rate_last = now; 1493 if (peer->rate_tokens >= ip_rt_error_cost) 1494 peer->rate_tokens -= ip_rt_error_cost; 1495 else 1496 send = false; 1497 } 1498 if (send) 1499 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1500 1501 out: kfree_skb(skb); 1502 return 0; 1503 } 1504 1505 /* 1506 * The last two values are not from the RFC but 1507 * are needed for AMPRnet AX.25 paths. 1508 */ 1509 1510 static const unsigned short mtu_plateau[] = 1511 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1512 1513 static inline unsigned short guess_mtu(unsigned short old_mtu) 1514 { 1515 int i; 1516 1517 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1518 if (old_mtu > mtu_plateau[i]) 1519 return mtu_plateau[i]; 1520 return 68; 1521 } 1522 1523 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, 1524 unsigned short new_mtu, 1525 struct net_device *dev) 1526 { 1527 unsigned short old_mtu = ntohs(iph->tot_len); 1528 unsigned short est_mtu = 0; 1529 struct inet_peer *peer; 1530 1531 peer = inet_getpeer_v4(iph->daddr, 1); 1532 if (peer) { 1533 unsigned short mtu = new_mtu; 1534 1535 if (new_mtu < 68 || new_mtu >= old_mtu) { 1536 /* BSD 4.2 derived systems incorrectly adjust 1537 * tot_len by the IP header length, and report 1538 * a zero MTU in the ICMP message. 1539 */ 1540 if (mtu == 0 && 1541 old_mtu >= 68 + (iph->ihl << 2)) 1542 old_mtu -= iph->ihl << 2; 1543 mtu = guess_mtu(old_mtu); 1544 } 1545 1546 if (mtu < ip_rt_min_pmtu) 1547 mtu = ip_rt_min_pmtu; 1548 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { 1549 unsigned long pmtu_expires; 1550 1551 pmtu_expires = jiffies + ip_rt_mtu_expires; 1552 if (!pmtu_expires) 1553 pmtu_expires = 1UL; 1554 1555 est_mtu = mtu; 1556 peer->pmtu_learned = mtu; 1557 peer->pmtu_expires = pmtu_expires; 1558 } 1559 1560 inet_putpeer(peer); 1561 1562 atomic_inc(&__rt_peer_genid); 1563 } 1564 return est_mtu ? : new_mtu; 1565 } 1566 1567 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 1568 { 1569 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 1570 1571 if (!expires) 1572 return; 1573 if (time_before(jiffies, expires)) { 1574 u32 orig_dst_mtu = dst_mtu(dst); 1575 if (peer->pmtu_learned < orig_dst_mtu) { 1576 if (!peer->pmtu_orig) 1577 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 1578 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); 1579 } 1580 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) 1581 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1582 } 1583 1584 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1585 { 1586 struct rtable *rt = (struct rtable *) dst; 1587 struct inet_peer *peer; 1588 1589 dst_confirm(dst); 1590 1591 if (!rt->peer) 1592 rt_bind_peer(rt, rt->rt_dst, 1); 1593 peer = rt->peer; 1594 if (peer) { 1595 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); 1596 1597 if (mtu < ip_rt_min_pmtu) 1598 mtu = ip_rt_min_pmtu; 1599 if (!pmtu_expires || mtu < peer->pmtu_learned) { 1600 1601 pmtu_expires = jiffies + ip_rt_mtu_expires; 1602 if (!pmtu_expires) 1603 pmtu_expires = 1UL; 1604 1605 peer->pmtu_learned = mtu; 1606 peer->pmtu_expires = pmtu_expires; 1607 1608 atomic_inc(&__rt_peer_genid); 1609 rt->rt_peer_genid = rt_peer_genid(); 1610 } 1611 check_peer_pmtu(dst, peer); 1612 } 1613 } 1614 1615 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 1616 { 1617 struct rtable *rt = (struct rtable *) dst; 1618 __be32 orig_gw = rt->rt_gateway; 1619 1620 dst_confirm(&rt->dst); 1621 1622 neigh_release(rt->dst.neighbour); 1623 rt->dst.neighbour = NULL; 1624 1625 rt->rt_gateway = peer->redirect_learned.a4; 1626 if (rt_bind_neighbour(rt) || 1627 !(rt->dst.neighbour->nud_state & NUD_VALID)) { 1628 if (rt->dst.neighbour) 1629 neigh_event_send(rt->dst.neighbour, NULL); 1630 rt->rt_gateway = orig_gw; 1631 return -EAGAIN; 1632 } else { 1633 rt->rt_flags |= RTCF_REDIRECTED; 1634 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, 1635 rt->dst.neighbour); 1636 } 1637 return 0; 1638 } 1639 1640 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1641 { 1642 struct rtable *rt = (struct rtable *) dst; 1643 1644 if (rt_is_expired(rt)) 1645 return NULL; 1646 if (rt->rt_peer_genid != rt_peer_genid()) { 1647 struct inet_peer *peer; 1648 1649 if (!rt->peer) 1650 rt_bind_peer(rt, rt->rt_dst, 0); 1651 1652 peer = rt->peer; 1653 if (peer) { 1654 check_peer_pmtu(dst, peer); 1655 1656 if (peer->redirect_learned.a4 && 1657 peer->redirect_learned.a4 != rt->rt_gateway) { 1658 if (check_peer_redir(dst, peer)) 1659 return NULL; 1660 } 1661 } 1662 1663 rt->rt_peer_genid = rt_peer_genid(); 1664 } 1665 return dst; 1666 } 1667 1668 static void ipv4_dst_destroy(struct dst_entry *dst) 1669 { 1670 struct rtable *rt = (struct rtable *) dst; 1671 struct inet_peer *peer = rt->peer; 1672 1673 if (rt->fi) { 1674 fib_info_put(rt->fi); 1675 rt->fi = NULL; 1676 } 1677 if (peer) { 1678 rt->peer = NULL; 1679 inet_putpeer(peer); 1680 } 1681 } 1682 1683 1684 static void ipv4_link_failure(struct sk_buff *skb) 1685 { 1686 struct rtable *rt; 1687 1688 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1689 1690 rt = skb_rtable(skb); 1691 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1692 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1693 } 1694 1695 static int ip_rt_bug(struct sk_buff *skb) 1696 { 1697 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1698 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1699 skb->dev ? skb->dev->name : "?"); 1700 kfree_skb(skb); 1701 WARN_ON(1); 1702 return 0; 1703 } 1704 1705 /* 1706 We do not cache source address of outgoing interface, 1707 because it is used only by IP RR, TS and SRR options, 1708 so that it out of fast path. 1709 1710 BTW remember: "addr" is allowed to be not aligned 1711 in IP options! 1712 */ 1713 1714 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1715 { 1716 __be32 src; 1717 1718 if (rt_is_output_route(rt)) 1719 src = ip_hdr(skb)->saddr; 1720 else { 1721 struct fib_result res; 1722 struct flowi4 fl4; 1723 struct iphdr *iph; 1724 1725 iph = ip_hdr(skb); 1726 1727 memset(&fl4, 0, sizeof(fl4)); 1728 fl4.daddr = iph->daddr; 1729 fl4.saddr = iph->saddr; 1730 fl4.flowi4_tos = iph->tos; 1731 fl4.flowi4_oif = rt->dst.dev->ifindex; 1732 fl4.flowi4_iif = skb->dev->ifindex; 1733 fl4.flowi4_mark = skb->mark; 1734 1735 rcu_read_lock(); 1736 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1737 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1738 else 1739 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1740 RT_SCOPE_UNIVERSE); 1741 rcu_read_unlock(); 1742 } 1743 memcpy(addr, &src, 4); 1744 } 1745 1746 #ifdef CONFIG_IP_ROUTE_CLASSID 1747 static void set_class_tag(struct rtable *rt, u32 tag) 1748 { 1749 if (!(rt->dst.tclassid & 0xFFFF)) 1750 rt->dst.tclassid |= tag & 0xFFFF; 1751 if (!(rt->dst.tclassid & 0xFFFF0000)) 1752 rt->dst.tclassid |= tag & 0xFFFF0000; 1753 } 1754 #endif 1755 1756 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1757 { 1758 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); 1759 1760 if (advmss == 0) { 1761 advmss = max_t(unsigned int, dst->dev->mtu - 40, 1762 ip_rt_min_advmss); 1763 if (advmss > 65535 - 40) 1764 advmss = 65535 - 40; 1765 } 1766 return advmss; 1767 } 1768 1769 static unsigned int ipv4_default_mtu(const struct dst_entry *dst) 1770 { 1771 unsigned int mtu = dst->dev->mtu; 1772 1773 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1774 const struct rtable *rt = (const struct rtable *) dst; 1775 1776 if (rt->rt_gateway != rt->rt_dst && mtu > 576) 1777 mtu = 576; 1778 } 1779 1780 if (mtu > IP_MAX_MTU) 1781 mtu = IP_MAX_MTU; 1782 1783 return mtu; 1784 } 1785 1786 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1787 struct fib_info *fi) 1788 { 1789 struct inet_peer *peer; 1790 int create = 0; 1791 1792 /* If a peer entry exists for this destination, we must hook 1793 * it up in order to get at cached metrics. 1794 */ 1795 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) 1796 create = 1; 1797 1798 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); 1799 if (peer) { 1800 rt->rt_peer_genid = rt_peer_genid(); 1801 if (inet_metrics_new(peer)) 1802 memcpy(peer->metrics, fi->fib_metrics, 1803 sizeof(u32) * RTAX_MAX); 1804 dst_init_metrics(&rt->dst, peer->metrics, false); 1805 1806 check_peer_pmtu(&rt->dst, peer); 1807 if (peer->redirect_learned.a4 && 1808 peer->redirect_learned.a4 != rt->rt_gateway) { 1809 rt->rt_gateway = peer->redirect_learned.a4; 1810 rt->rt_flags |= RTCF_REDIRECTED; 1811 } 1812 } else { 1813 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1814 rt->fi = fi; 1815 atomic_inc(&fi->fib_clntref); 1816 } 1817 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1818 } 1819 } 1820 1821 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1822 const struct fib_result *res, 1823 struct fib_info *fi, u16 type, u32 itag) 1824 { 1825 struct dst_entry *dst = &rt->dst; 1826 1827 if (fi) { 1828 if (FIB_RES_GW(*res) && 1829 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1830 rt->rt_gateway = FIB_RES_GW(*res); 1831 rt_init_metrics(rt, fl4, fi); 1832 #ifdef CONFIG_IP_ROUTE_CLASSID 1833 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1834 #endif 1835 } 1836 1837 if (dst_mtu(dst) > IP_MAX_MTU) 1838 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); 1839 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1840 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1841 1842 #ifdef CONFIG_IP_ROUTE_CLASSID 1843 #ifdef CONFIG_IP_MULTIPLE_TABLES 1844 set_class_tag(rt, fib_rules_tclass(res)); 1845 #endif 1846 set_class_tag(rt, itag); 1847 #endif 1848 } 1849 1850 static struct rtable *rt_dst_alloc(struct net_device *dev, 1851 bool nopolicy, bool noxfrm) 1852 { 1853 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1854 DST_HOST | 1855 (nopolicy ? DST_NOPOLICY : 0) | 1856 (noxfrm ? DST_NOXFRM : 0)); 1857 } 1858 1859 /* called in rcu_read_lock() section */ 1860 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1861 u8 tos, struct net_device *dev, int our) 1862 { 1863 unsigned int hash; 1864 struct rtable *rth; 1865 __be32 spec_dst; 1866 struct in_device *in_dev = __in_dev_get_rcu(dev); 1867 u32 itag = 0; 1868 int err; 1869 1870 /* Primary sanity checks. */ 1871 1872 if (in_dev == NULL) 1873 return -EINVAL; 1874 1875 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1876 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1877 goto e_inval; 1878 1879 if (ipv4_is_zeronet(saddr)) { 1880 if (!ipv4_is_local_multicast(daddr)) 1881 goto e_inval; 1882 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1883 } else { 1884 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 1885 &itag); 1886 if (err < 0) 1887 goto e_err; 1888 } 1889 rth = rt_dst_alloc(init_net.loopback_dev, 1890 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1891 if (!rth) 1892 goto e_nobufs; 1893 1894 #ifdef CONFIG_IP_ROUTE_CLASSID 1895 rth->dst.tclassid = itag; 1896 #endif 1897 rth->dst.output = ip_rt_bug; 1898 1899 rth->rt_key_dst = daddr; 1900 rth->rt_key_src = saddr; 1901 rth->rt_genid = rt_genid(dev_net(dev)); 1902 rth->rt_flags = RTCF_MULTICAST; 1903 rth->rt_type = RTN_MULTICAST; 1904 rth->rt_key_tos = tos; 1905 rth->rt_dst = daddr; 1906 rth->rt_src = saddr; 1907 rth->rt_route_iif = dev->ifindex; 1908 rth->rt_iif = dev->ifindex; 1909 rth->rt_oif = 0; 1910 rth->rt_mark = skb->mark; 1911 rth->rt_gateway = daddr; 1912 rth->rt_spec_dst= spec_dst; 1913 rth->rt_peer_genid = 0; 1914 rth->peer = NULL; 1915 rth->fi = NULL; 1916 if (our) { 1917 rth->dst.input= ip_local_deliver; 1918 rth->rt_flags |= RTCF_LOCAL; 1919 } 1920 1921 #ifdef CONFIG_IP_MROUTE 1922 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1923 rth->dst.input = ip_mr_input; 1924 #endif 1925 RT_CACHE_STAT_INC(in_slow_mc); 1926 1927 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1928 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 1929 return IS_ERR(rth) ? PTR_ERR(rth) : 0; 1930 1931 e_nobufs: 1932 return -ENOBUFS; 1933 e_inval: 1934 return -EINVAL; 1935 e_err: 1936 return err; 1937 } 1938 1939 1940 static void ip_handle_martian_source(struct net_device *dev, 1941 struct in_device *in_dev, 1942 struct sk_buff *skb, 1943 __be32 daddr, 1944 __be32 saddr) 1945 { 1946 RT_CACHE_STAT_INC(in_martian_src); 1947 #ifdef CONFIG_IP_ROUTE_VERBOSE 1948 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1949 /* 1950 * RFC1812 recommendation, if source is martian, 1951 * the only hint is MAC header. 1952 */ 1953 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 1954 &daddr, &saddr, dev->name); 1955 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1956 int i; 1957 const unsigned char *p = skb_mac_header(skb); 1958 printk(KERN_WARNING "ll header: "); 1959 for (i = 0; i < dev->hard_header_len; i++, p++) { 1960 printk("%02x", *p); 1961 if (i < (dev->hard_header_len - 1)) 1962 printk(":"); 1963 } 1964 printk("\n"); 1965 } 1966 } 1967 #endif 1968 } 1969 1970 /* called in rcu_read_lock() section */ 1971 static int __mkroute_input(struct sk_buff *skb, 1972 const struct fib_result *res, 1973 struct in_device *in_dev, 1974 __be32 daddr, __be32 saddr, u32 tos, 1975 struct rtable **result) 1976 { 1977 struct rtable *rth; 1978 int err; 1979 struct in_device *out_dev; 1980 unsigned int flags = 0; 1981 __be32 spec_dst; 1982 u32 itag; 1983 1984 /* get a working reference to the output device */ 1985 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 1986 if (out_dev == NULL) { 1987 if (net_ratelimit()) 1988 printk(KERN_CRIT "Bug in ip_route_input" \ 1989 "_slow(). Please, report\n"); 1990 return -EINVAL; 1991 } 1992 1993 1994 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1995 in_dev->dev, &spec_dst, &itag); 1996 if (err < 0) { 1997 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1998 saddr); 1999 2000 goto cleanup; 2001 } 2002 2003 if (err) 2004 flags |= RTCF_DIRECTSRC; 2005 2006 if (out_dev == in_dev && err && 2007 (IN_DEV_SHARED_MEDIA(out_dev) || 2008 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 2009 flags |= RTCF_DOREDIRECT; 2010 2011 if (skb->protocol != htons(ETH_P_IP)) { 2012 /* Not IP (i.e. ARP). Do not create route, if it is 2013 * invalid for proxy arp. DNAT routes are always valid. 2014 * 2015 * Proxy arp feature have been extended to allow, ARP 2016 * replies back to the same interface, to support 2017 * Private VLAN switch technologies. See arp.c. 2018 */ 2019 if (out_dev == in_dev && 2020 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 2021 err = -EINVAL; 2022 goto cleanup; 2023 } 2024 } 2025 2026 rth = rt_dst_alloc(out_dev->dev, 2027 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2028 IN_DEV_CONF_GET(out_dev, NOXFRM)); 2029 if (!rth) { 2030 err = -ENOBUFS; 2031 goto cleanup; 2032 } 2033 2034 rth->rt_key_dst = daddr; 2035 rth->rt_key_src = saddr; 2036 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2037 rth->rt_flags = flags; 2038 rth->rt_type = res->type; 2039 rth->rt_key_tos = tos; 2040 rth->rt_dst = daddr; 2041 rth->rt_src = saddr; 2042 rth->rt_route_iif = in_dev->dev->ifindex; 2043 rth->rt_iif = in_dev->dev->ifindex; 2044 rth->rt_oif = 0; 2045 rth->rt_mark = skb->mark; 2046 rth->rt_gateway = daddr; 2047 rth->rt_spec_dst= spec_dst; 2048 rth->rt_peer_genid = 0; 2049 rth->peer = NULL; 2050 rth->fi = NULL; 2051 2052 rth->dst.input = ip_forward; 2053 rth->dst.output = ip_output; 2054 2055 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 2056 2057 *result = rth; 2058 err = 0; 2059 cleanup: 2060 return err; 2061 } 2062 2063 static int ip_mkroute_input(struct sk_buff *skb, 2064 struct fib_result *res, 2065 const struct flowi4 *fl4, 2066 struct in_device *in_dev, 2067 __be32 daddr, __be32 saddr, u32 tos) 2068 { 2069 struct rtable* rth = NULL; 2070 int err; 2071 unsigned hash; 2072 2073 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2074 if (res->fi && res->fi->fib_nhs > 1) 2075 fib_select_multipath(res); 2076 #endif 2077 2078 /* create a routing cache entry */ 2079 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2080 if (err) 2081 return err; 2082 2083 /* put it into the cache */ 2084 hash = rt_hash(daddr, saddr, fl4->flowi4_iif, 2085 rt_genid(dev_net(rth->dst.dev))); 2086 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); 2087 if (IS_ERR(rth)) 2088 return PTR_ERR(rth); 2089 return 0; 2090 } 2091 2092 /* 2093 * NOTE. We drop all the packets that has local source 2094 * addresses, because every properly looped back packet 2095 * must have correct destination already attached by output routine. 2096 * 2097 * Such approach solves two big problems: 2098 * 1. Not simplex devices are handled properly. 2099 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2100 * called with rcu_read_lock() 2101 */ 2102 2103 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2104 u8 tos, struct net_device *dev) 2105 { 2106 struct fib_result res; 2107 struct in_device *in_dev = __in_dev_get_rcu(dev); 2108 struct flowi4 fl4; 2109 unsigned flags = 0; 2110 u32 itag = 0; 2111 struct rtable * rth; 2112 unsigned hash; 2113 __be32 spec_dst; 2114 int err = -EINVAL; 2115 struct net * net = dev_net(dev); 2116 2117 /* IP on this device is disabled. */ 2118 2119 if (!in_dev) 2120 goto out; 2121 2122 /* Check for the most weird martians, which can be not detected 2123 by fib_lookup. 2124 */ 2125 2126 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2127 ipv4_is_loopback(saddr)) 2128 goto martian_source; 2129 2130 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2131 goto brd_input; 2132 2133 /* Accept zero addresses only to limited broadcast; 2134 * I even do not know to fix it or not. Waiting for complains :-) 2135 */ 2136 if (ipv4_is_zeronet(saddr)) 2137 goto martian_source; 2138 2139 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 2140 goto martian_destination; 2141 2142 /* 2143 * Now we are ready to route packet. 2144 */ 2145 fl4.flowi4_oif = 0; 2146 fl4.flowi4_iif = dev->ifindex; 2147 fl4.flowi4_mark = skb->mark; 2148 fl4.flowi4_tos = tos; 2149 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2150 fl4.daddr = daddr; 2151 fl4.saddr = saddr; 2152 err = fib_lookup(net, &fl4, &res); 2153 if (err != 0) { 2154 if (!IN_DEV_FORWARD(in_dev)) 2155 goto e_hostunreach; 2156 goto no_route; 2157 } 2158 2159 RT_CACHE_STAT_INC(in_slow_tot); 2160 2161 if (res.type == RTN_BROADCAST) 2162 goto brd_input; 2163 2164 if (res.type == RTN_LOCAL) { 2165 err = fib_validate_source(skb, saddr, daddr, tos, 2166 net->loopback_dev->ifindex, 2167 dev, &spec_dst, &itag); 2168 if (err < 0) 2169 goto martian_source_keep_err; 2170 if (err) 2171 flags |= RTCF_DIRECTSRC; 2172 spec_dst = daddr; 2173 goto local_input; 2174 } 2175 2176 if (!IN_DEV_FORWARD(in_dev)) 2177 goto e_hostunreach; 2178 if (res.type != RTN_UNICAST) 2179 goto martian_destination; 2180 2181 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 2182 out: return err; 2183 2184 brd_input: 2185 if (skb->protocol != htons(ETH_P_IP)) 2186 goto e_inval; 2187 2188 if (ipv4_is_zeronet(saddr)) 2189 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2190 else { 2191 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 2192 &itag); 2193 if (err < 0) 2194 goto martian_source_keep_err; 2195 if (err) 2196 flags |= RTCF_DIRECTSRC; 2197 } 2198 flags |= RTCF_BROADCAST; 2199 res.type = RTN_BROADCAST; 2200 RT_CACHE_STAT_INC(in_brd); 2201 2202 local_input: 2203 rth = rt_dst_alloc(net->loopback_dev, 2204 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2205 if (!rth) 2206 goto e_nobufs; 2207 2208 rth->dst.input= ip_local_deliver; 2209 rth->dst.output= ip_rt_bug; 2210 #ifdef CONFIG_IP_ROUTE_CLASSID 2211 rth->dst.tclassid = itag; 2212 #endif 2213 2214 rth->rt_key_dst = daddr; 2215 rth->rt_key_src = saddr; 2216 rth->rt_genid = rt_genid(net); 2217 rth->rt_flags = flags|RTCF_LOCAL; 2218 rth->rt_type = res.type; 2219 rth->rt_key_tos = tos; 2220 rth->rt_dst = daddr; 2221 rth->rt_src = saddr; 2222 #ifdef CONFIG_IP_ROUTE_CLASSID 2223 rth->dst.tclassid = itag; 2224 #endif 2225 rth->rt_route_iif = dev->ifindex; 2226 rth->rt_iif = dev->ifindex; 2227 rth->rt_oif = 0; 2228 rth->rt_mark = skb->mark; 2229 rth->rt_gateway = daddr; 2230 rth->rt_spec_dst= spec_dst; 2231 rth->rt_peer_genid = 0; 2232 rth->peer = NULL; 2233 rth->fi = NULL; 2234 if (res.type == RTN_UNREACHABLE) { 2235 rth->dst.input= ip_error; 2236 rth->dst.error= -err; 2237 rth->rt_flags &= ~RTCF_LOCAL; 2238 } 2239 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 2240 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 2241 err = 0; 2242 if (IS_ERR(rth)) 2243 err = PTR_ERR(rth); 2244 goto out; 2245 2246 no_route: 2247 RT_CACHE_STAT_INC(in_no_route); 2248 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2249 res.type = RTN_UNREACHABLE; 2250 if (err == -ESRCH) 2251 err = -ENETUNREACH; 2252 goto local_input; 2253 2254 /* 2255 * Do not cache martian addresses: they should be logged (RFC1812) 2256 */ 2257 martian_destination: 2258 RT_CACHE_STAT_INC(in_martian_dst); 2259 #ifdef CONFIG_IP_ROUTE_VERBOSE 2260 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2261 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2262 &daddr, &saddr, dev->name); 2263 #endif 2264 2265 e_hostunreach: 2266 err = -EHOSTUNREACH; 2267 goto out; 2268 2269 e_inval: 2270 err = -EINVAL; 2271 goto out; 2272 2273 e_nobufs: 2274 err = -ENOBUFS; 2275 goto out; 2276 2277 martian_source: 2278 err = -EINVAL; 2279 martian_source_keep_err: 2280 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2281 goto out; 2282 } 2283 2284 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2285 u8 tos, struct net_device *dev, bool noref) 2286 { 2287 struct rtable * rth; 2288 unsigned hash; 2289 int iif = dev->ifindex; 2290 struct net *net; 2291 int res; 2292 2293 net = dev_net(dev); 2294 2295 rcu_read_lock(); 2296 2297 if (!rt_caching(net)) 2298 goto skip_cache; 2299 2300 tos &= IPTOS_RT_MASK; 2301 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2302 2303 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2304 rth = rcu_dereference(rth->dst.rt_next)) { 2305 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | 2306 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | 2307 (rth->rt_iif ^ iif) | 2308 rth->rt_oif | 2309 (rth->rt_key_tos ^ tos)) == 0 && 2310 rth->rt_mark == skb->mark && 2311 net_eq(dev_net(rth->dst.dev), net) && 2312 !rt_is_expired(rth)) { 2313 if (noref) { 2314 dst_use_noref(&rth->dst, jiffies); 2315 skb_dst_set_noref(skb, &rth->dst); 2316 } else { 2317 dst_use(&rth->dst, jiffies); 2318 skb_dst_set(skb, &rth->dst); 2319 } 2320 RT_CACHE_STAT_INC(in_hit); 2321 rcu_read_unlock(); 2322 return 0; 2323 } 2324 RT_CACHE_STAT_INC(in_hlist_search); 2325 } 2326 2327 skip_cache: 2328 /* Multicast recognition logic is moved from route cache to here. 2329 The problem was that too many Ethernet cards have broken/missing 2330 hardware multicast filters :-( As result the host on multicasting 2331 network acquires a lot of useless route cache entries, sort of 2332 SDR messages from all the world. Now we try to get rid of them. 2333 Really, provided software IP multicast filter is organized 2334 reasonably (at least, hashed), it does not result in a slowdown 2335 comparing with route cache reject entries. 2336 Note, that multicast routers are not affected, because 2337 route cache entry is created eventually. 2338 */ 2339 if (ipv4_is_multicast(daddr)) { 2340 struct in_device *in_dev = __in_dev_get_rcu(dev); 2341 2342 if (in_dev) { 2343 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 2344 ip_hdr(skb)->protocol); 2345 if (our 2346 #ifdef CONFIG_IP_MROUTE 2347 || 2348 (!ipv4_is_local_multicast(daddr) && 2349 IN_DEV_MFORWARD(in_dev)) 2350 #endif 2351 ) { 2352 int res = ip_route_input_mc(skb, daddr, saddr, 2353 tos, dev, our); 2354 rcu_read_unlock(); 2355 return res; 2356 } 2357 } 2358 rcu_read_unlock(); 2359 return -EINVAL; 2360 } 2361 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2362 rcu_read_unlock(); 2363 return res; 2364 } 2365 EXPORT_SYMBOL(ip_route_input_common); 2366 2367 /* called with rcu_read_lock() */ 2368 static struct rtable *__mkroute_output(const struct fib_result *res, 2369 const struct flowi4 *fl4, 2370 __be32 orig_daddr, __be32 orig_saddr, 2371 int orig_oif, struct net_device *dev_out, 2372 unsigned int flags) 2373 { 2374 struct fib_info *fi = res->fi; 2375 u32 tos = RT_FL_TOS(fl4); 2376 struct in_device *in_dev; 2377 u16 type = res->type; 2378 struct rtable *rth; 2379 2380 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2381 return ERR_PTR(-EINVAL); 2382 2383 if (ipv4_is_lbcast(fl4->daddr)) 2384 type = RTN_BROADCAST; 2385 else if (ipv4_is_multicast(fl4->daddr)) 2386 type = RTN_MULTICAST; 2387 else if (ipv4_is_zeronet(fl4->daddr)) 2388 return ERR_PTR(-EINVAL); 2389 2390 if (dev_out->flags & IFF_LOOPBACK) 2391 flags |= RTCF_LOCAL; 2392 2393 in_dev = __in_dev_get_rcu(dev_out); 2394 if (!in_dev) 2395 return ERR_PTR(-EINVAL); 2396 2397 if (type == RTN_BROADCAST) { 2398 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2399 fi = NULL; 2400 } else if (type == RTN_MULTICAST) { 2401 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2402 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2403 fl4->flowi4_proto)) 2404 flags &= ~RTCF_LOCAL; 2405 /* If multicast route do not exist use 2406 * default one, but do not gateway in this case. 2407 * Yes, it is hack. 2408 */ 2409 if (fi && res->prefixlen < 4) 2410 fi = NULL; 2411 } 2412 2413 rth = rt_dst_alloc(dev_out, 2414 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2415 IN_DEV_CONF_GET(in_dev, NOXFRM)); 2416 if (!rth) 2417 return ERR_PTR(-ENOBUFS); 2418 2419 rth->dst.output = ip_output; 2420 2421 rth->rt_key_dst = orig_daddr; 2422 rth->rt_key_src = orig_saddr; 2423 rth->rt_genid = rt_genid(dev_net(dev_out)); 2424 rth->rt_flags = flags; 2425 rth->rt_type = type; 2426 rth->rt_key_tos = tos; 2427 rth->rt_dst = fl4->daddr; 2428 rth->rt_src = fl4->saddr; 2429 rth->rt_route_iif = 0; 2430 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2431 rth->rt_oif = orig_oif; 2432 rth->rt_mark = fl4->flowi4_mark; 2433 rth->rt_gateway = fl4->daddr; 2434 rth->rt_spec_dst= fl4->saddr; 2435 rth->rt_peer_genid = 0; 2436 rth->peer = NULL; 2437 rth->fi = NULL; 2438 2439 RT_CACHE_STAT_INC(out_slow_tot); 2440 2441 if (flags & RTCF_LOCAL) { 2442 rth->dst.input = ip_local_deliver; 2443 rth->rt_spec_dst = fl4->daddr; 2444 } 2445 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2446 rth->rt_spec_dst = fl4->saddr; 2447 if (flags & RTCF_LOCAL && 2448 !(dev_out->flags & IFF_LOOPBACK)) { 2449 rth->dst.output = ip_mc_output; 2450 RT_CACHE_STAT_INC(out_slow_mc); 2451 } 2452 #ifdef CONFIG_IP_MROUTE 2453 if (type == RTN_MULTICAST) { 2454 if (IN_DEV_MFORWARD(in_dev) && 2455 !ipv4_is_local_multicast(fl4->daddr)) { 2456 rth->dst.input = ip_mr_input; 2457 rth->dst.output = ip_mc_output; 2458 } 2459 } 2460 #endif 2461 } 2462 2463 rt_set_nexthop(rth, fl4, res, fi, type, 0); 2464 2465 return rth; 2466 } 2467 2468 /* 2469 * Major route resolver routine. 2470 * called with rcu_read_lock(); 2471 */ 2472 2473 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 2474 { 2475 struct net_device *dev_out = NULL; 2476 u32 tos = RT_FL_TOS(fl4); 2477 unsigned int flags = 0; 2478 struct fib_result res; 2479 struct rtable *rth; 2480 __be32 orig_daddr; 2481 __be32 orig_saddr; 2482 int orig_oif; 2483 2484 res.fi = NULL; 2485 #ifdef CONFIG_IP_MULTIPLE_TABLES 2486 res.r = NULL; 2487 #endif 2488 2489 orig_daddr = fl4->daddr; 2490 orig_saddr = fl4->saddr; 2491 orig_oif = fl4->flowi4_oif; 2492 2493 fl4->flowi4_iif = net->loopback_dev->ifindex; 2494 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2495 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2496 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2497 2498 rcu_read_lock(); 2499 if (fl4->saddr) { 2500 rth = ERR_PTR(-EINVAL); 2501 if (ipv4_is_multicast(fl4->saddr) || 2502 ipv4_is_lbcast(fl4->saddr) || 2503 ipv4_is_zeronet(fl4->saddr)) 2504 goto out; 2505 2506 /* I removed check for oif == dev_out->oif here. 2507 It was wrong for two reasons: 2508 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2509 is assigned to multiple interfaces. 2510 2. Moreover, we are allowed to send packets with saddr 2511 of another iface. --ANK 2512 */ 2513 2514 if (fl4->flowi4_oif == 0 && 2515 (ipv4_is_multicast(fl4->daddr) || 2516 ipv4_is_lbcast(fl4->daddr))) { 2517 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2518 dev_out = __ip_dev_find(net, fl4->saddr, false); 2519 if (dev_out == NULL) 2520 goto out; 2521 2522 /* Special hack: user can direct multicasts 2523 and limited broadcast via necessary interface 2524 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2525 This hack is not just for fun, it allows 2526 vic,vat and friends to work. 2527 They bind socket to loopback, set ttl to zero 2528 and expect that it will work. 2529 From the viewpoint of routing cache they are broken, 2530 because we are not allowed to build multicast path 2531 with loopback source addr (look, routing cache 2532 cannot know, that ttl is zero, so that packet 2533 will not leave this host and route is valid). 2534 Luckily, this hack is good workaround. 2535 */ 2536 2537 fl4->flowi4_oif = dev_out->ifindex; 2538 goto make_route; 2539 } 2540 2541 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2542 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2543 if (!__ip_dev_find(net, fl4->saddr, false)) 2544 goto out; 2545 } 2546 } 2547 2548 2549 if (fl4->flowi4_oif) { 2550 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2551 rth = ERR_PTR(-ENODEV); 2552 if (dev_out == NULL) 2553 goto out; 2554 2555 /* RACE: Check return value of inet_select_addr instead. */ 2556 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2557 rth = ERR_PTR(-ENETUNREACH); 2558 goto out; 2559 } 2560 if (ipv4_is_local_multicast(fl4->daddr) || 2561 ipv4_is_lbcast(fl4->daddr)) { 2562 if (!fl4->saddr) 2563 fl4->saddr = inet_select_addr(dev_out, 0, 2564 RT_SCOPE_LINK); 2565 goto make_route; 2566 } 2567 if (fl4->saddr) { 2568 if (ipv4_is_multicast(fl4->daddr)) 2569 fl4->saddr = inet_select_addr(dev_out, 0, 2570 fl4->flowi4_scope); 2571 else if (!fl4->daddr) 2572 fl4->saddr = inet_select_addr(dev_out, 0, 2573 RT_SCOPE_HOST); 2574 } 2575 } 2576 2577 if (!fl4->daddr) { 2578 fl4->daddr = fl4->saddr; 2579 if (!fl4->daddr) 2580 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2581 dev_out = net->loopback_dev; 2582 fl4->flowi4_oif = net->loopback_dev->ifindex; 2583 res.type = RTN_LOCAL; 2584 flags |= RTCF_LOCAL; 2585 goto make_route; 2586 } 2587 2588 if (fib_lookup(net, fl4, &res)) { 2589 res.fi = NULL; 2590 if (fl4->flowi4_oif) { 2591 /* Apparently, routing tables are wrong. Assume, 2592 that the destination is on link. 2593 2594 WHY? DW. 2595 Because we are allowed to send to iface 2596 even if it has NO routes and NO assigned 2597 addresses. When oif is specified, routing 2598 tables are looked up with only one purpose: 2599 to catch if destination is gatewayed, rather than 2600 direct. Moreover, if MSG_DONTROUTE is set, 2601 we send packet, ignoring both routing tables 2602 and ifaddr state. --ANK 2603 2604 2605 We could make it even if oif is unknown, 2606 likely IPv6, but we do not. 2607 */ 2608 2609 if (fl4->saddr == 0) 2610 fl4->saddr = inet_select_addr(dev_out, 0, 2611 RT_SCOPE_LINK); 2612 res.type = RTN_UNICAST; 2613 goto make_route; 2614 } 2615 rth = ERR_PTR(-ENETUNREACH); 2616 goto out; 2617 } 2618 2619 if (res.type == RTN_LOCAL) { 2620 if (!fl4->saddr) { 2621 if (res.fi->fib_prefsrc) 2622 fl4->saddr = res.fi->fib_prefsrc; 2623 else 2624 fl4->saddr = fl4->daddr; 2625 } 2626 dev_out = net->loopback_dev; 2627 fl4->flowi4_oif = dev_out->ifindex; 2628 res.fi = NULL; 2629 flags |= RTCF_LOCAL; 2630 goto make_route; 2631 } 2632 2633 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2634 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) 2635 fib_select_multipath(&res); 2636 else 2637 #endif 2638 if (!res.prefixlen && 2639 res.table->tb_num_default > 1 && 2640 res.type == RTN_UNICAST && !fl4->flowi4_oif) 2641 fib_select_default(&res); 2642 2643 if (!fl4->saddr) 2644 fl4->saddr = FIB_RES_PREFSRC(net, res); 2645 2646 dev_out = FIB_RES_DEV(res); 2647 fl4->flowi4_oif = dev_out->ifindex; 2648 2649 2650 make_route: 2651 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 2652 dev_out, flags); 2653 if (!IS_ERR(rth)) { 2654 unsigned int hash; 2655 2656 hash = rt_hash(orig_daddr, orig_saddr, orig_oif, 2657 rt_genid(dev_net(dev_out))); 2658 rth = rt_intern_hash(hash, rth, NULL, orig_oif); 2659 } 2660 2661 out: 2662 rcu_read_unlock(); 2663 return rth; 2664 } 2665 2666 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) 2667 { 2668 struct rtable *rth; 2669 unsigned int hash; 2670 2671 if (!rt_caching(net)) 2672 goto slow_output; 2673 2674 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); 2675 2676 rcu_read_lock_bh(); 2677 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2678 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2679 if (rth->rt_key_dst == flp4->daddr && 2680 rth->rt_key_src == flp4->saddr && 2681 rt_is_output_route(rth) && 2682 rth->rt_oif == flp4->flowi4_oif && 2683 rth->rt_mark == flp4->flowi4_mark && 2684 !((rth->rt_key_tos ^ flp4->flowi4_tos) & 2685 (IPTOS_RT_MASK | RTO_ONLINK)) && 2686 net_eq(dev_net(rth->dst.dev), net) && 2687 !rt_is_expired(rth)) { 2688 dst_use(&rth->dst, jiffies); 2689 RT_CACHE_STAT_INC(out_hit); 2690 rcu_read_unlock_bh(); 2691 if (!flp4->saddr) 2692 flp4->saddr = rth->rt_src; 2693 if (!flp4->daddr) 2694 flp4->daddr = rth->rt_dst; 2695 return rth; 2696 } 2697 RT_CACHE_STAT_INC(out_hlist_search); 2698 } 2699 rcu_read_unlock_bh(); 2700 2701 slow_output: 2702 return ip_route_output_slow(net, flp4); 2703 } 2704 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2705 2706 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2707 { 2708 return NULL; 2709 } 2710 2711 static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) 2712 { 2713 return 0; 2714 } 2715 2716 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2717 { 2718 } 2719 2720 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2721 unsigned long old) 2722 { 2723 return NULL; 2724 } 2725 2726 static struct dst_ops ipv4_dst_blackhole_ops = { 2727 .family = AF_INET, 2728 .protocol = cpu_to_be16(ETH_P_IP), 2729 .destroy = ipv4_dst_destroy, 2730 .check = ipv4_blackhole_dst_check, 2731 .default_mtu = ipv4_blackhole_default_mtu, 2732 .default_advmss = ipv4_default_advmss, 2733 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2734 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2735 }; 2736 2737 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2738 { 2739 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); 2740 struct rtable *ort = (struct rtable *) dst_orig; 2741 2742 if (rt) { 2743 struct dst_entry *new = &rt->dst; 2744 2745 new->__use = 1; 2746 new->input = dst_discard; 2747 new->output = dst_discard; 2748 dst_copy_metrics(new, &ort->dst); 2749 2750 new->dev = ort->dst.dev; 2751 if (new->dev) 2752 dev_hold(new->dev); 2753 2754 rt->rt_key_dst = ort->rt_key_dst; 2755 rt->rt_key_src = ort->rt_key_src; 2756 rt->rt_key_tos = ort->rt_key_tos; 2757 rt->rt_route_iif = ort->rt_route_iif; 2758 rt->rt_iif = ort->rt_iif; 2759 rt->rt_oif = ort->rt_oif; 2760 rt->rt_mark = ort->rt_mark; 2761 2762 rt->rt_genid = rt_genid(net); 2763 rt->rt_flags = ort->rt_flags; 2764 rt->rt_type = ort->rt_type; 2765 rt->rt_dst = ort->rt_dst; 2766 rt->rt_src = ort->rt_src; 2767 rt->rt_gateway = ort->rt_gateway; 2768 rt->rt_spec_dst = ort->rt_spec_dst; 2769 rt->peer = ort->peer; 2770 if (rt->peer) 2771 atomic_inc(&rt->peer->refcnt); 2772 rt->fi = ort->fi; 2773 if (rt->fi) 2774 atomic_inc(&rt->fi->fib_clntref); 2775 2776 dst_free(new); 2777 } 2778 2779 dst_release(dst_orig); 2780 2781 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2782 } 2783 2784 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2785 struct sock *sk) 2786 { 2787 struct rtable *rt = __ip_route_output_key(net, flp4); 2788 2789 if (IS_ERR(rt)) 2790 return rt; 2791 2792 if (flp4->flowi4_proto) 2793 rt = (struct rtable *) xfrm_lookup(net, &rt->dst, 2794 flowi4_to_flowi(flp4), 2795 sk, 0); 2796 2797 return rt; 2798 } 2799 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2800 2801 static int rt_fill_info(struct net *net, 2802 struct sk_buff *skb, u32 pid, u32 seq, int event, 2803 int nowait, unsigned int flags) 2804 { 2805 struct rtable *rt = skb_rtable(skb); 2806 struct rtmsg *r; 2807 struct nlmsghdr *nlh; 2808 long expires = 0; 2809 const struct inet_peer *peer = rt->peer; 2810 u32 id = 0, ts = 0, tsage = 0, error; 2811 2812 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2813 if (nlh == NULL) 2814 return -EMSGSIZE; 2815 2816 r = nlmsg_data(nlh); 2817 r->rtm_family = AF_INET; 2818 r->rtm_dst_len = 32; 2819 r->rtm_src_len = 0; 2820 r->rtm_tos = rt->rt_key_tos; 2821 r->rtm_table = RT_TABLE_MAIN; 2822 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2823 r->rtm_type = rt->rt_type; 2824 r->rtm_scope = RT_SCOPE_UNIVERSE; 2825 r->rtm_protocol = RTPROT_UNSPEC; 2826 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2827 if (rt->rt_flags & RTCF_NOTIFY) 2828 r->rtm_flags |= RTM_F_NOTIFY; 2829 2830 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2831 2832 if (rt->rt_key_src) { 2833 r->rtm_src_len = 32; 2834 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); 2835 } 2836 if (rt->dst.dev) 2837 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2838 #ifdef CONFIG_IP_ROUTE_CLASSID 2839 if (rt->dst.tclassid) 2840 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2841 #endif 2842 if (rt_is_input_route(rt)) 2843 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2844 else if (rt->rt_src != rt->rt_key_src) 2845 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2846 2847 if (rt->rt_dst != rt->rt_gateway) 2848 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2849 2850 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2851 goto nla_put_failure; 2852 2853 if (rt->rt_mark) 2854 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); 2855 2856 error = rt->dst.error; 2857 if (peer) { 2858 inet_peer_refcheck(rt->peer); 2859 id = atomic_read(&peer->ip_id_count) & 0xffff; 2860 if (peer->tcp_ts_stamp) { 2861 ts = peer->tcp_ts; 2862 tsage = get_seconds() - peer->tcp_ts_stamp; 2863 } 2864 expires = ACCESS_ONCE(peer->pmtu_expires); 2865 if (expires) 2866 expires -= jiffies; 2867 } 2868 2869 if (rt_is_input_route(rt)) { 2870 #ifdef CONFIG_IP_MROUTE 2871 __be32 dst = rt->rt_dst; 2872 2873 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2874 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2875 int err = ipmr_get_route(net, skb, 2876 rt->rt_src, rt->rt_dst, 2877 r, nowait); 2878 if (err <= 0) { 2879 if (!nowait) { 2880 if (err == 0) 2881 return 0; 2882 goto nla_put_failure; 2883 } else { 2884 if (err == -EMSGSIZE) 2885 goto nla_put_failure; 2886 error = err; 2887 } 2888 } 2889 } else 2890 #endif 2891 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); 2892 } 2893 2894 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2895 expires, error) < 0) 2896 goto nla_put_failure; 2897 2898 return nlmsg_end(skb, nlh); 2899 2900 nla_put_failure: 2901 nlmsg_cancel(skb, nlh); 2902 return -EMSGSIZE; 2903 } 2904 2905 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2906 { 2907 struct net *net = sock_net(in_skb->sk); 2908 struct rtmsg *rtm; 2909 struct nlattr *tb[RTA_MAX+1]; 2910 struct rtable *rt = NULL; 2911 __be32 dst = 0; 2912 __be32 src = 0; 2913 u32 iif; 2914 int err; 2915 int mark; 2916 struct sk_buff *skb; 2917 2918 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2919 if (err < 0) 2920 goto errout; 2921 2922 rtm = nlmsg_data(nlh); 2923 2924 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2925 if (skb == NULL) { 2926 err = -ENOBUFS; 2927 goto errout; 2928 } 2929 2930 /* Reserve room for dummy headers, this skb can pass 2931 through good chunk of routing engine. 2932 */ 2933 skb_reset_mac_header(skb); 2934 skb_reset_network_header(skb); 2935 2936 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2937 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2938 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2939 2940 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2941 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2942 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2943 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2944 2945 if (iif) { 2946 struct net_device *dev; 2947 2948 dev = __dev_get_by_index(net, iif); 2949 if (dev == NULL) { 2950 err = -ENODEV; 2951 goto errout_free; 2952 } 2953 2954 skb->protocol = htons(ETH_P_IP); 2955 skb->dev = dev; 2956 skb->mark = mark; 2957 local_bh_disable(); 2958 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2959 local_bh_enable(); 2960 2961 rt = skb_rtable(skb); 2962 if (err == 0 && rt->dst.error) 2963 err = -rt->dst.error; 2964 } else { 2965 struct flowi4 fl4 = { 2966 .daddr = dst, 2967 .saddr = src, 2968 .flowi4_tos = rtm->rtm_tos, 2969 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2970 .flowi4_mark = mark, 2971 }; 2972 rt = ip_route_output_key(net, &fl4); 2973 2974 err = 0; 2975 if (IS_ERR(rt)) 2976 err = PTR_ERR(rt); 2977 } 2978 2979 if (err) 2980 goto errout_free; 2981 2982 skb_dst_set(skb, &rt->dst); 2983 if (rtm->rtm_flags & RTM_F_NOTIFY) 2984 rt->rt_flags |= RTCF_NOTIFY; 2985 2986 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2987 RTM_NEWROUTE, 0, 0); 2988 if (err <= 0) 2989 goto errout_free; 2990 2991 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2992 errout: 2993 return err; 2994 2995 errout_free: 2996 kfree_skb(skb); 2997 goto errout; 2998 } 2999 3000 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 3001 { 3002 struct rtable *rt; 3003 int h, s_h; 3004 int idx, s_idx; 3005 struct net *net; 3006 3007 net = sock_net(skb->sk); 3008 3009 s_h = cb->args[0]; 3010 if (s_h < 0) 3011 s_h = 0; 3012 s_idx = idx = cb->args[1]; 3013 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 3014 if (!rt_hash_table[h].chain) 3015 continue; 3016 rcu_read_lock_bh(); 3017 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; 3018 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { 3019 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) 3020 continue; 3021 if (rt_is_expired(rt)) 3022 continue; 3023 skb_dst_set_noref(skb, &rt->dst); 3024 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3025 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3026 1, NLM_F_MULTI) <= 0) { 3027 skb_dst_drop(skb); 3028 rcu_read_unlock_bh(); 3029 goto done; 3030 } 3031 skb_dst_drop(skb); 3032 } 3033 rcu_read_unlock_bh(); 3034 } 3035 3036 done: 3037 cb->args[0] = h; 3038 cb->args[1] = idx; 3039 return skb->len; 3040 } 3041 3042 void ip_rt_multicast_event(struct in_device *in_dev) 3043 { 3044 rt_cache_flush(dev_net(in_dev->dev), 0); 3045 } 3046 3047 #ifdef CONFIG_SYSCTL 3048 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3049 void __user *buffer, 3050 size_t *lenp, loff_t *ppos) 3051 { 3052 if (write) { 3053 int flush_delay; 3054 ctl_table ctl; 3055 struct net *net; 3056 3057 memcpy(&ctl, __ctl, sizeof(ctl)); 3058 ctl.data = &flush_delay; 3059 proc_dointvec(&ctl, write, buffer, lenp, ppos); 3060 3061 net = (struct net *)__ctl->extra1; 3062 rt_cache_flush(net, flush_delay); 3063 return 0; 3064 } 3065 3066 return -EINVAL; 3067 } 3068 3069 static ctl_table ipv4_route_table[] = { 3070 { 3071 .procname = "gc_thresh", 3072 .data = &ipv4_dst_ops.gc_thresh, 3073 .maxlen = sizeof(int), 3074 .mode = 0644, 3075 .proc_handler = proc_dointvec, 3076 }, 3077 { 3078 .procname = "max_size", 3079 .data = &ip_rt_max_size, 3080 .maxlen = sizeof(int), 3081 .mode = 0644, 3082 .proc_handler = proc_dointvec, 3083 }, 3084 { 3085 /* Deprecated. Use gc_min_interval_ms */ 3086 3087 .procname = "gc_min_interval", 3088 .data = &ip_rt_gc_min_interval, 3089 .maxlen = sizeof(int), 3090 .mode = 0644, 3091 .proc_handler = proc_dointvec_jiffies, 3092 }, 3093 { 3094 .procname = "gc_min_interval_ms", 3095 .data = &ip_rt_gc_min_interval, 3096 .maxlen = sizeof(int), 3097 .mode = 0644, 3098 .proc_handler = proc_dointvec_ms_jiffies, 3099 }, 3100 { 3101 .procname = "gc_timeout", 3102 .data = &ip_rt_gc_timeout, 3103 .maxlen = sizeof(int), 3104 .mode = 0644, 3105 .proc_handler = proc_dointvec_jiffies, 3106 }, 3107 { 3108 .procname = "gc_interval", 3109 .data = &ip_rt_gc_interval, 3110 .maxlen = sizeof(int), 3111 .mode = 0644, 3112 .proc_handler = proc_dointvec_jiffies, 3113 }, 3114 { 3115 .procname = "redirect_load", 3116 .data = &ip_rt_redirect_load, 3117 .maxlen = sizeof(int), 3118 .mode = 0644, 3119 .proc_handler = proc_dointvec, 3120 }, 3121 { 3122 .procname = "redirect_number", 3123 .data = &ip_rt_redirect_number, 3124 .maxlen = sizeof(int), 3125 .mode = 0644, 3126 .proc_handler = proc_dointvec, 3127 }, 3128 { 3129 .procname = "redirect_silence", 3130 .data = &ip_rt_redirect_silence, 3131 .maxlen = sizeof(int), 3132 .mode = 0644, 3133 .proc_handler = proc_dointvec, 3134 }, 3135 { 3136 .procname = "error_cost", 3137 .data = &ip_rt_error_cost, 3138 .maxlen = sizeof(int), 3139 .mode = 0644, 3140 .proc_handler = proc_dointvec, 3141 }, 3142 { 3143 .procname = "error_burst", 3144 .data = &ip_rt_error_burst, 3145 .maxlen = sizeof(int), 3146 .mode = 0644, 3147 .proc_handler = proc_dointvec, 3148 }, 3149 { 3150 .procname = "gc_elasticity", 3151 .data = &ip_rt_gc_elasticity, 3152 .maxlen = sizeof(int), 3153 .mode = 0644, 3154 .proc_handler = proc_dointvec, 3155 }, 3156 { 3157 .procname = "mtu_expires", 3158 .data = &ip_rt_mtu_expires, 3159 .maxlen = sizeof(int), 3160 .mode = 0644, 3161 .proc_handler = proc_dointvec_jiffies, 3162 }, 3163 { 3164 .procname = "min_pmtu", 3165 .data = &ip_rt_min_pmtu, 3166 .maxlen = sizeof(int), 3167 .mode = 0644, 3168 .proc_handler = proc_dointvec, 3169 }, 3170 { 3171 .procname = "min_adv_mss", 3172 .data = &ip_rt_min_advmss, 3173 .maxlen = sizeof(int), 3174 .mode = 0644, 3175 .proc_handler = proc_dointvec, 3176 }, 3177 { } 3178 }; 3179 3180 static struct ctl_table empty[1]; 3181 3182 static struct ctl_table ipv4_skeleton[] = 3183 { 3184 { .procname = "route", 3185 .mode = 0555, .child = ipv4_route_table}, 3186 { .procname = "neigh", 3187 .mode = 0555, .child = empty}, 3188 { } 3189 }; 3190 3191 static __net_initdata struct ctl_path ipv4_path[] = { 3192 { .procname = "net", }, 3193 { .procname = "ipv4", }, 3194 { }, 3195 }; 3196 3197 static struct ctl_table ipv4_route_flush_table[] = { 3198 { 3199 .procname = "flush", 3200 .maxlen = sizeof(int), 3201 .mode = 0200, 3202 .proc_handler = ipv4_sysctl_rtcache_flush, 3203 }, 3204 { }, 3205 }; 3206 3207 static __net_initdata struct ctl_path ipv4_route_path[] = { 3208 { .procname = "net", }, 3209 { .procname = "ipv4", }, 3210 { .procname = "route", }, 3211 { }, 3212 }; 3213 3214 static __net_init int sysctl_route_net_init(struct net *net) 3215 { 3216 struct ctl_table *tbl; 3217 3218 tbl = ipv4_route_flush_table; 3219 if (!net_eq(net, &init_net)) { 3220 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3221 if (tbl == NULL) 3222 goto err_dup; 3223 } 3224 tbl[0].extra1 = net; 3225 3226 net->ipv4.route_hdr = 3227 register_net_sysctl_table(net, ipv4_route_path, tbl); 3228 if (net->ipv4.route_hdr == NULL) 3229 goto err_reg; 3230 return 0; 3231 3232 err_reg: 3233 if (tbl != ipv4_route_flush_table) 3234 kfree(tbl); 3235 err_dup: 3236 return -ENOMEM; 3237 } 3238 3239 static __net_exit void sysctl_route_net_exit(struct net *net) 3240 { 3241 struct ctl_table *tbl; 3242 3243 tbl = net->ipv4.route_hdr->ctl_table_arg; 3244 unregister_net_sysctl_table(net->ipv4.route_hdr); 3245 BUG_ON(tbl == ipv4_route_flush_table); 3246 kfree(tbl); 3247 } 3248 3249 static __net_initdata struct pernet_operations sysctl_route_ops = { 3250 .init = sysctl_route_net_init, 3251 .exit = sysctl_route_net_exit, 3252 }; 3253 #endif 3254 3255 static __net_init int rt_genid_init(struct net *net) 3256 { 3257 get_random_bytes(&net->ipv4.rt_genid, 3258 sizeof(net->ipv4.rt_genid)); 3259 get_random_bytes(&net->ipv4.dev_addr_genid, 3260 sizeof(net->ipv4.dev_addr_genid)); 3261 return 0; 3262 } 3263 3264 static __net_initdata struct pernet_operations rt_genid_ops = { 3265 .init = rt_genid_init, 3266 }; 3267 3268 3269 #ifdef CONFIG_IP_ROUTE_CLASSID 3270 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3271 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3272 3273 static __initdata unsigned long rhash_entries; 3274 static int __init set_rhash_entries(char *str) 3275 { 3276 if (!str) 3277 return 0; 3278 rhash_entries = simple_strtoul(str, &str, 0); 3279 return 1; 3280 } 3281 __setup("rhash_entries=", set_rhash_entries); 3282 3283 int __init ip_rt_init(void) 3284 { 3285 int rc = 0; 3286 3287 #ifdef CONFIG_IP_ROUTE_CLASSID 3288 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3289 if (!ip_rt_acct) 3290 panic("IP: failed to allocate ip_rt_acct\n"); 3291 #endif 3292 3293 ipv4_dst_ops.kmem_cachep = 3294 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3295 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3296 3297 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3298 3299 if (dst_entries_init(&ipv4_dst_ops) < 0) 3300 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3301 3302 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3303 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3304 3305 rt_hash_table = (struct rt_hash_bucket *) 3306 alloc_large_system_hash("IP route cache", 3307 sizeof(struct rt_hash_bucket), 3308 rhash_entries, 3309 (totalram_pages >= 128 * 1024) ? 3310 15 : 17, 3311 0, 3312 &rt_hash_log, 3313 &rt_hash_mask, 3314 rhash_entries ? 0 : 512 * 1024); 3315 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3316 rt_hash_lock_init(); 3317 3318 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3319 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3320 3321 devinet_init(); 3322 ip_fib_init(); 3323 3324 if (ip_rt_proc_init()) 3325 printk(KERN_ERR "Unable to create route proc files\n"); 3326 #ifdef CONFIG_XFRM 3327 xfrm_init(); 3328 xfrm4_init(ip_rt_max_size); 3329 #endif 3330 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 3331 3332 #ifdef CONFIG_SYSCTL 3333 register_pernet_subsys(&sysctl_route_ops); 3334 #endif 3335 register_pernet_subsys(&rt_genid_ops); 3336 return rc; 3337 } 3338 3339 #ifdef CONFIG_SYSCTL 3340 /* 3341 * We really need to sanitize the damn ipv4 init order, then all 3342 * this nonsense will go away. 3343 */ 3344 void __init ip_static_sysctl_init(void) 3345 { 3346 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3347 } 3348 #endif 3349