1 /* 2 * Linux INET6 implementation 3 * Forwarding Information Database 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $ 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 /* 17 * Changes: 18 * Yuji SEKIYA @USAGI: Support default route on router node; 19 * remove ip6_null_entry from the top of 20 * routing table. 21 * Ville Nuorvala: Fixed routing subtrees. 22 */ 23 #include <linux/errno.h> 24 #include <linux/types.h> 25 #include <linux/net.h> 26 #include <linux/route.h> 27 #include <linux/netdevice.h> 28 #include <linux/in6.h> 29 #include <linux/init.h> 30 #include <linux/list.h> 31 32 #ifdef CONFIG_PROC_FS 33 #include <linux/proc_fs.h> 34 #endif 35 36 #include <net/ipv6.h> 37 #include <net/ndisc.h> 38 #include <net/addrconf.h> 39 40 #include <net/ip6_fib.h> 41 #include <net/ip6_route.h> 42 43 #define RT6_DEBUG 2 44 45 #if RT6_DEBUG >= 3 46 #define RT6_TRACE(x...) printk(KERN_DEBUG x) 47 #else 48 #define RT6_TRACE(x...) do { ; } while (0) 49 #endif 50 51 struct rt6_statistics rt6_stats; 52 53 static struct kmem_cache * fib6_node_kmem __read_mostly; 54 55 enum fib_walk_state_t 56 { 57 #ifdef CONFIG_IPV6_SUBTREES 58 FWS_S, 59 #endif 60 FWS_L, 61 FWS_R, 62 FWS_C, 63 FWS_U 64 }; 65 66 struct fib6_cleaner_t 67 { 68 struct fib6_walker_t w; 69 int (*func)(struct rt6_info *, void *arg); 70 void *arg; 71 }; 72 73 static DEFINE_RWLOCK(fib6_walker_lock); 74 75 #ifdef CONFIG_IPV6_SUBTREES 76 #define FWS_INIT FWS_S 77 #else 78 #define FWS_INIT FWS_L 79 #endif 80 81 static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt); 82 static struct rt6_info * fib6_find_prefix(struct fib6_node *fn); 83 static struct fib6_node * fib6_repair_tree(struct fib6_node *fn); 84 static int fib6_walk(struct fib6_walker_t *w); 85 static int fib6_walk_continue(struct fib6_walker_t *w); 86 87 /* 88 * A routing update causes an increase of the serial number on the 89 * affected subtree. This allows for cached routes to be asynchronously 90 * tested when modifications are made to the destination cache as a 91 * result of redirects, path MTU changes, etc. 92 */ 93 94 static __u32 rt_sernum; 95 96 static void fib6_gc_timer_cb(unsigned long arg); 97 98 static DEFINE_TIMER(ip6_fib_timer, fib6_gc_timer_cb, 0, 99 (unsigned long)&init_net); 100 101 static struct fib6_walker_t fib6_walker_list = { 102 .prev = &fib6_walker_list, 103 .next = &fib6_walker_list, 104 }; 105 106 #define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) 107 108 static inline void fib6_walker_link(struct fib6_walker_t *w) 109 { 110 write_lock_bh(&fib6_walker_lock); 111 w->next = fib6_walker_list.next; 112 w->prev = &fib6_walker_list; 113 w->next->prev = w; 114 w->prev->next = w; 115 write_unlock_bh(&fib6_walker_lock); 116 } 117 118 static inline void fib6_walker_unlink(struct fib6_walker_t *w) 119 { 120 write_lock_bh(&fib6_walker_lock); 121 w->next->prev = w->prev; 122 w->prev->next = w->next; 123 w->prev = w->next = w; 124 write_unlock_bh(&fib6_walker_lock); 125 } 126 static __inline__ u32 fib6_new_sernum(void) 127 { 128 u32 n = ++rt_sernum; 129 if ((__s32)n <= 0) 130 rt_sernum = n = 1; 131 return n; 132 } 133 134 /* 135 * Auxiliary address test functions for the radix tree. 136 * 137 * These assume a 32bit processor (although it will work on 138 * 64bit processors) 139 */ 140 141 /* 142 * test bit 143 */ 144 145 static __inline__ __be32 addr_bit_set(void *token, int fn_bit) 146 { 147 __be32 *addr = token; 148 149 return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; 150 } 151 152 static __inline__ struct fib6_node * node_alloc(void) 153 { 154 struct fib6_node *fn; 155 156 fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); 157 158 return fn; 159 } 160 161 static __inline__ void node_free(struct fib6_node * fn) 162 { 163 kmem_cache_free(fib6_node_kmem, fn); 164 } 165 166 static __inline__ void rt6_release(struct rt6_info *rt) 167 { 168 if (atomic_dec_and_test(&rt->rt6i_ref)) 169 dst_free(&rt->u.dst); 170 } 171 172 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 173 #define FIB_TABLE_HASHSZ 256 174 #else 175 #define FIB_TABLE_HASHSZ 1 176 #endif 177 178 static void fib6_link_table(struct net *net, struct fib6_table *tb) 179 { 180 unsigned int h; 181 182 /* 183 * Initialize table lock at a single place to give lockdep a key, 184 * tables aren't visible prior to being linked to the list. 185 */ 186 rwlock_init(&tb->tb6_lock); 187 188 h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1); 189 190 /* 191 * No protection necessary, this is the only list mutatation 192 * operation, tables never disappear once they exist. 193 */ 194 hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); 195 } 196 197 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 198 199 static struct fib6_table *fib6_alloc_table(u32 id) 200 { 201 struct fib6_table *table; 202 203 table = kzalloc(sizeof(*table), GFP_ATOMIC); 204 if (table != NULL) { 205 table->tb6_id = id; 206 table->tb6_root.leaf = &ip6_null_entry; 207 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 208 } 209 210 return table; 211 } 212 213 struct fib6_table *fib6_new_table(struct net *net, u32 id) 214 { 215 struct fib6_table *tb; 216 217 if (id == 0) 218 id = RT6_TABLE_MAIN; 219 tb = fib6_get_table(net, id); 220 if (tb) 221 return tb; 222 223 tb = fib6_alloc_table(id); 224 if (tb != NULL) 225 fib6_link_table(net, tb); 226 227 return tb; 228 } 229 230 struct fib6_table *fib6_get_table(struct net *net, u32 id) 231 { 232 struct fib6_table *tb; 233 struct hlist_head *head; 234 struct hlist_node *node; 235 unsigned int h; 236 237 if (id == 0) 238 id = RT6_TABLE_MAIN; 239 h = id & (FIB_TABLE_HASHSZ - 1); 240 rcu_read_lock(); 241 head = &net->ipv6.fib_table_hash[h]; 242 hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) { 243 if (tb->tb6_id == id) { 244 rcu_read_unlock(); 245 return tb; 246 } 247 } 248 rcu_read_unlock(); 249 250 return NULL; 251 } 252 253 static void fib6_tables_init(struct net *net) 254 { 255 fib6_link_table(net, net->ipv6.fib6_main_tbl); 256 fib6_link_table(net, net->ipv6.fib6_local_tbl); 257 } 258 #else 259 260 struct fib6_table *fib6_new_table(struct net *net, u32 id) 261 { 262 return fib6_get_table(net, id); 263 } 264 265 struct fib6_table *fib6_get_table(struct net *net, u32 id) 266 { 267 return net->ipv6.fib6_main_tbl; 268 } 269 270 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl, 271 int flags, pol_lookup_t lookup) 272 { 273 return (struct dst_entry *) lookup(net->ipv6.fib6_main_tbl, fl, flags); 274 } 275 276 static void fib6_tables_init(struct net *net) 277 { 278 fib6_link_table(net, net->ipv6.fib6_main_tbl); 279 } 280 281 #endif 282 283 static int fib6_dump_node(struct fib6_walker_t *w) 284 { 285 int res; 286 struct rt6_info *rt; 287 288 for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { 289 res = rt6_dump_route(rt, w->args); 290 if (res < 0) { 291 /* Frame is full, suspend walking */ 292 w->leaf = rt; 293 return 1; 294 } 295 BUG_TRAP(res!=0); 296 } 297 w->leaf = NULL; 298 return 0; 299 } 300 301 static void fib6_dump_end(struct netlink_callback *cb) 302 { 303 struct fib6_walker_t *w = (void*)cb->args[2]; 304 305 if (w) { 306 cb->args[2] = 0; 307 kfree(w); 308 } 309 cb->done = (void*)cb->args[3]; 310 cb->args[1] = 3; 311 } 312 313 static int fib6_dump_done(struct netlink_callback *cb) 314 { 315 fib6_dump_end(cb); 316 return cb->done ? cb->done(cb) : 0; 317 } 318 319 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, 320 struct netlink_callback *cb) 321 { 322 struct fib6_walker_t *w; 323 int res; 324 325 w = (void *)cb->args[2]; 326 w->root = &table->tb6_root; 327 328 if (cb->args[4] == 0) { 329 read_lock_bh(&table->tb6_lock); 330 res = fib6_walk(w); 331 read_unlock_bh(&table->tb6_lock); 332 if (res > 0) 333 cb->args[4] = 1; 334 } else { 335 read_lock_bh(&table->tb6_lock); 336 res = fib6_walk_continue(w); 337 read_unlock_bh(&table->tb6_lock); 338 if (res != 0) { 339 if (res < 0) 340 fib6_walker_unlink(w); 341 goto end; 342 } 343 fib6_walker_unlink(w); 344 cb->args[4] = 0; 345 } 346 end: 347 return res; 348 } 349 350 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 351 { 352 struct net *net = skb->sk->sk_net; 353 unsigned int h, s_h; 354 unsigned int e = 0, s_e; 355 struct rt6_rtnl_dump_arg arg; 356 struct fib6_walker_t *w; 357 struct fib6_table *tb; 358 struct hlist_node *node; 359 struct hlist_head *head; 360 int res = 0; 361 362 s_h = cb->args[0]; 363 s_e = cb->args[1]; 364 365 w = (void *)cb->args[2]; 366 if (w == NULL) { 367 /* New dump: 368 * 369 * 1. hook callback destructor. 370 */ 371 cb->args[3] = (long)cb->done; 372 cb->done = fib6_dump_done; 373 374 /* 375 * 2. allocate and initialize walker. 376 */ 377 w = kzalloc(sizeof(*w), GFP_ATOMIC); 378 if (w == NULL) 379 return -ENOMEM; 380 w->func = fib6_dump_node; 381 cb->args[2] = (long)w; 382 } 383 384 arg.skb = skb; 385 arg.cb = cb; 386 w->args = &arg; 387 388 for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { 389 e = 0; 390 head = &net->ipv6.fib_table_hash[h]; 391 hlist_for_each_entry(tb, node, head, tb6_hlist) { 392 if (e < s_e) 393 goto next; 394 res = fib6_dump_table(tb, skb, cb); 395 if (res != 0) 396 goto out; 397 next: 398 e++; 399 } 400 } 401 out: 402 cb->args[1] = e; 403 cb->args[0] = h; 404 405 res = res < 0 ? res : skb->len; 406 if (res <= 0) 407 fib6_dump_end(cb); 408 return res; 409 } 410 411 /* 412 * Routing Table 413 * 414 * return the appropriate node for a routing tree "add" operation 415 * by either creating and inserting or by returning an existing 416 * node. 417 */ 418 419 static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, 420 int addrlen, int plen, 421 int offset) 422 { 423 struct fib6_node *fn, *in, *ln; 424 struct fib6_node *pn = NULL; 425 struct rt6key *key; 426 int bit; 427 __be32 dir = 0; 428 __u32 sernum = fib6_new_sernum(); 429 430 RT6_TRACE("fib6_add_1\n"); 431 432 /* insert node in tree */ 433 434 fn = root; 435 436 do { 437 key = (struct rt6key *)((u8 *)fn->leaf + offset); 438 439 /* 440 * Prefix match 441 */ 442 if (plen < fn->fn_bit || 443 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) 444 goto insert_above; 445 446 /* 447 * Exact match ? 448 */ 449 450 if (plen == fn->fn_bit) { 451 /* clean up an intermediate node */ 452 if ((fn->fn_flags & RTN_RTINFO) == 0) { 453 rt6_release(fn->leaf); 454 fn->leaf = NULL; 455 } 456 457 fn->fn_sernum = sernum; 458 459 return fn; 460 } 461 462 /* 463 * We have more bits to go 464 */ 465 466 /* Try to walk down on tree. */ 467 fn->fn_sernum = sernum; 468 dir = addr_bit_set(addr, fn->fn_bit); 469 pn = fn; 470 fn = dir ? fn->right: fn->left; 471 } while (fn); 472 473 /* 474 * We walked to the bottom of tree. 475 * Create new leaf node without children. 476 */ 477 478 ln = node_alloc(); 479 480 if (ln == NULL) 481 return NULL; 482 ln->fn_bit = plen; 483 484 ln->parent = pn; 485 ln->fn_sernum = sernum; 486 487 if (dir) 488 pn->right = ln; 489 else 490 pn->left = ln; 491 492 return ln; 493 494 495 insert_above: 496 /* 497 * split since we don't have a common prefix anymore or 498 * we have a less significant route. 499 * we've to insert an intermediate node on the list 500 * this new node will point to the one we need to create 501 * and the current 502 */ 503 504 pn = fn->parent; 505 506 /* find 1st bit in difference between the 2 addrs. 507 508 See comment in __ipv6_addr_diff: bit may be an invalid value, 509 but if it is >= plen, the value is ignored in any case. 510 */ 511 512 bit = __ipv6_addr_diff(addr, &key->addr, addrlen); 513 514 /* 515 * (intermediate)[in] 516 * / \ 517 * (new leaf node)[ln] (old node)[fn] 518 */ 519 if (plen > bit) { 520 in = node_alloc(); 521 ln = node_alloc(); 522 523 if (in == NULL || ln == NULL) { 524 if (in) 525 node_free(in); 526 if (ln) 527 node_free(ln); 528 return NULL; 529 } 530 531 /* 532 * new intermediate node. 533 * RTN_RTINFO will 534 * be off since that an address that chooses one of 535 * the branches would not match less specific routes 536 * in the other branch 537 */ 538 539 in->fn_bit = bit; 540 541 in->parent = pn; 542 in->leaf = fn->leaf; 543 atomic_inc(&in->leaf->rt6i_ref); 544 545 in->fn_sernum = sernum; 546 547 /* update parent pointer */ 548 if (dir) 549 pn->right = in; 550 else 551 pn->left = in; 552 553 ln->fn_bit = plen; 554 555 ln->parent = in; 556 fn->parent = in; 557 558 ln->fn_sernum = sernum; 559 560 if (addr_bit_set(addr, bit)) { 561 in->right = ln; 562 in->left = fn; 563 } else { 564 in->left = ln; 565 in->right = fn; 566 } 567 } else { /* plen <= bit */ 568 569 /* 570 * (new leaf node)[ln] 571 * / \ 572 * (old node)[fn] NULL 573 */ 574 575 ln = node_alloc(); 576 577 if (ln == NULL) 578 return NULL; 579 580 ln->fn_bit = plen; 581 582 ln->parent = pn; 583 584 ln->fn_sernum = sernum; 585 586 if (dir) 587 pn->right = ln; 588 else 589 pn->left = ln; 590 591 if (addr_bit_set(&key->addr, plen)) 592 ln->right = fn; 593 else 594 ln->left = fn; 595 596 fn->parent = ln; 597 } 598 return ln; 599 } 600 601 /* 602 * Insert routing information in a node. 603 */ 604 605 static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, 606 struct nl_info *info) 607 { 608 struct rt6_info *iter = NULL; 609 struct rt6_info **ins; 610 611 ins = &fn->leaf; 612 613 for (iter = fn->leaf; iter; iter=iter->u.dst.rt6_next) { 614 /* 615 * Search for duplicates 616 */ 617 618 if (iter->rt6i_metric == rt->rt6i_metric) { 619 /* 620 * Same priority level 621 */ 622 623 if (iter->rt6i_dev == rt->rt6i_dev && 624 iter->rt6i_idev == rt->rt6i_idev && 625 ipv6_addr_equal(&iter->rt6i_gateway, 626 &rt->rt6i_gateway)) { 627 if (!(iter->rt6i_flags&RTF_EXPIRES)) 628 return -EEXIST; 629 iter->rt6i_expires = rt->rt6i_expires; 630 if (!(rt->rt6i_flags&RTF_EXPIRES)) { 631 iter->rt6i_flags &= ~RTF_EXPIRES; 632 iter->rt6i_expires = 0; 633 } 634 return -EEXIST; 635 } 636 } 637 638 if (iter->rt6i_metric > rt->rt6i_metric) 639 break; 640 641 ins = &iter->u.dst.rt6_next; 642 } 643 644 /* Reset round-robin state, if necessary */ 645 if (ins == &fn->leaf) 646 fn->rr_ptr = NULL; 647 648 /* 649 * insert node 650 */ 651 652 rt->u.dst.rt6_next = iter; 653 *ins = rt; 654 rt->rt6i_node = fn; 655 atomic_inc(&rt->rt6i_ref); 656 inet6_rt_notify(RTM_NEWROUTE, rt, info); 657 rt6_stats.fib_rt_entries++; 658 659 if ((fn->fn_flags & RTN_RTINFO) == 0) { 660 rt6_stats.fib_route_nodes++; 661 fn->fn_flags |= RTN_RTINFO; 662 } 663 664 return 0; 665 } 666 667 static __inline__ void fib6_start_gc(struct rt6_info *rt) 668 { 669 if (ip6_fib_timer.expires == 0 && 670 (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) 671 mod_timer(&ip6_fib_timer, jiffies + 672 init_net.ipv6.sysctl.ip6_rt_gc_interval); 673 } 674 675 void fib6_force_start_gc(void) 676 { 677 if (ip6_fib_timer.expires == 0) 678 mod_timer(&ip6_fib_timer, jiffies + 679 init_net.ipv6.sysctl.ip6_rt_gc_interval); 680 } 681 682 /* 683 * Add routing information to the routing tree. 684 * <destination addr>/<source addr> 685 * with source addr info in sub-trees 686 */ 687 688 int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) 689 { 690 struct fib6_node *fn, *pn = NULL; 691 int err = -ENOMEM; 692 693 fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), 694 rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); 695 696 if (fn == NULL) 697 goto out; 698 699 pn = fn; 700 701 #ifdef CONFIG_IPV6_SUBTREES 702 if (rt->rt6i_src.plen) { 703 struct fib6_node *sn; 704 705 if (fn->subtree == NULL) { 706 struct fib6_node *sfn; 707 708 /* 709 * Create subtree. 710 * 711 * fn[main tree] 712 * | 713 * sfn[subtree root] 714 * \ 715 * sn[new leaf node] 716 */ 717 718 /* Create subtree root node */ 719 sfn = node_alloc(); 720 if (sfn == NULL) 721 goto st_failure; 722 723 sfn->leaf = &ip6_null_entry; 724 atomic_inc(&ip6_null_entry.rt6i_ref); 725 sfn->fn_flags = RTN_ROOT; 726 sfn->fn_sernum = fib6_new_sernum(); 727 728 /* Now add the first leaf node to new subtree */ 729 730 sn = fib6_add_1(sfn, &rt->rt6i_src.addr, 731 sizeof(struct in6_addr), rt->rt6i_src.plen, 732 offsetof(struct rt6_info, rt6i_src)); 733 734 if (sn == NULL) { 735 /* If it is failed, discard just allocated 736 root, and then (in st_failure) stale node 737 in main tree. 738 */ 739 node_free(sfn); 740 goto st_failure; 741 } 742 743 /* Now link new subtree to main tree */ 744 sfn->parent = fn; 745 fn->subtree = sfn; 746 } else { 747 sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, 748 sizeof(struct in6_addr), rt->rt6i_src.plen, 749 offsetof(struct rt6_info, rt6i_src)); 750 751 if (sn == NULL) 752 goto st_failure; 753 } 754 755 if (fn->leaf == NULL) { 756 fn->leaf = rt; 757 atomic_inc(&rt->rt6i_ref); 758 } 759 fn = sn; 760 } 761 #endif 762 763 err = fib6_add_rt2node(fn, rt, info); 764 765 if (err == 0) { 766 fib6_start_gc(rt); 767 if (!(rt->rt6i_flags&RTF_CACHE)) 768 fib6_prune_clones(pn, rt); 769 } 770 771 out: 772 if (err) { 773 #ifdef CONFIG_IPV6_SUBTREES 774 /* 775 * If fib6_add_1 has cleared the old leaf pointer in the 776 * super-tree leaf node we have to find a new one for it. 777 */ 778 if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { 779 pn->leaf = fib6_find_prefix(pn); 780 #if RT6_DEBUG >= 2 781 if (!pn->leaf) { 782 BUG_TRAP(pn->leaf != NULL); 783 pn->leaf = &ip6_null_entry; 784 } 785 #endif 786 atomic_inc(&pn->leaf->rt6i_ref); 787 } 788 #endif 789 dst_free(&rt->u.dst); 790 } 791 return err; 792 793 #ifdef CONFIG_IPV6_SUBTREES 794 /* Subtree creation failed, probably main tree node 795 is orphan. If it is, shoot it. 796 */ 797 st_failure: 798 if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) 799 fib6_repair_tree(fn); 800 dst_free(&rt->u.dst); 801 return err; 802 #endif 803 } 804 805 /* 806 * Routing tree lookup 807 * 808 */ 809 810 struct lookup_args { 811 int offset; /* key offset on rt6_info */ 812 struct in6_addr *addr; /* search key */ 813 }; 814 815 static struct fib6_node * fib6_lookup_1(struct fib6_node *root, 816 struct lookup_args *args) 817 { 818 struct fib6_node *fn; 819 __be32 dir; 820 821 if (unlikely(args->offset == 0)) 822 return NULL; 823 824 /* 825 * Descend on a tree 826 */ 827 828 fn = root; 829 830 for (;;) { 831 struct fib6_node *next; 832 833 dir = addr_bit_set(args->addr, fn->fn_bit); 834 835 next = dir ? fn->right : fn->left; 836 837 if (next) { 838 fn = next; 839 continue; 840 } 841 842 break; 843 } 844 845 while(fn) { 846 if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { 847 struct rt6key *key; 848 849 key = (struct rt6key *) ((u8 *) fn->leaf + 850 args->offset); 851 852 if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { 853 #ifdef CONFIG_IPV6_SUBTREES 854 if (fn->subtree) 855 fn = fib6_lookup_1(fn->subtree, args + 1); 856 #endif 857 if (!fn || fn->fn_flags & RTN_RTINFO) 858 return fn; 859 } 860 } 861 862 if (fn->fn_flags & RTN_ROOT) 863 break; 864 865 fn = fn->parent; 866 } 867 868 return NULL; 869 } 870 871 struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, 872 struct in6_addr *saddr) 873 { 874 struct fib6_node *fn; 875 struct lookup_args args[] = { 876 { 877 .offset = offsetof(struct rt6_info, rt6i_dst), 878 .addr = daddr, 879 }, 880 #ifdef CONFIG_IPV6_SUBTREES 881 { 882 .offset = offsetof(struct rt6_info, rt6i_src), 883 .addr = saddr, 884 }, 885 #endif 886 { 887 .offset = 0, /* sentinel */ 888 } 889 }; 890 891 fn = fib6_lookup_1(root, daddr ? args : args + 1); 892 893 if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) 894 fn = root; 895 896 return fn; 897 } 898 899 /* 900 * Get node with specified destination prefix (and source prefix, 901 * if subtrees are used) 902 */ 903 904 905 static struct fib6_node * fib6_locate_1(struct fib6_node *root, 906 struct in6_addr *addr, 907 int plen, int offset) 908 { 909 struct fib6_node *fn; 910 911 for (fn = root; fn ; ) { 912 struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); 913 914 /* 915 * Prefix match 916 */ 917 if (plen < fn->fn_bit || 918 !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) 919 return NULL; 920 921 if (plen == fn->fn_bit) 922 return fn; 923 924 /* 925 * We have more bits to go 926 */ 927 if (addr_bit_set(addr, fn->fn_bit)) 928 fn = fn->right; 929 else 930 fn = fn->left; 931 } 932 return NULL; 933 } 934 935 struct fib6_node * fib6_locate(struct fib6_node *root, 936 struct in6_addr *daddr, int dst_len, 937 struct in6_addr *saddr, int src_len) 938 { 939 struct fib6_node *fn; 940 941 fn = fib6_locate_1(root, daddr, dst_len, 942 offsetof(struct rt6_info, rt6i_dst)); 943 944 #ifdef CONFIG_IPV6_SUBTREES 945 if (src_len) { 946 BUG_TRAP(saddr!=NULL); 947 if (fn && fn->subtree) 948 fn = fib6_locate_1(fn->subtree, saddr, src_len, 949 offsetof(struct rt6_info, rt6i_src)); 950 } 951 #endif 952 953 if (fn && fn->fn_flags&RTN_RTINFO) 954 return fn; 955 956 return NULL; 957 } 958 959 960 /* 961 * Deletion 962 * 963 */ 964 965 static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) 966 { 967 if (fn->fn_flags&RTN_ROOT) 968 return &ip6_null_entry; 969 970 while(fn) { 971 if(fn->left) 972 return fn->left->leaf; 973 974 if(fn->right) 975 return fn->right->leaf; 976 977 fn = FIB6_SUBTREE(fn); 978 } 979 return NULL; 980 } 981 982 /* 983 * Called to trim the tree of intermediate nodes when possible. "fn" 984 * is the node we want to try and remove. 985 */ 986 987 static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) 988 { 989 int children; 990 int nstate; 991 struct fib6_node *child, *pn; 992 struct fib6_walker_t *w; 993 int iter = 0; 994 995 for (;;) { 996 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); 997 iter++; 998 999 BUG_TRAP(!(fn->fn_flags&RTN_RTINFO)); 1000 BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT)); 1001 BUG_TRAP(fn->leaf==NULL); 1002 1003 children = 0; 1004 child = NULL; 1005 if (fn->right) child = fn->right, children |= 1; 1006 if (fn->left) child = fn->left, children |= 2; 1007 1008 if (children == 3 || FIB6_SUBTREE(fn) 1009 #ifdef CONFIG_IPV6_SUBTREES 1010 /* Subtree root (i.e. fn) may have one child */ 1011 || (children && fn->fn_flags&RTN_ROOT) 1012 #endif 1013 ) { 1014 fn->leaf = fib6_find_prefix(fn); 1015 #if RT6_DEBUG >= 2 1016 if (fn->leaf==NULL) { 1017 BUG_TRAP(fn->leaf); 1018 fn->leaf = &ip6_null_entry; 1019 } 1020 #endif 1021 atomic_inc(&fn->leaf->rt6i_ref); 1022 return fn->parent; 1023 } 1024 1025 pn = fn->parent; 1026 #ifdef CONFIG_IPV6_SUBTREES 1027 if (FIB6_SUBTREE(pn) == fn) { 1028 BUG_TRAP(fn->fn_flags&RTN_ROOT); 1029 FIB6_SUBTREE(pn) = NULL; 1030 nstate = FWS_L; 1031 } else { 1032 BUG_TRAP(!(fn->fn_flags&RTN_ROOT)); 1033 #endif 1034 if (pn->right == fn) pn->right = child; 1035 else if (pn->left == fn) pn->left = child; 1036 #if RT6_DEBUG >= 2 1037 else BUG_TRAP(0); 1038 #endif 1039 if (child) 1040 child->parent = pn; 1041 nstate = FWS_R; 1042 #ifdef CONFIG_IPV6_SUBTREES 1043 } 1044 #endif 1045 1046 read_lock(&fib6_walker_lock); 1047 FOR_WALKERS(w) { 1048 if (child == NULL) { 1049 if (w->root == fn) { 1050 w->root = w->node = NULL; 1051 RT6_TRACE("W %p adjusted by delroot 1\n", w); 1052 } else if (w->node == fn) { 1053 RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); 1054 w->node = pn; 1055 w->state = nstate; 1056 } 1057 } else { 1058 if (w->root == fn) { 1059 w->root = child; 1060 RT6_TRACE("W %p adjusted by delroot 2\n", w); 1061 } 1062 if (w->node == fn) { 1063 w->node = child; 1064 if (children&2) { 1065 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 1066 w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; 1067 } else { 1068 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); 1069 w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; 1070 } 1071 } 1072 } 1073 } 1074 read_unlock(&fib6_walker_lock); 1075 1076 node_free(fn); 1077 if (pn->fn_flags&RTN_RTINFO || FIB6_SUBTREE(pn)) 1078 return pn; 1079 1080 rt6_release(pn->leaf); 1081 pn->leaf = NULL; 1082 fn = pn; 1083 } 1084 } 1085 1086 static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, 1087 struct nl_info *info) 1088 { 1089 struct fib6_walker_t *w; 1090 struct rt6_info *rt = *rtp; 1091 1092 RT6_TRACE("fib6_del_route\n"); 1093 1094 /* Unlink it */ 1095 *rtp = rt->u.dst.rt6_next; 1096 rt->rt6i_node = NULL; 1097 rt6_stats.fib_rt_entries--; 1098 rt6_stats.fib_discarded_routes++; 1099 1100 /* Reset round-robin state, if necessary */ 1101 if (fn->rr_ptr == rt) 1102 fn->rr_ptr = NULL; 1103 1104 /* Adjust walkers */ 1105 read_lock(&fib6_walker_lock); 1106 FOR_WALKERS(w) { 1107 if (w->state == FWS_C && w->leaf == rt) { 1108 RT6_TRACE("walker %p adjusted by delroute\n", w); 1109 w->leaf = rt->u.dst.rt6_next; 1110 if (w->leaf == NULL) 1111 w->state = FWS_U; 1112 } 1113 } 1114 read_unlock(&fib6_walker_lock); 1115 1116 rt->u.dst.rt6_next = NULL; 1117 1118 /* If it was last route, expunge its radix tree node */ 1119 if (fn->leaf == NULL) { 1120 fn->fn_flags &= ~RTN_RTINFO; 1121 rt6_stats.fib_route_nodes--; 1122 fn = fib6_repair_tree(fn); 1123 } 1124 1125 if (atomic_read(&rt->rt6i_ref) != 1) { 1126 /* This route is used as dummy address holder in some split 1127 * nodes. It is not leaked, but it still holds other resources, 1128 * which must be released in time. So, scan ascendant nodes 1129 * and replace dummy references to this route with references 1130 * to still alive ones. 1131 */ 1132 while (fn) { 1133 if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { 1134 fn->leaf = fib6_find_prefix(fn); 1135 atomic_inc(&fn->leaf->rt6i_ref); 1136 rt6_release(rt); 1137 } 1138 fn = fn->parent; 1139 } 1140 /* No more references are possible at this point. */ 1141 BUG_ON(atomic_read(&rt->rt6i_ref) != 1); 1142 } 1143 1144 inet6_rt_notify(RTM_DELROUTE, rt, info); 1145 rt6_release(rt); 1146 } 1147 1148 int fib6_del(struct rt6_info *rt, struct nl_info *info) 1149 { 1150 struct fib6_node *fn = rt->rt6i_node; 1151 struct rt6_info **rtp; 1152 1153 #if RT6_DEBUG >= 2 1154 if (rt->u.dst.obsolete>0) { 1155 BUG_TRAP(fn==NULL); 1156 return -ENOENT; 1157 } 1158 #endif 1159 if (fn == NULL || rt == &ip6_null_entry) 1160 return -ENOENT; 1161 1162 BUG_TRAP(fn->fn_flags&RTN_RTINFO); 1163 1164 if (!(rt->rt6i_flags&RTF_CACHE)) { 1165 struct fib6_node *pn = fn; 1166 #ifdef CONFIG_IPV6_SUBTREES 1167 /* clones of this route might be in another subtree */ 1168 if (rt->rt6i_src.plen) { 1169 while (!(pn->fn_flags&RTN_ROOT)) 1170 pn = pn->parent; 1171 pn = pn->parent; 1172 } 1173 #endif 1174 fib6_prune_clones(pn, rt); 1175 } 1176 1177 /* 1178 * Walk the leaf entries looking for ourself 1179 */ 1180 1181 for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.dst.rt6_next) { 1182 if (*rtp == rt) { 1183 fib6_del_route(fn, rtp, info); 1184 return 0; 1185 } 1186 } 1187 return -ENOENT; 1188 } 1189 1190 /* 1191 * Tree traversal function. 1192 * 1193 * Certainly, it is not interrupt safe. 1194 * However, it is internally reenterable wrt itself and fib6_add/fib6_del. 1195 * It means, that we can modify tree during walking 1196 * and use this function for garbage collection, clone pruning, 1197 * cleaning tree when a device goes down etc. etc. 1198 * 1199 * It guarantees that every node will be traversed, 1200 * and that it will be traversed only once. 1201 * 1202 * Callback function w->func may return: 1203 * 0 -> continue walking. 1204 * positive value -> walking is suspended (used by tree dumps, 1205 * and probably by gc, if it will be split to several slices) 1206 * negative value -> terminate walking. 1207 * 1208 * The function itself returns: 1209 * 0 -> walk is complete. 1210 * >0 -> walk is incomplete (i.e. suspended) 1211 * <0 -> walk is terminated by an error. 1212 */ 1213 1214 static int fib6_walk_continue(struct fib6_walker_t *w) 1215 { 1216 struct fib6_node *fn, *pn; 1217 1218 for (;;) { 1219 fn = w->node; 1220 if (fn == NULL) 1221 return 0; 1222 1223 if (w->prune && fn != w->root && 1224 fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { 1225 w->state = FWS_C; 1226 w->leaf = fn->leaf; 1227 } 1228 switch (w->state) { 1229 #ifdef CONFIG_IPV6_SUBTREES 1230 case FWS_S: 1231 if (FIB6_SUBTREE(fn)) { 1232 w->node = FIB6_SUBTREE(fn); 1233 continue; 1234 } 1235 w->state = FWS_L; 1236 #endif 1237 case FWS_L: 1238 if (fn->left) { 1239 w->node = fn->left; 1240 w->state = FWS_INIT; 1241 continue; 1242 } 1243 w->state = FWS_R; 1244 case FWS_R: 1245 if (fn->right) { 1246 w->node = fn->right; 1247 w->state = FWS_INIT; 1248 continue; 1249 } 1250 w->state = FWS_C; 1251 w->leaf = fn->leaf; 1252 case FWS_C: 1253 if (w->leaf && fn->fn_flags&RTN_RTINFO) { 1254 int err = w->func(w); 1255 if (err) 1256 return err; 1257 continue; 1258 } 1259 w->state = FWS_U; 1260 case FWS_U: 1261 if (fn == w->root) 1262 return 0; 1263 pn = fn->parent; 1264 w->node = pn; 1265 #ifdef CONFIG_IPV6_SUBTREES 1266 if (FIB6_SUBTREE(pn) == fn) { 1267 BUG_TRAP(fn->fn_flags&RTN_ROOT); 1268 w->state = FWS_L; 1269 continue; 1270 } 1271 #endif 1272 if (pn->left == fn) { 1273 w->state = FWS_R; 1274 continue; 1275 } 1276 if (pn->right == fn) { 1277 w->state = FWS_C; 1278 w->leaf = w->node->leaf; 1279 continue; 1280 } 1281 #if RT6_DEBUG >= 2 1282 BUG_TRAP(0); 1283 #endif 1284 } 1285 } 1286 } 1287 1288 static int fib6_walk(struct fib6_walker_t *w) 1289 { 1290 int res; 1291 1292 w->state = FWS_INIT; 1293 w->node = w->root; 1294 1295 fib6_walker_link(w); 1296 res = fib6_walk_continue(w); 1297 if (res <= 0) 1298 fib6_walker_unlink(w); 1299 return res; 1300 } 1301 1302 static int fib6_clean_node(struct fib6_walker_t *w) 1303 { 1304 struct nl_info info = { 1305 .nl_net = &init_net, 1306 }; 1307 int res; 1308 struct rt6_info *rt; 1309 struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); 1310 1311 for (rt = w->leaf; rt; rt = rt->u.dst.rt6_next) { 1312 res = c->func(rt, c->arg); 1313 if (res < 0) { 1314 w->leaf = rt; 1315 res = fib6_del(rt, &info); 1316 if (res) { 1317 #if RT6_DEBUG >= 2 1318 printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); 1319 #endif 1320 continue; 1321 } 1322 return 0; 1323 } 1324 BUG_TRAP(res==0); 1325 } 1326 w->leaf = rt; 1327 return 0; 1328 } 1329 1330 /* 1331 * Convenient frontend to tree walker. 1332 * 1333 * func is called on each route. 1334 * It may return -1 -> delete this route. 1335 * 0 -> continue walking 1336 * 1337 * prune==1 -> only immediate children of node (certainly, 1338 * ignoring pure split nodes) will be scanned. 1339 */ 1340 1341 static void fib6_clean_tree(struct fib6_node *root, 1342 int (*func)(struct rt6_info *, void *arg), 1343 int prune, void *arg) 1344 { 1345 struct fib6_cleaner_t c; 1346 1347 c.w.root = root; 1348 c.w.func = fib6_clean_node; 1349 c.w.prune = prune; 1350 c.func = func; 1351 c.arg = arg; 1352 1353 fib6_walk(&c.w); 1354 } 1355 1356 void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), 1357 int prune, void *arg) 1358 { 1359 struct fib6_table *table; 1360 struct hlist_node *node; 1361 struct hlist_head *head; 1362 unsigned int h; 1363 1364 rcu_read_lock(); 1365 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 1366 head = &net->ipv6.fib_table_hash[h]; 1367 hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { 1368 write_lock_bh(&table->tb6_lock); 1369 fib6_clean_tree(&table->tb6_root, func, prune, arg); 1370 write_unlock_bh(&table->tb6_lock); 1371 } 1372 } 1373 rcu_read_unlock(); 1374 } 1375 1376 static int fib6_prune_clone(struct rt6_info *rt, void *arg) 1377 { 1378 if (rt->rt6i_flags & RTF_CACHE) { 1379 RT6_TRACE("pruning clone %p\n", rt); 1380 return -1; 1381 } 1382 1383 return 0; 1384 } 1385 1386 static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt) 1387 { 1388 fib6_clean_tree(fn, fib6_prune_clone, 1, rt); 1389 } 1390 1391 /* 1392 * Garbage collection 1393 */ 1394 1395 static struct fib6_gc_args 1396 { 1397 int timeout; 1398 int more; 1399 } gc_args; 1400 1401 static int fib6_age(struct rt6_info *rt, void *arg) 1402 { 1403 unsigned long now = jiffies; 1404 1405 /* 1406 * check addrconf expiration here. 1407 * Routes are expired even if they are in use. 1408 * 1409 * Also age clones. Note, that clones are aged out 1410 * only if they are not in use now. 1411 */ 1412 1413 if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { 1414 if (time_after(now, rt->rt6i_expires)) { 1415 RT6_TRACE("expiring %p\n", rt); 1416 return -1; 1417 } 1418 gc_args.more++; 1419 } else if (rt->rt6i_flags & RTF_CACHE) { 1420 if (atomic_read(&rt->u.dst.__refcnt) == 0 && 1421 time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { 1422 RT6_TRACE("aging clone %p\n", rt); 1423 return -1; 1424 } else if ((rt->rt6i_flags & RTF_GATEWAY) && 1425 (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) { 1426 RT6_TRACE("purging route %p via non-router but gateway\n", 1427 rt); 1428 return -1; 1429 } 1430 gc_args.more++; 1431 } 1432 1433 return 0; 1434 } 1435 1436 static DEFINE_SPINLOCK(fib6_gc_lock); 1437 1438 void fib6_run_gc(unsigned long expires, struct net *net) 1439 { 1440 if (expires != ~0UL) { 1441 spin_lock_bh(&fib6_gc_lock); 1442 gc_args.timeout = expires ? (int)expires : 1443 net->ipv6.sysctl.ip6_rt_gc_interval; 1444 } else { 1445 local_bh_disable(); 1446 if (!spin_trylock(&fib6_gc_lock)) { 1447 mod_timer(&ip6_fib_timer, jiffies + HZ); 1448 local_bh_enable(); 1449 return; 1450 } 1451 gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; 1452 } 1453 gc_args.more = 0; 1454 1455 icmp6_dst_gc(&gc_args.more); 1456 1457 fib6_clean_all(net, fib6_age, 0, NULL); 1458 1459 if (gc_args.more) 1460 mod_timer(&ip6_fib_timer, jiffies + 1461 net->ipv6.sysctl.ip6_rt_gc_interval); 1462 else { 1463 del_timer(&ip6_fib_timer); 1464 ip6_fib_timer.expires = 0; 1465 } 1466 spin_unlock_bh(&fib6_gc_lock); 1467 } 1468 1469 static void fib6_gc_timer_cb(unsigned long arg) 1470 { 1471 fib6_run_gc(0, (struct net *)arg); 1472 } 1473 1474 static int fib6_net_init(struct net *net) 1475 { 1476 int ret; 1477 1478 ret = -ENOMEM; 1479 net->ipv6.fib_table_hash = 1480 kzalloc(sizeof(*net->ipv6.fib_table_hash)*FIB_TABLE_HASHSZ, 1481 GFP_KERNEL); 1482 if (!net->ipv6.fib_table_hash) 1483 goto out; 1484 1485 net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), 1486 GFP_KERNEL); 1487 if (!net->ipv6.fib6_main_tbl) 1488 goto out_fib_table_hash; 1489 1490 net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; 1491 net->ipv6.fib6_main_tbl->tb6_root.leaf = &ip6_null_entry; 1492 net->ipv6.fib6_main_tbl->tb6_root.fn_flags = 1493 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 1494 1495 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1496 net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), 1497 GFP_KERNEL); 1498 if (!net->ipv6.fib6_local_tbl) 1499 goto out_fib6_main_tbl; 1500 net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; 1501 net->ipv6.fib6_local_tbl->tb6_root.leaf = &ip6_null_entry; 1502 net->ipv6.fib6_local_tbl->tb6_root.fn_flags = 1503 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; 1504 #endif 1505 fib6_tables_init(net); 1506 1507 ret = 0; 1508 out: 1509 return ret; 1510 1511 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1512 out_fib6_main_tbl: 1513 kfree(net->ipv6.fib6_main_tbl); 1514 #endif 1515 out_fib_table_hash: 1516 kfree(net->ipv6.fib_table_hash); 1517 goto out; 1518 } 1519 1520 static void fib6_net_exit(struct net *net) 1521 { 1522 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 1523 kfree(net->ipv6.fib6_local_tbl); 1524 #endif 1525 kfree(net->ipv6.fib6_main_tbl); 1526 kfree(net->ipv6.fib_table_hash); 1527 } 1528 1529 static struct pernet_operations fib6_net_ops = { 1530 .init = fib6_net_init, 1531 .exit = fib6_net_exit, 1532 }; 1533 1534 int __init fib6_init(void) 1535 { 1536 int ret = -ENOMEM; 1537 fib6_node_kmem = kmem_cache_create("fib6_nodes", 1538 sizeof(struct fib6_node), 1539 0, SLAB_HWCACHE_ALIGN, 1540 NULL); 1541 if (!fib6_node_kmem) 1542 goto out; 1543 1544 ret = register_pernet_subsys(&fib6_net_ops); 1545 if (ret) 1546 goto out_kmem_cache_create; 1547 1548 ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib); 1549 if (ret) 1550 goto out_unregister_subsys; 1551 out: 1552 return ret; 1553 1554 out_unregister_subsys: 1555 unregister_pernet_subsys(&fib6_net_ops); 1556 out_kmem_cache_create: 1557 kmem_cache_destroy(fib6_node_kmem); 1558 goto out; 1559 } 1560 1561 void fib6_gc_cleanup(void) 1562 { 1563 del_timer(&ip6_fib_timer); 1564 unregister_pernet_subsys(&fib6_net_ops); 1565 kmem_cache_destroy(fib6_node_kmem); 1566 } 1567