1 /* 2 * This program is free software; you can redistribute it and/or 3 * modify it under the terms of the GNU General Public License 4 * as published by the Free Software Foundation; either version 5 * 2 of the License, or (at your option) any later version. 6 * 7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet 8 * & Swedish University of Agricultural Sciences. 9 * 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of 11 * Agricultural Sciences. 12 * 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 14 * 15 * This work is based on the LPC-trie which is originally descibed in: 16 * 17 * An experimental study of compression methods for dynamic tries 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 20 * 21 * 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 24 * 25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ 26 * 27 * 28 * Code from fib_hash has been reused which includes the following header: 29 * 30 * 31 * INET An implementation of the TCP/IP protocol suite for the LINUX 32 * operating system. INET is implemented using the BSD Socket 33 * interface as the means of communication with the user level. 34 * 35 * IPv4 FIB: lookup engine and maintenance routines. 36 * 37 * 38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 39 * 40 * This program is free software; you can redistribute it and/or 41 * modify it under the terms of the GNU General Public License 42 * as published by the Free Software Foundation; either version 43 * 2 of the License, or (at your option) any later version. 44 * 45 * Substantial contributions to this work comes from: 46 * 47 * David S. Miller, <davem@davemloft.net> 48 * Stephen Hemminger <shemminger@osdl.org> 49 * Paul E. McKenney <paulmck@us.ibm.com> 50 * Patrick McHardy <kaber@trash.net> 51 */ 52 53 #define VERSION "0.404" 54 55 #include <linux/config.h> 56 #include <asm/uaccess.h> 57 #include <asm/system.h> 58 #include <asm/bitops.h> 59 #include <linux/types.h> 60 #include <linux/kernel.h> 61 #include <linux/sched.h> 62 #include <linux/mm.h> 63 #include <linux/string.h> 64 #include <linux/socket.h> 65 #include <linux/sockios.h> 66 #include <linux/errno.h> 67 #include <linux/in.h> 68 #include <linux/inet.h> 69 #include <linux/inetdevice.h> 70 #include <linux/netdevice.h> 71 #include <linux/if_arp.h> 72 #include <linux/proc_fs.h> 73 #include <linux/rcupdate.h> 74 #include <linux/skbuff.h> 75 #include <linux/netlink.h> 76 #include <linux/init.h> 77 #include <linux/list.h> 78 #include <net/ip.h> 79 #include <net/protocol.h> 80 #include <net/route.h> 81 #include <net/tcp.h> 82 #include <net/sock.h> 83 #include <net/ip_fib.h> 84 #include "fib_lookup.h" 85 86 #undef CONFIG_IP_FIB_TRIE_STATS 87 #define MAX_CHILDS 16384 88 89 #define KEYLENGTH (8*sizeof(t_key)) 90 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) 91 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) 92 93 typedef unsigned int t_key; 94 95 #define T_TNODE 0 96 #define T_LEAF 1 97 #define NODE_TYPE_MASK 0x1UL 98 #define NODE_PARENT(node) \ 99 ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) 100 101 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) 102 103 #define NODE_SET_PARENT(node, ptr) \ 104 rcu_assign_pointer((node)->parent, \ 105 ((unsigned long)(ptr)) | NODE_TYPE(node)) 106 107 #define IS_TNODE(n) (!(n->parent & T_LEAF)) 108 #define IS_LEAF(n) (n->parent & T_LEAF) 109 110 struct node { 111 t_key key; 112 unsigned long parent; 113 }; 114 115 struct leaf { 116 t_key key; 117 unsigned long parent; 118 struct hlist_head list; 119 struct rcu_head rcu; 120 }; 121 122 struct leaf_info { 123 struct hlist_node hlist; 124 struct rcu_head rcu; 125 int plen; 126 struct list_head falh; 127 }; 128 129 struct tnode { 130 t_key key; 131 unsigned long parent; 132 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ 133 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ 134 unsigned short full_children; /* KEYLENGTH bits needed */ 135 unsigned short empty_children; /* KEYLENGTH bits needed */ 136 struct rcu_head rcu; 137 struct node *child[0]; 138 }; 139 140 #ifdef CONFIG_IP_FIB_TRIE_STATS 141 struct trie_use_stats { 142 unsigned int gets; 143 unsigned int backtrack; 144 unsigned int semantic_match_passed; 145 unsigned int semantic_match_miss; 146 unsigned int null_node_hit; 147 unsigned int resize_node_skipped; 148 }; 149 #endif 150 151 struct trie_stat { 152 unsigned int totdepth; 153 unsigned int maxdepth; 154 unsigned int tnodes; 155 unsigned int leaves; 156 unsigned int nullpointers; 157 unsigned int nodesizes[MAX_CHILDS]; 158 }; 159 160 struct trie { 161 struct node *trie; 162 #ifdef CONFIG_IP_FIB_TRIE_STATS 163 struct trie_use_stats stats; 164 #endif 165 int size; 166 unsigned int revision; 167 }; 168 169 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); 170 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 171 static struct node *resize(struct trie *t, struct tnode *tn); 172 static struct tnode *inflate(struct trie *t, struct tnode *tn); 173 static struct tnode *halve(struct trie *t, struct tnode *tn); 174 static void tnode_free(struct tnode *tn); 175 176 static kmem_cache_t *fn_alias_kmem __read_mostly; 177 static struct trie *trie_local = NULL, *trie_main = NULL; 178 179 180 /* rcu_read_lock needs to be hold by caller from readside */ 181 182 static inline struct node *tnode_get_child(struct tnode *tn, int i) 183 { 184 BUG_ON(i >= 1 << tn->bits); 185 186 return rcu_dereference(tn->child[i]); 187 } 188 189 static inline int tnode_child_length(const struct tnode *tn) 190 { 191 return 1 << tn->bits; 192 } 193 194 static inline t_key tkey_extract_bits(t_key a, int offset, int bits) 195 { 196 if (offset < KEYLENGTH) 197 return ((t_key)(a << offset)) >> (KEYLENGTH - bits); 198 else 199 return 0; 200 } 201 202 static inline int tkey_equals(t_key a, t_key b) 203 { 204 return a == b; 205 } 206 207 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) 208 { 209 if (bits == 0 || offset >= KEYLENGTH) 210 return 1; 211 bits = bits > KEYLENGTH ? KEYLENGTH : bits; 212 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; 213 } 214 215 static inline int tkey_mismatch(t_key a, int offset, t_key b) 216 { 217 t_key diff = a ^ b; 218 int i = offset; 219 220 if (!diff) 221 return 0; 222 while ((diff << i) >> (KEYLENGTH-1) == 0) 223 i++; 224 return i; 225 } 226 227 /* 228 To understand this stuff, an understanding of keys and all their bits is 229 necessary. Every node in the trie has a key associated with it, but not 230 all of the bits in that key are significant. 231 232 Consider a node 'n' and its parent 'tp'. 233 234 If n is a leaf, every bit in its key is significant. Its presence is 235 necessitated by path compression, since during a tree traversal (when 236 searching for a leaf - unless we are doing an insertion) we will completely 237 ignore all skipped bits we encounter. Thus we need to verify, at the end of 238 a potentially successful search, that we have indeed been walking the 239 correct key path. 240 241 Note that we can never "miss" the correct key in the tree if present by 242 following the wrong path. Path compression ensures that segments of the key 243 that are the same for all keys with a given prefix are skipped, but the 244 skipped part *is* identical for each node in the subtrie below the skipped 245 bit! trie_insert() in this implementation takes care of that - note the 246 call to tkey_sub_equals() in trie_insert(). 247 248 if n is an internal node - a 'tnode' here, the various parts of its key 249 have many different meanings. 250 251 Example: 252 _________________________________________________________________ 253 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 254 ----------------------------------------------------------------- 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 256 257 _________________________________________________________________ 258 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 259 ----------------------------------------------------------------- 260 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 261 262 tp->pos = 7 263 tp->bits = 3 264 n->pos = 15 265 n->bits = 4 266 267 First, let's just ignore the bits that come before the parent tp, that is 268 the bits from 0 to (tp->pos-1). They are *known* but at this point we do 269 not use them for anything. 270 271 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the 272 index into the parent's child array. That is, they will be used to find 273 'n' among tp's children. 274 275 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits 276 for the node n. 277 278 All the bits we have seen so far are significant to the node n. The rest 279 of the bits are really not needed or indeed known in n->key. 280 281 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 282 n's child array, and will of course be different for each child. 283 284 285 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown 286 at this point. 287 288 */ 289 290 static inline void check_tnode(const struct tnode *tn) 291 { 292 WARN_ON(tn && tn->pos+tn->bits > 32); 293 } 294 295 static int halve_threshold = 25; 296 static int inflate_threshold = 50; 297 static int halve_threshold_root = 15; 298 static int inflate_threshold_root = 25; 299 300 301 static void __alias_free_mem(struct rcu_head *head) 302 { 303 struct fib_alias *fa = container_of(head, struct fib_alias, rcu); 304 kmem_cache_free(fn_alias_kmem, fa); 305 } 306 307 static inline void alias_free_mem_rcu(struct fib_alias *fa) 308 { 309 call_rcu(&fa->rcu, __alias_free_mem); 310 } 311 312 static void __leaf_free_rcu(struct rcu_head *head) 313 { 314 kfree(container_of(head, struct leaf, rcu)); 315 } 316 317 static inline void free_leaf(struct leaf *leaf) 318 { 319 call_rcu(&leaf->rcu, __leaf_free_rcu); 320 } 321 322 static void __leaf_info_free_rcu(struct rcu_head *head) 323 { 324 kfree(container_of(head, struct leaf_info, rcu)); 325 } 326 327 static inline void free_leaf_info(struct leaf_info *leaf) 328 { 329 call_rcu(&leaf->rcu, __leaf_info_free_rcu); 330 } 331 332 static struct tnode *tnode_alloc(unsigned int size) 333 { 334 struct page *pages; 335 336 if (size <= PAGE_SIZE) 337 return kcalloc(size, 1, GFP_KERNEL); 338 339 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); 340 if (!pages) 341 return NULL; 342 343 return page_address(pages); 344 } 345 346 static void __tnode_free_rcu(struct rcu_head *head) 347 { 348 struct tnode *tn = container_of(head, struct tnode, rcu); 349 unsigned int size = sizeof(struct tnode) + 350 (1 << tn->bits) * sizeof(struct node *); 351 352 if (size <= PAGE_SIZE) 353 kfree(tn); 354 else 355 free_pages((unsigned long)tn, get_order(size)); 356 } 357 358 static inline void tnode_free(struct tnode *tn) 359 { 360 call_rcu(&tn->rcu, __tnode_free_rcu); 361 } 362 363 static struct leaf *leaf_new(void) 364 { 365 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); 366 if (l) { 367 l->parent = T_LEAF; 368 INIT_HLIST_HEAD(&l->list); 369 } 370 return l; 371 } 372 373 static struct leaf_info *leaf_info_new(int plen) 374 { 375 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); 376 if (li) { 377 li->plen = plen; 378 INIT_LIST_HEAD(&li->falh); 379 } 380 return li; 381 } 382 383 static struct tnode* tnode_new(t_key key, int pos, int bits) 384 { 385 int nchildren = 1<<bits; 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 387 struct tnode *tn = tnode_alloc(sz); 388 389 if (tn) { 390 memset(tn, 0, sz); 391 tn->parent = T_TNODE; 392 tn->pos = pos; 393 tn->bits = bits; 394 tn->key = key; 395 tn->full_children = 0; 396 tn->empty_children = 1<<bits; 397 } 398 399 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), 400 (unsigned int) (sizeof(struct node) * 1<<bits)); 401 return tn; 402 } 403 404 /* 405 * Check whether a tnode 'n' is "full", i.e. it is an internal node 406 * and no bits are skipped. See discussion in dyntree paper p. 6 407 */ 408 409 static inline int tnode_full(const struct tnode *tn, const struct node *n) 410 { 411 if (n == NULL || IS_LEAF(n)) 412 return 0; 413 414 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 415 } 416 417 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n) 418 { 419 tnode_put_child_reorg(tn, i, n, -1); 420 } 421 422 /* 423 * Add a child at position i overwriting the old value. 424 * Update the value of full_children and empty_children. 425 */ 426 427 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) 428 { 429 struct node *chi = tn->child[i]; 430 int isfull; 431 432 BUG_ON(i >= 1<<tn->bits); 433 434 435 /* update emptyChildren */ 436 if (n == NULL && chi != NULL) 437 tn->empty_children++; 438 else if (n != NULL && chi == NULL) 439 tn->empty_children--; 440 441 /* update fullChildren */ 442 if (wasfull == -1) 443 wasfull = tnode_full(tn, chi); 444 445 isfull = tnode_full(tn, n); 446 if (wasfull && !isfull) 447 tn->full_children--; 448 else if (!wasfull && isfull) 449 tn->full_children++; 450 451 if (n) 452 NODE_SET_PARENT(n, tn); 453 454 rcu_assign_pointer(tn->child[i], n); 455 } 456 457 static struct node *resize(struct trie *t, struct tnode *tn) 458 { 459 int i; 460 int err = 0; 461 struct tnode *old_tn; 462 int inflate_threshold_use; 463 int halve_threshold_use; 464 465 if (!tn) 466 return NULL; 467 468 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 469 tn, inflate_threshold, halve_threshold); 470 471 /* No children */ 472 if (tn->empty_children == tnode_child_length(tn)) { 473 tnode_free(tn); 474 return NULL; 475 } 476 /* One child */ 477 if (tn->empty_children == tnode_child_length(tn) - 1) 478 for (i = 0; i < tnode_child_length(tn); i++) { 479 struct node *n; 480 481 n = tn->child[i]; 482 if (!n) 483 continue; 484 485 /* compress one level */ 486 NODE_SET_PARENT(n, NULL); 487 tnode_free(tn); 488 return n; 489 } 490 /* 491 * Double as long as the resulting node has a number of 492 * nonempty nodes that are above the threshold. 493 */ 494 495 /* 496 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of 497 * the Helsinki University of Technology and Matti Tikkanen of Nokia 498 * Telecommunications, page 6: 499 * "A node is doubled if the ratio of non-empty children to all 500 * children in the *doubled* node is at least 'high'." 501 * 502 * 'high' in this instance is the variable 'inflate_threshold'. It 503 * is expressed as a percentage, so we multiply it with 504 * tnode_child_length() and instead of multiplying by 2 (since the 505 * child array will be doubled by inflate()) and multiplying 506 * the left-hand side by 100 (to handle the percentage thing) we 507 * multiply the left-hand side by 50. 508 * 509 * The left-hand side may look a bit weird: tnode_child_length(tn) 510 * - tn->empty_children is of course the number of non-null children 511 * in the current node. tn->full_children is the number of "full" 512 * children, that is non-null tnodes with a skip value of 0. 513 * All of those will be doubled in the resulting inflated tnode, so 514 * we just count them one extra time here. 515 * 516 * A clearer way to write this would be: 517 * 518 * to_be_doubled = tn->full_children; 519 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - 520 * tn->full_children; 521 * 522 * new_child_length = tnode_child_length(tn) * 2; 523 * 524 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 525 * new_child_length; 526 * if (new_fill_factor >= inflate_threshold) 527 * 528 * ...and so on, tho it would mess up the while () loop. 529 * 530 * anyway, 531 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 532 * inflate_threshold 533 * 534 * avoid a division: 535 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 536 * inflate_threshold * new_child_length 537 * 538 * expand not_to_be_doubled and to_be_doubled, and shorten: 539 * 100 * (tnode_child_length(tn) - tn->empty_children + 540 * tn->full_children) >= inflate_threshold * new_child_length 541 * 542 * expand new_child_length: 543 * 100 * (tnode_child_length(tn) - tn->empty_children + 544 * tn->full_children) >= 545 * inflate_threshold * tnode_child_length(tn) * 2 546 * 547 * shorten again: 548 * 50 * (tn->full_children + tnode_child_length(tn) - 549 * tn->empty_children) >= inflate_threshold * 550 * tnode_child_length(tn) 551 * 552 */ 553 554 check_tnode(tn); 555 556 /* Keep root node larger */ 557 558 if(!tn->parent) 559 inflate_threshold_use = inflate_threshold_root; 560 else 561 inflate_threshold_use = inflate_threshold; 562 563 err = 0; 564 while ((tn->full_children > 0 && 565 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 566 inflate_threshold_use * tnode_child_length(tn))) { 567 568 old_tn = tn; 569 tn = inflate(t, tn); 570 if (IS_ERR(tn)) { 571 tn = old_tn; 572 #ifdef CONFIG_IP_FIB_TRIE_STATS 573 t->stats.resize_node_skipped++; 574 #endif 575 break; 576 } 577 } 578 579 check_tnode(tn); 580 581 /* 582 * Halve as long as the number of empty children in this 583 * node is above threshold. 584 */ 585 586 587 /* Keep root node larger */ 588 589 if(!tn->parent) 590 halve_threshold_use = halve_threshold_root; 591 else 592 halve_threshold_use = halve_threshold; 593 594 err = 0; 595 while (tn->bits > 1 && 596 100 * (tnode_child_length(tn) - tn->empty_children) < 597 halve_threshold_use * tnode_child_length(tn)) { 598 599 old_tn = tn; 600 tn = halve(t, tn); 601 if (IS_ERR(tn)) { 602 tn = old_tn; 603 #ifdef CONFIG_IP_FIB_TRIE_STATS 604 t->stats.resize_node_skipped++; 605 #endif 606 break; 607 } 608 } 609 610 611 /* Only one child remains */ 612 if (tn->empty_children == tnode_child_length(tn) - 1) 613 for (i = 0; i < tnode_child_length(tn); i++) { 614 struct node *n; 615 616 n = tn->child[i]; 617 if (!n) 618 continue; 619 620 /* compress one level */ 621 622 NODE_SET_PARENT(n, NULL); 623 tnode_free(tn); 624 return n; 625 } 626 627 return (struct node *) tn; 628 } 629 630 static struct tnode *inflate(struct trie *t, struct tnode *tn) 631 { 632 struct tnode *inode; 633 struct tnode *oldtnode = tn; 634 int olen = tnode_child_length(tn); 635 int i; 636 637 pr_debug("In inflate\n"); 638 639 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 640 641 if (!tn) 642 return ERR_PTR(-ENOMEM); 643 644 /* 645 * Preallocate and store tnodes before the actual work so we 646 * don't get into an inconsistent state if memory allocation 647 * fails. In case of failure we return the oldnode and inflate 648 * of tnode is ignored. 649 */ 650 651 for (i = 0; i < olen; i++) { 652 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); 653 654 if (inode && 655 IS_TNODE(inode) && 656 inode->pos == oldtnode->pos + oldtnode->bits && 657 inode->bits > 1) { 658 struct tnode *left, *right; 659 t_key m = TKEY_GET_MASK(inode->pos, 1); 660 661 left = tnode_new(inode->key&(~m), inode->pos + 1, 662 inode->bits - 1); 663 if (!left) 664 goto nomem; 665 666 right = tnode_new(inode->key|m, inode->pos + 1, 667 inode->bits - 1); 668 669 if (!right) { 670 tnode_free(left); 671 goto nomem; 672 } 673 674 put_child(t, tn, 2*i, (struct node *) left); 675 put_child(t, tn, 2*i+1, (struct node *) right); 676 } 677 } 678 679 for (i = 0; i < olen; i++) { 680 struct node *node = tnode_get_child(oldtnode, i); 681 struct tnode *left, *right; 682 int size, j; 683 684 /* An empty child */ 685 if (node == NULL) 686 continue; 687 688 /* A leaf or an internal node with skipped bits */ 689 690 if (IS_LEAF(node) || ((struct tnode *) node)->pos > 691 tn->pos + tn->bits - 1) { 692 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits, 693 1) == 0) 694 put_child(t, tn, 2*i, node); 695 else 696 put_child(t, tn, 2*i+1, node); 697 continue; 698 } 699 700 /* An internal node with two children */ 701 inode = (struct tnode *) node; 702 703 if (inode->bits == 1) { 704 put_child(t, tn, 2*i, inode->child[0]); 705 put_child(t, tn, 2*i+1, inode->child[1]); 706 707 tnode_free(inode); 708 continue; 709 } 710 711 /* An internal node with more than two children */ 712 713 /* We will replace this node 'inode' with two new 714 * ones, 'left' and 'right', each with half of the 715 * original children. The two new nodes will have 716 * a position one bit further down the key and this 717 * means that the "significant" part of their keys 718 * (see the discussion near the top of this file) 719 * will differ by one bit, which will be "0" in 720 * left's key and "1" in right's key. Since we are 721 * moving the key position by one step, the bit that 722 * we are moving away from - the bit at position 723 * (inode->pos) - is the one that will differ between 724 * left and right. So... we synthesize that bit in the 725 * two new keys. 726 * The mask 'm' below will be a single "one" bit at 727 * the position (inode->pos) 728 */ 729 730 /* Use the old key, but set the new significant 731 * bit to zero. 732 */ 733 734 left = (struct tnode *) tnode_get_child(tn, 2*i); 735 put_child(t, tn, 2*i, NULL); 736 737 BUG_ON(!left); 738 739 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 740 put_child(t, tn, 2*i+1, NULL); 741 742 BUG_ON(!right); 743 744 size = tnode_child_length(left); 745 for (j = 0; j < size; j++) { 746 put_child(t, left, j, inode->child[j]); 747 put_child(t, right, j, inode->child[j + size]); 748 } 749 put_child(t, tn, 2*i, resize(t, left)); 750 put_child(t, tn, 2*i+1, resize(t, right)); 751 752 tnode_free(inode); 753 } 754 tnode_free(oldtnode); 755 return tn; 756 nomem: 757 { 758 int size = tnode_child_length(tn); 759 int j; 760 761 for (j = 0; j < size; j++) 762 if (tn->child[j]) 763 tnode_free((struct tnode *)tn->child[j]); 764 765 tnode_free(tn); 766 767 return ERR_PTR(-ENOMEM); 768 } 769 } 770 771 static struct tnode *halve(struct trie *t, struct tnode *tn) 772 { 773 struct tnode *oldtnode = tn; 774 struct node *left, *right; 775 int i; 776 int olen = tnode_child_length(tn); 777 778 pr_debug("In halve\n"); 779 780 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 781 782 if (!tn) 783 return ERR_PTR(-ENOMEM); 784 785 /* 786 * Preallocate and store tnodes before the actual work so we 787 * don't get into an inconsistent state if memory allocation 788 * fails. In case of failure we return the oldnode and halve 789 * of tnode is ignored. 790 */ 791 792 for (i = 0; i < olen; i += 2) { 793 left = tnode_get_child(oldtnode, i); 794 right = tnode_get_child(oldtnode, i+1); 795 796 /* Two nonempty children */ 797 if (left && right) { 798 struct tnode *newn; 799 800 newn = tnode_new(left->key, tn->pos + tn->bits, 1); 801 802 if (!newn) 803 goto nomem; 804 805 put_child(t, tn, i/2, (struct node *)newn); 806 } 807 808 } 809 810 for (i = 0; i < olen; i += 2) { 811 struct tnode *newBinNode; 812 813 left = tnode_get_child(oldtnode, i); 814 right = tnode_get_child(oldtnode, i+1); 815 816 /* At least one of the children is empty */ 817 if (left == NULL) { 818 if (right == NULL) /* Both are empty */ 819 continue; 820 put_child(t, tn, i/2, right); 821 continue; 822 } 823 824 if (right == NULL) { 825 put_child(t, tn, i/2, left); 826 continue; 827 } 828 829 /* Two nonempty children */ 830 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 831 put_child(t, tn, i/2, NULL); 832 put_child(t, newBinNode, 0, left); 833 put_child(t, newBinNode, 1, right); 834 put_child(t, tn, i/2, resize(t, newBinNode)); 835 } 836 tnode_free(oldtnode); 837 return tn; 838 nomem: 839 { 840 int size = tnode_child_length(tn); 841 int j; 842 843 for (j = 0; j < size; j++) 844 if (tn->child[j]) 845 tnode_free((struct tnode *)tn->child[j]); 846 847 tnode_free(tn); 848 849 return ERR_PTR(-ENOMEM); 850 } 851 } 852 853 static void trie_init(struct trie *t) 854 { 855 if (!t) 856 return; 857 858 t->size = 0; 859 rcu_assign_pointer(t->trie, NULL); 860 t->revision = 0; 861 #ifdef CONFIG_IP_FIB_TRIE_STATS 862 memset(&t->stats, 0, sizeof(struct trie_use_stats)); 863 #endif 864 } 865 866 /* readside must use rcu_read_lock currently dump routines 867 via get_fa_head and dump */ 868 869 static struct leaf_info *find_leaf_info(struct leaf *l, int plen) 870 { 871 struct hlist_head *head = &l->list; 872 struct hlist_node *node; 873 struct leaf_info *li; 874 875 hlist_for_each_entry_rcu(li, node, head, hlist) 876 if (li->plen == plen) 877 return li; 878 879 return NULL; 880 } 881 882 static inline struct list_head * get_fa_head(struct leaf *l, int plen) 883 { 884 struct leaf_info *li = find_leaf_info(l, plen); 885 886 if (!li) 887 return NULL; 888 889 return &li->falh; 890 } 891 892 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) 893 { 894 struct leaf_info *li = NULL, *last = NULL; 895 struct hlist_node *node; 896 897 if (hlist_empty(head)) { 898 hlist_add_head_rcu(&new->hlist, head); 899 } else { 900 hlist_for_each_entry(li, node, head, hlist) { 901 if (new->plen > li->plen) 902 break; 903 904 last = li; 905 } 906 if (last) 907 hlist_add_after_rcu(&last->hlist, &new->hlist); 908 else 909 hlist_add_before_rcu(&new->hlist, &li->hlist); 910 } 911 } 912 913 /* rcu_read_lock needs to be hold by caller from readside */ 914 915 static struct leaf * 916 fib_find_node(struct trie *t, u32 key) 917 { 918 int pos; 919 struct tnode *tn; 920 struct node *n; 921 922 pos = 0; 923 n = rcu_dereference(t->trie); 924 925 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 926 tn = (struct tnode *) n; 927 928 check_tnode(tn); 929 930 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 931 pos = tn->pos + tn->bits; 932 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 933 } else 934 break; 935 } 936 /* Case we have found a leaf. Compare prefixes */ 937 938 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) 939 return (struct leaf *)n; 940 941 return NULL; 942 } 943 944 static struct node *trie_rebalance(struct trie *t, struct tnode *tn) 945 { 946 int wasfull; 947 t_key cindex, key; 948 struct tnode *tp = NULL; 949 950 key = tn->key; 951 952 while (tn != NULL && NODE_PARENT(tn) != NULL) { 953 954 tp = NODE_PARENT(tn); 955 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 956 wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); 957 tn = (struct tnode *) resize (t, (struct tnode *)tn); 958 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); 959 960 if (!NODE_PARENT(tn)) 961 break; 962 963 tn = NODE_PARENT(tn); 964 } 965 /* Handle last (top) tnode */ 966 if (IS_TNODE(tn)) 967 tn = (struct tnode*) resize(t, (struct tnode *)tn); 968 969 return (struct node*) tn; 970 } 971 972 /* only used from updater-side */ 973 974 static struct list_head * 975 fib_insert_node(struct trie *t, int *err, u32 key, int plen) 976 { 977 int pos, newpos; 978 struct tnode *tp = NULL, *tn = NULL; 979 struct node *n; 980 struct leaf *l; 981 int missbit; 982 struct list_head *fa_head = NULL; 983 struct leaf_info *li; 984 t_key cindex; 985 986 pos = 0; 987 n = t->trie; 988 989 /* If we point to NULL, stop. Either the tree is empty and we should 990 * just put a new leaf in if, or we have reached an empty child slot, 991 * and we should just put our new leaf in that. 992 * If we point to a T_TNODE, check if it matches our key. Note that 993 * a T_TNODE might be skipping any number of bits - its 'pos' need 994 * not be the parent's 'pos'+'bits'! 995 * 996 * If it does match the current key, get pos/bits from it, extract 997 * the index from our key, push the T_TNODE and walk the tree. 998 * 999 * If it doesn't, we have to replace it with a new T_TNODE. 1000 * 1001 * If we point to a T_LEAF, it might or might not have the same key 1002 * as we do. If it does, just change the value, update the T_LEAF's 1003 * value, and return it. 1004 * If it doesn't, we need to replace it with a T_TNODE. 1005 */ 1006 1007 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 1008 tn = (struct tnode *) n; 1009 1010 check_tnode(tn); 1011 1012 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { 1013 tp = tn; 1014 pos = tn->pos + tn->bits; 1015 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); 1016 1017 BUG_ON(n && NODE_PARENT(n) != tn); 1018 } else 1019 break; 1020 } 1021 1022 /* 1023 * n ----> NULL, LEAF or TNODE 1024 * 1025 * tp is n's (parent) ----> NULL or TNODE 1026 */ 1027 1028 BUG_ON(tp && IS_LEAF(tp)); 1029 1030 /* Case 1: n is a leaf. Compare prefixes */ 1031 1032 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { 1033 struct leaf *l = (struct leaf *) n; 1034 1035 li = leaf_info_new(plen); 1036 1037 if (!li) { 1038 *err = -ENOMEM; 1039 goto err; 1040 } 1041 1042 fa_head = &li->falh; 1043 insert_leaf_info(&l->list, li); 1044 goto done; 1045 } 1046 t->size++; 1047 l = leaf_new(); 1048 1049 if (!l) { 1050 *err = -ENOMEM; 1051 goto err; 1052 } 1053 1054 l->key = key; 1055 li = leaf_info_new(plen); 1056 1057 if (!li) { 1058 tnode_free((struct tnode *) l); 1059 *err = -ENOMEM; 1060 goto err; 1061 } 1062 1063 fa_head = &li->falh; 1064 insert_leaf_info(&l->list, li); 1065 1066 if (t->trie && n == NULL) { 1067 /* Case 2: n is NULL, and will just insert a new leaf */ 1068 1069 NODE_SET_PARENT(l, tp); 1070 1071 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1072 put_child(t, (struct tnode *)tp, cindex, (struct node *)l); 1073 } else { 1074 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1075 /* 1076 * Add a new tnode here 1077 * first tnode need some special handling 1078 */ 1079 1080 if (tp) 1081 pos = tp->pos+tp->bits; 1082 else 1083 pos = 0; 1084 1085 if (n) { 1086 newpos = tkey_mismatch(key, pos, n->key); 1087 tn = tnode_new(n->key, newpos, 1); 1088 } else { 1089 newpos = 0; 1090 tn = tnode_new(key, newpos, 1); /* First tnode */ 1091 } 1092 1093 if (!tn) { 1094 free_leaf_info(li); 1095 tnode_free((struct tnode *) l); 1096 *err = -ENOMEM; 1097 goto err; 1098 } 1099 1100 NODE_SET_PARENT(tn, tp); 1101 1102 missbit = tkey_extract_bits(key, newpos, 1); 1103 put_child(t, tn, missbit, (struct node *)l); 1104 put_child(t, tn, 1-missbit, n); 1105 1106 if (tp) { 1107 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1108 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); 1109 } else { 1110 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ 1111 tp = tn; 1112 } 1113 } 1114 1115 if (tp && tp->pos + tp->bits > 32) 1116 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", 1117 tp, tp->pos, tp->bits, key, plen); 1118 1119 /* Rebalance the trie */ 1120 1121 rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); 1122 done: 1123 t->revision++; 1124 err: 1125 return fa_head; 1126 } 1127 1128 static int 1129 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1130 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1131 { 1132 struct trie *t = (struct trie *) tb->tb_data; 1133 struct fib_alias *fa, *new_fa; 1134 struct list_head *fa_head = NULL; 1135 struct fib_info *fi; 1136 int plen = r->rtm_dst_len; 1137 int type = r->rtm_type; 1138 u8 tos = r->rtm_tos; 1139 u32 key, mask; 1140 int err; 1141 struct leaf *l; 1142 1143 if (plen > 32) 1144 return -EINVAL; 1145 1146 key = 0; 1147 if (rta->rta_dst) 1148 memcpy(&key, rta->rta_dst, 4); 1149 1150 key = ntohl(key); 1151 1152 pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); 1153 1154 mask = ntohl(inet_make_mask(plen)); 1155 1156 if (key & ~mask) 1157 return -EINVAL; 1158 1159 key = key & mask; 1160 1161 fi = fib_create_info(r, rta, nlhdr, &err); 1162 1163 if (!fi) 1164 goto err; 1165 1166 l = fib_find_node(t, key); 1167 fa = NULL; 1168 1169 if (l) { 1170 fa_head = get_fa_head(l, plen); 1171 fa = fib_find_alias(fa_head, tos, fi->fib_priority); 1172 } 1173 1174 /* Now fa, if non-NULL, points to the first fib alias 1175 * with the same keys [prefix,tos,priority], if such key already 1176 * exists or to the node before which we will insert new one. 1177 * 1178 * If fa is NULL, we will need to allocate a new one and 1179 * insert to the head of f. 1180 * 1181 * If f is NULL, no fib node matched the destination key 1182 * and we need to allocate a new one of those as well. 1183 */ 1184 1185 if (fa && fa->fa_info->fib_priority == fi->fib_priority) { 1186 struct fib_alias *fa_orig; 1187 1188 err = -EEXIST; 1189 if (nlhdr->nlmsg_flags & NLM_F_EXCL) 1190 goto out; 1191 1192 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) { 1193 struct fib_info *fi_drop; 1194 u8 state; 1195 1196 err = -ENOBUFS; 1197 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); 1198 if (new_fa == NULL) 1199 goto out; 1200 1201 fi_drop = fa->fa_info; 1202 new_fa->fa_tos = fa->fa_tos; 1203 new_fa->fa_info = fi; 1204 new_fa->fa_type = type; 1205 new_fa->fa_scope = r->rtm_scope; 1206 state = fa->fa_state; 1207 new_fa->fa_state &= ~FA_S_ACCESSED; 1208 1209 list_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1210 alias_free_mem_rcu(fa); 1211 1212 fib_release_info(fi_drop); 1213 if (state & FA_S_ACCESSED) 1214 rt_cache_flush(-1); 1215 1216 goto succeeded; 1217 } 1218 /* Error if we find a perfect match which 1219 * uses the same scope, type, and nexthop 1220 * information. 1221 */ 1222 fa_orig = fa; 1223 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) { 1224 if (fa->fa_tos != tos) 1225 break; 1226 if (fa->fa_info->fib_priority != fi->fib_priority) 1227 break; 1228 if (fa->fa_type == type && 1229 fa->fa_scope == r->rtm_scope && 1230 fa->fa_info == fi) { 1231 goto out; 1232 } 1233 } 1234 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND)) 1235 fa = fa_orig; 1236 } 1237 err = -ENOENT; 1238 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE)) 1239 goto out; 1240 1241 err = -ENOBUFS; 1242 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); 1243 if (new_fa == NULL) 1244 goto out; 1245 1246 new_fa->fa_info = fi; 1247 new_fa->fa_tos = tos; 1248 new_fa->fa_type = type; 1249 new_fa->fa_scope = r->rtm_scope; 1250 new_fa->fa_state = 0; 1251 /* 1252 * Insert new entry to the list. 1253 */ 1254 1255 if (!fa_head) { 1256 fa_head = fib_insert_node(t, &err, key, plen); 1257 err = 0; 1258 if (err) 1259 goto out_free_new_fa; 1260 } 1261 1262 list_add_tail_rcu(&new_fa->fa_list, 1263 (fa ? &fa->fa_list : fa_head)); 1264 1265 rt_cache_flush(-1); 1266 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); 1267 succeeded: 1268 return 0; 1269 1270 out_free_new_fa: 1271 kmem_cache_free(fn_alias_kmem, new_fa); 1272 out: 1273 fib_release_info(fi); 1274 err: 1275 return err; 1276 } 1277 1278 1279 /* should be called with rcu_read_lock */ 1280 static inline int check_leaf(struct trie *t, struct leaf *l, 1281 t_key key, int *plen, const struct flowi *flp, 1282 struct fib_result *res) 1283 { 1284 int err, i; 1285 t_key mask; 1286 struct leaf_info *li; 1287 struct hlist_head *hhead = &l->list; 1288 struct hlist_node *node; 1289 1290 hlist_for_each_entry_rcu(li, node, hhead, hlist) { 1291 i = li->plen; 1292 mask = ntohl(inet_make_mask(i)); 1293 if (l->key != (key & mask)) 1294 continue; 1295 1296 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) { 1297 *plen = i; 1298 #ifdef CONFIG_IP_FIB_TRIE_STATS 1299 t->stats.semantic_match_passed++; 1300 #endif 1301 return err; 1302 } 1303 #ifdef CONFIG_IP_FIB_TRIE_STATS 1304 t->stats.semantic_match_miss++; 1305 #endif 1306 } 1307 return 1; 1308 } 1309 1310 static int 1311 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1312 { 1313 struct trie *t = (struct trie *) tb->tb_data; 1314 int plen, ret = 0; 1315 struct node *n; 1316 struct tnode *pn; 1317 int pos, bits; 1318 t_key key = ntohl(flp->fl4_dst); 1319 int chopped_off; 1320 t_key cindex = 0; 1321 int current_prefix_length = KEYLENGTH; 1322 struct tnode *cn; 1323 t_key node_prefix, key_prefix, pref_mismatch; 1324 int mp; 1325 1326 rcu_read_lock(); 1327 1328 n = rcu_dereference(t->trie); 1329 if (!n) 1330 goto failed; 1331 1332 #ifdef CONFIG_IP_FIB_TRIE_STATS 1333 t->stats.gets++; 1334 #endif 1335 1336 /* Just a leaf? */ 1337 if (IS_LEAF(n)) { 1338 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) 1339 goto found; 1340 goto failed; 1341 } 1342 pn = (struct tnode *) n; 1343 chopped_off = 0; 1344 1345 while (pn) { 1346 pos = pn->pos; 1347 bits = pn->bits; 1348 1349 if (!chopped_off) 1350 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits); 1351 1352 n = tnode_get_child(pn, cindex); 1353 1354 if (n == NULL) { 1355 #ifdef CONFIG_IP_FIB_TRIE_STATS 1356 t->stats.null_node_hit++; 1357 #endif 1358 goto backtrace; 1359 } 1360 1361 if (IS_LEAF(n)) { 1362 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) 1363 goto found; 1364 else 1365 goto backtrace; 1366 } 1367 1368 #define HL_OPTIMIZE 1369 #ifdef HL_OPTIMIZE 1370 cn = (struct tnode *)n; 1371 1372 /* 1373 * It's a tnode, and we can do some extra checks here if we 1374 * like, to avoid descending into a dead-end branch. 1375 * This tnode is in the parent's child array at index 1376 * key[p_pos..p_pos+p_bits] but potentially with some bits 1377 * chopped off, so in reality the index may be just a 1378 * subprefix, padded with zero at the end. 1379 * We can also take a look at any skipped bits in this 1380 * tnode - everything up to p_pos is supposed to be ok, 1381 * and the non-chopped bits of the index (se previous 1382 * paragraph) are also guaranteed ok, but the rest is 1383 * considered unknown. 1384 * 1385 * The skipped bits are key[pos+bits..cn->pos]. 1386 */ 1387 1388 /* If current_prefix_length < pos+bits, we are already doing 1389 * actual prefix matching, which means everything from 1390 * pos+(bits-chopped_off) onward must be zero along some 1391 * branch of this subtree - otherwise there is *no* valid 1392 * prefix present. Here we can only check the skipped 1393 * bits. Remember, since we have already indexed into the 1394 * parent's child array, we know that the bits we chopped of 1395 * *are* zero. 1396 */ 1397 1398 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ 1399 1400 if (current_prefix_length < pos+bits) { 1401 if (tkey_extract_bits(cn->key, current_prefix_length, 1402 cn->pos - current_prefix_length) != 0 || 1403 !(cn->child[0])) 1404 goto backtrace; 1405 } 1406 1407 /* 1408 * If chopped_off=0, the index is fully validated and we 1409 * only need to look at the skipped bits for this, the new, 1410 * tnode. What we actually want to do is to find out if 1411 * these skipped bits match our key perfectly, or if we will 1412 * have to count on finding a matching prefix further down, 1413 * because if we do, we would like to have some way of 1414 * verifying the existence of such a prefix at this point. 1415 */ 1416 1417 /* The only thing we can do at this point is to verify that 1418 * any such matching prefix can indeed be a prefix to our 1419 * key, and if the bits in the node we are inspecting that 1420 * do not match our key are not ZERO, this cannot be true. 1421 * Thus, find out where there is a mismatch (before cn->pos) 1422 * and verify that all the mismatching bits are zero in the 1423 * new tnode's key. 1424 */ 1425 1426 /* Note: We aren't very concerned about the piece of the key 1427 * that precede pn->pos+pn->bits, since these have already been 1428 * checked. The bits after cn->pos aren't checked since these are 1429 * by definition "unknown" at this point. Thus, what we want to 1430 * see is if we are about to enter the "prefix matching" state, 1431 * and in that case verify that the skipped bits that will prevail 1432 * throughout this subtree are zero, as they have to be if we are 1433 * to find a matching prefix. 1434 */ 1435 1436 node_prefix = MASK_PFX(cn->key, cn->pos); 1437 key_prefix = MASK_PFX(key, cn->pos); 1438 pref_mismatch = key_prefix^node_prefix; 1439 mp = 0; 1440 1441 /* In short: If skipped bits in this node do not match the search 1442 * key, enter the "prefix matching" state.directly. 1443 */ 1444 if (pref_mismatch) { 1445 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { 1446 mp++; 1447 pref_mismatch = pref_mismatch <<1; 1448 } 1449 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); 1450 1451 if (key_prefix != 0) 1452 goto backtrace; 1453 1454 if (current_prefix_length >= cn->pos) 1455 current_prefix_length = mp; 1456 } 1457 #endif 1458 pn = (struct tnode *)n; /* Descend */ 1459 chopped_off = 0; 1460 continue; 1461 1462 backtrace: 1463 chopped_off++; 1464 1465 /* As zero don't change the child key (cindex) */ 1466 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) 1467 chopped_off++; 1468 1469 /* Decrease current_... with bits chopped off */ 1470 if (current_prefix_length > pn->pos + pn->bits - chopped_off) 1471 current_prefix_length = pn->pos + pn->bits - chopped_off; 1472 1473 /* 1474 * Either we do the actual chop off according or if we have 1475 * chopped off all bits in this tnode walk up to our parent. 1476 */ 1477 1478 if (chopped_off <= pn->bits) { 1479 cindex &= ~(1 << (chopped_off-1)); 1480 } else { 1481 if (NODE_PARENT(pn) == NULL) 1482 goto failed; 1483 1484 /* Get Child's index */ 1485 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); 1486 pn = NODE_PARENT(pn); 1487 chopped_off = 0; 1488 1489 #ifdef CONFIG_IP_FIB_TRIE_STATS 1490 t->stats.backtrack++; 1491 #endif 1492 goto backtrace; 1493 } 1494 } 1495 failed: 1496 ret = 1; 1497 found: 1498 rcu_read_unlock(); 1499 return ret; 1500 } 1501 1502 /* only called from updater side */ 1503 static int trie_leaf_remove(struct trie *t, t_key key) 1504 { 1505 t_key cindex; 1506 struct tnode *tp = NULL; 1507 struct node *n = t->trie; 1508 struct leaf *l; 1509 1510 pr_debug("entering trie_leaf_remove(%p)\n", n); 1511 1512 /* Note that in the case skipped bits, those bits are *not* checked! 1513 * When we finish this, we will have NULL or a T_LEAF, and the 1514 * T_LEAF may or may not match our key. 1515 */ 1516 1517 while (n != NULL && IS_TNODE(n)) { 1518 struct tnode *tn = (struct tnode *) n; 1519 check_tnode(tn); 1520 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); 1521 1522 BUG_ON(n && NODE_PARENT(n) != tn); 1523 } 1524 l = (struct leaf *) n; 1525 1526 if (!n || !tkey_equals(l->key, key)) 1527 return 0; 1528 1529 /* 1530 * Key found. 1531 * Remove the leaf and rebalance the tree 1532 */ 1533 1534 t->revision++; 1535 t->size--; 1536 1537 preempt_disable(); 1538 tp = NODE_PARENT(n); 1539 tnode_free((struct tnode *) n); 1540 1541 if (tp) { 1542 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1543 put_child(t, (struct tnode *)tp, cindex, NULL); 1544 rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); 1545 } else 1546 rcu_assign_pointer(t->trie, NULL); 1547 preempt_enable(); 1548 1549 return 1; 1550 } 1551 1552 static int 1553 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, 1554 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) 1555 { 1556 struct trie *t = (struct trie *) tb->tb_data; 1557 u32 key, mask; 1558 int plen = r->rtm_dst_len; 1559 u8 tos = r->rtm_tos; 1560 struct fib_alias *fa, *fa_to_delete; 1561 struct list_head *fa_head; 1562 struct leaf *l; 1563 struct leaf_info *li; 1564 1565 1566 if (plen > 32) 1567 return -EINVAL; 1568 1569 key = 0; 1570 if (rta->rta_dst) 1571 memcpy(&key, rta->rta_dst, 4); 1572 1573 key = ntohl(key); 1574 mask = ntohl(inet_make_mask(plen)); 1575 1576 if (key & ~mask) 1577 return -EINVAL; 1578 1579 key = key & mask; 1580 l = fib_find_node(t, key); 1581 1582 if (!l) 1583 return -ESRCH; 1584 1585 fa_head = get_fa_head(l, plen); 1586 fa = fib_find_alias(fa_head, tos, 0); 1587 1588 if (!fa) 1589 return -ESRCH; 1590 1591 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); 1592 1593 fa_to_delete = NULL; 1594 fa_head = fa->fa_list.prev; 1595 1596 list_for_each_entry(fa, fa_head, fa_list) { 1597 struct fib_info *fi = fa->fa_info; 1598 1599 if (fa->fa_tos != tos) 1600 break; 1601 1602 if ((!r->rtm_type || 1603 fa->fa_type == r->rtm_type) && 1604 (r->rtm_scope == RT_SCOPE_NOWHERE || 1605 fa->fa_scope == r->rtm_scope) && 1606 (!r->rtm_protocol || 1607 fi->fib_protocol == r->rtm_protocol) && 1608 fib_nh_match(r, nlhdr, rta, fi) == 0) { 1609 fa_to_delete = fa; 1610 break; 1611 } 1612 } 1613 1614 if (!fa_to_delete) 1615 return -ESRCH; 1616 1617 fa = fa_to_delete; 1618 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); 1619 1620 l = fib_find_node(t, key); 1621 li = find_leaf_info(l, plen); 1622 1623 list_del_rcu(&fa->fa_list); 1624 1625 if (list_empty(fa_head)) { 1626 hlist_del_rcu(&li->hlist); 1627 free_leaf_info(li); 1628 } 1629 1630 if (hlist_empty(&l->list)) 1631 trie_leaf_remove(t, key); 1632 1633 if (fa->fa_state & FA_S_ACCESSED) 1634 rt_cache_flush(-1); 1635 1636 fib_release_info(fa->fa_info); 1637 alias_free_mem_rcu(fa); 1638 return 0; 1639 } 1640 1641 static int trie_flush_list(struct trie *t, struct list_head *head) 1642 { 1643 struct fib_alias *fa, *fa_node; 1644 int found = 0; 1645 1646 list_for_each_entry_safe(fa, fa_node, head, fa_list) { 1647 struct fib_info *fi = fa->fa_info; 1648 1649 if (fi && (fi->fib_flags & RTNH_F_DEAD)) { 1650 list_del_rcu(&fa->fa_list); 1651 fib_release_info(fa->fa_info); 1652 alias_free_mem_rcu(fa); 1653 found++; 1654 } 1655 } 1656 return found; 1657 } 1658 1659 static int trie_flush_leaf(struct trie *t, struct leaf *l) 1660 { 1661 int found = 0; 1662 struct hlist_head *lih = &l->list; 1663 struct hlist_node *node, *tmp; 1664 struct leaf_info *li = NULL; 1665 1666 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { 1667 found += trie_flush_list(t, &li->falh); 1668 1669 if (list_empty(&li->falh)) { 1670 hlist_del_rcu(&li->hlist); 1671 free_leaf_info(li); 1672 } 1673 } 1674 return found; 1675 } 1676 1677 /* rcu_read_lock needs to be hold by caller from readside */ 1678 1679 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) 1680 { 1681 struct node *c = (struct node *) thisleaf; 1682 struct tnode *p; 1683 int idx; 1684 struct node *trie = rcu_dereference(t->trie); 1685 1686 if (c == NULL) { 1687 if (trie == NULL) 1688 return NULL; 1689 1690 if (IS_LEAF(trie)) /* trie w. just a leaf */ 1691 return (struct leaf *) trie; 1692 1693 p = (struct tnode*) trie; /* Start */ 1694 } else 1695 p = (struct tnode *) NODE_PARENT(c); 1696 1697 while (p) { 1698 int pos, last; 1699 1700 /* Find the next child of the parent */ 1701 if (c) 1702 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits); 1703 else 1704 pos = 0; 1705 1706 last = 1 << p->bits; 1707 for (idx = pos; idx < last ; idx++) { 1708 c = rcu_dereference(p->child[idx]); 1709 1710 if (!c) 1711 continue; 1712 1713 /* Decend if tnode */ 1714 while (IS_TNODE(c)) { 1715 p = (struct tnode *) c; 1716 idx = 0; 1717 1718 /* Rightmost non-NULL branch */ 1719 if (p && IS_TNODE(p)) 1720 while (!(c = rcu_dereference(p->child[idx])) 1721 && idx < (1<<p->bits)) idx++; 1722 1723 /* Done with this tnode? */ 1724 if (idx >= (1 << p->bits) || !c) 1725 goto up; 1726 } 1727 return (struct leaf *) c; 1728 } 1729 up: 1730 /* No more children go up one step */ 1731 c = (struct node *) p; 1732 p = (struct tnode *) NODE_PARENT(p); 1733 } 1734 return NULL; /* Ready. Root of trie */ 1735 } 1736 1737 static int fn_trie_flush(struct fib_table *tb) 1738 { 1739 struct trie *t = (struct trie *) tb->tb_data; 1740 struct leaf *ll = NULL, *l = NULL; 1741 int found = 0, h; 1742 1743 t->revision++; 1744 1745 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { 1746 found += trie_flush_leaf(t, l); 1747 1748 if (ll && hlist_empty(&ll->list)) 1749 trie_leaf_remove(t, ll->key); 1750 ll = l; 1751 } 1752 1753 if (ll && hlist_empty(&ll->list)) 1754 trie_leaf_remove(t, ll->key); 1755 1756 pr_debug("trie_flush found=%d\n", found); 1757 return found; 1758 } 1759 1760 static int trie_last_dflt = -1; 1761 1762 static void 1763 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) 1764 { 1765 struct trie *t = (struct trie *) tb->tb_data; 1766 int order, last_idx; 1767 struct fib_info *fi = NULL; 1768 struct fib_info *last_resort; 1769 struct fib_alias *fa = NULL; 1770 struct list_head *fa_head; 1771 struct leaf *l; 1772 1773 last_idx = -1; 1774 last_resort = NULL; 1775 order = -1; 1776 1777 rcu_read_lock(); 1778 1779 l = fib_find_node(t, 0); 1780 if (!l) 1781 goto out; 1782 1783 fa_head = get_fa_head(l, 0); 1784 if (!fa_head) 1785 goto out; 1786 1787 if (list_empty(fa_head)) 1788 goto out; 1789 1790 list_for_each_entry_rcu(fa, fa_head, fa_list) { 1791 struct fib_info *next_fi = fa->fa_info; 1792 1793 if (fa->fa_scope != res->scope || 1794 fa->fa_type != RTN_UNICAST) 1795 continue; 1796 1797 if (next_fi->fib_priority > res->fi->fib_priority) 1798 break; 1799 if (!next_fi->fib_nh[0].nh_gw || 1800 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1801 continue; 1802 fa->fa_state |= FA_S_ACCESSED; 1803 1804 if (fi == NULL) { 1805 if (next_fi != res->fi) 1806 break; 1807 } else if (!fib_detect_death(fi, order, &last_resort, 1808 &last_idx, &trie_last_dflt)) { 1809 if (res->fi) 1810 fib_info_put(res->fi); 1811 res->fi = fi; 1812 atomic_inc(&fi->fib_clntref); 1813 trie_last_dflt = order; 1814 goto out; 1815 } 1816 fi = next_fi; 1817 order++; 1818 } 1819 if (order <= 0 || fi == NULL) { 1820 trie_last_dflt = -1; 1821 goto out; 1822 } 1823 1824 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) { 1825 if (res->fi) 1826 fib_info_put(res->fi); 1827 res->fi = fi; 1828 atomic_inc(&fi->fib_clntref); 1829 trie_last_dflt = order; 1830 goto out; 1831 } 1832 if (last_idx >= 0) { 1833 if (res->fi) 1834 fib_info_put(res->fi); 1835 res->fi = last_resort; 1836 if (last_resort) 1837 atomic_inc(&last_resort->fib_clntref); 1838 } 1839 trie_last_dflt = last_idx; 1840 out:; 1841 rcu_read_unlock(); 1842 } 1843 1844 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, 1845 struct sk_buff *skb, struct netlink_callback *cb) 1846 { 1847 int i, s_i; 1848 struct fib_alias *fa; 1849 1850 u32 xkey = htonl(key); 1851 1852 s_i = cb->args[3]; 1853 i = 0; 1854 1855 /* rcu_read_lock is hold by caller */ 1856 1857 list_for_each_entry_rcu(fa, fah, fa_list) { 1858 if (i < s_i) { 1859 i++; 1860 continue; 1861 } 1862 BUG_ON(!fa->fa_info); 1863 1864 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, 1865 cb->nlh->nlmsg_seq, 1866 RTM_NEWROUTE, 1867 tb->tb_id, 1868 fa->fa_type, 1869 fa->fa_scope, 1870 &xkey, 1871 plen, 1872 fa->fa_tos, 1873 fa->fa_info, 0) < 0) { 1874 cb->args[3] = i; 1875 return -1; 1876 } 1877 i++; 1878 } 1879 cb->args[3] = i; 1880 return skb->len; 1881 } 1882 1883 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb, 1884 struct netlink_callback *cb) 1885 { 1886 int h, s_h; 1887 struct list_head *fa_head; 1888 struct leaf *l = NULL; 1889 1890 s_h = cb->args[2]; 1891 1892 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { 1893 if (h < s_h) 1894 continue; 1895 if (h > s_h) 1896 memset(&cb->args[3], 0, 1897 sizeof(cb->args) - 3*sizeof(cb->args[0])); 1898 1899 fa_head = get_fa_head(l, plen); 1900 1901 if (!fa_head) 1902 continue; 1903 1904 if (list_empty(fa_head)) 1905 continue; 1906 1907 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { 1908 cb->args[2] = h; 1909 return -1; 1910 } 1911 } 1912 cb->args[2] = h; 1913 return skb->len; 1914 } 1915 1916 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) 1917 { 1918 int m, s_m; 1919 struct trie *t = (struct trie *) tb->tb_data; 1920 1921 s_m = cb->args[1]; 1922 1923 rcu_read_lock(); 1924 for (m = 0; m <= 32; m++) { 1925 if (m < s_m) 1926 continue; 1927 if (m > s_m) 1928 memset(&cb->args[2], 0, 1929 sizeof(cb->args) - 2*sizeof(cb->args[0])); 1930 1931 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { 1932 cb->args[1] = m; 1933 goto out; 1934 } 1935 } 1936 rcu_read_unlock(); 1937 cb->args[1] = m; 1938 return skb->len; 1939 out: 1940 rcu_read_unlock(); 1941 return -1; 1942 } 1943 1944 /* Fix more generic FIB names for init later */ 1945 1946 #ifdef CONFIG_IP_MULTIPLE_TABLES 1947 struct fib_table * fib_hash_init(int id) 1948 #else 1949 struct fib_table * __init fib_hash_init(int id) 1950 #endif 1951 { 1952 struct fib_table *tb; 1953 struct trie *t; 1954 1955 if (fn_alias_kmem == NULL) 1956 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 1957 sizeof(struct fib_alias), 1958 0, SLAB_HWCACHE_ALIGN, 1959 NULL, NULL); 1960 1961 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), 1962 GFP_KERNEL); 1963 if (tb == NULL) 1964 return NULL; 1965 1966 tb->tb_id = id; 1967 tb->tb_lookup = fn_trie_lookup; 1968 tb->tb_insert = fn_trie_insert; 1969 tb->tb_delete = fn_trie_delete; 1970 tb->tb_flush = fn_trie_flush; 1971 tb->tb_select_default = fn_trie_select_default; 1972 tb->tb_dump = fn_trie_dump; 1973 memset(tb->tb_data, 0, sizeof(struct trie)); 1974 1975 t = (struct trie *) tb->tb_data; 1976 1977 trie_init(t); 1978 1979 if (id == RT_TABLE_LOCAL) 1980 trie_local = t; 1981 else if (id == RT_TABLE_MAIN) 1982 trie_main = t; 1983 1984 if (id == RT_TABLE_LOCAL) 1985 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION); 1986 1987 return tb; 1988 } 1989 1990 #ifdef CONFIG_PROC_FS 1991 /* Depth first Trie walk iterator */ 1992 struct fib_trie_iter { 1993 struct tnode *tnode; 1994 struct trie *trie; 1995 unsigned index; 1996 unsigned depth; 1997 }; 1998 1999 static struct node *fib_trie_get_next(struct fib_trie_iter *iter) 2000 { 2001 struct tnode *tn = iter->tnode; 2002 unsigned cindex = iter->index; 2003 struct tnode *p; 2004 2005 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 2006 iter->tnode, iter->index, iter->depth); 2007 rescan: 2008 while (cindex < (1<<tn->bits)) { 2009 struct node *n = tnode_get_child(tn, cindex); 2010 2011 if (n) { 2012 if (IS_LEAF(n)) { 2013 iter->tnode = tn; 2014 iter->index = cindex + 1; 2015 } else { 2016 /* push down one level */ 2017 iter->tnode = (struct tnode *) n; 2018 iter->index = 0; 2019 ++iter->depth; 2020 } 2021 return n; 2022 } 2023 2024 ++cindex; 2025 } 2026 2027 /* Current node exhausted, pop back up */ 2028 p = NODE_PARENT(tn); 2029 if (p) { 2030 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; 2031 tn = p; 2032 --iter->depth; 2033 goto rescan; 2034 } 2035 2036 /* got root? */ 2037 return NULL; 2038 } 2039 2040 static struct node *fib_trie_get_first(struct fib_trie_iter *iter, 2041 struct trie *t) 2042 { 2043 struct node *n = rcu_dereference(t->trie); 2044 2045 if (n && IS_TNODE(n)) { 2046 iter->tnode = (struct tnode *) n; 2047 iter->trie = t; 2048 iter->index = 0; 2049 iter->depth = 1; 2050 return n; 2051 } 2052 return NULL; 2053 } 2054 2055 static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2056 { 2057 struct node *n; 2058 struct fib_trie_iter iter; 2059 2060 memset(s, 0, sizeof(*s)); 2061 2062 rcu_read_lock(); 2063 for (n = fib_trie_get_first(&iter, t); n; 2064 n = fib_trie_get_next(&iter)) { 2065 if (IS_LEAF(n)) { 2066 s->leaves++; 2067 s->totdepth += iter.depth; 2068 if (iter.depth > s->maxdepth) 2069 s->maxdepth = iter.depth; 2070 } else { 2071 const struct tnode *tn = (const struct tnode *) n; 2072 int i; 2073 2074 s->tnodes++; 2075 s->nodesizes[tn->bits]++; 2076 for (i = 0; i < (1<<tn->bits); i++) 2077 if (!tn->child[i]) 2078 s->nullpointers++; 2079 } 2080 } 2081 rcu_read_unlock(); 2082 } 2083 2084 /* 2085 * This outputs /proc/net/fib_triestats 2086 */ 2087 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2088 { 2089 unsigned i, max, pointers, bytes, avdepth; 2090 2091 if (stat->leaves) 2092 avdepth = stat->totdepth*100 / stat->leaves; 2093 else 2094 avdepth = 0; 2095 2096 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); 2097 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2098 2099 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2100 2101 bytes = sizeof(struct leaf) * stat->leaves; 2102 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes); 2103 bytes += sizeof(struct tnode) * stat->tnodes; 2104 2105 max = MAX_CHILDS-1; 2106 while (max >= 0 && stat->nodesizes[max] == 0) 2107 max--; 2108 2109 pointers = 0; 2110 for (i = 1; i <= max; i++) 2111 if (stat->nodesizes[i] != 0) { 2112 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]); 2113 pointers += (1<<i) * stat->nodesizes[i]; 2114 } 2115 seq_putc(seq, '\n'); 2116 seq_printf(seq, "\tPointers: %d\n", pointers); 2117 2118 bytes += sizeof(struct node *) * pointers; 2119 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers); 2120 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024); 2121 2122 #ifdef CONFIG_IP_FIB_TRIE_STATS 2123 seq_printf(seq, "Counters:\n---------\n"); 2124 seq_printf(seq,"gets = %d\n", t->stats.gets); 2125 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack); 2126 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2127 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2128 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2129 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped); 2130 #ifdef CLEAR_STATS 2131 memset(&(t->stats), 0, sizeof(t->stats)); 2132 #endif 2133 #endif /* CONFIG_IP_FIB_TRIE_STATS */ 2134 } 2135 2136 static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2137 { 2138 struct trie_stat *stat; 2139 2140 stat = kmalloc(sizeof(*stat), GFP_KERNEL); 2141 if (!stat) 2142 return -ENOMEM; 2143 2144 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n", 2145 sizeof(struct leaf), sizeof(struct tnode)); 2146 2147 if (trie_local) { 2148 seq_printf(seq, "Local:\n"); 2149 trie_collect_stats(trie_local, stat); 2150 trie_show_stats(seq, stat); 2151 } 2152 2153 if (trie_main) { 2154 seq_printf(seq, "Main:\n"); 2155 trie_collect_stats(trie_main, stat); 2156 trie_show_stats(seq, stat); 2157 } 2158 kfree(stat); 2159 2160 return 0; 2161 } 2162 2163 static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2164 { 2165 return single_open(file, fib_triestat_seq_show, NULL); 2166 } 2167 2168 static struct file_operations fib_triestat_fops = { 2169 .owner = THIS_MODULE, 2170 .open = fib_triestat_seq_open, 2171 .read = seq_read, 2172 .llseek = seq_lseek, 2173 .release = single_release, 2174 }; 2175 2176 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter, 2177 loff_t pos) 2178 { 2179 loff_t idx = 0; 2180 struct node *n; 2181 2182 for (n = fib_trie_get_first(iter, trie_local); 2183 n; ++idx, n = fib_trie_get_next(iter)) { 2184 if (pos == idx) 2185 return n; 2186 } 2187 2188 for (n = fib_trie_get_first(iter, trie_main); 2189 n; ++idx, n = fib_trie_get_next(iter)) { 2190 if (pos == idx) 2191 return n; 2192 } 2193 return NULL; 2194 } 2195 2196 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2197 { 2198 rcu_read_lock(); 2199 if (*pos == 0) 2200 return SEQ_START_TOKEN; 2201 return fib_trie_get_idx(seq->private, *pos - 1); 2202 } 2203 2204 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2205 { 2206 struct fib_trie_iter *iter = seq->private; 2207 void *l = v; 2208 2209 ++*pos; 2210 if (v == SEQ_START_TOKEN) 2211 return fib_trie_get_idx(iter, 0); 2212 2213 v = fib_trie_get_next(iter); 2214 BUG_ON(v == l); 2215 if (v) 2216 return v; 2217 2218 /* continue scan in next trie */ 2219 if (iter->trie == trie_local) 2220 return fib_trie_get_first(iter, trie_main); 2221 2222 return NULL; 2223 } 2224 2225 static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2226 { 2227 rcu_read_unlock(); 2228 } 2229 2230 static void seq_indent(struct seq_file *seq, int n) 2231 { 2232 while (n-- > 0) seq_puts(seq, " "); 2233 } 2234 2235 static inline const char *rtn_scope(enum rt_scope_t s) 2236 { 2237 static char buf[32]; 2238 2239 switch(s) { 2240 case RT_SCOPE_UNIVERSE: return "universe"; 2241 case RT_SCOPE_SITE: return "site"; 2242 case RT_SCOPE_LINK: return "link"; 2243 case RT_SCOPE_HOST: return "host"; 2244 case RT_SCOPE_NOWHERE: return "nowhere"; 2245 default: 2246 snprintf(buf, sizeof(buf), "scope=%d", s); 2247 return buf; 2248 } 2249 } 2250 2251 static const char *rtn_type_names[__RTN_MAX] = { 2252 [RTN_UNSPEC] = "UNSPEC", 2253 [RTN_UNICAST] = "UNICAST", 2254 [RTN_LOCAL] = "LOCAL", 2255 [RTN_BROADCAST] = "BROADCAST", 2256 [RTN_ANYCAST] = "ANYCAST", 2257 [RTN_MULTICAST] = "MULTICAST", 2258 [RTN_BLACKHOLE] = "BLACKHOLE", 2259 [RTN_UNREACHABLE] = "UNREACHABLE", 2260 [RTN_PROHIBIT] = "PROHIBIT", 2261 [RTN_THROW] = "THROW", 2262 [RTN_NAT] = "NAT", 2263 [RTN_XRESOLVE] = "XRESOLVE", 2264 }; 2265 2266 static inline const char *rtn_type(unsigned t) 2267 { 2268 static char buf[32]; 2269 2270 if (t < __RTN_MAX && rtn_type_names[t]) 2271 return rtn_type_names[t]; 2272 snprintf(buf, sizeof(buf), "type %d", t); 2273 return buf; 2274 } 2275 2276 /* Pretty print the trie */ 2277 static int fib_trie_seq_show(struct seq_file *seq, void *v) 2278 { 2279 const struct fib_trie_iter *iter = seq->private; 2280 struct node *n = v; 2281 2282 if (v == SEQ_START_TOKEN) 2283 return 0; 2284 2285 if (IS_TNODE(n)) { 2286 struct tnode *tn = (struct tnode *) n; 2287 t_key prf = ntohl(MASK_PFX(tn->key, tn->pos)); 2288 2289 if (!NODE_PARENT(n)) { 2290 if (iter->trie == trie_local) 2291 seq_puts(seq, "<local>:\n"); 2292 else 2293 seq_puts(seq, "<main>:\n"); 2294 } 2295 seq_indent(seq, iter->depth-1); 2296 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n", 2297 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children, 2298 tn->empty_children); 2299 2300 } else { 2301 struct leaf *l = (struct leaf *) n; 2302 int i; 2303 u32 val = ntohl(l->key); 2304 2305 seq_indent(seq, iter->depth); 2306 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val)); 2307 for (i = 32; i >= 0; i--) { 2308 struct leaf_info *li = find_leaf_info(l, i); 2309 if (li) { 2310 struct fib_alias *fa; 2311 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2312 seq_indent(seq, iter->depth+1); 2313 seq_printf(seq, " /%d %s %s", i, 2314 rtn_scope(fa->fa_scope), 2315 rtn_type(fa->fa_type)); 2316 if (fa->fa_tos) 2317 seq_printf(seq, "tos =%d\n", 2318 fa->fa_tos); 2319 seq_putc(seq, '\n'); 2320 } 2321 } 2322 } 2323 } 2324 2325 return 0; 2326 } 2327 2328 static struct seq_operations fib_trie_seq_ops = { 2329 .start = fib_trie_seq_start, 2330 .next = fib_trie_seq_next, 2331 .stop = fib_trie_seq_stop, 2332 .show = fib_trie_seq_show, 2333 }; 2334 2335 static int fib_trie_seq_open(struct inode *inode, struct file *file) 2336 { 2337 struct seq_file *seq; 2338 int rc = -ENOMEM; 2339 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); 2340 2341 if (!s) 2342 goto out; 2343 2344 rc = seq_open(file, &fib_trie_seq_ops); 2345 if (rc) 2346 goto out_kfree; 2347 2348 seq = file->private_data; 2349 seq->private = s; 2350 memset(s, 0, sizeof(*s)); 2351 out: 2352 return rc; 2353 out_kfree: 2354 kfree(s); 2355 goto out; 2356 } 2357 2358 static struct file_operations fib_trie_fops = { 2359 .owner = THIS_MODULE, 2360 .open = fib_trie_seq_open, 2361 .read = seq_read, 2362 .llseek = seq_lseek, 2363 .release = seq_release_private, 2364 }; 2365 2366 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi) 2367 { 2368 static unsigned type2flags[RTN_MAX + 1] = { 2369 [7] = RTF_REJECT, [8] = RTF_REJECT, 2370 }; 2371 unsigned flags = type2flags[type]; 2372 2373 if (fi && fi->fib_nh->nh_gw) 2374 flags |= RTF_GATEWAY; 2375 if (mask == 0xFFFFFFFF) 2376 flags |= RTF_HOST; 2377 flags |= RTF_UP; 2378 return flags; 2379 } 2380 2381 /* 2382 * This outputs /proc/net/route. 2383 * The format of the file is not supposed to be changed 2384 * and needs to be same as fib_hash output to avoid breaking 2385 * legacy utilities 2386 */ 2387 static int fib_route_seq_show(struct seq_file *seq, void *v) 2388 { 2389 const struct fib_trie_iter *iter = seq->private; 2390 struct leaf *l = v; 2391 int i; 2392 char bf[128]; 2393 2394 if (v == SEQ_START_TOKEN) { 2395 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " 2396 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" 2397 "\tWindow\tIRTT"); 2398 return 0; 2399 } 2400 2401 if (iter->trie == trie_local) 2402 return 0; 2403 if (IS_TNODE(l)) 2404 return 0; 2405 2406 for (i=32; i>=0; i--) { 2407 struct leaf_info *li = find_leaf_info(l, i); 2408 struct fib_alias *fa; 2409 u32 mask, prefix; 2410 2411 if (!li) 2412 continue; 2413 2414 mask = inet_make_mask(li->plen); 2415 prefix = htonl(l->key); 2416 2417 list_for_each_entry_rcu(fa, &li->falh, fa_list) { 2418 const struct fib_info *fi = fa->fa_info; 2419 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); 2420 2421 if (fa->fa_type == RTN_BROADCAST 2422 || fa->fa_type == RTN_MULTICAST) 2423 continue; 2424 2425 if (fi) 2426 snprintf(bf, sizeof(bf), 2427 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", 2428 fi->fib_dev ? fi->fib_dev->name : "*", 2429 prefix, 2430 fi->fib_nh->nh_gw, flags, 0, 0, 2431 fi->fib_priority, 2432 mask, 2433 (fi->fib_advmss ? fi->fib_advmss + 40 : 0), 2434 fi->fib_window, 2435 fi->fib_rtt >> 3); 2436 else 2437 snprintf(bf, sizeof(bf), 2438 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", 2439 prefix, 0, flags, 0, 0, 0, 2440 mask, 0, 0, 0); 2441 2442 seq_printf(seq, "%-127s\n", bf); 2443 } 2444 } 2445 2446 return 0; 2447 } 2448 2449 static struct seq_operations fib_route_seq_ops = { 2450 .start = fib_trie_seq_start, 2451 .next = fib_trie_seq_next, 2452 .stop = fib_trie_seq_stop, 2453 .show = fib_route_seq_show, 2454 }; 2455 2456 static int fib_route_seq_open(struct inode *inode, struct file *file) 2457 { 2458 struct seq_file *seq; 2459 int rc = -ENOMEM; 2460 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); 2461 2462 if (!s) 2463 goto out; 2464 2465 rc = seq_open(file, &fib_route_seq_ops); 2466 if (rc) 2467 goto out_kfree; 2468 2469 seq = file->private_data; 2470 seq->private = s; 2471 memset(s, 0, sizeof(*s)); 2472 out: 2473 return rc; 2474 out_kfree: 2475 kfree(s); 2476 goto out; 2477 } 2478 2479 static struct file_operations fib_route_fops = { 2480 .owner = THIS_MODULE, 2481 .open = fib_route_seq_open, 2482 .read = seq_read, 2483 .llseek = seq_lseek, 2484 .release = seq_release_private, 2485 }; 2486 2487 int __init fib_proc_init(void) 2488 { 2489 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops)) 2490 goto out1; 2491 2492 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops)) 2493 goto out2; 2494 2495 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops)) 2496 goto out3; 2497 2498 return 0; 2499 2500 out3: 2501 proc_net_remove("fib_triestat"); 2502 out2: 2503 proc_net_remove("fib_trie"); 2504 out1: 2505 return -ENOMEM; 2506 } 2507 2508 void __init fib_proc_exit(void) 2509 { 2510 proc_net_remove("fib_trie"); 2511 proc_net_remove("fib_triestat"); 2512 proc_net_remove("route"); 2513 } 2514 2515 #endif /* CONFIG_PROC_FS */ 2516