1 /* 2 * This program is free software; you can redistribute it and/or 3 * modify it under the terms of the GNU General Public License 4 * as published by the Free Software Foundation; either version 5 * 2 of the License, or (at your option) any later version. 6 * 7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet 8 * & Swedish University of Agricultural Sciences. 9 * 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of 11 * Agricultural Sciences. 12 * 13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet 14 * 15 * This work is based on the LPC-trie which is originally described in: 16 * 17 * An experimental study of compression methods for dynamic tries 18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 19 * http://www.csc.kth.se/~snilsson/software/dyntrie2/ 20 * 21 * 22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 24 * 25 * 26 * Code from fib_hash has been reused which includes the following header: 27 * 28 * 29 * INET An implementation of the TCP/IP protocol suite for the LINUX 30 * operating system. INET is implemented using the BSD Socket 31 * interface as the means of communication with the user level. 32 * 33 * IPv4 FIB: lookup engine and maintenance routines. 34 * 35 * 36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 37 * 38 * This program is free software; you can redistribute it and/or 39 * modify it under the terms of the GNU General Public License 40 * as published by the Free Software Foundation; either version 41 * 2 of the License, or (at your option) any later version. 42 * 43 * Substantial contributions to this work comes from: 44 * 45 * David S. Miller, <davem@davemloft.net> 46 * Stephen Hemminger <shemminger@osdl.org> 47 * Paul E. McKenney <paulmck@us.ibm.com> 48 * Patrick McHardy <kaber@trash.net> 49 */ 50 51 #define VERSION "0.409" 52 53 #include <linux/uaccess.h> 54 #include <linux/bitops.h> 55 #include <linux/types.h> 56 #include <linux/kernel.h> 57 #include <linux/mm.h> 58 #include <linux/string.h> 59 #include <linux/socket.h> 60 #include <linux/sockios.h> 61 #include <linux/errno.h> 62 #include <linux/in.h> 63 #include <linux/inet.h> 64 #include <linux/inetdevice.h> 65 #include <linux/netdevice.h> 66 #include <linux/if_arp.h> 67 #include <linux/proc_fs.h> 68 #include <linux/rcupdate.h> 69 #include <linux/skbuff.h> 70 #include <linux/netlink.h> 71 #include <linux/init.h> 72 #include <linux/list.h> 73 #include <linux/slab.h> 74 #include <linux/export.h> 75 #include <linux/vmalloc.h> 76 #include <linux/notifier.h> 77 #include <net/net_namespace.h> 78 #include <net/ip.h> 79 #include <net/protocol.h> 80 #include <net/route.h> 81 #include <net/tcp.h> 82 #include <net/sock.h> 83 #include <net/ip_fib.h> 84 #include <trace/events/fib.h> 85 #include "fib_lookup.h" 86 87 static unsigned int fib_seq_sum(void) 88 { 89 unsigned int fib_seq = 0; 90 struct net *net; 91 92 rtnl_lock(); 93 for_each_net(net) 94 fib_seq += net->ipv4.fib_seq; 95 rtnl_unlock(); 96 97 return fib_seq; 98 } 99 100 static ATOMIC_NOTIFIER_HEAD(fib_chain); 101 102 static int call_fib_notifier(struct notifier_block *nb, struct net *net, 103 enum fib_event_type event_type, 104 struct fib_notifier_info *info) 105 { 106 info->net = net; 107 return nb->notifier_call(nb, event_type, info); 108 } 109 110 static void fib_rules_notify(struct net *net, struct notifier_block *nb, 111 enum fib_event_type event_type) 112 { 113 #ifdef CONFIG_IP_MULTIPLE_TABLES 114 struct fib_notifier_info info; 115 116 if (net->ipv4.fib_has_custom_rules) 117 call_fib_notifier(nb, net, event_type, &info); 118 #endif 119 } 120 121 static void fib_notify(struct net *net, struct notifier_block *nb, 122 enum fib_event_type event_type); 123 124 static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, 125 enum fib_event_type event_type, u32 dst, 126 int dst_len, struct fib_info *fi, 127 u8 tos, u8 type, u32 tb_id, u32 nlflags) 128 { 129 struct fib_entry_notifier_info info = { 130 .dst = dst, 131 .dst_len = dst_len, 132 .fi = fi, 133 .tos = tos, 134 .type = type, 135 .tb_id = tb_id, 136 .nlflags = nlflags, 137 }; 138 return call_fib_notifier(nb, net, event_type, &info.info); 139 } 140 141 static bool fib_dump_is_consistent(struct notifier_block *nb, 142 void (*cb)(struct notifier_block *nb), 143 unsigned int fib_seq) 144 { 145 atomic_notifier_chain_register(&fib_chain, nb); 146 if (fib_seq == fib_seq_sum()) 147 return true; 148 atomic_notifier_chain_unregister(&fib_chain, nb); 149 if (cb) 150 cb(nb); 151 return false; 152 } 153 154 #define FIB_DUMP_MAX_RETRIES 5 155 int register_fib_notifier(struct notifier_block *nb, 156 void (*cb)(struct notifier_block *nb)) 157 { 158 int retries = 0; 159 160 do { 161 unsigned int fib_seq = fib_seq_sum(); 162 struct net *net; 163 164 /* Mutex semantics guarantee that every change done to 165 * FIB tries before we read the change sequence counter 166 * is now visible to us. 167 */ 168 rcu_read_lock(); 169 for_each_net_rcu(net) { 170 fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); 171 fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); 172 } 173 rcu_read_unlock(); 174 175 if (fib_dump_is_consistent(nb, cb, fib_seq)) 176 return 0; 177 } while (++retries < FIB_DUMP_MAX_RETRIES); 178 179 return -EBUSY; 180 } 181 EXPORT_SYMBOL(register_fib_notifier); 182 183 int unregister_fib_notifier(struct notifier_block *nb) 184 { 185 return atomic_notifier_chain_unregister(&fib_chain, nb); 186 } 187 EXPORT_SYMBOL(unregister_fib_notifier); 188 189 int call_fib_notifiers(struct net *net, enum fib_event_type event_type, 190 struct fib_notifier_info *info) 191 { 192 net->ipv4.fib_seq++; 193 info->net = net; 194 return atomic_notifier_call_chain(&fib_chain, event_type, info); 195 } 196 197 static int call_fib_entry_notifiers(struct net *net, 198 enum fib_event_type event_type, u32 dst, 199 int dst_len, struct fib_info *fi, 200 u8 tos, u8 type, u32 tb_id, u32 nlflags) 201 { 202 struct fib_entry_notifier_info info = { 203 .dst = dst, 204 .dst_len = dst_len, 205 .fi = fi, 206 .tos = tos, 207 .type = type, 208 .tb_id = tb_id, 209 .nlflags = nlflags, 210 }; 211 return call_fib_notifiers(net, event_type, &info.info); 212 } 213 214 #define MAX_STAT_DEPTH 32 215 216 #define KEYLENGTH (8*sizeof(t_key)) 217 #define KEY_MAX ((t_key)~0) 218 219 typedef unsigned int t_key; 220 221 #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) 222 #define IS_TNODE(n) ((n)->bits) 223 #define IS_LEAF(n) (!(n)->bits) 224 225 struct key_vector { 226 t_key key; 227 unsigned char pos; /* 2log(KEYLENGTH) bits needed */ 228 unsigned char bits; /* 2log(KEYLENGTH) bits needed */ 229 unsigned char slen; 230 union { 231 /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ 232 struct hlist_head leaf; 233 /* This array is valid if (pos | bits) > 0 (TNODE) */ 234 struct key_vector __rcu *tnode[0]; 235 }; 236 }; 237 238 struct tnode { 239 struct rcu_head rcu; 240 t_key empty_children; /* KEYLENGTH bits needed */ 241 t_key full_children; /* KEYLENGTH bits needed */ 242 struct key_vector __rcu *parent; 243 struct key_vector kv[1]; 244 #define tn_bits kv[0].bits 245 }; 246 247 #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) 248 #define LEAF_SIZE TNODE_SIZE(1) 249 250 #ifdef CONFIG_IP_FIB_TRIE_STATS 251 struct trie_use_stats { 252 unsigned int gets; 253 unsigned int backtrack; 254 unsigned int semantic_match_passed; 255 unsigned int semantic_match_miss; 256 unsigned int null_node_hit; 257 unsigned int resize_node_skipped; 258 }; 259 #endif 260 261 struct trie_stat { 262 unsigned int totdepth; 263 unsigned int maxdepth; 264 unsigned int tnodes; 265 unsigned int leaves; 266 unsigned int nullpointers; 267 unsigned int prefixes; 268 unsigned int nodesizes[MAX_STAT_DEPTH]; 269 }; 270 271 struct trie { 272 struct key_vector kv[1]; 273 #ifdef CONFIG_IP_FIB_TRIE_STATS 274 struct trie_use_stats __percpu *stats; 275 #endif 276 }; 277 278 static struct key_vector *resize(struct trie *t, struct key_vector *tn); 279 static size_t tnode_free_size; 280 281 /* 282 * synchronize_rcu after call_rcu for that many pages; it should be especially 283 * useful before resizing the root node with PREEMPT_NONE configs; the value was 284 * obtained experimentally, aiming to avoid visible slowdown. 285 */ 286 static const int sync_pages = 128; 287 288 static struct kmem_cache *fn_alias_kmem __read_mostly; 289 static struct kmem_cache *trie_leaf_kmem __read_mostly; 290 291 static inline struct tnode *tn_info(struct key_vector *kv) 292 { 293 return container_of(kv, struct tnode, kv[0]); 294 } 295 296 /* caller must hold RTNL */ 297 #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) 298 #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) 299 300 /* caller must hold RCU read lock or RTNL */ 301 #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) 302 #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) 303 304 /* wrapper for rcu_assign_pointer */ 305 static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) 306 { 307 if (n) 308 rcu_assign_pointer(tn_info(n)->parent, tp); 309 } 310 311 #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) 312 313 /* This provides us with the number of children in this node, in the case of a 314 * leaf this will return 0 meaning none of the children are accessible. 315 */ 316 static inline unsigned long child_length(const struct key_vector *tn) 317 { 318 return (1ul << tn->bits) & ~(1ul); 319 } 320 321 #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) 322 323 static inline unsigned long get_index(t_key key, struct key_vector *kv) 324 { 325 unsigned long index = key ^ kv->key; 326 327 if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) 328 return 0; 329 330 return index >> kv->pos; 331 } 332 333 /* To understand this stuff, an understanding of keys and all their bits is 334 * necessary. Every node in the trie has a key associated with it, but not 335 * all of the bits in that key are significant. 336 * 337 * Consider a node 'n' and its parent 'tp'. 338 * 339 * If n is a leaf, every bit in its key is significant. Its presence is 340 * necessitated by path compression, since during a tree traversal (when 341 * searching for a leaf - unless we are doing an insertion) we will completely 342 * ignore all skipped bits we encounter. Thus we need to verify, at the end of 343 * a potentially successful search, that we have indeed been walking the 344 * correct key path. 345 * 346 * Note that we can never "miss" the correct key in the tree if present by 347 * following the wrong path. Path compression ensures that segments of the key 348 * that are the same for all keys with a given prefix are skipped, but the 349 * skipped part *is* identical for each node in the subtrie below the skipped 350 * bit! trie_insert() in this implementation takes care of that. 351 * 352 * if n is an internal node - a 'tnode' here, the various parts of its key 353 * have many different meanings. 354 * 355 * Example: 356 * _________________________________________________________________ 357 * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | 358 * ----------------------------------------------------------------- 359 * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 360 * 361 * _________________________________________________________________ 362 * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | 363 * ----------------------------------------------------------------- 364 * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 365 * 366 * tp->pos = 22 367 * tp->bits = 3 368 * n->pos = 13 369 * n->bits = 4 370 * 371 * First, let's just ignore the bits that come before the parent tp, that is 372 * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this 373 * point we do not use them for anything. 374 * 375 * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the 376 * index into the parent's child array. That is, they will be used to find 377 * 'n' among tp's children. 378 * 379 * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits 380 * for the node n. 381 * 382 * All the bits we have seen so far are significant to the node n. The rest 383 * of the bits are really not needed or indeed known in n->key. 384 * 385 * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into 386 * n's child array, and will of course be different for each child. 387 * 388 * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown 389 * at this point. 390 */ 391 392 static const int halve_threshold = 25; 393 static const int inflate_threshold = 50; 394 static const int halve_threshold_root = 15; 395 static const int inflate_threshold_root = 30; 396 397 static void __alias_free_mem(struct rcu_head *head) 398 { 399 struct fib_alias *fa = container_of(head, struct fib_alias, rcu); 400 kmem_cache_free(fn_alias_kmem, fa); 401 } 402 403 static inline void alias_free_mem_rcu(struct fib_alias *fa) 404 { 405 call_rcu(&fa->rcu, __alias_free_mem); 406 } 407 408 #define TNODE_KMALLOC_MAX \ 409 ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *)) 410 #define TNODE_VMALLOC_MAX \ 411 ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) 412 413 static void __node_free_rcu(struct rcu_head *head) 414 { 415 struct tnode *n = container_of(head, struct tnode, rcu); 416 417 if (!n->tn_bits) 418 kmem_cache_free(trie_leaf_kmem, n); 419 else 420 kvfree(n); 421 } 422 423 #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) 424 425 static struct tnode *tnode_alloc(int bits) 426 { 427 size_t size; 428 429 /* verify bits is within bounds */ 430 if (bits > TNODE_VMALLOC_MAX) 431 return NULL; 432 433 /* determine size and verify it is non-zero and didn't overflow */ 434 size = TNODE_SIZE(1ul << bits); 435 436 if (size <= PAGE_SIZE) 437 return kzalloc(size, GFP_KERNEL); 438 else 439 return vzalloc(size); 440 } 441 442 static inline void empty_child_inc(struct key_vector *n) 443 { 444 ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; 445 } 446 447 static inline void empty_child_dec(struct key_vector *n) 448 { 449 tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; 450 } 451 452 static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) 453 { 454 struct key_vector *l; 455 struct tnode *kv; 456 457 kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); 458 if (!kv) 459 return NULL; 460 461 /* initialize key vector */ 462 l = kv->kv; 463 l->key = key; 464 l->pos = 0; 465 l->bits = 0; 466 l->slen = fa->fa_slen; 467 468 /* link leaf to fib alias */ 469 INIT_HLIST_HEAD(&l->leaf); 470 hlist_add_head(&fa->fa_list, &l->leaf); 471 472 return l; 473 } 474 475 static struct key_vector *tnode_new(t_key key, int pos, int bits) 476 { 477 unsigned int shift = pos + bits; 478 struct key_vector *tn; 479 struct tnode *tnode; 480 481 /* verify bits and pos their msb bits clear and values are valid */ 482 BUG_ON(!bits || (shift > KEYLENGTH)); 483 484 tnode = tnode_alloc(bits); 485 if (!tnode) 486 return NULL; 487 488 pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), 489 sizeof(struct key_vector *) << bits); 490 491 if (bits == KEYLENGTH) 492 tnode->full_children = 1; 493 else 494 tnode->empty_children = 1ul << bits; 495 496 tn = tnode->kv; 497 tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; 498 tn->pos = pos; 499 tn->bits = bits; 500 tn->slen = pos; 501 502 return tn; 503 } 504 505 /* Check whether a tnode 'n' is "full", i.e. it is an internal node 506 * and no bits are skipped. See discussion in dyntree paper p. 6 507 */ 508 static inline int tnode_full(struct key_vector *tn, struct key_vector *n) 509 { 510 return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); 511 } 512 513 /* Add a child at position i overwriting the old value. 514 * Update the value of full_children and empty_children. 515 */ 516 static void put_child(struct key_vector *tn, unsigned long i, 517 struct key_vector *n) 518 { 519 struct key_vector *chi = get_child(tn, i); 520 int isfull, wasfull; 521 522 BUG_ON(i >= child_length(tn)); 523 524 /* update emptyChildren, overflow into fullChildren */ 525 if (!n && chi) 526 empty_child_inc(tn); 527 if (n && !chi) 528 empty_child_dec(tn); 529 530 /* update fullChildren */ 531 wasfull = tnode_full(tn, chi); 532 isfull = tnode_full(tn, n); 533 534 if (wasfull && !isfull) 535 tn_info(tn)->full_children--; 536 else if (!wasfull && isfull) 537 tn_info(tn)->full_children++; 538 539 if (n && (tn->slen < n->slen)) 540 tn->slen = n->slen; 541 542 rcu_assign_pointer(tn->tnode[i], n); 543 } 544 545 static void update_children(struct key_vector *tn) 546 { 547 unsigned long i; 548 549 /* update all of the child parent pointers */ 550 for (i = child_length(tn); i;) { 551 struct key_vector *inode = get_child(tn, --i); 552 553 if (!inode) 554 continue; 555 556 /* Either update the children of a tnode that 557 * already belongs to us or update the child 558 * to point to ourselves. 559 */ 560 if (node_parent(inode) == tn) 561 update_children(inode); 562 else 563 node_set_parent(inode, tn); 564 } 565 } 566 567 static inline void put_child_root(struct key_vector *tp, t_key key, 568 struct key_vector *n) 569 { 570 if (IS_TRIE(tp)) 571 rcu_assign_pointer(tp->tnode[0], n); 572 else 573 put_child(tp, get_index(key, tp), n); 574 } 575 576 static inline void tnode_free_init(struct key_vector *tn) 577 { 578 tn_info(tn)->rcu.next = NULL; 579 } 580 581 static inline void tnode_free_append(struct key_vector *tn, 582 struct key_vector *n) 583 { 584 tn_info(n)->rcu.next = tn_info(tn)->rcu.next; 585 tn_info(tn)->rcu.next = &tn_info(n)->rcu; 586 } 587 588 static void tnode_free(struct key_vector *tn) 589 { 590 struct callback_head *head = &tn_info(tn)->rcu; 591 592 while (head) { 593 head = head->next; 594 tnode_free_size += TNODE_SIZE(1ul << tn->bits); 595 node_free(tn); 596 597 tn = container_of(head, struct tnode, rcu)->kv; 598 } 599 600 if (tnode_free_size >= PAGE_SIZE * sync_pages) { 601 tnode_free_size = 0; 602 synchronize_rcu(); 603 } 604 } 605 606 static struct key_vector *replace(struct trie *t, 607 struct key_vector *oldtnode, 608 struct key_vector *tn) 609 { 610 struct key_vector *tp = node_parent(oldtnode); 611 unsigned long i; 612 613 /* setup the parent pointer out of and back into this node */ 614 NODE_INIT_PARENT(tn, tp); 615 put_child_root(tp, tn->key, tn); 616 617 /* update all of the child parent pointers */ 618 update_children(tn); 619 620 /* all pointers should be clean so we are done */ 621 tnode_free(oldtnode); 622 623 /* resize children now that oldtnode is freed */ 624 for (i = child_length(tn); i;) { 625 struct key_vector *inode = get_child(tn, --i); 626 627 /* resize child node */ 628 if (tnode_full(tn, inode)) 629 tn = resize(t, inode); 630 } 631 632 return tp; 633 } 634 635 static struct key_vector *inflate(struct trie *t, 636 struct key_vector *oldtnode) 637 { 638 struct key_vector *tn; 639 unsigned long i; 640 t_key m; 641 642 pr_debug("In inflate\n"); 643 644 tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); 645 if (!tn) 646 goto notnode; 647 648 /* prepare oldtnode to be freed */ 649 tnode_free_init(oldtnode); 650 651 /* Assemble all of the pointers in our cluster, in this case that 652 * represents all of the pointers out of our allocated nodes that 653 * point to existing tnodes and the links between our allocated 654 * nodes. 655 */ 656 for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { 657 struct key_vector *inode = get_child(oldtnode, --i); 658 struct key_vector *node0, *node1; 659 unsigned long j, k; 660 661 /* An empty child */ 662 if (!inode) 663 continue; 664 665 /* A leaf or an internal node with skipped bits */ 666 if (!tnode_full(oldtnode, inode)) { 667 put_child(tn, get_index(inode->key, tn), inode); 668 continue; 669 } 670 671 /* drop the node in the old tnode free list */ 672 tnode_free_append(oldtnode, inode); 673 674 /* An internal node with two children */ 675 if (inode->bits == 1) { 676 put_child(tn, 2 * i + 1, get_child(inode, 1)); 677 put_child(tn, 2 * i, get_child(inode, 0)); 678 continue; 679 } 680 681 /* We will replace this node 'inode' with two new 682 * ones, 'node0' and 'node1', each with half of the 683 * original children. The two new nodes will have 684 * a position one bit further down the key and this 685 * means that the "significant" part of their keys 686 * (see the discussion near the top of this file) 687 * will differ by one bit, which will be "0" in 688 * node0's key and "1" in node1's key. Since we are 689 * moving the key position by one step, the bit that 690 * we are moving away from - the bit at position 691 * (tn->pos) - is the one that will differ between 692 * node0 and node1. So... we synthesize that bit in the 693 * two new keys. 694 */ 695 node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); 696 if (!node1) 697 goto nomem; 698 node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); 699 700 tnode_free_append(tn, node1); 701 if (!node0) 702 goto nomem; 703 tnode_free_append(tn, node0); 704 705 /* populate child pointers in new nodes */ 706 for (k = child_length(inode), j = k / 2; j;) { 707 put_child(node1, --j, get_child(inode, --k)); 708 put_child(node0, j, get_child(inode, j)); 709 put_child(node1, --j, get_child(inode, --k)); 710 put_child(node0, j, get_child(inode, j)); 711 } 712 713 /* link new nodes to parent */ 714 NODE_INIT_PARENT(node1, tn); 715 NODE_INIT_PARENT(node0, tn); 716 717 /* link parent to nodes */ 718 put_child(tn, 2 * i + 1, node1); 719 put_child(tn, 2 * i, node0); 720 } 721 722 /* setup the parent pointers into and out of this node */ 723 return replace(t, oldtnode, tn); 724 nomem: 725 /* all pointers should be clean so we are done */ 726 tnode_free(tn); 727 notnode: 728 return NULL; 729 } 730 731 static struct key_vector *halve(struct trie *t, 732 struct key_vector *oldtnode) 733 { 734 struct key_vector *tn; 735 unsigned long i; 736 737 pr_debug("In halve\n"); 738 739 tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); 740 if (!tn) 741 goto notnode; 742 743 /* prepare oldtnode to be freed */ 744 tnode_free_init(oldtnode); 745 746 /* Assemble all of the pointers in our cluster, in this case that 747 * represents all of the pointers out of our allocated nodes that 748 * point to existing tnodes and the links between our allocated 749 * nodes. 750 */ 751 for (i = child_length(oldtnode); i;) { 752 struct key_vector *node1 = get_child(oldtnode, --i); 753 struct key_vector *node0 = get_child(oldtnode, --i); 754 struct key_vector *inode; 755 756 /* At least one of the children is empty */ 757 if (!node1 || !node0) { 758 put_child(tn, i / 2, node1 ? : node0); 759 continue; 760 } 761 762 /* Two nonempty children */ 763 inode = tnode_new(node0->key, oldtnode->pos, 1); 764 if (!inode) 765 goto nomem; 766 tnode_free_append(tn, inode); 767 768 /* initialize pointers out of node */ 769 put_child(inode, 1, node1); 770 put_child(inode, 0, node0); 771 NODE_INIT_PARENT(inode, tn); 772 773 /* link parent to node */ 774 put_child(tn, i / 2, inode); 775 } 776 777 /* setup the parent pointers into and out of this node */ 778 return replace(t, oldtnode, tn); 779 nomem: 780 /* all pointers should be clean so we are done */ 781 tnode_free(tn); 782 notnode: 783 return NULL; 784 } 785 786 static struct key_vector *collapse(struct trie *t, 787 struct key_vector *oldtnode) 788 { 789 struct key_vector *n, *tp; 790 unsigned long i; 791 792 /* scan the tnode looking for that one child that might still exist */ 793 for (n = NULL, i = child_length(oldtnode); !n && i;) 794 n = get_child(oldtnode, --i); 795 796 /* compress one level */ 797 tp = node_parent(oldtnode); 798 put_child_root(tp, oldtnode->key, n); 799 node_set_parent(n, tp); 800 801 /* drop dead node */ 802 node_free(oldtnode); 803 804 return tp; 805 } 806 807 static unsigned char update_suffix(struct key_vector *tn) 808 { 809 unsigned char slen = tn->pos; 810 unsigned long stride, i; 811 unsigned char slen_max; 812 813 /* only vector 0 can have a suffix length greater than or equal to 814 * tn->pos + tn->bits, the second highest node will have a suffix 815 * length at most of tn->pos + tn->bits - 1 816 */ 817 slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); 818 819 /* search though the list of children looking for nodes that might 820 * have a suffix greater than the one we currently have. This is 821 * why we start with a stride of 2 since a stride of 1 would 822 * represent the nodes with suffix length equal to tn->pos 823 */ 824 for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { 825 struct key_vector *n = get_child(tn, i); 826 827 if (!n || (n->slen <= slen)) 828 continue; 829 830 /* update stride and slen based on new value */ 831 stride <<= (n->slen - slen); 832 slen = n->slen; 833 i &= ~(stride - 1); 834 835 /* stop searching if we have hit the maximum possible value */ 836 if (slen >= slen_max) 837 break; 838 } 839 840 tn->slen = slen; 841 842 return slen; 843 } 844 845 /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of 846 * the Helsinki University of Technology and Matti Tikkanen of Nokia 847 * Telecommunications, page 6: 848 * "A node is doubled if the ratio of non-empty children to all 849 * children in the *doubled* node is at least 'high'." 850 * 851 * 'high' in this instance is the variable 'inflate_threshold'. It 852 * is expressed as a percentage, so we multiply it with 853 * child_length() and instead of multiplying by 2 (since the 854 * child array will be doubled by inflate()) and multiplying 855 * the left-hand side by 100 (to handle the percentage thing) we 856 * multiply the left-hand side by 50. 857 * 858 * The left-hand side may look a bit weird: child_length(tn) 859 * - tn->empty_children is of course the number of non-null children 860 * in the current node. tn->full_children is the number of "full" 861 * children, that is non-null tnodes with a skip value of 0. 862 * All of those will be doubled in the resulting inflated tnode, so 863 * we just count them one extra time here. 864 * 865 * A clearer way to write this would be: 866 * 867 * to_be_doubled = tn->full_children; 868 * not_to_be_doubled = child_length(tn) - tn->empty_children - 869 * tn->full_children; 870 * 871 * new_child_length = child_length(tn) * 2; 872 * 873 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / 874 * new_child_length; 875 * if (new_fill_factor >= inflate_threshold) 876 * 877 * ...and so on, tho it would mess up the while () loop. 878 * 879 * anyway, 880 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= 881 * inflate_threshold 882 * 883 * avoid a division: 884 * 100 * (not_to_be_doubled + 2*to_be_doubled) >= 885 * inflate_threshold * new_child_length 886 * 887 * expand not_to_be_doubled and to_be_doubled, and shorten: 888 * 100 * (child_length(tn) - tn->empty_children + 889 * tn->full_children) >= inflate_threshold * new_child_length 890 * 891 * expand new_child_length: 892 * 100 * (child_length(tn) - tn->empty_children + 893 * tn->full_children) >= 894 * inflate_threshold * child_length(tn) * 2 895 * 896 * shorten again: 897 * 50 * (tn->full_children + child_length(tn) - 898 * tn->empty_children) >= inflate_threshold * 899 * child_length(tn) 900 * 901 */ 902 static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) 903 { 904 unsigned long used = child_length(tn); 905 unsigned long threshold = used; 906 907 /* Keep root node larger */ 908 threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; 909 used -= tn_info(tn)->empty_children; 910 used += tn_info(tn)->full_children; 911 912 /* if bits == KEYLENGTH then pos = 0, and will fail below */ 913 914 return (used > 1) && tn->pos && ((50 * used) >= threshold); 915 } 916 917 static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) 918 { 919 unsigned long used = child_length(tn); 920 unsigned long threshold = used; 921 922 /* Keep root node larger */ 923 threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; 924 used -= tn_info(tn)->empty_children; 925 926 /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ 927 928 return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); 929 } 930 931 static inline bool should_collapse(struct key_vector *tn) 932 { 933 unsigned long used = child_length(tn); 934 935 used -= tn_info(tn)->empty_children; 936 937 /* account for bits == KEYLENGTH case */ 938 if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) 939 used -= KEY_MAX; 940 941 /* One child or none, time to drop us from the trie */ 942 return used < 2; 943 } 944 945 #define MAX_WORK 10 946 static struct key_vector *resize(struct trie *t, struct key_vector *tn) 947 { 948 #ifdef CONFIG_IP_FIB_TRIE_STATS 949 struct trie_use_stats __percpu *stats = t->stats; 950 #endif 951 struct key_vector *tp = node_parent(tn); 952 unsigned long cindex = get_index(tn->key, tp); 953 int max_work = MAX_WORK; 954 955 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", 956 tn, inflate_threshold, halve_threshold); 957 958 /* track the tnode via the pointer from the parent instead of 959 * doing it ourselves. This way we can let RCU fully do its 960 * thing without us interfering 961 */ 962 BUG_ON(tn != get_child(tp, cindex)); 963 964 /* Double as long as the resulting node has a number of 965 * nonempty nodes that are above the threshold. 966 */ 967 while (should_inflate(tp, tn) && max_work) { 968 tp = inflate(t, tn); 969 if (!tp) { 970 #ifdef CONFIG_IP_FIB_TRIE_STATS 971 this_cpu_inc(stats->resize_node_skipped); 972 #endif 973 break; 974 } 975 976 max_work--; 977 tn = get_child(tp, cindex); 978 } 979 980 /* update parent in case inflate failed */ 981 tp = node_parent(tn); 982 983 /* Return if at least one inflate is run */ 984 if (max_work != MAX_WORK) 985 return tp; 986 987 /* Halve as long as the number of empty children in this 988 * node is above threshold. 989 */ 990 while (should_halve(tp, tn) && max_work) { 991 tp = halve(t, tn); 992 if (!tp) { 993 #ifdef CONFIG_IP_FIB_TRIE_STATS 994 this_cpu_inc(stats->resize_node_skipped); 995 #endif 996 break; 997 } 998 999 max_work--; 1000 tn = get_child(tp, cindex); 1001 } 1002 1003 /* Only one child remains */ 1004 if (should_collapse(tn)) 1005 return collapse(t, tn); 1006 1007 /* update parent in case halve failed */ 1008 return node_parent(tn); 1009 } 1010 1011 static void node_pull_suffix(struct key_vector *tn, unsigned char slen) 1012 { 1013 unsigned char node_slen = tn->slen; 1014 1015 while ((node_slen > tn->pos) && (node_slen > slen)) { 1016 slen = update_suffix(tn); 1017 if (node_slen == slen) 1018 break; 1019 1020 tn = node_parent(tn); 1021 node_slen = tn->slen; 1022 } 1023 } 1024 1025 static void node_push_suffix(struct key_vector *tn, unsigned char slen) 1026 { 1027 while (tn->slen < slen) { 1028 tn->slen = slen; 1029 tn = node_parent(tn); 1030 } 1031 } 1032 1033 /* rcu_read_lock needs to be hold by caller from readside */ 1034 static struct key_vector *fib_find_node(struct trie *t, 1035 struct key_vector **tp, u32 key) 1036 { 1037 struct key_vector *pn, *n = t->kv; 1038 unsigned long index = 0; 1039 1040 do { 1041 pn = n; 1042 n = get_child_rcu(n, index); 1043 1044 if (!n) 1045 break; 1046 1047 index = get_cindex(key, n); 1048 1049 /* This bit of code is a bit tricky but it combines multiple 1050 * checks into a single check. The prefix consists of the 1051 * prefix plus zeros for the bits in the cindex. The index 1052 * is the difference between the key and this value. From 1053 * this we can actually derive several pieces of data. 1054 * if (index >= (1ul << bits)) 1055 * we have a mismatch in skip bits and failed 1056 * else 1057 * we know the value is cindex 1058 * 1059 * This check is safe even if bits == KEYLENGTH due to the 1060 * fact that we can only allocate a node with 32 bits if a 1061 * long is greater than 32 bits. 1062 */ 1063 if (index >= (1ul << n->bits)) { 1064 n = NULL; 1065 break; 1066 } 1067 1068 /* keep searching until we find a perfect match leaf or NULL */ 1069 } while (IS_TNODE(n)); 1070 1071 *tp = pn; 1072 1073 return n; 1074 } 1075 1076 /* Return the first fib alias matching TOS with 1077 * priority less than or equal to PRIO. 1078 */ 1079 static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, 1080 u8 tos, u32 prio, u32 tb_id) 1081 { 1082 struct fib_alias *fa; 1083 1084 if (!fah) 1085 return NULL; 1086 1087 hlist_for_each_entry(fa, fah, fa_list) { 1088 if (fa->fa_slen < slen) 1089 continue; 1090 if (fa->fa_slen != slen) 1091 break; 1092 if (fa->tb_id > tb_id) 1093 continue; 1094 if (fa->tb_id != tb_id) 1095 break; 1096 if (fa->fa_tos > tos) 1097 continue; 1098 if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) 1099 return fa; 1100 } 1101 1102 return NULL; 1103 } 1104 1105 static void trie_rebalance(struct trie *t, struct key_vector *tn) 1106 { 1107 while (!IS_TRIE(tn)) 1108 tn = resize(t, tn); 1109 } 1110 1111 static int fib_insert_node(struct trie *t, struct key_vector *tp, 1112 struct fib_alias *new, t_key key) 1113 { 1114 struct key_vector *n, *l; 1115 1116 l = leaf_new(key, new); 1117 if (!l) 1118 goto noleaf; 1119 1120 /* retrieve child from parent node */ 1121 n = get_child(tp, get_index(key, tp)); 1122 1123 /* Case 2: n is a LEAF or a TNODE and the key doesn't match. 1124 * 1125 * Add a new tnode here 1126 * first tnode need some special handling 1127 * leaves us in position for handling as case 3 1128 */ 1129 if (n) { 1130 struct key_vector *tn; 1131 1132 tn = tnode_new(key, __fls(key ^ n->key), 1); 1133 if (!tn) 1134 goto notnode; 1135 1136 /* initialize routes out of node */ 1137 NODE_INIT_PARENT(tn, tp); 1138 put_child(tn, get_index(key, tn) ^ 1, n); 1139 1140 /* start adding routes into the node */ 1141 put_child_root(tp, key, tn); 1142 node_set_parent(n, tn); 1143 1144 /* parent now has a NULL spot where the leaf can go */ 1145 tp = tn; 1146 } 1147 1148 /* Case 3: n is NULL, and will just insert a new leaf */ 1149 node_push_suffix(tp, new->fa_slen); 1150 NODE_INIT_PARENT(l, tp); 1151 put_child_root(tp, key, l); 1152 trie_rebalance(t, tp); 1153 1154 return 0; 1155 notnode: 1156 node_free(l); 1157 noleaf: 1158 return -ENOMEM; 1159 } 1160 1161 static int fib_insert_alias(struct trie *t, struct key_vector *tp, 1162 struct key_vector *l, struct fib_alias *new, 1163 struct fib_alias *fa, t_key key) 1164 { 1165 if (!l) 1166 return fib_insert_node(t, tp, new, key); 1167 1168 if (fa) { 1169 hlist_add_before_rcu(&new->fa_list, &fa->fa_list); 1170 } else { 1171 struct fib_alias *last; 1172 1173 hlist_for_each_entry(last, &l->leaf, fa_list) { 1174 if (new->fa_slen < last->fa_slen) 1175 break; 1176 if ((new->fa_slen == last->fa_slen) && 1177 (new->tb_id > last->tb_id)) 1178 break; 1179 fa = last; 1180 } 1181 1182 if (fa) 1183 hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); 1184 else 1185 hlist_add_head_rcu(&new->fa_list, &l->leaf); 1186 } 1187 1188 /* if we added to the tail node then we need to update slen */ 1189 if (l->slen < new->fa_slen) { 1190 l->slen = new->fa_slen; 1191 node_push_suffix(tp, new->fa_slen); 1192 } 1193 1194 return 0; 1195 } 1196 1197 /* Caller must hold RTNL. */ 1198 int fib_table_insert(struct net *net, struct fib_table *tb, 1199 struct fib_config *cfg) 1200 { 1201 struct trie *t = (struct trie *)tb->tb_data; 1202 struct fib_alias *fa, *new_fa; 1203 struct key_vector *l, *tp; 1204 u16 nlflags = NLM_F_EXCL; 1205 struct fib_info *fi; 1206 u8 plen = cfg->fc_dst_len; 1207 u8 slen = KEYLENGTH - plen; 1208 u8 tos = cfg->fc_tos; 1209 u32 key; 1210 int err; 1211 1212 if (plen > KEYLENGTH) 1213 return -EINVAL; 1214 1215 key = ntohl(cfg->fc_dst); 1216 1217 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); 1218 1219 if ((plen < KEYLENGTH) && (key << plen)) 1220 return -EINVAL; 1221 1222 fi = fib_create_info(cfg); 1223 if (IS_ERR(fi)) { 1224 err = PTR_ERR(fi); 1225 goto err; 1226 } 1227 1228 l = fib_find_node(t, &tp, key); 1229 fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, 1230 tb->tb_id) : NULL; 1231 1232 /* Now fa, if non-NULL, points to the first fib alias 1233 * with the same keys [prefix,tos,priority], if such key already 1234 * exists or to the node before which we will insert new one. 1235 * 1236 * If fa is NULL, we will need to allocate a new one and 1237 * insert to the tail of the section matching the suffix length 1238 * of the new alias. 1239 */ 1240 1241 if (fa && fa->fa_tos == tos && 1242 fa->fa_info->fib_priority == fi->fib_priority) { 1243 struct fib_alias *fa_first, *fa_match; 1244 1245 err = -EEXIST; 1246 if (cfg->fc_nlflags & NLM_F_EXCL) 1247 goto out; 1248 1249 nlflags &= ~NLM_F_EXCL; 1250 1251 /* We have 2 goals: 1252 * 1. Find exact match for type, scope, fib_info to avoid 1253 * duplicate routes 1254 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it 1255 */ 1256 fa_match = NULL; 1257 fa_first = fa; 1258 hlist_for_each_entry_from(fa, fa_list) { 1259 if ((fa->fa_slen != slen) || 1260 (fa->tb_id != tb->tb_id) || 1261 (fa->fa_tos != tos)) 1262 break; 1263 if (fa->fa_info->fib_priority != fi->fib_priority) 1264 break; 1265 if (fa->fa_type == cfg->fc_type && 1266 fa->fa_info == fi) { 1267 fa_match = fa; 1268 break; 1269 } 1270 } 1271 1272 if (cfg->fc_nlflags & NLM_F_REPLACE) { 1273 struct fib_info *fi_drop; 1274 u8 state; 1275 1276 nlflags |= NLM_F_REPLACE; 1277 fa = fa_first; 1278 if (fa_match) { 1279 if (fa == fa_match) 1280 err = 0; 1281 goto out; 1282 } 1283 err = -ENOBUFS; 1284 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1285 if (!new_fa) 1286 goto out; 1287 1288 fi_drop = fa->fa_info; 1289 new_fa->fa_tos = fa->fa_tos; 1290 new_fa->fa_info = fi; 1291 new_fa->fa_type = cfg->fc_type; 1292 state = fa->fa_state; 1293 new_fa->fa_state = state & ~FA_S_ACCESSED; 1294 new_fa->fa_slen = fa->fa_slen; 1295 new_fa->tb_id = tb->tb_id; 1296 new_fa->fa_default = -1; 1297 1298 hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); 1299 1300 alias_free_mem_rcu(fa); 1301 1302 fib_release_info(fi_drop); 1303 if (state & FA_S_ACCESSED) 1304 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1305 1306 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, 1307 key, plen, fi, 1308 new_fa->fa_tos, cfg->fc_type, 1309 tb->tb_id, cfg->fc_nlflags); 1310 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, 1311 tb->tb_id, &cfg->fc_nlinfo, nlflags); 1312 1313 goto succeeded; 1314 } 1315 /* Error if we find a perfect match which 1316 * uses the same scope, type, and nexthop 1317 * information. 1318 */ 1319 if (fa_match) 1320 goto out; 1321 1322 if (cfg->fc_nlflags & NLM_F_APPEND) 1323 nlflags |= NLM_F_APPEND; 1324 else 1325 fa = fa_first; 1326 } 1327 err = -ENOENT; 1328 if (!(cfg->fc_nlflags & NLM_F_CREATE)) 1329 goto out; 1330 1331 nlflags |= NLM_F_CREATE; 1332 err = -ENOBUFS; 1333 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1334 if (!new_fa) 1335 goto out; 1336 1337 new_fa->fa_info = fi; 1338 new_fa->fa_tos = tos; 1339 new_fa->fa_type = cfg->fc_type; 1340 new_fa->fa_state = 0; 1341 new_fa->fa_slen = slen; 1342 new_fa->tb_id = tb->tb_id; 1343 new_fa->fa_default = -1; 1344 1345 /* Insert new entry to the list. */ 1346 err = fib_insert_alias(t, tp, l, new_fa, fa, key); 1347 if (err) 1348 goto out_free_new_fa; 1349 1350 if (!plen) 1351 tb->tb_num_default++; 1352 1353 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1354 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, 1355 cfg->fc_type, tb->tb_id, cfg->fc_nlflags); 1356 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, 1357 &cfg->fc_nlinfo, nlflags); 1358 succeeded: 1359 return 0; 1360 1361 out_free_new_fa: 1362 kmem_cache_free(fn_alias_kmem, new_fa); 1363 out: 1364 fib_release_info(fi); 1365 err: 1366 return err; 1367 } 1368 1369 static inline t_key prefix_mismatch(t_key key, struct key_vector *n) 1370 { 1371 t_key prefix = n->key; 1372 1373 return (key ^ prefix) & (prefix | -prefix); 1374 } 1375 1376 /* should be called with rcu_read_lock */ 1377 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, 1378 struct fib_result *res, int fib_flags) 1379 { 1380 struct trie *t = (struct trie *) tb->tb_data; 1381 #ifdef CONFIG_IP_FIB_TRIE_STATS 1382 struct trie_use_stats __percpu *stats = t->stats; 1383 #endif 1384 const t_key key = ntohl(flp->daddr); 1385 struct key_vector *n, *pn; 1386 struct fib_alias *fa; 1387 unsigned long index; 1388 t_key cindex; 1389 1390 trace_fib_table_lookup(tb->tb_id, flp); 1391 1392 pn = t->kv; 1393 cindex = 0; 1394 1395 n = get_child_rcu(pn, cindex); 1396 if (!n) 1397 return -EAGAIN; 1398 1399 #ifdef CONFIG_IP_FIB_TRIE_STATS 1400 this_cpu_inc(stats->gets); 1401 #endif 1402 1403 /* Step 1: Travel to the longest prefix match in the trie */ 1404 for (;;) { 1405 index = get_cindex(key, n); 1406 1407 /* This bit of code is a bit tricky but it combines multiple 1408 * checks into a single check. The prefix consists of the 1409 * prefix plus zeros for the "bits" in the prefix. The index 1410 * is the difference between the key and this value. From 1411 * this we can actually derive several pieces of data. 1412 * if (index >= (1ul << bits)) 1413 * we have a mismatch in skip bits and failed 1414 * else 1415 * we know the value is cindex 1416 * 1417 * This check is safe even if bits == KEYLENGTH due to the 1418 * fact that we can only allocate a node with 32 bits if a 1419 * long is greater than 32 bits. 1420 */ 1421 if (index >= (1ul << n->bits)) 1422 break; 1423 1424 /* we have found a leaf. Prefixes have already been compared */ 1425 if (IS_LEAF(n)) 1426 goto found; 1427 1428 /* only record pn and cindex if we are going to be chopping 1429 * bits later. Otherwise we are just wasting cycles. 1430 */ 1431 if (n->slen > n->pos) { 1432 pn = n; 1433 cindex = index; 1434 } 1435 1436 n = get_child_rcu(n, index); 1437 if (unlikely(!n)) 1438 goto backtrace; 1439 } 1440 1441 /* Step 2: Sort out leaves and begin backtracing for longest prefix */ 1442 for (;;) { 1443 /* record the pointer where our next node pointer is stored */ 1444 struct key_vector __rcu **cptr = n->tnode; 1445 1446 /* This test verifies that none of the bits that differ 1447 * between the key and the prefix exist in the region of 1448 * the lsb and higher in the prefix. 1449 */ 1450 if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) 1451 goto backtrace; 1452 1453 /* exit out and process leaf */ 1454 if (unlikely(IS_LEAF(n))) 1455 break; 1456 1457 /* Don't bother recording parent info. Since we are in 1458 * prefix match mode we will have to come back to wherever 1459 * we started this traversal anyway 1460 */ 1461 1462 while ((n = rcu_dereference(*cptr)) == NULL) { 1463 backtrace: 1464 #ifdef CONFIG_IP_FIB_TRIE_STATS 1465 if (!n) 1466 this_cpu_inc(stats->null_node_hit); 1467 #endif 1468 /* If we are at cindex 0 there are no more bits for 1469 * us to strip at this level so we must ascend back 1470 * up one level to see if there are any more bits to 1471 * be stripped there. 1472 */ 1473 while (!cindex) { 1474 t_key pkey = pn->key; 1475 1476 /* If we don't have a parent then there is 1477 * nothing for us to do as we do not have any 1478 * further nodes to parse. 1479 */ 1480 if (IS_TRIE(pn)) 1481 return -EAGAIN; 1482 #ifdef CONFIG_IP_FIB_TRIE_STATS 1483 this_cpu_inc(stats->backtrack); 1484 #endif 1485 /* Get Child's index */ 1486 pn = node_parent_rcu(pn); 1487 cindex = get_index(pkey, pn); 1488 } 1489 1490 /* strip the least significant bit from the cindex */ 1491 cindex &= cindex - 1; 1492 1493 /* grab pointer for next child node */ 1494 cptr = &pn->tnode[cindex]; 1495 } 1496 } 1497 1498 found: 1499 /* this line carries forward the xor from earlier in the function */ 1500 index = key ^ n->key; 1501 1502 /* Step 3: Process the leaf, if that fails fall back to backtracing */ 1503 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { 1504 struct fib_info *fi = fa->fa_info; 1505 int nhsel, err; 1506 1507 if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { 1508 if (index >= (1ul << fa->fa_slen)) 1509 continue; 1510 } 1511 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1512 continue; 1513 if (fi->fib_dead) 1514 continue; 1515 if (fa->fa_info->fib_scope < flp->flowi4_scope) 1516 continue; 1517 fib_alias_accessed(fa); 1518 err = fib_props[fa->fa_type].error; 1519 if (unlikely(err < 0)) { 1520 #ifdef CONFIG_IP_FIB_TRIE_STATS 1521 this_cpu_inc(stats->semantic_match_passed); 1522 #endif 1523 return err; 1524 } 1525 if (fi->fib_flags & RTNH_F_DEAD) 1526 continue; 1527 for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { 1528 const struct fib_nh *nh = &fi->fib_nh[nhsel]; 1529 struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); 1530 1531 if (nh->nh_flags & RTNH_F_DEAD) 1532 continue; 1533 if (in_dev && 1534 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 1535 nh->nh_flags & RTNH_F_LINKDOWN && 1536 !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) 1537 continue; 1538 if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { 1539 if (flp->flowi4_oif && 1540 flp->flowi4_oif != nh->nh_oif) 1541 continue; 1542 } 1543 1544 if (!(fib_flags & FIB_LOOKUP_NOREF)) 1545 atomic_inc(&fi->fib_clntref); 1546 1547 res->prefixlen = KEYLENGTH - fa->fa_slen; 1548 res->nh_sel = nhsel; 1549 res->type = fa->fa_type; 1550 res->scope = fi->fib_scope; 1551 res->fi = fi; 1552 res->table = tb; 1553 res->fa_head = &n->leaf; 1554 #ifdef CONFIG_IP_FIB_TRIE_STATS 1555 this_cpu_inc(stats->semantic_match_passed); 1556 #endif 1557 trace_fib_table_lookup_nh(nh); 1558 1559 return err; 1560 } 1561 } 1562 #ifdef CONFIG_IP_FIB_TRIE_STATS 1563 this_cpu_inc(stats->semantic_match_miss); 1564 #endif 1565 goto backtrace; 1566 } 1567 EXPORT_SYMBOL_GPL(fib_table_lookup); 1568 1569 static void fib_remove_alias(struct trie *t, struct key_vector *tp, 1570 struct key_vector *l, struct fib_alias *old) 1571 { 1572 /* record the location of the previous list_info entry */ 1573 struct hlist_node **pprev = old->fa_list.pprev; 1574 struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); 1575 1576 /* remove the fib_alias from the list */ 1577 hlist_del_rcu(&old->fa_list); 1578 1579 /* if we emptied the list this leaf will be freed and we can sort 1580 * out parent suffix lengths as a part of trie_rebalance 1581 */ 1582 if (hlist_empty(&l->leaf)) { 1583 if (tp->slen == l->slen) 1584 node_pull_suffix(tp, tp->pos); 1585 put_child_root(tp, l->key, NULL); 1586 node_free(l); 1587 trie_rebalance(t, tp); 1588 return; 1589 } 1590 1591 /* only access fa if it is pointing at the last valid hlist_node */ 1592 if (*pprev) 1593 return; 1594 1595 /* update the trie with the latest suffix length */ 1596 l->slen = fa->fa_slen; 1597 node_pull_suffix(tp, fa->fa_slen); 1598 } 1599 1600 /* Caller must hold RTNL. */ 1601 int fib_table_delete(struct net *net, struct fib_table *tb, 1602 struct fib_config *cfg) 1603 { 1604 struct trie *t = (struct trie *) tb->tb_data; 1605 struct fib_alias *fa, *fa_to_delete; 1606 struct key_vector *l, *tp; 1607 u8 plen = cfg->fc_dst_len; 1608 u8 slen = KEYLENGTH - plen; 1609 u8 tos = cfg->fc_tos; 1610 u32 key; 1611 1612 if (plen > KEYLENGTH) 1613 return -EINVAL; 1614 1615 key = ntohl(cfg->fc_dst); 1616 1617 if ((plen < KEYLENGTH) && (key << plen)) 1618 return -EINVAL; 1619 1620 l = fib_find_node(t, &tp, key); 1621 if (!l) 1622 return -ESRCH; 1623 1624 fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); 1625 if (!fa) 1626 return -ESRCH; 1627 1628 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); 1629 1630 fa_to_delete = NULL; 1631 hlist_for_each_entry_from(fa, fa_list) { 1632 struct fib_info *fi = fa->fa_info; 1633 1634 if ((fa->fa_slen != slen) || 1635 (fa->tb_id != tb->tb_id) || 1636 (fa->fa_tos != tos)) 1637 break; 1638 1639 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && 1640 (cfg->fc_scope == RT_SCOPE_NOWHERE || 1641 fa->fa_info->fib_scope == cfg->fc_scope) && 1642 (!cfg->fc_prefsrc || 1643 fi->fib_prefsrc == cfg->fc_prefsrc) && 1644 (!cfg->fc_protocol || 1645 fi->fib_protocol == cfg->fc_protocol) && 1646 fib_nh_match(cfg, fi) == 0) { 1647 fa_to_delete = fa; 1648 break; 1649 } 1650 } 1651 1652 if (!fa_to_delete) 1653 return -ESRCH; 1654 1655 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, 1656 fa_to_delete->fa_info, tos, cfg->fc_type, 1657 tb->tb_id, 0); 1658 rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, 1659 &cfg->fc_nlinfo, 0); 1660 1661 if (!plen) 1662 tb->tb_num_default--; 1663 1664 fib_remove_alias(t, tp, l, fa_to_delete); 1665 1666 if (fa_to_delete->fa_state & FA_S_ACCESSED) 1667 rt_cache_flush(cfg->fc_nlinfo.nl_net); 1668 1669 fib_release_info(fa_to_delete->fa_info); 1670 alias_free_mem_rcu(fa_to_delete); 1671 return 0; 1672 } 1673 1674 /* Scan for the next leaf starting at the provided key value */ 1675 static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) 1676 { 1677 struct key_vector *pn, *n = *tn; 1678 unsigned long cindex; 1679 1680 /* this loop is meant to try and find the key in the trie */ 1681 do { 1682 /* record parent and next child index */ 1683 pn = n; 1684 cindex = (key > pn->key) ? get_index(key, pn) : 0; 1685 1686 if (cindex >> pn->bits) 1687 break; 1688 1689 /* descend into the next child */ 1690 n = get_child_rcu(pn, cindex++); 1691 if (!n) 1692 break; 1693 1694 /* guarantee forward progress on the keys */ 1695 if (IS_LEAF(n) && (n->key >= key)) 1696 goto found; 1697 } while (IS_TNODE(n)); 1698 1699 /* this loop will search for the next leaf with a greater key */ 1700 while (!IS_TRIE(pn)) { 1701 /* if we exhausted the parent node we will need to climb */ 1702 if (cindex >= (1ul << pn->bits)) { 1703 t_key pkey = pn->key; 1704 1705 pn = node_parent_rcu(pn); 1706 cindex = get_index(pkey, pn) + 1; 1707 continue; 1708 } 1709 1710 /* grab the next available node */ 1711 n = get_child_rcu(pn, cindex++); 1712 if (!n) 1713 continue; 1714 1715 /* no need to compare keys since we bumped the index */ 1716 if (IS_LEAF(n)) 1717 goto found; 1718 1719 /* Rescan start scanning in new node */ 1720 pn = n; 1721 cindex = 0; 1722 } 1723 1724 *tn = pn; 1725 return NULL; /* Root of trie */ 1726 found: 1727 /* if we are at the limit for keys just return NULL for the tnode */ 1728 *tn = pn; 1729 return n; 1730 } 1731 1732 static void fib_trie_free(struct fib_table *tb) 1733 { 1734 struct trie *t = (struct trie *)tb->tb_data; 1735 struct key_vector *pn = t->kv; 1736 unsigned long cindex = 1; 1737 struct hlist_node *tmp; 1738 struct fib_alias *fa; 1739 1740 /* walk trie in reverse order and free everything */ 1741 for (;;) { 1742 struct key_vector *n; 1743 1744 if (!(cindex--)) { 1745 t_key pkey = pn->key; 1746 1747 if (IS_TRIE(pn)) 1748 break; 1749 1750 n = pn; 1751 pn = node_parent(pn); 1752 1753 /* drop emptied tnode */ 1754 put_child_root(pn, n->key, NULL); 1755 node_free(n); 1756 1757 cindex = get_index(pkey, pn); 1758 1759 continue; 1760 } 1761 1762 /* grab the next available node */ 1763 n = get_child(pn, cindex); 1764 if (!n) 1765 continue; 1766 1767 if (IS_TNODE(n)) { 1768 /* record pn and cindex for leaf walking */ 1769 pn = n; 1770 cindex = 1ul << n->bits; 1771 1772 continue; 1773 } 1774 1775 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { 1776 hlist_del_rcu(&fa->fa_list); 1777 alias_free_mem_rcu(fa); 1778 } 1779 1780 put_child_root(pn, n->key, NULL); 1781 node_free(n); 1782 } 1783 1784 #ifdef CONFIG_IP_FIB_TRIE_STATS 1785 free_percpu(t->stats); 1786 #endif 1787 kfree(tb); 1788 } 1789 1790 struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) 1791 { 1792 struct trie *ot = (struct trie *)oldtb->tb_data; 1793 struct key_vector *l, *tp = ot->kv; 1794 struct fib_table *local_tb; 1795 struct fib_alias *fa; 1796 struct trie *lt; 1797 t_key key = 0; 1798 1799 if (oldtb->tb_data == oldtb->__data) 1800 return oldtb; 1801 1802 local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); 1803 if (!local_tb) 1804 return NULL; 1805 1806 lt = (struct trie *)local_tb->tb_data; 1807 1808 while ((l = leaf_walk_rcu(&tp, key)) != NULL) { 1809 struct key_vector *local_l = NULL, *local_tp; 1810 1811 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { 1812 struct fib_alias *new_fa; 1813 1814 if (local_tb->tb_id != fa->tb_id) 1815 continue; 1816 1817 /* clone fa for new local table */ 1818 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); 1819 if (!new_fa) 1820 goto out; 1821 1822 memcpy(new_fa, fa, sizeof(*fa)); 1823 1824 /* insert clone into table */ 1825 if (!local_l) 1826 local_l = fib_find_node(lt, &local_tp, l->key); 1827 1828 if (fib_insert_alias(lt, local_tp, local_l, new_fa, 1829 NULL, l->key)) { 1830 kmem_cache_free(fn_alias_kmem, new_fa); 1831 goto out; 1832 } 1833 } 1834 1835 /* stop loop if key wrapped back to 0 */ 1836 key = l->key + 1; 1837 if (key < l->key) 1838 break; 1839 } 1840 1841 return local_tb; 1842 out: 1843 fib_trie_free(local_tb); 1844 1845 return NULL; 1846 } 1847 1848 /* Caller must hold RTNL */ 1849 void fib_table_flush_external(struct fib_table *tb) 1850 { 1851 struct trie *t = (struct trie *)tb->tb_data; 1852 struct key_vector *pn = t->kv; 1853 unsigned long cindex = 1; 1854 struct hlist_node *tmp; 1855 struct fib_alias *fa; 1856 1857 /* walk trie in reverse order */ 1858 for (;;) { 1859 unsigned char slen = 0; 1860 struct key_vector *n; 1861 1862 if (!(cindex--)) { 1863 t_key pkey = pn->key; 1864 1865 /* cannot resize the trie vector */ 1866 if (IS_TRIE(pn)) 1867 break; 1868 1869 /* update the suffix to address pulled leaves */ 1870 if (pn->slen > pn->pos) 1871 update_suffix(pn); 1872 1873 /* resize completed node */ 1874 pn = resize(t, pn); 1875 cindex = get_index(pkey, pn); 1876 1877 continue; 1878 } 1879 1880 /* grab the next available node */ 1881 n = get_child(pn, cindex); 1882 if (!n) 1883 continue; 1884 1885 if (IS_TNODE(n)) { 1886 /* record pn and cindex for leaf walking */ 1887 pn = n; 1888 cindex = 1ul << n->bits; 1889 1890 continue; 1891 } 1892 1893 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { 1894 /* if alias was cloned to local then we just 1895 * need to remove the local copy from main 1896 */ 1897 if (tb->tb_id != fa->tb_id) { 1898 hlist_del_rcu(&fa->fa_list); 1899 alias_free_mem_rcu(fa); 1900 continue; 1901 } 1902 1903 /* record local slen */ 1904 slen = fa->fa_slen; 1905 } 1906 1907 /* update leaf slen */ 1908 n->slen = slen; 1909 1910 if (hlist_empty(&n->leaf)) { 1911 put_child_root(pn, n->key, NULL); 1912 node_free(n); 1913 } 1914 } 1915 } 1916 1917 /* Caller must hold RTNL. */ 1918 int fib_table_flush(struct net *net, struct fib_table *tb) 1919 { 1920 struct trie *t = (struct trie *)tb->tb_data; 1921 struct key_vector *pn = t->kv; 1922 unsigned long cindex = 1; 1923 struct hlist_node *tmp; 1924 struct fib_alias *fa; 1925 int found = 0; 1926 1927 /* walk trie in reverse order */ 1928 for (;;) { 1929 unsigned char slen = 0; 1930 struct key_vector *n; 1931 1932 if (!(cindex--)) { 1933 t_key pkey = pn->key; 1934 1935 /* cannot resize the trie vector */ 1936 if (IS_TRIE(pn)) 1937 break; 1938 1939 /* update the suffix to address pulled leaves */ 1940 if (pn->slen > pn->pos) 1941 update_suffix(pn); 1942 1943 /* resize completed node */ 1944 pn = resize(t, pn); 1945 cindex = get_index(pkey, pn); 1946 1947 continue; 1948 } 1949 1950 /* grab the next available node */ 1951 n = get_child(pn, cindex); 1952 if (!n) 1953 continue; 1954 1955 if (IS_TNODE(n)) { 1956 /* record pn and cindex for leaf walking */ 1957 pn = n; 1958 cindex = 1ul << n->bits; 1959 1960 continue; 1961 } 1962 1963 hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { 1964 struct fib_info *fi = fa->fa_info; 1965 1966 if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { 1967 slen = fa->fa_slen; 1968 continue; 1969 } 1970 1971 call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, 1972 n->key, 1973 KEYLENGTH - fa->fa_slen, 1974 fi, fa->fa_tos, fa->fa_type, 1975 tb->tb_id, 0); 1976 hlist_del_rcu(&fa->fa_list); 1977 fib_release_info(fa->fa_info); 1978 alias_free_mem_rcu(fa); 1979 found++; 1980 } 1981 1982 /* update leaf slen */ 1983 n->slen = slen; 1984 1985 if (hlist_empty(&n->leaf)) { 1986 put_child_root(pn, n->key, NULL); 1987 node_free(n); 1988 } 1989 } 1990 1991 pr_debug("trie_flush found=%d\n", found); 1992 return found; 1993 } 1994 1995 static void fib_leaf_notify(struct net *net, struct key_vector *l, 1996 struct fib_table *tb, struct notifier_block *nb, 1997 enum fib_event_type event_type) 1998 { 1999 struct fib_alias *fa; 2000 2001 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { 2002 struct fib_info *fi = fa->fa_info; 2003 2004 if (!fi) 2005 continue; 2006 2007 /* local and main table can share the same trie, 2008 * so don't notify twice for the same entry. 2009 */ 2010 if (tb->tb_id != fa->tb_id) 2011 continue; 2012 2013 call_fib_entry_notifier(nb, net, event_type, l->key, 2014 KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, 2015 fa->fa_type, fa->tb_id, 0); 2016 } 2017 } 2018 2019 static void fib_table_notify(struct net *net, struct fib_table *tb, 2020 struct notifier_block *nb, 2021 enum fib_event_type event_type) 2022 { 2023 struct trie *t = (struct trie *)tb->tb_data; 2024 struct key_vector *l, *tp = t->kv; 2025 t_key key = 0; 2026 2027 while ((l = leaf_walk_rcu(&tp, key)) != NULL) { 2028 fib_leaf_notify(net, l, tb, nb, event_type); 2029 2030 key = l->key + 1; 2031 /* stop in case of wrap around */ 2032 if (key < l->key) 2033 break; 2034 } 2035 } 2036 2037 static void fib_notify(struct net *net, struct notifier_block *nb, 2038 enum fib_event_type event_type) 2039 { 2040 unsigned int h; 2041 2042 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2043 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2044 struct fib_table *tb; 2045 2046 hlist_for_each_entry_rcu(tb, head, tb_hlist) 2047 fib_table_notify(net, tb, nb, event_type); 2048 } 2049 } 2050 2051 static void __trie_free_rcu(struct rcu_head *head) 2052 { 2053 struct fib_table *tb = container_of(head, struct fib_table, rcu); 2054 #ifdef CONFIG_IP_FIB_TRIE_STATS 2055 struct trie *t = (struct trie *)tb->tb_data; 2056 2057 if (tb->tb_data == tb->__data) 2058 free_percpu(t->stats); 2059 #endif /* CONFIG_IP_FIB_TRIE_STATS */ 2060 kfree(tb); 2061 } 2062 2063 void fib_free_table(struct fib_table *tb) 2064 { 2065 call_rcu(&tb->rcu, __trie_free_rcu); 2066 } 2067 2068 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, 2069 struct sk_buff *skb, struct netlink_callback *cb) 2070 { 2071 __be32 xkey = htonl(l->key); 2072 struct fib_alias *fa; 2073 int i, s_i; 2074 2075 s_i = cb->args[4]; 2076 i = 0; 2077 2078 /* rcu_read_lock is hold by caller */ 2079 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { 2080 if (i < s_i) { 2081 i++; 2082 continue; 2083 } 2084 2085 if (tb->tb_id != fa->tb_id) { 2086 i++; 2087 continue; 2088 } 2089 2090 if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, 2091 cb->nlh->nlmsg_seq, 2092 RTM_NEWROUTE, 2093 tb->tb_id, 2094 fa->fa_type, 2095 xkey, 2096 KEYLENGTH - fa->fa_slen, 2097 fa->fa_tos, 2098 fa->fa_info, NLM_F_MULTI) < 0) { 2099 cb->args[4] = i; 2100 return -1; 2101 } 2102 i++; 2103 } 2104 2105 cb->args[4] = i; 2106 return skb->len; 2107 } 2108 2109 /* rcu_read_lock needs to be hold by caller from readside */ 2110 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, 2111 struct netlink_callback *cb) 2112 { 2113 struct trie *t = (struct trie *)tb->tb_data; 2114 struct key_vector *l, *tp = t->kv; 2115 /* Dump starting at last key. 2116 * Note: 0.0.0.0/0 (ie default) is first key. 2117 */ 2118 int count = cb->args[2]; 2119 t_key key = cb->args[3]; 2120 2121 while ((l = leaf_walk_rcu(&tp, key)) != NULL) { 2122 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { 2123 cb->args[3] = key; 2124 cb->args[2] = count; 2125 return -1; 2126 } 2127 2128 ++count; 2129 key = l->key + 1; 2130 2131 memset(&cb->args[4], 0, 2132 sizeof(cb->args) - 4*sizeof(cb->args[0])); 2133 2134 /* stop loop if key wrapped back to 0 */ 2135 if (key < l->key) 2136 break; 2137 } 2138 2139 cb->args[3] = key; 2140 cb->args[2] = count; 2141 2142 return skb->len; 2143 } 2144 2145 void __init fib_trie_init(void) 2146 { 2147 fn_alias_kmem = kmem_cache_create("ip_fib_alias", 2148 sizeof(struct fib_alias), 2149 0, SLAB_PANIC, NULL); 2150 2151 trie_leaf_kmem = kmem_cache_create("ip_fib_trie", 2152 LEAF_SIZE, 2153 0, SLAB_PANIC, NULL); 2154 } 2155 2156 struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) 2157 { 2158 struct fib_table *tb; 2159 struct trie *t; 2160 size_t sz = sizeof(*tb); 2161 2162 if (!alias) 2163 sz += sizeof(struct trie); 2164 2165 tb = kzalloc(sz, GFP_KERNEL); 2166 if (!tb) 2167 return NULL; 2168 2169 tb->tb_id = id; 2170 tb->tb_num_default = 0; 2171 tb->tb_data = (alias ? alias->__data : tb->__data); 2172 2173 if (alias) 2174 return tb; 2175 2176 t = (struct trie *) tb->tb_data; 2177 t->kv[0].pos = KEYLENGTH; 2178 t->kv[0].slen = KEYLENGTH; 2179 #ifdef CONFIG_IP_FIB_TRIE_STATS 2180 t->stats = alloc_percpu(struct trie_use_stats); 2181 if (!t->stats) { 2182 kfree(tb); 2183 tb = NULL; 2184 } 2185 #endif 2186 2187 return tb; 2188 } 2189 2190 #ifdef CONFIG_PROC_FS 2191 /* Depth first Trie walk iterator */ 2192 struct fib_trie_iter { 2193 struct seq_net_private p; 2194 struct fib_table *tb; 2195 struct key_vector *tnode; 2196 unsigned int index; 2197 unsigned int depth; 2198 }; 2199 2200 static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) 2201 { 2202 unsigned long cindex = iter->index; 2203 struct key_vector *pn = iter->tnode; 2204 t_key pkey; 2205 2206 pr_debug("get_next iter={node=%p index=%d depth=%d}\n", 2207 iter->tnode, iter->index, iter->depth); 2208 2209 while (!IS_TRIE(pn)) { 2210 while (cindex < child_length(pn)) { 2211 struct key_vector *n = get_child_rcu(pn, cindex++); 2212 2213 if (!n) 2214 continue; 2215 2216 if (IS_LEAF(n)) { 2217 iter->tnode = pn; 2218 iter->index = cindex; 2219 } else { 2220 /* push down one level */ 2221 iter->tnode = n; 2222 iter->index = 0; 2223 ++iter->depth; 2224 } 2225 2226 return n; 2227 } 2228 2229 /* Current node exhausted, pop back up */ 2230 pkey = pn->key; 2231 pn = node_parent_rcu(pn); 2232 cindex = get_index(pkey, pn) + 1; 2233 --iter->depth; 2234 } 2235 2236 /* record root node so further searches know we are done */ 2237 iter->tnode = pn; 2238 iter->index = 0; 2239 2240 return NULL; 2241 } 2242 2243 static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, 2244 struct trie *t) 2245 { 2246 struct key_vector *n, *pn; 2247 2248 if (!t) 2249 return NULL; 2250 2251 pn = t->kv; 2252 n = rcu_dereference(pn->tnode[0]); 2253 if (!n) 2254 return NULL; 2255 2256 if (IS_TNODE(n)) { 2257 iter->tnode = n; 2258 iter->index = 0; 2259 iter->depth = 1; 2260 } else { 2261 iter->tnode = pn; 2262 iter->index = 0; 2263 iter->depth = 0; 2264 } 2265 2266 return n; 2267 } 2268 2269 static void trie_collect_stats(struct trie *t, struct trie_stat *s) 2270 { 2271 struct key_vector *n; 2272 struct fib_trie_iter iter; 2273 2274 memset(s, 0, sizeof(*s)); 2275 2276 rcu_read_lock(); 2277 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { 2278 if (IS_LEAF(n)) { 2279 struct fib_alias *fa; 2280 2281 s->leaves++; 2282 s->totdepth += iter.depth; 2283 if (iter.depth > s->maxdepth) 2284 s->maxdepth = iter.depth; 2285 2286 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) 2287 ++s->prefixes; 2288 } else { 2289 s->tnodes++; 2290 if (n->bits < MAX_STAT_DEPTH) 2291 s->nodesizes[n->bits]++; 2292 s->nullpointers += tn_info(n)->empty_children; 2293 } 2294 } 2295 rcu_read_unlock(); 2296 } 2297 2298 /* 2299 * This outputs /proc/net/fib_triestats 2300 */ 2301 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) 2302 { 2303 unsigned int i, max, pointers, bytes, avdepth; 2304 2305 if (stat->leaves) 2306 avdepth = stat->totdepth*100 / stat->leaves; 2307 else 2308 avdepth = 0; 2309 2310 seq_printf(seq, "\tAver depth: %u.%02d\n", 2311 avdepth / 100, avdepth % 100); 2312 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); 2313 2314 seq_printf(seq, "\tLeaves: %u\n", stat->leaves); 2315 bytes = LEAF_SIZE * stat->leaves; 2316 2317 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); 2318 bytes += sizeof(struct fib_alias) * stat->prefixes; 2319 2320 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); 2321 bytes += TNODE_SIZE(0) * stat->tnodes; 2322 2323 max = MAX_STAT_DEPTH; 2324 while (max > 0 && stat->nodesizes[max-1] == 0) 2325 max--; 2326 2327 pointers = 0; 2328 for (i = 1; i < max; i++) 2329 if (stat->nodesizes[i] != 0) { 2330 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); 2331 pointers += (1<<i) * stat->nodesizes[i]; 2332 } 2333 seq_putc(seq, '\n'); 2334 seq_printf(seq, "\tPointers: %u\n", pointers); 2335 2336 bytes += sizeof(struct key_vector *) * pointers; 2337 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); 2338 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); 2339 } 2340 2341 #ifdef CONFIG_IP_FIB_TRIE_STATS 2342 static void trie_show_usage(struct seq_file *seq, 2343 const struct trie_use_stats __percpu *stats) 2344 { 2345 struct trie_use_stats s = { 0 }; 2346 int cpu; 2347 2348 /* loop through all of the CPUs and gather up the stats */ 2349 for_each_possible_cpu(cpu) { 2350 const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); 2351 2352 s.gets += pcpu->gets; 2353 s.backtrack += pcpu->backtrack; 2354 s.semantic_match_passed += pcpu->semantic_match_passed; 2355 s.semantic_match_miss += pcpu->semantic_match_miss; 2356 s.null_node_hit += pcpu->null_node_hit; 2357 s.resize_node_skipped += pcpu->resize_node_skipped; 2358 } 2359 2360 seq_printf(seq, "\nCounters:\n---------\n"); 2361 seq_printf(seq, "gets = %u\n", s.gets); 2362 seq_printf(seq, "backtracks = %u\n", s.backtrack); 2363 seq_printf(seq, "semantic match passed = %u\n", 2364 s.semantic_match_passed); 2365 seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); 2366 seq_printf(seq, "null node hit= %u\n", s.null_node_hit); 2367 seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); 2368 } 2369 #endif /* CONFIG_IP_FIB_TRIE_STATS */ 2370 2371 static void fib_table_print(struct seq_file *seq, struct fib_table *tb) 2372 { 2373 if (tb->tb_id == RT_TABLE_LOCAL) 2374 seq_puts(seq, "Local:\n"); 2375 else if (tb->tb_id == RT_TABLE_MAIN) 2376 seq_puts(seq, "Main:\n"); 2377 else 2378 seq_printf(seq, "Id %d:\n", tb->tb_id); 2379 } 2380 2381 2382 static int fib_triestat_seq_show(struct seq_file *seq, void *v) 2383 { 2384 struct net *net = (struct net *)seq->private; 2385 unsigned int h; 2386 2387 seq_printf(seq, 2388 "Basic info: size of leaf:" 2389 " %Zd bytes, size of tnode: %Zd bytes.\n", 2390 LEAF_SIZE, TNODE_SIZE(0)); 2391 2392 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2393 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2394 struct fib_table *tb; 2395 2396 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2397 struct trie *t = (struct trie *) tb->tb_data; 2398 struct trie_stat stat; 2399 2400 if (!t) 2401 continue; 2402 2403 fib_table_print(seq, tb); 2404 2405 trie_collect_stats(t, &stat); 2406 trie_show_stats(seq, &stat); 2407 #ifdef CONFIG_IP_FIB_TRIE_STATS 2408 trie_show_usage(seq, t->stats); 2409 #endif 2410 } 2411 } 2412 2413 return 0; 2414 } 2415 2416 static int fib_triestat_seq_open(struct inode *inode, struct file *file) 2417 { 2418 return single_open_net(inode, file, fib_triestat_seq_show); 2419 } 2420 2421 static const struct file_operations fib_triestat_fops = { 2422 .owner = THIS_MODULE, 2423 .open = fib_triestat_seq_open, 2424 .read = seq_read, 2425 .llseek = seq_lseek, 2426 .release = single_release_net, 2427 }; 2428 2429 static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) 2430 { 2431 struct fib_trie_iter *iter = seq->private; 2432 struct net *net = seq_file_net(seq); 2433 loff_t idx = 0; 2434 unsigned int h; 2435 2436 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 2437 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2438 struct fib_table *tb; 2439 2440 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2441 struct key_vector *n; 2442 2443 for (n = fib_trie_get_first(iter, 2444 (struct trie *) tb->tb_data); 2445 n; n = fib_trie_get_next(iter)) 2446 if (pos == idx++) { 2447 iter->tb = tb; 2448 return n; 2449 } 2450 } 2451 } 2452 2453 return NULL; 2454 } 2455 2456 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) 2457 __acquires(RCU) 2458 { 2459 rcu_read_lock(); 2460 return fib_trie_get_idx(seq, *pos); 2461 } 2462 2463 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2464 { 2465 struct fib_trie_iter *iter = seq->private; 2466 struct net *net = seq_file_net(seq); 2467 struct fib_table *tb = iter->tb; 2468 struct hlist_node *tb_node; 2469 unsigned int h; 2470 struct key_vector *n; 2471 2472 ++*pos; 2473 /* next node in same table */ 2474 n = fib_trie_get_next(iter); 2475 if (n) 2476 return n; 2477 2478 /* walk rest of this hash chain */ 2479 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); 2480 while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { 2481 tb = hlist_entry(tb_node, struct fib_table, tb_hlist); 2482 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); 2483 if (n) 2484 goto found; 2485 } 2486 2487 /* new hash chain */ 2488 while (++h < FIB_TABLE_HASHSZ) { 2489 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 2490 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 2491 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); 2492 if (n) 2493 goto found; 2494 } 2495 } 2496 return NULL; 2497 2498 found: 2499 iter->tb = tb; 2500 return n; 2501 } 2502 2503 static void fib_trie_seq_stop(struct seq_file *seq, void *v) 2504 __releases(RCU) 2505 { 2506 rcu_read_unlock(); 2507 } 2508 2509 static void seq_indent(struct seq_file *seq, int n) 2510 { 2511 while (n-- > 0) 2512 seq_puts(seq, " "); 2513 } 2514 2515 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) 2516 { 2517 switch (s) { 2518 case RT_SCOPE_UNIVERSE: return "universe"; 2519 case RT_SCOPE_SITE: return "site"; 2520 case RT_SCOPE_LINK: return "link"; 2521 case RT_SCOPE_HOST: return "host"; 2522 case RT_SCOPE_NOWHERE: return "nowhere"; 2523 default: 2524 snprintf(buf, len, "scope=%d", s); 2525 return buf; 2526 } 2527 } 2528 2529 static const char *const rtn_type_names[__RTN_MAX] = { 2530 [RTN_UNSPEC] = "UNSPEC", 2531 [RTN_UNICAST] = "UNICAST", 2532 [RTN_LOCAL] = "LOCAL", 2533 [RTN_BROADCAST] = "BROADCAST", 2534 [RTN_ANYCAST] = "ANYCAST", 2535 [RTN_MULTICAST] = "MULTICAST", 2536 [RTN_BLACKHOLE] = "BLACKHOLE", 2537 [RTN_UNREACHABLE] = "UNREACHABLE", 2538 [RTN_PROHIBIT] = "PROHIBIT", 2539 [RTN_THROW] = "THROW", 2540 [RTN_NAT] = "NAT", 2541 [RTN_XRESOLVE] = "XRESOLVE", 2542 }; 2543 2544 static inline const char *rtn_type(char *buf, size_t len, unsigned int t) 2545 { 2546 if (t < __RTN_MAX && rtn_type_names[t]) 2547 return rtn_type_names[t]; 2548 snprintf(buf, len, "type %u", t); 2549 return buf; 2550 } 2551 2552 /* Pretty print the trie */ 2553 static int fib_trie_seq_show(struct seq_file *seq, void *v) 2554 { 2555 const struct fib_trie_iter *iter = seq->private; 2556 struct key_vector *n = v; 2557 2558 if (IS_TRIE(node_parent_rcu(n))) 2559 fib_table_print(seq, iter->tb); 2560 2561 if (IS_TNODE(n)) { 2562 __be32 prf = htonl(n->key); 2563 2564 seq_indent(seq, iter->depth-1); 2565 seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", 2566 &prf, KEYLENGTH - n->pos - n->bits, n->bits, 2567 tn_info(n)->full_children, 2568 tn_info(n)->empty_children); 2569 } else { 2570 __be32 val = htonl(n->key); 2571 struct fib_alias *fa; 2572 2573 seq_indent(seq, iter->depth); 2574 seq_printf(seq, " |-- %pI4\n", &val); 2575 2576 hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { 2577 char buf1[32], buf2[32]; 2578 2579 seq_indent(seq, iter->depth + 1); 2580 seq_printf(seq, " /%zu %s %s", 2581 KEYLENGTH - fa->fa_slen, 2582 rtn_scope(buf1, sizeof(buf1), 2583 fa->fa_info->fib_scope), 2584 rtn_type(buf2, sizeof(buf2), 2585 fa->fa_type)); 2586 if (fa->fa_tos) 2587 seq_printf(seq, " tos=%d", fa->fa_tos); 2588 seq_putc(seq, '\n'); 2589 } 2590 } 2591 2592 return 0; 2593 } 2594 2595 static const struct seq_operations fib_trie_seq_ops = { 2596 .start = fib_trie_seq_start, 2597 .next = fib_trie_seq_next, 2598 .stop = fib_trie_seq_stop, 2599 .show = fib_trie_seq_show, 2600 }; 2601 2602 static int fib_trie_seq_open(struct inode *inode, struct file *file) 2603 { 2604 return seq_open_net(inode, file, &fib_trie_seq_ops, 2605 sizeof(struct fib_trie_iter)); 2606 } 2607 2608 static const struct file_operations fib_trie_fops = { 2609 .owner = THIS_MODULE, 2610 .open = fib_trie_seq_open, 2611 .read = seq_read, 2612 .llseek = seq_lseek, 2613 .release = seq_release_net, 2614 }; 2615 2616 struct fib_route_iter { 2617 struct seq_net_private p; 2618 struct fib_table *main_tb; 2619 struct key_vector *tnode; 2620 loff_t pos; 2621 t_key key; 2622 }; 2623 2624 static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, 2625 loff_t pos) 2626 { 2627 struct key_vector *l, **tp = &iter->tnode; 2628 t_key key; 2629 2630 /* use cached location of previously found key */ 2631 if (iter->pos > 0 && pos >= iter->pos) { 2632 key = iter->key; 2633 } else { 2634 iter->pos = 1; 2635 key = 0; 2636 } 2637 2638 pos -= iter->pos; 2639 2640 while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { 2641 key = l->key + 1; 2642 iter->pos++; 2643 l = NULL; 2644 2645 /* handle unlikely case of a key wrap */ 2646 if (!key) 2647 break; 2648 } 2649 2650 if (l) 2651 iter->key = l->key; /* remember it */ 2652 else 2653 iter->pos = 0; /* forget it */ 2654 2655 return l; 2656 } 2657 2658 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) 2659 __acquires(RCU) 2660 { 2661 struct fib_route_iter *iter = seq->private; 2662 struct fib_table *tb; 2663 struct trie *t; 2664 2665 rcu_read_lock(); 2666 2667 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); 2668 if (!tb) 2669 return NULL; 2670 2671 iter->main_tb = tb; 2672 t = (struct trie *)tb->tb_data; 2673 iter->tnode = t->kv; 2674 2675 if (*pos != 0) 2676 return fib_route_get_idx(iter, *pos); 2677 2678 iter->pos = 0; 2679 iter->key = KEY_MAX; 2680 2681 return SEQ_START_TOKEN; 2682 } 2683 2684 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2685 { 2686 struct fib_route_iter *iter = seq->private; 2687 struct key_vector *l = NULL; 2688 t_key key = iter->key + 1; 2689 2690 ++*pos; 2691 2692 /* only allow key of 0 for start of sequence */ 2693 if ((v == SEQ_START_TOKEN) || key) 2694 l = leaf_walk_rcu(&iter->tnode, key); 2695 2696 if (l) { 2697 iter->key = l->key; 2698 iter->pos++; 2699 } else { 2700 iter->pos = 0; 2701 } 2702 2703 return l; 2704 } 2705 2706 static void fib_route_seq_stop(struct seq_file *seq, void *v) 2707 __releases(RCU) 2708 { 2709 rcu_read_unlock(); 2710 } 2711 2712 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) 2713 { 2714 unsigned int flags = 0; 2715 2716 if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) 2717 flags = RTF_REJECT; 2718 if (fi && fi->fib_nh->nh_gw) 2719 flags |= RTF_GATEWAY; 2720 if (mask == htonl(0xFFFFFFFF)) 2721 flags |= RTF_HOST; 2722 flags |= RTF_UP; 2723 return flags; 2724 } 2725 2726 /* 2727 * This outputs /proc/net/route. 2728 * The format of the file is not supposed to be changed 2729 * and needs to be same as fib_hash output to avoid breaking 2730 * legacy utilities 2731 */ 2732 static int fib_route_seq_show(struct seq_file *seq, void *v) 2733 { 2734 struct fib_route_iter *iter = seq->private; 2735 struct fib_table *tb = iter->main_tb; 2736 struct fib_alias *fa; 2737 struct key_vector *l = v; 2738 __be32 prefix; 2739 2740 if (v == SEQ_START_TOKEN) { 2741 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " 2742 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" 2743 "\tWindow\tIRTT"); 2744 return 0; 2745 } 2746 2747 prefix = htonl(l->key); 2748 2749 hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { 2750 const struct fib_info *fi = fa->fa_info; 2751 __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); 2752 unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); 2753 2754 if ((fa->fa_type == RTN_BROADCAST) || 2755 (fa->fa_type == RTN_MULTICAST)) 2756 continue; 2757 2758 if (fa->tb_id != tb->tb_id) 2759 continue; 2760 2761 seq_setwidth(seq, 127); 2762 2763 if (fi) 2764 seq_printf(seq, 2765 "%s\t%08X\t%08X\t%04X\t%d\t%u\t" 2766 "%d\t%08X\t%d\t%u\t%u", 2767 fi->fib_dev ? fi->fib_dev->name : "*", 2768 prefix, 2769 fi->fib_nh->nh_gw, flags, 0, 0, 2770 fi->fib_priority, 2771 mask, 2772 (fi->fib_advmss ? 2773 fi->fib_advmss + 40 : 0), 2774 fi->fib_window, 2775 fi->fib_rtt >> 3); 2776 else 2777 seq_printf(seq, 2778 "*\t%08X\t%08X\t%04X\t%d\t%u\t" 2779 "%d\t%08X\t%d\t%u\t%u", 2780 prefix, 0, flags, 0, 0, 0, 2781 mask, 0, 0, 0); 2782 2783 seq_pad(seq, '\n'); 2784 } 2785 2786 return 0; 2787 } 2788 2789 static const struct seq_operations fib_route_seq_ops = { 2790 .start = fib_route_seq_start, 2791 .next = fib_route_seq_next, 2792 .stop = fib_route_seq_stop, 2793 .show = fib_route_seq_show, 2794 }; 2795 2796 static int fib_route_seq_open(struct inode *inode, struct file *file) 2797 { 2798 return seq_open_net(inode, file, &fib_route_seq_ops, 2799 sizeof(struct fib_route_iter)); 2800 } 2801 2802 static const struct file_operations fib_route_fops = { 2803 .owner = THIS_MODULE, 2804 .open = fib_route_seq_open, 2805 .read = seq_read, 2806 .llseek = seq_lseek, 2807 .release = seq_release_net, 2808 }; 2809 2810 int __net_init fib_proc_init(struct net *net) 2811 { 2812 if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) 2813 goto out1; 2814 2815 if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, 2816 &fib_triestat_fops)) 2817 goto out2; 2818 2819 if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) 2820 goto out3; 2821 2822 return 0; 2823 2824 out3: 2825 remove_proc_entry("fib_triestat", net->proc_net); 2826 out2: 2827 remove_proc_entry("fib_trie", net->proc_net); 2828 out1: 2829 return -ENOMEM; 2830 } 2831 2832 void __net_exit fib_proc_exit(struct net *net) 2833 { 2834 remove_proc_entry("fib_trie", net->proc_net); 2835 remove_proc_entry("fib_triestat", net->proc_net); 2836 remove_proc_entry("route", net->proc_net); 2837 } 2838 2839 #endif /* CONFIG_PROC_FS */ 2840