xref: /openbmc/linux/net/ipv4/fib_trie.c (revision b627b4ed)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.408"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
82 
83 #define MAX_STAT_DEPTH 32
84 
85 #define KEYLENGTH (8*sizeof(t_key))
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93 
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
96 
97 struct node {
98 	unsigned long parent;
99 	t_key key;
100 };
101 
102 struct leaf {
103 	unsigned long parent;
104 	t_key key;
105 	struct hlist_head list;
106 	struct rcu_head rcu;
107 };
108 
109 struct leaf_info {
110 	struct hlist_node hlist;
111 	struct rcu_head rcu;
112 	int plen;
113 	struct list_head falh;
114 };
115 
116 struct tnode {
117 	unsigned long parent;
118 	t_key key;
119 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
120 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned int full_children;	/* KEYLENGTH bits needed */
122 	unsigned int empty_children;	/* KEYLENGTH bits needed */
123 	union {
124 		struct rcu_head rcu;
125 		struct work_struct work;
126 	};
127 	struct node *child[0];
128 };
129 
130 #ifdef CONFIG_IP_FIB_TRIE_STATS
131 struct trie_use_stats {
132 	unsigned int gets;
133 	unsigned int backtrack;
134 	unsigned int semantic_match_passed;
135 	unsigned int semantic_match_miss;
136 	unsigned int null_node_hit;
137 	unsigned int resize_node_skipped;
138 };
139 #endif
140 
141 struct trie_stat {
142 	unsigned int totdepth;
143 	unsigned int maxdepth;
144 	unsigned int tnodes;
145 	unsigned int leaves;
146 	unsigned int nullpointers;
147 	unsigned int prefixes;
148 	unsigned int nodesizes[MAX_STAT_DEPTH];
149 };
150 
151 struct trie {
152 	struct node *trie;
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 	struct trie_use_stats stats;
155 #endif
156 };
157 
158 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
159 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
160 				  int wasfull);
161 static struct node *resize(struct trie *t, struct tnode *tn);
162 static struct tnode *inflate(struct trie *t, struct tnode *tn);
163 static struct tnode *halve(struct trie *t, struct tnode *tn);
164 
165 static struct kmem_cache *fn_alias_kmem __read_mostly;
166 static struct kmem_cache *trie_leaf_kmem __read_mostly;
167 
168 static inline struct tnode *node_parent(struct node *node)
169 {
170 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171 }
172 
173 static inline struct tnode *node_parent_rcu(struct node *node)
174 {
175 	struct tnode *ret = node_parent(node);
176 
177 	return rcu_dereference(ret);
178 }
179 
180 /* Same as rcu_assign_pointer
181  * but that macro() assumes that value is a pointer.
182  */
183 static inline void node_set_parent(struct node *node, struct tnode *ptr)
184 {
185 	smp_wmb();
186 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
187 }
188 
189 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
190 {
191 	BUG_ON(i >= 1U << tn->bits);
192 
193 	return tn->child[i];
194 }
195 
196 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
197 {
198 	struct node *ret = tnode_get_child(tn, i);
199 
200 	return rcu_dereference(ret);
201 }
202 
203 static inline int tnode_child_length(const struct tnode *tn)
204 {
205 	return 1 << tn->bits;
206 }
207 
208 static inline t_key mask_pfx(t_key k, unsigned short l)
209 {
210 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
211 }
212 
213 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
214 {
215 	if (offset < KEYLENGTH)
216 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
217 	else
218 		return 0;
219 }
220 
221 static inline int tkey_equals(t_key a, t_key b)
222 {
223 	return a == b;
224 }
225 
226 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
227 {
228 	if (bits == 0 || offset >= KEYLENGTH)
229 		return 1;
230 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
231 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
232 }
233 
234 static inline int tkey_mismatch(t_key a, int offset, t_key b)
235 {
236 	t_key diff = a ^ b;
237 	int i = offset;
238 
239 	if (!diff)
240 		return 0;
241 	while ((diff << i) >> (KEYLENGTH-1) == 0)
242 		i++;
243 	return i;
244 }
245 
246 /*
247   To understand this stuff, an understanding of keys and all their bits is
248   necessary. Every node in the trie has a key associated with it, but not
249   all of the bits in that key are significant.
250 
251   Consider a node 'n' and its parent 'tp'.
252 
253   If n is a leaf, every bit in its key is significant. Its presence is
254   necessitated by path compression, since during a tree traversal (when
255   searching for a leaf - unless we are doing an insertion) we will completely
256   ignore all skipped bits we encounter. Thus we need to verify, at the end of
257   a potentially successful search, that we have indeed been walking the
258   correct key path.
259 
260   Note that we can never "miss" the correct key in the tree if present by
261   following the wrong path. Path compression ensures that segments of the key
262   that are the same for all keys with a given prefix are skipped, but the
263   skipped part *is* identical for each node in the subtrie below the skipped
264   bit! trie_insert() in this implementation takes care of that - note the
265   call to tkey_sub_equals() in trie_insert().
266 
267   if n is an internal node - a 'tnode' here, the various parts of its key
268   have many different meanings.
269 
270   Example:
271   _________________________________________________________________
272   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
273   -----------------------------------------------------------------
274     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
275 
276   _________________________________________________________________
277   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
278   -----------------------------------------------------------------
279    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
280 
281   tp->pos = 7
282   tp->bits = 3
283   n->pos = 15
284   n->bits = 4
285 
286   First, let's just ignore the bits that come before the parent tp, that is
287   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
288   not use them for anything.
289 
290   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
291   index into the parent's child array. That is, they will be used to find
292   'n' among tp's children.
293 
294   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
295   for the node n.
296 
297   All the bits we have seen so far are significant to the node n. The rest
298   of the bits are really not needed or indeed known in n->key.
299 
300   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
301   n's child array, and will of course be different for each child.
302 
303 
304   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
305   at this point.
306 
307 */
308 
309 static inline void check_tnode(const struct tnode *tn)
310 {
311 	WARN_ON(tn && tn->pos+tn->bits > 32);
312 }
313 
314 static const int halve_threshold = 25;
315 static const int inflate_threshold = 50;
316 static const int halve_threshold_root = 8;
317 static const int inflate_threshold_root = 15;
318 
319 
320 static void __alias_free_mem(struct rcu_head *head)
321 {
322 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
323 	kmem_cache_free(fn_alias_kmem, fa);
324 }
325 
326 static inline void alias_free_mem_rcu(struct fib_alias *fa)
327 {
328 	call_rcu(&fa->rcu, __alias_free_mem);
329 }
330 
331 static void __leaf_free_rcu(struct rcu_head *head)
332 {
333 	struct leaf *l = container_of(head, struct leaf, rcu);
334 	kmem_cache_free(trie_leaf_kmem, l);
335 }
336 
337 static inline void free_leaf(struct leaf *l)
338 {
339 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
340 }
341 
342 static void __leaf_info_free_rcu(struct rcu_head *head)
343 {
344 	kfree(container_of(head, struct leaf_info, rcu));
345 }
346 
347 static inline void free_leaf_info(struct leaf_info *leaf)
348 {
349 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
350 }
351 
352 static struct tnode *tnode_alloc(size_t size)
353 {
354 	if (size <= PAGE_SIZE)
355 		return kzalloc(size, GFP_KERNEL);
356 	else
357 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
358 }
359 
360 static void __tnode_vfree(struct work_struct *arg)
361 {
362 	struct tnode *tn = container_of(arg, struct tnode, work);
363 	vfree(tn);
364 }
365 
366 static void __tnode_free_rcu(struct rcu_head *head)
367 {
368 	struct tnode *tn = container_of(head, struct tnode, rcu);
369 	size_t size = sizeof(struct tnode) +
370 		      (sizeof(struct node *) << tn->bits);
371 
372 	if (size <= PAGE_SIZE)
373 		kfree(tn);
374 	else {
375 		INIT_WORK(&tn->work, __tnode_vfree);
376 		schedule_work(&tn->work);
377 	}
378 }
379 
380 static inline void tnode_free(struct tnode *tn)
381 {
382 	if (IS_LEAF(tn))
383 		free_leaf((struct leaf *) tn);
384 	else
385 		call_rcu(&tn->rcu, __tnode_free_rcu);
386 }
387 
388 static struct leaf *leaf_new(void)
389 {
390 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
391 	if (l) {
392 		l->parent = T_LEAF;
393 		INIT_HLIST_HEAD(&l->list);
394 	}
395 	return l;
396 }
397 
398 static struct leaf_info *leaf_info_new(int plen)
399 {
400 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
401 	if (li) {
402 		li->plen = plen;
403 		INIT_LIST_HEAD(&li->falh);
404 	}
405 	return li;
406 }
407 
408 static struct tnode *tnode_new(t_key key, int pos, int bits)
409 {
410 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
411 	struct tnode *tn = tnode_alloc(sz);
412 
413 	if (tn) {
414 		tn->parent = T_TNODE;
415 		tn->pos = pos;
416 		tn->bits = bits;
417 		tn->key = key;
418 		tn->full_children = 0;
419 		tn->empty_children = 1<<bits;
420 	}
421 
422 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
423 		 (unsigned long) (sizeof(struct node) << bits));
424 	return tn;
425 }
426 
427 /*
428  * Check whether a tnode 'n' is "full", i.e. it is an internal node
429  * and no bits are skipped. See discussion in dyntree paper p. 6
430  */
431 
432 static inline int tnode_full(const struct tnode *tn, const struct node *n)
433 {
434 	if (n == NULL || IS_LEAF(n))
435 		return 0;
436 
437 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
438 }
439 
440 static inline void put_child(struct trie *t, struct tnode *tn, int i,
441 			     struct node *n)
442 {
443 	tnode_put_child_reorg(tn, i, n, -1);
444 }
445 
446  /*
447   * Add a child at position i overwriting the old value.
448   * Update the value of full_children and empty_children.
449   */
450 
451 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
452 				  int wasfull)
453 {
454 	struct node *chi = tn->child[i];
455 	int isfull;
456 
457 	BUG_ON(i >= 1<<tn->bits);
458 
459 	/* update emptyChildren */
460 	if (n == NULL && chi != NULL)
461 		tn->empty_children++;
462 	else if (n != NULL && chi == NULL)
463 		tn->empty_children--;
464 
465 	/* update fullChildren */
466 	if (wasfull == -1)
467 		wasfull = tnode_full(tn, chi);
468 
469 	isfull = tnode_full(tn, n);
470 	if (wasfull && !isfull)
471 		tn->full_children--;
472 	else if (!wasfull && isfull)
473 		tn->full_children++;
474 
475 	if (n)
476 		node_set_parent(n, tn);
477 
478 	rcu_assign_pointer(tn->child[i], n);
479 }
480 
481 static struct node *resize(struct trie *t, struct tnode *tn)
482 {
483 	int i;
484 	int err = 0;
485 	struct tnode *old_tn;
486 	int inflate_threshold_use;
487 	int halve_threshold_use;
488 	int max_resize;
489 
490 	if (!tn)
491 		return NULL;
492 
493 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494 		 tn, inflate_threshold, halve_threshold);
495 
496 	/* No children */
497 	if (tn->empty_children == tnode_child_length(tn)) {
498 		tnode_free(tn);
499 		return NULL;
500 	}
501 	/* One child */
502 	if (tn->empty_children == tnode_child_length(tn) - 1)
503 		for (i = 0; i < tnode_child_length(tn); i++) {
504 			struct node *n;
505 
506 			n = tn->child[i];
507 			if (!n)
508 				continue;
509 
510 			/* compress one level */
511 			node_set_parent(n, NULL);
512 			tnode_free(tn);
513 			return n;
514 		}
515 	/*
516 	 * Double as long as the resulting node has a number of
517 	 * nonempty nodes that are above the threshold.
518 	 */
519 
520 	/*
521 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
522 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
523 	 * Telecommunications, page 6:
524 	 * "A node is doubled if the ratio of non-empty children to all
525 	 * children in the *doubled* node is at least 'high'."
526 	 *
527 	 * 'high' in this instance is the variable 'inflate_threshold'. It
528 	 * is expressed as a percentage, so we multiply it with
529 	 * tnode_child_length() and instead of multiplying by 2 (since the
530 	 * child array will be doubled by inflate()) and multiplying
531 	 * the left-hand side by 100 (to handle the percentage thing) we
532 	 * multiply the left-hand side by 50.
533 	 *
534 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
535 	 * - tn->empty_children is of course the number of non-null children
536 	 * in the current node. tn->full_children is the number of "full"
537 	 * children, that is non-null tnodes with a skip value of 0.
538 	 * All of those will be doubled in the resulting inflated tnode, so
539 	 * we just count them one extra time here.
540 	 *
541 	 * A clearer way to write this would be:
542 	 *
543 	 * to_be_doubled = tn->full_children;
544 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
545 	 *     tn->full_children;
546 	 *
547 	 * new_child_length = tnode_child_length(tn) * 2;
548 	 *
549 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
550 	 *      new_child_length;
551 	 * if (new_fill_factor >= inflate_threshold)
552 	 *
553 	 * ...and so on, tho it would mess up the while () loop.
554 	 *
555 	 * anyway,
556 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
557 	 *      inflate_threshold
558 	 *
559 	 * avoid a division:
560 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
561 	 *      inflate_threshold * new_child_length
562 	 *
563 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
564 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
565 	 *    tn->full_children) >= inflate_threshold * new_child_length
566 	 *
567 	 * expand new_child_length:
568 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 	 *    tn->full_children) >=
570 	 *      inflate_threshold * tnode_child_length(tn) * 2
571 	 *
572 	 * shorten again:
573 	 * 50 * (tn->full_children + tnode_child_length(tn) -
574 	 *    tn->empty_children) >= inflate_threshold *
575 	 *    tnode_child_length(tn)
576 	 *
577 	 */
578 
579 	check_tnode(tn);
580 
581 	/* Keep root node larger  */
582 
583 	if (!tn->parent)
584 		inflate_threshold_use = inflate_threshold_root;
585 	else
586 		inflate_threshold_use = inflate_threshold;
587 
588 	err = 0;
589 	max_resize = 10;
590 	while ((tn->full_children > 0 &&  max_resize-- &&
591 		50 * (tn->full_children + tnode_child_length(tn)
592 		      - tn->empty_children)
593 		>= inflate_threshold_use * tnode_child_length(tn))) {
594 
595 		old_tn = tn;
596 		tn = inflate(t, tn);
597 
598 		if (IS_ERR(tn)) {
599 			tn = old_tn;
600 #ifdef CONFIG_IP_FIB_TRIE_STATS
601 			t->stats.resize_node_skipped++;
602 #endif
603 			break;
604 		}
605 	}
606 
607 	if (max_resize < 0) {
608 		if (!tn->parent)
609 			pr_warning("Fix inflate_threshold_root."
610 				   " Now=%d size=%d bits\n",
611 				   inflate_threshold_root, tn->bits);
612 		else
613 			pr_warning("Fix inflate_threshold."
614 				   " Now=%d size=%d bits\n",
615 				   inflate_threshold, tn->bits);
616 	}
617 
618 	check_tnode(tn);
619 
620 	/*
621 	 * Halve as long as the number of empty children in this
622 	 * node is above threshold.
623 	 */
624 
625 
626 	/* Keep root node larger  */
627 
628 	if (!tn->parent)
629 		halve_threshold_use = halve_threshold_root;
630 	else
631 		halve_threshold_use = halve_threshold;
632 
633 	err = 0;
634 	max_resize = 10;
635 	while (tn->bits > 1 &&  max_resize-- &&
636 	       100 * (tnode_child_length(tn) - tn->empty_children) <
637 	       halve_threshold_use * tnode_child_length(tn)) {
638 
639 		old_tn = tn;
640 		tn = halve(t, tn);
641 		if (IS_ERR(tn)) {
642 			tn = old_tn;
643 #ifdef CONFIG_IP_FIB_TRIE_STATS
644 			t->stats.resize_node_skipped++;
645 #endif
646 			break;
647 		}
648 	}
649 
650 	if (max_resize < 0) {
651 		if (!tn->parent)
652 			pr_warning("Fix halve_threshold_root."
653 				   " Now=%d size=%d bits\n",
654 				   halve_threshold_root, tn->bits);
655 		else
656 			pr_warning("Fix halve_threshold."
657 				   " Now=%d size=%d bits\n",
658 				   halve_threshold, tn->bits);
659 	}
660 
661 	/* Only one child remains */
662 	if (tn->empty_children == tnode_child_length(tn) - 1)
663 		for (i = 0; i < tnode_child_length(tn); i++) {
664 			struct node *n;
665 
666 			n = tn->child[i];
667 			if (!n)
668 				continue;
669 
670 			/* compress one level */
671 
672 			node_set_parent(n, NULL);
673 			tnode_free(tn);
674 			return n;
675 		}
676 
677 	return (struct node *) tn;
678 }
679 
680 static struct tnode *inflate(struct trie *t, struct tnode *tn)
681 {
682 	struct tnode *oldtnode = tn;
683 	int olen = tnode_child_length(tn);
684 	int i;
685 
686 	pr_debug("In inflate\n");
687 
688 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
689 
690 	if (!tn)
691 		return ERR_PTR(-ENOMEM);
692 
693 	/*
694 	 * Preallocate and store tnodes before the actual work so we
695 	 * don't get into an inconsistent state if memory allocation
696 	 * fails. In case of failure we return the oldnode and  inflate
697 	 * of tnode is ignored.
698 	 */
699 
700 	for (i = 0; i < olen; i++) {
701 		struct tnode *inode;
702 
703 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
704 		if (inode &&
705 		    IS_TNODE(inode) &&
706 		    inode->pos == oldtnode->pos + oldtnode->bits &&
707 		    inode->bits > 1) {
708 			struct tnode *left, *right;
709 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
710 
711 			left = tnode_new(inode->key&(~m), inode->pos + 1,
712 					 inode->bits - 1);
713 			if (!left)
714 				goto nomem;
715 
716 			right = tnode_new(inode->key|m, inode->pos + 1,
717 					  inode->bits - 1);
718 
719 			if (!right) {
720 				tnode_free(left);
721 				goto nomem;
722 			}
723 
724 			put_child(t, tn, 2*i, (struct node *) left);
725 			put_child(t, tn, 2*i+1, (struct node *) right);
726 		}
727 	}
728 
729 	for (i = 0; i < olen; i++) {
730 		struct tnode *inode;
731 		struct node *node = tnode_get_child(oldtnode, i);
732 		struct tnode *left, *right;
733 		int size, j;
734 
735 		/* An empty child */
736 		if (node == NULL)
737 			continue;
738 
739 		/* A leaf or an internal node with skipped bits */
740 
741 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
742 		   tn->pos + tn->bits - 1) {
743 			if (tkey_extract_bits(node->key,
744 					      oldtnode->pos + oldtnode->bits,
745 					      1) == 0)
746 				put_child(t, tn, 2*i, node);
747 			else
748 				put_child(t, tn, 2*i+1, node);
749 			continue;
750 		}
751 
752 		/* An internal node with two children */
753 		inode = (struct tnode *) node;
754 
755 		if (inode->bits == 1) {
756 			put_child(t, tn, 2*i, inode->child[0]);
757 			put_child(t, tn, 2*i+1, inode->child[1]);
758 
759 			tnode_free(inode);
760 			continue;
761 		}
762 
763 		/* An internal node with more than two children */
764 
765 		/* We will replace this node 'inode' with two new
766 		 * ones, 'left' and 'right', each with half of the
767 		 * original children. The two new nodes will have
768 		 * a position one bit further down the key and this
769 		 * means that the "significant" part of their keys
770 		 * (see the discussion near the top of this file)
771 		 * will differ by one bit, which will be "0" in
772 		 * left's key and "1" in right's key. Since we are
773 		 * moving the key position by one step, the bit that
774 		 * we are moving away from - the bit at position
775 		 * (inode->pos) - is the one that will differ between
776 		 * left and right. So... we synthesize that bit in the
777 		 * two  new keys.
778 		 * The mask 'm' below will be a single "one" bit at
779 		 * the position (inode->pos)
780 		 */
781 
782 		/* Use the old key, but set the new significant
783 		 *   bit to zero.
784 		 */
785 
786 		left = (struct tnode *) tnode_get_child(tn, 2*i);
787 		put_child(t, tn, 2*i, NULL);
788 
789 		BUG_ON(!left);
790 
791 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
792 		put_child(t, tn, 2*i+1, NULL);
793 
794 		BUG_ON(!right);
795 
796 		size = tnode_child_length(left);
797 		for (j = 0; j < size; j++) {
798 			put_child(t, left, j, inode->child[j]);
799 			put_child(t, right, j, inode->child[j + size]);
800 		}
801 		put_child(t, tn, 2*i, resize(t, left));
802 		put_child(t, tn, 2*i+1, resize(t, right));
803 
804 		tnode_free(inode);
805 	}
806 	tnode_free(oldtnode);
807 	return tn;
808 nomem:
809 	{
810 		int size = tnode_child_length(tn);
811 		int j;
812 
813 		for (j = 0; j < size; j++)
814 			if (tn->child[j])
815 				tnode_free((struct tnode *)tn->child[j]);
816 
817 		tnode_free(tn);
818 
819 		return ERR_PTR(-ENOMEM);
820 	}
821 }
822 
823 static struct tnode *halve(struct trie *t, struct tnode *tn)
824 {
825 	struct tnode *oldtnode = tn;
826 	struct node *left, *right;
827 	int i;
828 	int olen = tnode_child_length(tn);
829 
830 	pr_debug("In halve\n");
831 
832 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
833 
834 	if (!tn)
835 		return ERR_PTR(-ENOMEM);
836 
837 	/*
838 	 * Preallocate and store tnodes before the actual work so we
839 	 * don't get into an inconsistent state if memory allocation
840 	 * fails. In case of failure we return the oldnode and halve
841 	 * of tnode is ignored.
842 	 */
843 
844 	for (i = 0; i < olen; i += 2) {
845 		left = tnode_get_child(oldtnode, i);
846 		right = tnode_get_child(oldtnode, i+1);
847 
848 		/* Two nonempty children */
849 		if (left && right) {
850 			struct tnode *newn;
851 
852 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
853 
854 			if (!newn)
855 				goto nomem;
856 
857 			put_child(t, tn, i/2, (struct node *)newn);
858 		}
859 
860 	}
861 
862 	for (i = 0; i < olen; i += 2) {
863 		struct tnode *newBinNode;
864 
865 		left = tnode_get_child(oldtnode, i);
866 		right = tnode_get_child(oldtnode, i+1);
867 
868 		/* At least one of the children is empty */
869 		if (left == NULL) {
870 			if (right == NULL)    /* Both are empty */
871 				continue;
872 			put_child(t, tn, i/2, right);
873 			continue;
874 		}
875 
876 		if (right == NULL) {
877 			put_child(t, tn, i/2, left);
878 			continue;
879 		}
880 
881 		/* Two nonempty children */
882 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
883 		put_child(t, tn, i/2, NULL);
884 		put_child(t, newBinNode, 0, left);
885 		put_child(t, newBinNode, 1, right);
886 		put_child(t, tn, i/2, resize(t, newBinNode));
887 	}
888 	tnode_free(oldtnode);
889 	return tn;
890 nomem:
891 	{
892 		int size = tnode_child_length(tn);
893 		int j;
894 
895 		for (j = 0; j < size; j++)
896 			if (tn->child[j])
897 				tnode_free((struct tnode *)tn->child[j]);
898 
899 		tnode_free(tn);
900 
901 		return ERR_PTR(-ENOMEM);
902 	}
903 }
904 
905 /* readside must use rcu_read_lock currently dump routines
906  via get_fa_head and dump */
907 
908 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
909 {
910 	struct hlist_head *head = &l->list;
911 	struct hlist_node *node;
912 	struct leaf_info *li;
913 
914 	hlist_for_each_entry_rcu(li, node, head, hlist)
915 		if (li->plen == plen)
916 			return li;
917 
918 	return NULL;
919 }
920 
921 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
922 {
923 	struct leaf_info *li = find_leaf_info(l, plen);
924 
925 	if (!li)
926 		return NULL;
927 
928 	return &li->falh;
929 }
930 
931 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
932 {
933 	struct leaf_info *li = NULL, *last = NULL;
934 	struct hlist_node *node;
935 
936 	if (hlist_empty(head)) {
937 		hlist_add_head_rcu(&new->hlist, head);
938 	} else {
939 		hlist_for_each_entry(li, node, head, hlist) {
940 			if (new->plen > li->plen)
941 				break;
942 
943 			last = li;
944 		}
945 		if (last)
946 			hlist_add_after_rcu(&last->hlist, &new->hlist);
947 		else
948 			hlist_add_before_rcu(&new->hlist, &li->hlist);
949 	}
950 }
951 
952 /* rcu_read_lock needs to be hold by caller from readside */
953 
954 static struct leaf *
955 fib_find_node(struct trie *t, u32 key)
956 {
957 	int pos;
958 	struct tnode *tn;
959 	struct node *n;
960 
961 	pos = 0;
962 	n = rcu_dereference(t->trie);
963 
964 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
965 		tn = (struct tnode *) n;
966 
967 		check_tnode(tn);
968 
969 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
970 			pos = tn->pos + tn->bits;
971 			n = tnode_get_child_rcu(tn,
972 						tkey_extract_bits(key,
973 								  tn->pos,
974 								  tn->bits));
975 		} else
976 			break;
977 	}
978 	/* Case we have found a leaf. Compare prefixes */
979 
980 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
981 		return (struct leaf *)n;
982 
983 	return NULL;
984 }
985 
986 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
987 {
988 	int wasfull;
989 	t_key cindex, key = tn->key;
990 	struct tnode *tp;
991 
992 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
993 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
994 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
995 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
996 
997 		tnode_put_child_reorg((struct tnode *)tp, cindex,
998 				      (struct node *)tn, wasfull);
999 
1000 		tp = node_parent((struct node *) tn);
1001 		if (!tp)
1002 			break;
1003 		tn = tp;
1004 	}
1005 
1006 	/* Handle last (top) tnode */
1007 	if (IS_TNODE(tn))
1008 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1009 
1010 	return (struct node *)tn;
1011 }
1012 
1013 /* only used from updater-side */
1014 
1015 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1016 {
1017 	int pos, newpos;
1018 	struct tnode *tp = NULL, *tn = NULL;
1019 	struct node *n;
1020 	struct leaf *l;
1021 	int missbit;
1022 	struct list_head *fa_head = NULL;
1023 	struct leaf_info *li;
1024 	t_key cindex;
1025 
1026 	pos = 0;
1027 	n = t->trie;
1028 
1029 	/* If we point to NULL, stop. Either the tree is empty and we should
1030 	 * just put a new leaf in if, or we have reached an empty child slot,
1031 	 * and we should just put our new leaf in that.
1032 	 * If we point to a T_TNODE, check if it matches our key. Note that
1033 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1034 	 * not be the parent's 'pos'+'bits'!
1035 	 *
1036 	 * If it does match the current key, get pos/bits from it, extract
1037 	 * the index from our key, push the T_TNODE and walk the tree.
1038 	 *
1039 	 * If it doesn't, we have to replace it with a new T_TNODE.
1040 	 *
1041 	 * If we point to a T_LEAF, it might or might not have the same key
1042 	 * as we do. If it does, just change the value, update the T_LEAF's
1043 	 * value, and return it.
1044 	 * If it doesn't, we need to replace it with a T_TNODE.
1045 	 */
1046 
1047 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1048 		tn = (struct tnode *) n;
1049 
1050 		check_tnode(tn);
1051 
1052 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1053 			tp = tn;
1054 			pos = tn->pos + tn->bits;
1055 			n = tnode_get_child(tn,
1056 					    tkey_extract_bits(key,
1057 							      tn->pos,
1058 							      tn->bits));
1059 
1060 			BUG_ON(n && node_parent(n) != tn);
1061 		} else
1062 			break;
1063 	}
1064 
1065 	/*
1066 	 * n  ----> NULL, LEAF or TNODE
1067 	 *
1068 	 * tp is n's (parent) ----> NULL or TNODE
1069 	 */
1070 
1071 	BUG_ON(tp && IS_LEAF(tp));
1072 
1073 	/* Case 1: n is a leaf. Compare prefixes */
1074 
1075 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1076 		l = (struct leaf *) n;
1077 		li = leaf_info_new(plen);
1078 
1079 		if (!li)
1080 			return NULL;
1081 
1082 		fa_head = &li->falh;
1083 		insert_leaf_info(&l->list, li);
1084 		goto done;
1085 	}
1086 	l = leaf_new();
1087 
1088 	if (!l)
1089 		return NULL;
1090 
1091 	l->key = key;
1092 	li = leaf_info_new(plen);
1093 
1094 	if (!li) {
1095 		free_leaf(l);
1096 		return NULL;
1097 	}
1098 
1099 	fa_head = &li->falh;
1100 	insert_leaf_info(&l->list, li);
1101 
1102 	if (t->trie && n == NULL) {
1103 		/* Case 2: n is NULL, and will just insert a new leaf */
1104 
1105 		node_set_parent((struct node *)l, tp);
1106 
1107 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1108 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1109 	} else {
1110 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1111 		/*
1112 		 *  Add a new tnode here
1113 		 *  first tnode need some special handling
1114 		 */
1115 
1116 		if (tp)
1117 			pos = tp->pos+tp->bits;
1118 		else
1119 			pos = 0;
1120 
1121 		if (n) {
1122 			newpos = tkey_mismatch(key, pos, n->key);
1123 			tn = tnode_new(n->key, newpos, 1);
1124 		} else {
1125 			newpos = 0;
1126 			tn = tnode_new(key, newpos, 1); /* First tnode */
1127 		}
1128 
1129 		if (!tn) {
1130 			free_leaf_info(li);
1131 			free_leaf(l);
1132 			return NULL;
1133 		}
1134 
1135 		node_set_parent((struct node *)tn, tp);
1136 
1137 		missbit = tkey_extract_bits(key, newpos, 1);
1138 		put_child(t, tn, missbit, (struct node *)l);
1139 		put_child(t, tn, 1-missbit, n);
1140 
1141 		if (tp) {
1142 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1143 			put_child(t, (struct tnode *)tp, cindex,
1144 				  (struct node *)tn);
1145 		} else {
1146 			rcu_assign_pointer(t->trie, (struct node *)tn);
1147 			tp = tn;
1148 		}
1149 	}
1150 
1151 	if (tp && tp->pos + tp->bits > 32)
1152 		pr_warning("fib_trie"
1153 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1154 			   tp, tp->pos, tp->bits, key, plen);
1155 
1156 	/* Rebalance the trie */
1157 
1158 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1159 done:
1160 	return fa_head;
1161 }
1162 
1163 /*
1164  * Caller must hold RTNL.
1165  */
1166 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1167 {
1168 	struct trie *t = (struct trie *) tb->tb_data;
1169 	struct fib_alias *fa, *new_fa;
1170 	struct list_head *fa_head = NULL;
1171 	struct fib_info *fi;
1172 	int plen = cfg->fc_dst_len;
1173 	u8 tos = cfg->fc_tos;
1174 	u32 key, mask;
1175 	int err;
1176 	struct leaf *l;
1177 
1178 	if (plen > 32)
1179 		return -EINVAL;
1180 
1181 	key = ntohl(cfg->fc_dst);
1182 
1183 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1184 
1185 	mask = ntohl(inet_make_mask(plen));
1186 
1187 	if (key & ~mask)
1188 		return -EINVAL;
1189 
1190 	key = key & mask;
1191 
1192 	fi = fib_create_info(cfg);
1193 	if (IS_ERR(fi)) {
1194 		err = PTR_ERR(fi);
1195 		goto err;
1196 	}
1197 
1198 	l = fib_find_node(t, key);
1199 	fa = NULL;
1200 
1201 	if (l) {
1202 		fa_head = get_fa_head(l, plen);
1203 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1204 	}
1205 
1206 	/* Now fa, if non-NULL, points to the first fib alias
1207 	 * with the same keys [prefix,tos,priority], if such key already
1208 	 * exists or to the node before which we will insert new one.
1209 	 *
1210 	 * If fa is NULL, we will need to allocate a new one and
1211 	 * insert to the head of f.
1212 	 *
1213 	 * If f is NULL, no fib node matched the destination key
1214 	 * and we need to allocate a new one of those as well.
1215 	 */
1216 
1217 	if (fa && fa->fa_tos == tos &&
1218 	    fa->fa_info->fib_priority == fi->fib_priority) {
1219 		struct fib_alias *fa_first, *fa_match;
1220 
1221 		err = -EEXIST;
1222 		if (cfg->fc_nlflags & NLM_F_EXCL)
1223 			goto out;
1224 
1225 		/* We have 2 goals:
1226 		 * 1. Find exact match for type, scope, fib_info to avoid
1227 		 * duplicate routes
1228 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1229 		 */
1230 		fa_match = NULL;
1231 		fa_first = fa;
1232 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1233 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1234 			if (fa->fa_tos != tos)
1235 				break;
1236 			if (fa->fa_info->fib_priority != fi->fib_priority)
1237 				break;
1238 			if (fa->fa_type == cfg->fc_type &&
1239 			    fa->fa_scope == cfg->fc_scope &&
1240 			    fa->fa_info == fi) {
1241 				fa_match = fa;
1242 				break;
1243 			}
1244 		}
1245 
1246 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1247 			struct fib_info *fi_drop;
1248 			u8 state;
1249 
1250 			fa = fa_first;
1251 			if (fa_match) {
1252 				if (fa == fa_match)
1253 					err = 0;
1254 				goto out;
1255 			}
1256 			err = -ENOBUFS;
1257 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1258 			if (new_fa == NULL)
1259 				goto out;
1260 
1261 			fi_drop = fa->fa_info;
1262 			new_fa->fa_tos = fa->fa_tos;
1263 			new_fa->fa_info = fi;
1264 			new_fa->fa_type = cfg->fc_type;
1265 			new_fa->fa_scope = cfg->fc_scope;
1266 			state = fa->fa_state;
1267 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1268 
1269 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1270 			alias_free_mem_rcu(fa);
1271 
1272 			fib_release_info(fi_drop);
1273 			if (state & FA_S_ACCESSED)
1274 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1275 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1276 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1277 
1278 			goto succeeded;
1279 		}
1280 		/* Error if we find a perfect match which
1281 		 * uses the same scope, type, and nexthop
1282 		 * information.
1283 		 */
1284 		if (fa_match)
1285 			goto out;
1286 
1287 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1288 			fa = fa_first;
1289 	}
1290 	err = -ENOENT;
1291 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1292 		goto out;
1293 
1294 	err = -ENOBUFS;
1295 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1296 	if (new_fa == NULL)
1297 		goto out;
1298 
1299 	new_fa->fa_info = fi;
1300 	new_fa->fa_tos = tos;
1301 	new_fa->fa_type = cfg->fc_type;
1302 	new_fa->fa_scope = cfg->fc_scope;
1303 	new_fa->fa_state = 0;
1304 	/*
1305 	 * Insert new entry to the list.
1306 	 */
1307 
1308 	if (!fa_head) {
1309 		fa_head = fib_insert_node(t, key, plen);
1310 		if (unlikely(!fa_head)) {
1311 			err = -ENOMEM;
1312 			goto out_free_new_fa;
1313 		}
1314 	}
1315 
1316 	list_add_tail_rcu(&new_fa->fa_list,
1317 			  (fa ? &fa->fa_list : fa_head));
1318 
1319 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1320 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1321 		  &cfg->fc_nlinfo, 0);
1322 succeeded:
1323 	return 0;
1324 
1325 out_free_new_fa:
1326 	kmem_cache_free(fn_alias_kmem, new_fa);
1327 out:
1328 	fib_release_info(fi);
1329 err:
1330 	return err;
1331 }
1332 
1333 /* should be called with rcu_read_lock */
1334 static int check_leaf(struct trie *t, struct leaf *l,
1335 		      t_key key,  const struct flowi *flp,
1336 		      struct fib_result *res)
1337 {
1338 	struct leaf_info *li;
1339 	struct hlist_head *hhead = &l->list;
1340 	struct hlist_node *node;
1341 
1342 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1343 		int err;
1344 		int plen = li->plen;
1345 		__be32 mask = inet_make_mask(plen);
1346 
1347 		if (l->key != (key & ntohl(mask)))
1348 			continue;
1349 
1350 		err = fib_semantic_match(&li->falh, flp, res,
1351 					 htonl(l->key), mask, plen);
1352 
1353 #ifdef CONFIG_IP_FIB_TRIE_STATS
1354 		if (err <= 0)
1355 			t->stats.semantic_match_passed++;
1356 		else
1357 			t->stats.semantic_match_miss++;
1358 #endif
1359 		if (err <= 0)
1360 			return err;
1361 	}
1362 
1363 	return 1;
1364 }
1365 
1366 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1367 			  struct fib_result *res)
1368 {
1369 	struct trie *t = (struct trie *) tb->tb_data;
1370 	int ret;
1371 	struct node *n;
1372 	struct tnode *pn;
1373 	int pos, bits;
1374 	t_key key = ntohl(flp->fl4_dst);
1375 	int chopped_off;
1376 	t_key cindex = 0;
1377 	int current_prefix_length = KEYLENGTH;
1378 	struct tnode *cn;
1379 	t_key node_prefix, key_prefix, pref_mismatch;
1380 	int mp;
1381 
1382 	rcu_read_lock();
1383 
1384 	n = rcu_dereference(t->trie);
1385 	if (!n)
1386 		goto failed;
1387 
1388 #ifdef CONFIG_IP_FIB_TRIE_STATS
1389 	t->stats.gets++;
1390 #endif
1391 
1392 	/* Just a leaf? */
1393 	if (IS_LEAF(n)) {
1394 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1395 		goto found;
1396 	}
1397 
1398 	pn = (struct tnode *) n;
1399 	chopped_off = 0;
1400 
1401 	while (pn) {
1402 		pos = pn->pos;
1403 		bits = pn->bits;
1404 
1405 		if (!chopped_off)
1406 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1407 						   pos, bits);
1408 
1409 		n = tnode_get_child(pn, cindex);
1410 
1411 		if (n == NULL) {
1412 #ifdef CONFIG_IP_FIB_TRIE_STATS
1413 			t->stats.null_node_hit++;
1414 #endif
1415 			goto backtrace;
1416 		}
1417 
1418 		if (IS_LEAF(n)) {
1419 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1420 			if (ret > 0)
1421 				goto backtrace;
1422 			goto found;
1423 		}
1424 
1425 		cn = (struct tnode *)n;
1426 
1427 		/*
1428 		 * It's a tnode, and we can do some extra checks here if we
1429 		 * like, to avoid descending into a dead-end branch.
1430 		 * This tnode is in the parent's child array at index
1431 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1432 		 * chopped off, so in reality the index may be just a
1433 		 * subprefix, padded with zero at the end.
1434 		 * We can also take a look at any skipped bits in this
1435 		 * tnode - everything up to p_pos is supposed to be ok,
1436 		 * and the non-chopped bits of the index (se previous
1437 		 * paragraph) are also guaranteed ok, but the rest is
1438 		 * considered unknown.
1439 		 *
1440 		 * The skipped bits are key[pos+bits..cn->pos].
1441 		 */
1442 
1443 		/* If current_prefix_length < pos+bits, we are already doing
1444 		 * actual prefix  matching, which means everything from
1445 		 * pos+(bits-chopped_off) onward must be zero along some
1446 		 * branch of this subtree - otherwise there is *no* valid
1447 		 * prefix present. Here we can only check the skipped
1448 		 * bits. Remember, since we have already indexed into the
1449 		 * parent's child array, we know that the bits we chopped of
1450 		 * *are* zero.
1451 		 */
1452 
1453 		/* NOTA BENE: Checking only skipped bits
1454 		   for the new node here */
1455 
1456 		if (current_prefix_length < pos+bits) {
1457 			if (tkey_extract_bits(cn->key, current_prefix_length,
1458 						cn->pos - current_prefix_length)
1459 			    || !(cn->child[0]))
1460 				goto backtrace;
1461 		}
1462 
1463 		/*
1464 		 * If chopped_off=0, the index is fully validated and we
1465 		 * only need to look at the skipped bits for this, the new,
1466 		 * tnode. What we actually want to do is to find out if
1467 		 * these skipped bits match our key perfectly, or if we will
1468 		 * have to count on finding a matching prefix further down,
1469 		 * because if we do, we would like to have some way of
1470 		 * verifying the existence of such a prefix at this point.
1471 		 */
1472 
1473 		/* The only thing we can do at this point is to verify that
1474 		 * any such matching prefix can indeed be a prefix to our
1475 		 * key, and if the bits in the node we are inspecting that
1476 		 * do not match our key are not ZERO, this cannot be true.
1477 		 * Thus, find out where there is a mismatch (before cn->pos)
1478 		 * and verify that all the mismatching bits are zero in the
1479 		 * new tnode's key.
1480 		 */
1481 
1482 		/*
1483 		 * Note: We aren't very concerned about the piece of
1484 		 * the key that precede pn->pos+pn->bits, since these
1485 		 * have already been checked. The bits after cn->pos
1486 		 * aren't checked since these are by definition
1487 		 * "unknown" at this point. Thus, what we want to see
1488 		 * is if we are about to enter the "prefix matching"
1489 		 * state, and in that case verify that the skipped
1490 		 * bits that will prevail throughout this subtree are
1491 		 * zero, as they have to be if we are to find a
1492 		 * matching prefix.
1493 		 */
1494 
1495 		node_prefix = mask_pfx(cn->key, cn->pos);
1496 		key_prefix = mask_pfx(key, cn->pos);
1497 		pref_mismatch = key_prefix^node_prefix;
1498 		mp = 0;
1499 
1500 		/*
1501 		 * In short: If skipped bits in this node do not match
1502 		 * the search key, enter the "prefix matching"
1503 		 * state.directly.
1504 		 */
1505 		if (pref_mismatch) {
1506 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1507 				mp++;
1508 				pref_mismatch = pref_mismatch << 1;
1509 			}
1510 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1511 
1512 			if (key_prefix != 0)
1513 				goto backtrace;
1514 
1515 			if (current_prefix_length >= cn->pos)
1516 				current_prefix_length = mp;
1517 		}
1518 
1519 		pn = (struct tnode *)n; /* Descend */
1520 		chopped_off = 0;
1521 		continue;
1522 
1523 backtrace:
1524 		chopped_off++;
1525 
1526 		/* As zero don't change the child key (cindex) */
1527 		while ((chopped_off <= pn->bits)
1528 		       && !(cindex & (1<<(chopped_off-1))))
1529 			chopped_off++;
1530 
1531 		/* Decrease current_... with bits chopped off */
1532 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1533 			current_prefix_length = pn->pos + pn->bits
1534 				- chopped_off;
1535 
1536 		/*
1537 		 * Either we do the actual chop off according or if we have
1538 		 * chopped off all bits in this tnode walk up to our parent.
1539 		 */
1540 
1541 		if (chopped_off <= pn->bits) {
1542 			cindex &= ~(1 << (chopped_off-1));
1543 		} else {
1544 			struct tnode *parent = node_parent((struct node *) pn);
1545 			if (!parent)
1546 				goto failed;
1547 
1548 			/* Get Child's index */
1549 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1550 			pn = parent;
1551 			chopped_off = 0;
1552 
1553 #ifdef CONFIG_IP_FIB_TRIE_STATS
1554 			t->stats.backtrack++;
1555 #endif
1556 			goto backtrace;
1557 		}
1558 	}
1559 failed:
1560 	ret = 1;
1561 found:
1562 	rcu_read_unlock();
1563 	return ret;
1564 }
1565 
1566 /*
1567  * Remove the leaf and return parent.
1568  */
1569 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1570 {
1571 	struct tnode *tp = node_parent((struct node *) l);
1572 
1573 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1574 
1575 	if (tp) {
1576 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1577 		put_child(t, (struct tnode *)tp, cindex, NULL);
1578 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1579 	} else
1580 		rcu_assign_pointer(t->trie, NULL);
1581 
1582 	free_leaf(l);
1583 }
1584 
1585 /*
1586  * Caller must hold RTNL.
1587  */
1588 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1589 {
1590 	struct trie *t = (struct trie *) tb->tb_data;
1591 	u32 key, mask;
1592 	int plen = cfg->fc_dst_len;
1593 	u8 tos = cfg->fc_tos;
1594 	struct fib_alias *fa, *fa_to_delete;
1595 	struct list_head *fa_head;
1596 	struct leaf *l;
1597 	struct leaf_info *li;
1598 
1599 	if (plen > 32)
1600 		return -EINVAL;
1601 
1602 	key = ntohl(cfg->fc_dst);
1603 	mask = ntohl(inet_make_mask(plen));
1604 
1605 	if (key & ~mask)
1606 		return -EINVAL;
1607 
1608 	key = key & mask;
1609 	l = fib_find_node(t, key);
1610 
1611 	if (!l)
1612 		return -ESRCH;
1613 
1614 	fa_head = get_fa_head(l, plen);
1615 	fa = fib_find_alias(fa_head, tos, 0);
1616 
1617 	if (!fa)
1618 		return -ESRCH;
1619 
1620 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1621 
1622 	fa_to_delete = NULL;
1623 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1624 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1625 		struct fib_info *fi = fa->fa_info;
1626 
1627 		if (fa->fa_tos != tos)
1628 			break;
1629 
1630 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1631 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1632 		     fa->fa_scope == cfg->fc_scope) &&
1633 		    (!cfg->fc_protocol ||
1634 		     fi->fib_protocol == cfg->fc_protocol) &&
1635 		    fib_nh_match(cfg, fi) == 0) {
1636 			fa_to_delete = fa;
1637 			break;
1638 		}
1639 	}
1640 
1641 	if (!fa_to_delete)
1642 		return -ESRCH;
1643 
1644 	fa = fa_to_delete;
1645 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1646 		  &cfg->fc_nlinfo, 0);
1647 
1648 	l = fib_find_node(t, key);
1649 	li = find_leaf_info(l, plen);
1650 
1651 	list_del_rcu(&fa->fa_list);
1652 
1653 	if (list_empty(fa_head)) {
1654 		hlist_del_rcu(&li->hlist);
1655 		free_leaf_info(li);
1656 	}
1657 
1658 	if (hlist_empty(&l->list))
1659 		trie_leaf_remove(t, l);
1660 
1661 	if (fa->fa_state & FA_S_ACCESSED)
1662 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1663 
1664 	fib_release_info(fa->fa_info);
1665 	alias_free_mem_rcu(fa);
1666 	return 0;
1667 }
1668 
1669 static int trie_flush_list(struct list_head *head)
1670 {
1671 	struct fib_alias *fa, *fa_node;
1672 	int found = 0;
1673 
1674 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1675 		struct fib_info *fi = fa->fa_info;
1676 
1677 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1678 			list_del_rcu(&fa->fa_list);
1679 			fib_release_info(fa->fa_info);
1680 			alias_free_mem_rcu(fa);
1681 			found++;
1682 		}
1683 	}
1684 	return found;
1685 }
1686 
1687 static int trie_flush_leaf(struct leaf *l)
1688 {
1689 	int found = 0;
1690 	struct hlist_head *lih = &l->list;
1691 	struct hlist_node *node, *tmp;
1692 	struct leaf_info *li = NULL;
1693 
1694 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1695 		found += trie_flush_list(&li->falh);
1696 
1697 		if (list_empty(&li->falh)) {
1698 			hlist_del_rcu(&li->hlist);
1699 			free_leaf_info(li);
1700 		}
1701 	}
1702 	return found;
1703 }
1704 
1705 /*
1706  * Scan for the next right leaf starting at node p->child[idx]
1707  * Since we have back pointer, no recursion necessary.
1708  */
1709 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1710 {
1711 	do {
1712 		t_key idx;
1713 
1714 		if (c)
1715 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1716 		else
1717 			idx = 0;
1718 
1719 		while (idx < 1u << p->bits) {
1720 			c = tnode_get_child_rcu(p, idx++);
1721 			if (!c)
1722 				continue;
1723 
1724 			if (IS_LEAF(c)) {
1725 				prefetch(p->child[idx]);
1726 				return (struct leaf *) c;
1727 			}
1728 
1729 			/* Rescan start scanning in new node */
1730 			p = (struct tnode *) c;
1731 			idx = 0;
1732 		}
1733 
1734 		/* Node empty, walk back up to parent */
1735 		c = (struct node *) p;
1736 	} while ( (p = node_parent_rcu(c)) != NULL);
1737 
1738 	return NULL; /* Root of trie */
1739 }
1740 
1741 static struct leaf *trie_firstleaf(struct trie *t)
1742 {
1743 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1744 
1745 	if (!n)
1746 		return NULL;
1747 
1748 	if (IS_LEAF(n))          /* trie is just a leaf */
1749 		return (struct leaf *) n;
1750 
1751 	return leaf_walk_rcu(n, NULL);
1752 }
1753 
1754 static struct leaf *trie_nextleaf(struct leaf *l)
1755 {
1756 	struct node *c = (struct node *) l;
1757 	struct tnode *p = node_parent(c);
1758 
1759 	if (!p)
1760 		return NULL;	/* trie with just one leaf */
1761 
1762 	return leaf_walk_rcu(p, c);
1763 }
1764 
1765 static struct leaf *trie_leafindex(struct trie *t, int index)
1766 {
1767 	struct leaf *l = trie_firstleaf(t);
1768 
1769 	while (l && index-- > 0)
1770 		l = trie_nextleaf(l);
1771 
1772 	return l;
1773 }
1774 
1775 
1776 /*
1777  * Caller must hold RTNL.
1778  */
1779 static int fn_trie_flush(struct fib_table *tb)
1780 {
1781 	struct trie *t = (struct trie *) tb->tb_data;
1782 	struct leaf *l, *ll = NULL;
1783 	int found = 0;
1784 
1785 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1786 		found += trie_flush_leaf(l);
1787 
1788 		if (ll && hlist_empty(&ll->list))
1789 			trie_leaf_remove(t, ll);
1790 		ll = l;
1791 	}
1792 
1793 	if (ll && hlist_empty(&ll->list))
1794 		trie_leaf_remove(t, ll);
1795 
1796 	pr_debug("trie_flush found=%d\n", found);
1797 	return found;
1798 }
1799 
1800 static void fn_trie_select_default(struct fib_table *tb,
1801 				   const struct flowi *flp,
1802 				   struct fib_result *res)
1803 {
1804 	struct trie *t = (struct trie *) tb->tb_data;
1805 	int order, last_idx;
1806 	struct fib_info *fi = NULL;
1807 	struct fib_info *last_resort;
1808 	struct fib_alias *fa = NULL;
1809 	struct list_head *fa_head;
1810 	struct leaf *l;
1811 
1812 	last_idx = -1;
1813 	last_resort = NULL;
1814 	order = -1;
1815 
1816 	rcu_read_lock();
1817 
1818 	l = fib_find_node(t, 0);
1819 	if (!l)
1820 		goto out;
1821 
1822 	fa_head = get_fa_head(l, 0);
1823 	if (!fa_head)
1824 		goto out;
1825 
1826 	if (list_empty(fa_head))
1827 		goto out;
1828 
1829 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1830 		struct fib_info *next_fi = fa->fa_info;
1831 
1832 		if (fa->fa_scope != res->scope ||
1833 		    fa->fa_type != RTN_UNICAST)
1834 			continue;
1835 
1836 		if (next_fi->fib_priority > res->fi->fib_priority)
1837 			break;
1838 		if (!next_fi->fib_nh[0].nh_gw ||
1839 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1840 			continue;
1841 		fa->fa_state |= FA_S_ACCESSED;
1842 
1843 		if (fi == NULL) {
1844 			if (next_fi != res->fi)
1845 				break;
1846 		} else if (!fib_detect_death(fi, order, &last_resort,
1847 					     &last_idx, tb->tb_default)) {
1848 			fib_result_assign(res, fi);
1849 			tb->tb_default = order;
1850 			goto out;
1851 		}
1852 		fi = next_fi;
1853 		order++;
1854 	}
1855 	if (order <= 0 || fi == NULL) {
1856 		tb->tb_default = -1;
1857 		goto out;
1858 	}
1859 
1860 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1861 				tb->tb_default)) {
1862 		fib_result_assign(res, fi);
1863 		tb->tb_default = order;
1864 		goto out;
1865 	}
1866 	if (last_idx >= 0)
1867 		fib_result_assign(res, last_resort);
1868 	tb->tb_default = last_idx;
1869 out:
1870 	rcu_read_unlock();
1871 }
1872 
1873 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1874 			   struct fib_table *tb,
1875 			   struct sk_buff *skb, struct netlink_callback *cb)
1876 {
1877 	int i, s_i;
1878 	struct fib_alias *fa;
1879 	__be32 xkey = htonl(key);
1880 
1881 	s_i = cb->args[5];
1882 	i = 0;
1883 
1884 	/* rcu_read_lock is hold by caller */
1885 
1886 	list_for_each_entry_rcu(fa, fah, fa_list) {
1887 		if (i < s_i) {
1888 			i++;
1889 			continue;
1890 		}
1891 
1892 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1893 				  cb->nlh->nlmsg_seq,
1894 				  RTM_NEWROUTE,
1895 				  tb->tb_id,
1896 				  fa->fa_type,
1897 				  fa->fa_scope,
1898 				  xkey,
1899 				  plen,
1900 				  fa->fa_tos,
1901 				  fa->fa_info, NLM_F_MULTI) < 0) {
1902 			cb->args[5] = i;
1903 			return -1;
1904 		}
1905 		i++;
1906 	}
1907 	cb->args[5] = i;
1908 	return skb->len;
1909 }
1910 
1911 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1912 			struct sk_buff *skb, struct netlink_callback *cb)
1913 {
1914 	struct leaf_info *li;
1915 	struct hlist_node *node;
1916 	int i, s_i;
1917 
1918 	s_i = cb->args[4];
1919 	i = 0;
1920 
1921 	/* rcu_read_lock is hold by caller */
1922 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1923 		if (i < s_i) {
1924 			i++;
1925 			continue;
1926 		}
1927 
1928 		if (i > s_i)
1929 			cb->args[5] = 0;
1930 
1931 		if (list_empty(&li->falh))
1932 			continue;
1933 
1934 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1935 			cb->args[4] = i;
1936 			return -1;
1937 		}
1938 		i++;
1939 	}
1940 
1941 	cb->args[4] = i;
1942 	return skb->len;
1943 }
1944 
1945 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1946 			struct netlink_callback *cb)
1947 {
1948 	struct leaf *l;
1949 	struct trie *t = (struct trie *) tb->tb_data;
1950 	t_key key = cb->args[2];
1951 	int count = cb->args[3];
1952 
1953 	rcu_read_lock();
1954 	/* Dump starting at last key.
1955 	 * Note: 0.0.0.0/0 (ie default) is first key.
1956 	 */
1957 	if (count == 0)
1958 		l = trie_firstleaf(t);
1959 	else {
1960 		/* Normally, continue from last key, but if that is missing
1961 		 * fallback to using slow rescan
1962 		 */
1963 		l = fib_find_node(t, key);
1964 		if (!l)
1965 			l = trie_leafindex(t, count);
1966 	}
1967 
1968 	while (l) {
1969 		cb->args[2] = l->key;
1970 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1971 			cb->args[3] = count;
1972 			rcu_read_unlock();
1973 			return -1;
1974 		}
1975 
1976 		++count;
1977 		l = trie_nextleaf(l);
1978 		memset(&cb->args[4], 0,
1979 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1980 	}
1981 	cb->args[3] = count;
1982 	rcu_read_unlock();
1983 
1984 	return skb->len;
1985 }
1986 
1987 void __init fib_hash_init(void)
1988 {
1989 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1990 					  sizeof(struct fib_alias),
1991 					  0, SLAB_PANIC, NULL);
1992 
1993 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1994 					   max(sizeof(struct leaf),
1995 					       sizeof(struct leaf_info)),
1996 					   0, SLAB_PANIC, NULL);
1997 }
1998 
1999 
2000 /* Fix more generic FIB names for init later */
2001 struct fib_table *fib_hash_table(u32 id)
2002 {
2003 	struct fib_table *tb;
2004 	struct trie *t;
2005 
2006 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2007 		     GFP_KERNEL);
2008 	if (tb == NULL)
2009 		return NULL;
2010 
2011 	tb->tb_id = id;
2012 	tb->tb_default = -1;
2013 	tb->tb_lookup = fn_trie_lookup;
2014 	tb->tb_insert = fn_trie_insert;
2015 	tb->tb_delete = fn_trie_delete;
2016 	tb->tb_flush = fn_trie_flush;
2017 	tb->tb_select_default = fn_trie_select_default;
2018 	tb->tb_dump = fn_trie_dump;
2019 
2020 	t = (struct trie *) tb->tb_data;
2021 	memset(t, 0, sizeof(*t));
2022 
2023 	if (id == RT_TABLE_LOCAL)
2024 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2025 
2026 	return tb;
2027 }
2028 
2029 #ifdef CONFIG_PROC_FS
2030 /* Depth first Trie walk iterator */
2031 struct fib_trie_iter {
2032 	struct seq_net_private p;
2033 	struct fib_table *tb;
2034 	struct tnode *tnode;
2035 	unsigned index;
2036 	unsigned depth;
2037 };
2038 
2039 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2040 {
2041 	struct tnode *tn = iter->tnode;
2042 	unsigned cindex = iter->index;
2043 	struct tnode *p;
2044 
2045 	/* A single entry routing table */
2046 	if (!tn)
2047 		return NULL;
2048 
2049 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2050 		 iter->tnode, iter->index, iter->depth);
2051 rescan:
2052 	while (cindex < (1<<tn->bits)) {
2053 		struct node *n = tnode_get_child_rcu(tn, cindex);
2054 
2055 		if (n) {
2056 			if (IS_LEAF(n)) {
2057 				iter->tnode = tn;
2058 				iter->index = cindex + 1;
2059 			} else {
2060 				/* push down one level */
2061 				iter->tnode = (struct tnode *) n;
2062 				iter->index = 0;
2063 				++iter->depth;
2064 			}
2065 			return n;
2066 		}
2067 
2068 		++cindex;
2069 	}
2070 
2071 	/* Current node exhausted, pop back up */
2072 	p = node_parent_rcu((struct node *)tn);
2073 	if (p) {
2074 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2075 		tn = p;
2076 		--iter->depth;
2077 		goto rescan;
2078 	}
2079 
2080 	/* got root? */
2081 	return NULL;
2082 }
2083 
2084 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2085 				       struct trie *t)
2086 {
2087 	struct node *n;
2088 
2089 	if (!t)
2090 		return NULL;
2091 
2092 	n = rcu_dereference(t->trie);
2093 	if (!n)
2094 		return NULL;
2095 
2096 	if (IS_TNODE(n)) {
2097 		iter->tnode = (struct tnode *) n;
2098 		iter->index = 0;
2099 		iter->depth = 1;
2100 	} else {
2101 		iter->tnode = NULL;
2102 		iter->index = 0;
2103 		iter->depth = 0;
2104 	}
2105 
2106 	return n;
2107 }
2108 
2109 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2110 {
2111 	struct node *n;
2112 	struct fib_trie_iter iter;
2113 
2114 	memset(s, 0, sizeof(*s));
2115 
2116 	rcu_read_lock();
2117 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2118 		if (IS_LEAF(n)) {
2119 			struct leaf *l = (struct leaf *)n;
2120 			struct leaf_info *li;
2121 			struct hlist_node *tmp;
2122 
2123 			s->leaves++;
2124 			s->totdepth += iter.depth;
2125 			if (iter.depth > s->maxdepth)
2126 				s->maxdepth = iter.depth;
2127 
2128 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2129 				++s->prefixes;
2130 		} else {
2131 			const struct tnode *tn = (const struct tnode *) n;
2132 			int i;
2133 
2134 			s->tnodes++;
2135 			if (tn->bits < MAX_STAT_DEPTH)
2136 				s->nodesizes[tn->bits]++;
2137 
2138 			for (i = 0; i < (1<<tn->bits); i++)
2139 				if (!tn->child[i])
2140 					s->nullpointers++;
2141 		}
2142 	}
2143 	rcu_read_unlock();
2144 }
2145 
2146 /*
2147  *	This outputs /proc/net/fib_triestats
2148  */
2149 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2150 {
2151 	unsigned i, max, pointers, bytes, avdepth;
2152 
2153 	if (stat->leaves)
2154 		avdepth = stat->totdepth*100 / stat->leaves;
2155 	else
2156 		avdepth = 0;
2157 
2158 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2159 		   avdepth / 100, avdepth % 100);
2160 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2161 
2162 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2163 	bytes = sizeof(struct leaf) * stat->leaves;
2164 
2165 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2166 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2167 
2168 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2169 	bytes += sizeof(struct tnode) * stat->tnodes;
2170 
2171 	max = MAX_STAT_DEPTH;
2172 	while (max > 0 && stat->nodesizes[max-1] == 0)
2173 		max--;
2174 
2175 	pointers = 0;
2176 	for (i = 1; i <= max; i++)
2177 		if (stat->nodesizes[i] != 0) {
2178 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2179 			pointers += (1<<i) * stat->nodesizes[i];
2180 		}
2181 	seq_putc(seq, '\n');
2182 	seq_printf(seq, "\tPointers: %u\n", pointers);
2183 
2184 	bytes += sizeof(struct node *) * pointers;
2185 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2186 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2187 }
2188 
2189 #ifdef CONFIG_IP_FIB_TRIE_STATS
2190 static void trie_show_usage(struct seq_file *seq,
2191 			    const struct trie_use_stats *stats)
2192 {
2193 	seq_printf(seq, "\nCounters:\n---------\n");
2194 	seq_printf(seq, "gets = %u\n", stats->gets);
2195 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2196 	seq_printf(seq, "semantic match passed = %u\n",
2197 		   stats->semantic_match_passed);
2198 	seq_printf(seq, "semantic match miss = %u\n",
2199 		   stats->semantic_match_miss);
2200 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2201 	seq_printf(seq, "skipped node resize = %u\n\n",
2202 		   stats->resize_node_skipped);
2203 }
2204 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2205 
2206 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2207 {
2208 	if (tb->tb_id == RT_TABLE_LOCAL)
2209 		seq_puts(seq, "Local:\n");
2210 	else if (tb->tb_id == RT_TABLE_MAIN)
2211 		seq_puts(seq, "Main:\n");
2212 	else
2213 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2214 }
2215 
2216 
2217 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2218 {
2219 	struct net *net = (struct net *)seq->private;
2220 	unsigned int h;
2221 
2222 	seq_printf(seq,
2223 		   "Basic info: size of leaf:"
2224 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2225 		   sizeof(struct leaf), sizeof(struct tnode));
2226 
2227 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2228 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2229 		struct hlist_node *node;
2230 		struct fib_table *tb;
2231 
2232 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2233 			struct trie *t = (struct trie *) tb->tb_data;
2234 			struct trie_stat stat;
2235 
2236 			if (!t)
2237 				continue;
2238 
2239 			fib_table_print(seq, tb);
2240 
2241 			trie_collect_stats(t, &stat);
2242 			trie_show_stats(seq, &stat);
2243 #ifdef CONFIG_IP_FIB_TRIE_STATS
2244 			trie_show_usage(seq, &t->stats);
2245 #endif
2246 		}
2247 	}
2248 
2249 	return 0;
2250 }
2251 
2252 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2253 {
2254 	return single_open_net(inode, file, fib_triestat_seq_show);
2255 }
2256 
2257 static const struct file_operations fib_triestat_fops = {
2258 	.owner	= THIS_MODULE,
2259 	.open	= fib_triestat_seq_open,
2260 	.read	= seq_read,
2261 	.llseek	= seq_lseek,
2262 	.release = single_release_net,
2263 };
2264 
2265 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2266 {
2267 	struct fib_trie_iter *iter = seq->private;
2268 	struct net *net = seq_file_net(seq);
2269 	loff_t idx = 0;
2270 	unsigned int h;
2271 
2272 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2273 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2274 		struct hlist_node *node;
2275 		struct fib_table *tb;
2276 
2277 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2278 			struct node *n;
2279 
2280 			for (n = fib_trie_get_first(iter,
2281 						    (struct trie *) tb->tb_data);
2282 			     n; n = fib_trie_get_next(iter))
2283 				if (pos == idx++) {
2284 					iter->tb = tb;
2285 					return n;
2286 				}
2287 		}
2288 	}
2289 
2290 	return NULL;
2291 }
2292 
2293 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2294 	__acquires(RCU)
2295 {
2296 	rcu_read_lock();
2297 	return fib_trie_get_idx(seq, *pos);
2298 }
2299 
2300 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2301 {
2302 	struct fib_trie_iter *iter = seq->private;
2303 	struct net *net = seq_file_net(seq);
2304 	struct fib_table *tb = iter->tb;
2305 	struct hlist_node *tb_node;
2306 	unsigned int h;
2307 	struct node *n;
2308 
2309 	++*pos;
2310 	/* next node in same table */
2311 	n = fib_trie_get_next(iter);
2312 	if (n)
2313 		return n;
2314 
2315 	/* walk rest of this hash chain */
2316 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2317 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2318 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2319 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2320 		if (n)
2321 			goto found;
2322 	}
2323 
2324 	/* new hash chain */
2325 	while (++h < FIB_TABLE_HASHSZ) {
2326 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2327 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2328 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2329 			if (n)
2330 				goto found;
2331 		}
2332 	}
2333 	return NULL;
2334 
2335 found:
2336 	iter->tb = tb;
2337 	return n;
2338 }
2339 
2340 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2341 	__releases(RCU)
2342 {
2343 	rcu_read_unlock();
2344 }
2345 
2346 static void seq_indent(struct seq_file *seq, int n)
2347 {
2348 	while (n-- > 0) seq_puts(seq, "   ");
2349 }
2350 
2351 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2352 {
2353 	switch (s) {
2354 	case RT_SCOPE_UNIVERSE: return "universe";
2355 	case RT_SCOPE_SITE:	return "site";
2356 	case RT_SCOPE_LINK:	return "link";
2357 	case RT_SCOPE_HOST:	return "host";
2358 	case RT_SCOPE_NOWHERE:	return "nowhere";
2359 	default:
2360 		snprintf(buf, len, "scope=%d", s);
2361 		return buf;
2362 	}
2363 }
2364 
2365 static const char *rtn_type_names[__RTN_MAX] = {
2366 	[RTN_UNSPEC] = "UNSPEC",
2367 	[RTN_UNICAST] = "UNICAST",
2368 	[RTN_LOCAL] = "LOCAL",
2369 	[RTN_BROADCAST] = "BROADCAST",
2370 	[RTN_ANYCAST] = "ANYCAST",
2371 	[RTN_MULTICAST] = "MULTICAST",
2372 	[RTN_BLACKHOLE] = "BLACKHOLE",
2373 	[RTN_UNREACHABLE] = "UNREACHABLE",
2374 	[RTN_PROHIBIT] = "PROHIBIT",
2375 	[RTN_THROW] = "THROW",
2376 	[RTN_NAT] = "NAT",
2377 	[RTN_XRESOLVE] = "XRESOLVE",
2378 };
2379 
2380 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2381 {
2382 	if (t < __RTN_MAX && rtn_type_names[t])
2383 		return rtn_type_names[t];
2384 	snprintf(buf, len, "type %u", t);
2385 	return buf;
2386 }
2387 
2388 /* Pretty print the trie */
2389 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2390 {
2391 	const struct fib_trie_iter *iter = seq->private;
2392 	struct node *n = v;
2393 
2394 	if (!node_parent_rcu(n))
2395 		fib_table_print(seq, iter->tb);
2396 
2397 	if (IS_TNODE(n)) {
2398 		struct tnode *tn = (struct tnode *) n;
2399 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2400 
2401 		seq_indent(seq, iter->depth-1);
2402 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2403 			   &prf, tn->pos, tn->bits, tn->full_children,
2404 			   tn->empty_children);
2405 
2406 	} else {
2407 		struct leaf *l = (struct leaf *) n;
2408 		struct leaf_info *li;
2409 		struct hlist_node *node;
2410 		__be32 val = htonl(l->key);
2411 
2412 		seq_indent(seq, iter->depth);
2413 		seq_printf(seq, "  |-- %pI4\n", &val);
2414 
2415 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2416 			struct fib_alias *fa;
2417 
2418 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2419 				char buf1[32], buf2[32];
2420 
2421 				seq_indent(seq, iter->depth+1);
2422 				seq_printf(seq, "  /%d %s %s", li->plen,
2423 					   rtn_scope(buf1, sizeof(buf1),
2424 						     fa->fa_scope),
2425 					   rtn_type(buf2, sizeof(buf2),
2426 						    fa->fa_type));
2427 				if (fa->fa_tos)
2428 					seq_printf(seq, " tos=%d", fa->fa_tos);
2429 				seq_putc(seq, '\n');
2430 			}
2431 		}
2432 	}
2433 
2434 	return 0;
2435 }
2436 
2437 static const struct seq_operations fib_trie_seq_ops = {
2438 	.start  = fib_trie_seq_start,
2439 	.next   = fib_trie_seq_next,
2440 	.stop   = fib_trie_seq_stop,
2441 	.show   = fib_trie_seq_show,
2442 };
2443 
2444 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2445 {
2446 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2447 			    sizeof(struct fib_trie_iter));
2448 }
2449 
2450 static const struct file_operations fib_trie_fops = {
2451 	.owner  = THIS_MODULE,
2452 	.open   = fib_trie_seq_open,
2453 	.read   = seq_read,
2454 	.llseek = seq_lseek,
2455 	.release = seq_release_net,
2456 };
2457 
2458 struct fib_route_iter {
2459 	struct seq_net_private p;
2460 	struct trie *main_trie;
2461 	loff_t	pos;
2462 	t_key	key;
2463 };
2464 
2465 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2466 {
2467 	struct leaf *l = NULL;
2468 	struct trie *t = iter->main_trie;
2469 
2470 	/* use cache location of last found key */
2471 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2472 		pos -= iter->pos;
2473 	else {
2474 		iter->pos = 0;
2475 		l = trie_firstleaf(t);
2476 	}
2477 
2478 	while (l && pos-- > 0) {
2479 		iter->pos++;
2480 		l = trie_nextleaf(l);
2481 	}
2482 
2483 	if (l)
2484 		iter->key = pos;	/* remember it */
2485 	else
2486 		iter->pos = 0;		/* forget it */
2487 
2488 	return l;
2489 }
2490 
2491 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2492 	__acquires(RCU)
2493 {
2494 	struct fib_route_iter *iter = seq->private;
2495 	struct fib_table *tb;
2496 
2497 	rcu_read_lock();
2498 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2499 	if (!tb)
2500 		return NULL;
2501 
2502 	iter->main_trie = (struct trie *) tb->tb_data;
2503 	if (*pos == 0)
2504 		return SEQ_START_TOKEN;
2505 	else
2506 		return fib_route_get_idx(iter, *pos - 1);
2507 }
2508 
2509 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2510 {
2511 	struct fib_route_iter *iter = seq->private;
2512 	struct leaf *l = v;
2513 
2514 	++*pos;
2515 	if (v == SEQ_START_TOKEN) {
2516 		iter->pos = 0;
2517 		l = trie_firstleaf(iter->main_trie);
2518 	} else {
2519 		iter->pos++;
2520 		l = trie_nextleaf(l);
2521 	}
2522 
2523 	if (l)
2524 		iter->key = l->key;
2525 	else
2526 		iter->pos = 0;
2527 	return l;
2528 }
2529 
2530 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2531 	__releases(RCU)
2532 {
2533 	rcu_read_unlock();
2534 }
2535 
2536 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2537 {
2538 	static unsigned type2flags[RTN_MAX + 1] = {
2539 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2540 	};
2541 	unsigned flags = type2flags[type];
2542 
2543 	if (fi && fi->fib_nh->nh_gw)
2544 		flags |= RTF_GATEWAY;
2545 	if (mask == htonl(0xFFFFFFFF))
2546 		flags |= RTF_HOST;
2547 	flags |= RTF_UP;
2548 	return flags;
2549 }
2550 
2551 /*
2552  *	This outputs /proc/net/route.
2553  *	The format of the file is not supposed to be changed
2554  * 	and needs to be same as fib_hash output to avoid breaking
2555  *	legacy utilities
2556  */
2557 static int fib_route_seq_show(struct seq_file *seq, void *v)
2558 {
2559 	struct leaf *l = v;
2560 	struct leaf_info *li;
2561 	struct hlist_node *node;
2562 
2563 	if (v == SEQ_START_TOKEN) {
2564 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2565 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2566 			   "\tWindow\tIRTT");
2567 		return 0;
2568 	}
2569 
2570 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2571 		struct fib_alias *fa;
2572 		__be32 mask, prefix;
2573 
2574 		mask = inet_make_mask(li->plen);
2575 		prefix = htonl(l->key);
2576 
2577 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2578 			const struct fib_info *fi = fa->fa_info;
2579 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2580 			int len;
2581 
2582 			if (fa->fa_type == RTN_BROADCAST
2583 			    || fa->fa_type == RTN_MULTICAST)
2584 				continue;
2585 
2586 			if (fi)
2587 				seq_printf(seq,
2588 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2589 					 "%d\t%08X\t%d\t%u\t%u%n",
2590 					 fi->fib_dev ? fi->fib_dev->name : "*",
2591 					 prefix,
2592 					 fi->fib_nh->nh_gw, flags, 0, 0,
2593 					 fi->fib_priority,
2594 					 mask,
2595 					 (fi->fib_advmss ?
2596 					  fi->fib_advmss + 40 : 0),
2597 					 fi->fib_window,
2598 					 fi->fib_rtt >> 3, &len);
2599 			else
2600 				seq_printf(seq,
2601 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2602 					 "%d\t%08X\t%d\t%u\t%u%n",
2603 					 prefix, 0, flags, 0, 0, 0,
2604 					 mask, 0, 0, 0, &len);
2605 
2606 			seq_printf(seq, "%*s\n", 127 - len, "");
2607 		}
2608 	}
2609 
2610 	return 0;
2611 }
2612 
2613 static const struct seq_operations fib_route_seq_ops = {
2614 	.start  = fib_route_seq_start,
2615 	.next   = fib_route_seq_next,
2616 	.stop   = fib_route_seq_stop,
2617 	.show   = fib_route_seq_show,
2618 };
2619 
2620 static int fib_route_seq_open(struct inode *inode, struct file *file)
2621 {
2622 	return seq_open_net(inode, file, &fib_route_seq_ops,
2623 			    sizeof(struct fib_route_iter));
2624 }
2625 
2626 static const struct file_operations fib_route_fops = {
2627 	.owner  = THIS_MODULE,
2628 	.open   = fib_route_seq_open,
2629 	.read   = seq_read,
2630 	.llseek = seq_lseek,
2631 	.release = seq_release_net,
2632 };
2633 
2634 int __net_init fib_proc_init(struct net *net)
2635 {
2636 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2637 		goto out1;
2638 
2639 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2640 				  &fib_triestat_fops))
2641 		goto out2;
2642 
2643 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2644 		goto out3;
2645 
2646 	return 0;
2647 
2648 out3:
2649 	proc_net_remove(net, "fib_triestat");
2650 out2:
2651 	proc_net_remove(net, "fib_trie");
2652 out1:
2653 	return -ENOMEM;
2654 }
2655 
2656 void __net_exit fib_proc_exit(struct net *net)
2657 {
2658 	proc_net_remove(net, "fib_trie");
2659 	proc_net_remove(net, "fib_triestat");
2660 	proc_net_remove(net, "route");
2661 }
2662 
2663 #endif /* CONFIG_PROC_FS */
2664