xref: /openbmc/linux/net/ipv4/fib_trie.c (revision f42b3800)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.408"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 
89 typedef unsigned int t_key;
90 
91 #define T_TNODE 0
92 #define T_LEAF  1
93 #define NODE_TYPE_MASK	0x1UL
94 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 
96 #define IS_TNODE(n) (!(n->parent & T_LEAF))
97 #define IS_LEAF(n) (n->parent & T_LEAF)
98 
99 struct node {
100 	unsigned long parent;
101 	t_key key;
102 };
103 
104 struct leaf {
105 	unsigned long parent;
106 	t_key key;
107 	struct hlist_head list;
108 	struct rcu_head rcu;
109 };
110 
111 struct leaf_info {
112 	struct hlist_node hlist;
113 	struct rcu_head rcu;
114 	int plen;
115 	struct list_head falh;
116 };
117 
118 struct tnode {
119 	unsigned long parent;
120 	t_key key;
121 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 	unsigned int full_children;	/* KEYLENGTH bits needed */
124 	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 	union {
126 		struct rcu_head rcu;
127 		struct work_struct work;
128 	};
129 	struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int prefixes;
150 	unsigned int nodesizes[MAX_STAT_DEPTH];
151 };
152 
153 struct trie {
154 	struct node *trie;
155 #ifdef CONFIG_IP_FIB_TRIE_STATS
156 	struct trie_use_stats stats;
157 #endif
158 };
159 
160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 				  int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 
167 static struct kmem_cache *fn_alias_kmem __read_mostly;
168 static struct kmem_cache *trie_leaf_kmem __read_mostly;
169 
170 static inline struct tnode *node_parent(struct node *node)
171 {
172 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
173 }
174 
175 static inline struct tnode *node_parent_rcu(struct node *node)
176 {
177 	struct tnode *ret = node_parent(node);
178 
179 	return rcu_dereference(ret);
180 }
181 
182 /* Same as rcu_assign_pointer
183  * but that macro() assumes that value is a pointer.
184  */
185 static inline void node_set_parent(struct node *node, struct tnode *ptr)
186 {
187 	smp_wmb();
188 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
189 }
190 
191 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
192 {
193 	BUG_ON(i >= 1U << tn->bits);
194 
195 	return tn->child[i];
196 }
197 
198 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
199 {
200 	struct node *ret = tnode_get_child(tn, i);
201 
202 	return rcu_dereference(ret);
203 }
204 
205 static inline int tnode_child_length(const struct tnode *tn)
206 {
207 	return 1 << tn->bits;
208 }
209 
210 static inline t_key mask_pfx(t_key k, unsigned short l)
211 {
212 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
213 }
214 
215 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
216 {
217 	if (offset < KEYLENGTH)
218 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
219 	else
220 		return 0;
221 }
222 
223 static inline int tkey_equals(t_key a, t_key b)
224 {
225 	return a == b;
226 }
227 
228 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
229 {
230 	if (bits == 0 || offset >= KEYLENGTH)
231 		return 1;
232 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
233 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
234 }
235 
236 static inline int tkey_mismatch(t_key a, int offset, t_key b)
237 {
238 	t_key diff = a ^ b;
239 	int i = offset;
240 
241 	if (!diff)
242 		return 0;
243 	while ((diff << i) >> (KEYLENGTH-1) == 0)
244 		i++;
245 	return i;
246 }
247 
248 /*
249   To understand this stuff, an understanding of keys and all their bits is
250   necessary. Every node in the trie has a key associated with it, but not
251   all of the bits in that key are significant.
252 
253   Consider a node 'n' and its parent 'tp'.
254 
255   If n is a leaf, every bit in its key is significant. Its presence is
256   necessitated by path compression, since during a tree traversal (when
257   searching for a leaf - unless we are doing an insertion) we will completely
258   ignore all skipped bits we encounter. Thus we need to verify, at the end of
259   a potentially successful search, that we have indeed been walking the
260   correct key path.
261 
262   Note that we can never "miss" the correct key in the tree if present by
263   following the wrong path. Path compression ensures that segments of the key
264   that are the same for all keys with a given prefix are skipped, but the
265   skipped part *is* identical for each node in the subtrie below the skipped
266   bit! trie_insert() in this implementation takes care of that - note the
267   call to tkey_sub_equals() in trie_insert().
268 
269   if n is an internal node - a 'tnode' here, the various parts of its key
270   have many different meanings.
271 
272   Example:
273   _________________________________________________________________
274   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
275   -----------------------------------------------------------------
276     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
277 
278   _________________________________________________________________
279   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
280   -----------------------------------------------------------------
281    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
282 
283   tp->pos = 7
284   tp->bits = 3
285   n->pos = 15
286   n->bits = 4
287 
288   First, let's just ignore the bits that come before the parent tp, that is
289   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
290   not use them for anything.
291 
292   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
293   index into the parent's child array. That is, they will be used to find
294   'n' among tp's children.
295 
296   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
297   for the node n.
298 
299   All the bits we have seen so far are significant to the node n. The rest
300   of the bits are really not needed or indeed known in n->key.
301 
302   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
303   n's child array, and will of course be different for each child.
304 
305 
306   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
307   at this point.
308 
309 */
310 
311 static inline void check_tnode(const struct tnode *tn)
312 {
313 	WARN_ON(tn && tn->pos+tn->bits > 32);
314 }
315 
316 static const int halve_threshold = 25;
317 static const int inflate_threshold = 50;
318 static const int halve_threshold_root = 8;
319 static const int inflate_threshold_root = 15;
320 
321 
322 static void __alias_free_mem(struct rcu_head *head)
323 {
324 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
325 	kmem_cache_free(fn_alias_kmem, fa);
326 }
327 
328 static inline void alias_free_mem_rcu(struct fib_alias *fa)
329 {
330 	call_rcu(&fa->rcu, __alias_free_mem);
331 }
332 
333 static void __leaf_free_rcu(struct rcu_head *head)
334 {
335 	struct leaf *l = container_of(head, struct leaf, rcu);
336 	kmem_cache_free(trie_leaf_kmem, l);
337 }
338 
339 static inline void free_leaf(struct leaf *l)
340 {
341 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
342 }
343 
344 static void __leaf_info_free_rcu(struct rcu_head *head)
345 {
346 	kfree(container_of(head, struct leaf_info, rcu));
347 }
348 
349 static inline void free_leaf_info(struct leaf_info *leaf)
350 {
351 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
352 }
353 
354 static struct tnode *tnode_alloc(size_t size)
355 {
356 	if (size <= PAGE_SIZE)
357 		return kzalloc(size, GFP_KERNEL);
358 	else
359 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
360 }
361 
362 static void __tnode_vfree(struct work_struct *arg)
363 {
364 	struct tnode *tn = container_of(arg, struct tnode, work);
365 	vfree(tn);
366 }
367 
368 static void __tnode_free_rcu(struct rcu_head *head)
369 {
370 	struct tnode *tn = container_of(head, struct tnode, rcu);
371 	size_t size = sizeof(struct tnode) +
372 		      (sizeof(struct node *) << tn->bits);
373 
374 	if (size <= PAGE_SIZE)
375 		kfree(tn);
376 	else {
377 		INIT_WORK(&tn->work, __tnode_vfree);
378 		schedule_work(&tn->work);
379 	}
380 }
381 
382 static inline void tnode_free(struct tnode *tn)
383 {
384 	if (IS_LEAF(tn))
385 		free_leaf((struct leaf *) tn);
386 	else
387 		call_rcu(&tn->rcu, __tnode_free_rcu);
388 }
389 
390 static struct leaf *leaf_new(void)
391 {
392 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
393 	if (l) {
394 		l->parent = T_LEAF;
395 		INIT_HLIST_HEAD(&l->list);
396 	}
397 	return l;
398 }
399 
400 static struct leaf_info *leaf_info_new(int plen)
401 {
402 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
403 	if (li) {
404 		li->plen = plen;
405 		INIT_LIST_HEAD(&li->falh);
406 	}
407 	return li;
408 }
409 
410 static struct tnode *tnode_new(t_key key, int pos, int bits)
411 {
412 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
413 	struct tnode *tn = tnode_alloc(sz);
414 
415 	if (tn) {
416 		tn->parent = T_TNODE;
417 		tn->pos = pos;
418 		tn->bits = bits;
419 		tn->key = key;
420 		tn->full_children = 0;
421 		tn->empty_children = 1<<bits;
422 	}
423 
424 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
425 		 (unsigned long) (sizeof(struct node) << bits));
426 	return tn;
427 }
428 
429 /*
430  * Check whether a tnode 'n' is "full", i.e. it is an internal node
431  * and no bits are skipped. See discussion in dyntree paper p. 6
432  */
433 
434 static inline int tnode_full(const struct tnode *tn, const struct node *n)
435 {
436 	if (n == NULL || IS_LEAF(n))
437 		return 0;
438 
439 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
440 }
441 
442 static inline void put_child(struct trie *t, struct tnode *tn, int i,
443 			     struct node *n)
444 {
445 	tnode_put_child_reorg(tn, i, n, -1);
446 }
447 
448  /*
449   * Add a child at position i overwriting the old value.
450   * Update the value of full_children and empty_children.
451   */
452 
453 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
454 				  int wasfull)
455 {
456 	struct node *chi = tn->child[i];
457 	int isfull;
458 
459 	BUG_ON(i >= 1<<tn->bits);
460 
461 	/* update emptyChildren */
462 	if (n == NULL && chi != NULL)
463 		tn->empty_children++;
464 	else if (n != NULL && chi == NULL)
465 		tn->empty_children--;
466 
467 	/* update fullChildren */
468 	if (wasfull == -1)
469 		wasfull = tnode_full(tn, chi);
470 
471 	isfull = tnode_full(tn, n);
472 	if (wasfull && !isfull)
473 		tn->full_children--;
474 	else if (!wasfull && isfull)
475 		tn->full_children++;
476 
477 	if (n)
478 		node_set_parent(n, tn);
479 
480 	rcu_assign_pointer(tn->child[i], n);
481 }
482 
483 static struct node *resize(struct trie *t, struct tnode *tn)
484 {
485 	int i;
486 	int err = 0;
487 	struct tnode *old_tn;
488 	int inflate_threshold_use;
489 	int halve_threshold_use;
490 	int max_resize;
491 
492 	if (!tn)
493 		return NULL;
494 
495 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
496 		 tn, inflate_threshold, halve_threshold);
497 
498 	/* No children */
499 	if (tn->empty_children == tnode_child_length(tn)) {
500 		tnode_free(tn);
501 		return NULL;
502 	}
503 	/* One child */
504 	if (tn->empty_children == tnode_child_length(tn) - 1)
505 		for (i = 0; i < tnode_child_length(tn); i++) {
506 			struct node *n;
507 
508 			n = tn->child[i];
509 			if (!n)
510 				continue;
511 
512 			/* compress one level */
513 			node_set_parent(n, NULL);
514 			tnode_free(tn);
515 			return n;
516 		}
517 	/*
518 	 * Double as long as the resulting node has a number of
519 	 * nonempty nodes that are above the threshold.
520 	 */
521 
522 	/*
523 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
524 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
525 	 * Telecommunications, page 6:
526 	 * "A node is doubled if the ratio of non-empty children to all
527 	 * children in the *doubled* node is at least 'high'."
528 	 *
529 	 * 'high' in this instance is the variable 'inflate_threshold'. It
530 	 * is expressed as a percentage, so we multiply it with
531 	 * tnode_child_length() and instead of multiplying by 2 (since the
532 	 * child array will be doubled by inflate()) and multiplying
533 	 * the left-hand side by 100 (to handle the percentage thing) we
534 	 * multiply the left-hand side by 50.
535 	 *
536 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
537 	 * - tn->empty_children is of course the number of non-null children
538 	 * in the current node. tn->full_children is the number of "full"
539 	 * children, that is non-null tnodes with a skip value of 0.
540 	 * All of those will be doubled in the resulting inflated tnode, so
541 	 * we just count them one extra time here.
542 	 *
543 	 * A clearer way to write this would be:
544 	 *
545 	 * to_be_doubled = tn->full_children;
546 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
547 	 *     tn->full_children;
548 	 *
549 	 * new_child_length = tnode_child_length(tn) * 2;
550 	 *
551 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
552 	 *      new_child_length;
553 	 * if (new_fill_factor >= inflate_threshold)
554 	 *
555 	 * ...and so on, tho it would mess up the while () loop.
556 	 *
557 	 * anyway,
558 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
559 	 *      inflate_threshold
560 	 *
561 	 * avoid a division:
562 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
563 	 *      inflate_threshold * new_child_length
564 	 *
565 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
566 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
567 	 *    tn->full_children) >= inflate_threshold * new_child_length
568 	 *
569 	 * expand new_child_length:
570 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
571 	 *    tn->full_children) >=
572 	 *      inflate_threshold * tnode_child_length(tn) * 2
573 	 *
574 	 * shorten again:
575 	 * 50 * (tn->full_children + tnode_child_length(tn) -
576 	 *    tn->empty_children) >= inflate_threshold *
577 	 *    tnode_child_length(tn)
578 	 *
579 	 */
580 
581 	check_tnode(tn);
582 
583 	/* Keep root node larger  */
584 
585 	if (!tn->parent)
586 		inflate_threshold_use = inflate_threshold_root;
587 	else
588 		inflate_threshold_use = inflate_threshold;
589 
590 	err = 0;
591 	max_resize = 10;
592 	while ((tn->full_children > 0 &&  max_resize-- &&
593 		50 * (tn->full_children + tnode_child_length(tn)
594 		      - tn->empty_children)
595 		>= inflate_threshold_use * tnode_child_length(tn))) {
596 
597 		old_tn = tn;
598 		tn = inflate(t, tn);
599 
600 		if (IS_ERR(tn)) {
601 			tn = old_tn;
602 #ifdef CONFIG_IP_FIB_TRIE_STATS
603 			t->stats.resize_node_skipped++;
604 #endif
605 			break;
606 		}
607 	}
608 
609 	if (max_resize < 0) {
610 		if (!tn->parent)
611 			pr_warning("Fix inflate_threshold_root."
612 				   " Now=%d size=%d bits\n",
613 				   inflate_threshold_root, tn->bits);
614 		else
615 			pr_warning("Fix inflate_threshold."
616 				   " Now=%d size=%d bits\n",
617 				   inflate_threshold, tn->bits);
618 	}
619 
620 	check_tnode(tn);
621 
622 	/*
623 	 * Halve as long as the number of empty children in this
624 	 * node is above threshold.
625 	 */
626 
627 
628 	/* Keep root node larger  */
629 
630 	if (!tn->parent)
631 		halve_threshold_use = halve_threshold_root;
632 	else
633 		halve_threshold_use = halve_threshold;
634 
635 	err = 0;
636 	max_resize = 10;
637 	while (tn->bits > 1 &&  max_resize-- &&
638 	       100 * (tnode_child_length(tn) - tn->empty_children) <
639 	       halve_threshold_use * tnode_child_length(tn)) {
640 
641 		old_tn = tn;
642 		tn = halve(t, tn);
643 		if (IS_ERR(tn)) {
644 			tn = old_tn;
645 #ifdef CONFIG_IP_FIB_TRIE_STATS
646 			t->stats.resize_node_skipped++;
647 #endif
648 			break;
649 		}
650 	}
651 
652 	if (max_resize < 0) {
653 		if (!tn->parent)
654 			pr_warning("Fix halve_threshold_root."
655 				   " Now=%d size=%d bits\n",
656 				   halve_threshold_root, tn->bits);
657 		else
658 			pr_warning("Fix halve_threshold."
659 				   " Now=%d size=%d bits\n",
660 				   halve_threshold, tn->bits);
661 	}
662 
663 	/* Only one child remains */
664 	if (tn->empty_children == tnode_child_length(tn) - 1)
665 		for (i = 0; i < tnode_child_length(tn); i++) {
666 			struct node *n;
667 
668 			n = tn->child[i];
669 			if (!n)
670 				continue;
671 
672 			/* compress one level */
673 
674 			node_set_parent(n, NULL);
675 			tnode_free(tn);
676 			return n;
677 		}
678 
679 	return (struct node *) tn;
680 }
681 
682 static struct tnode *inflate(struct trie *t, struct tnode *tn)
683 {
684 	struct tnode *oldtnode = tn;
685 	int olen = tnode_child_length(tn);
686 	int i;
687 
688 	pr_debug("In inflate\n");
689 
690 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
691 
692 	if (!tn)
693 		return ERR_PTR(-ENOMEM);
694 
695 	/*
696 	 * Preallocate and store tnodes before the actual work so we
697 	 * don't get into an inconsistent state if memory allocation
698 	 * fails. In case of failure we return the oldnode and  inflate
699 	 * of tnode is ignored.
700 	 */
701 
702 	for (i = 0; i < olen; i++) {
703 		struct tnode *inode;
704 
705 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
706 		if (inode &&
707 		    IS_TNODE(inode) &&
708 		    inode->pos == oldtnode->pos + oldtnode->bits &&
709 		    inode->bits > 1) {
710 			struct tnode *left, *right;
711 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
712 
713 			left = tnode_new(inode->key&(~m), inode->pos + 1,
714 					 inode->bits - 1);
715 			if (!left)
716 				goto nomem;
717 
718 			right = tnode_new(inode->key|m, inode->pos + 1,
719 					  inode->bits - 1);
720 
721 			if (!right) {
722 				tnode_free(left);
723 				goto nomem;
724 			}
725 
726 			put_child(t, tn, 2*i, (struct node *) left);
727 			put_child(t, tn, 2*i+1, (struct node *) right);
728 		}
729 	}
730 
731 	for (i = 0; i < olen; i++) {
732 		struct tnode *inode;
733 		struct node *node = tnode_get_child(oldtnode, i);
734 		struct tnode *left, *right;
735 		int size, j;
736 
737 		/* An empty child */
738 		if (node == NULL)
739 			continue;
740 
741 		/* A leaf or an internal node with skipped bits */
742 
743 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
744 		   tn->pos + tn->bits - 1) {
745 			if (tkey_extract_bits(node->key,
746 					      oldtnode->pos + oldtnode->bits,
747 					      1) == 0)
748 				put_child(t, tn, 2*i, node);
749 			else
750 				put_child(t, tn, 2*i+1, node);
751 			continue;
752 		}
753 
754 		/* An internal node with two children */
755 		inode = (struct tnode *) node;
756 
757 		if (inode->bits == 1) {
758 			put_child(t, tn, 2*i, inode->child[0]);
759 			put_child(t, tn, 2*i+1, inode->child[1]);
760 
761 			tnode_free(inode);
762 			continue;
763 		}
764 
765 		/* An internal node with more than two children */
766 
767 		/* We will replace this node 'inode' with two new
768 		 * ones, 'left' and 'right', each with half of the
769 		 * original children. The two new nodes will have
770 		 * a position one bit further down the key and this
771 		 * means that the "significant" part of their keys
772 		 * (see the discussion near the top of this file)
773 		 * will differ by one bit, which will be "0" in
774 		 * left's key and "1" in right's key. Since we are
775 		 * moving the key position by one step, the bit that
776 		 * we are moving away from - the bit at position
777 		 * (inode->pos) - is the one that will differ between
778 		 * left and right. So... we synthesize that bit in the
779 		 * two  new keys.
780 		 * The mask 'm' below will be a single "one" bit at
781 		 * the position (inode->pos)
782 		 */
783 
784 		/* Use the old key, but set the new significant
785 		 *   bit to zero.
786 		 */
787 
788 		left = (struct tnode *) tnode_get_child(tn, 2*i);
789 		put_child(t, tn, 2*i, NULL);
790 
791 		BUG_ON(!left);
792 
793 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
794 		put_child(t, tn, 2*i+1, NULL);
795 
796 		BUG_ON(!right);
797 
798 		size = tnode_child_length(left);
799 		for (j = 0; j < size; j++) {
800 			put_child(t, left, j, inode->child[j]);
801 			put_child(t, right, j, inode->child[j + size]);
802 		}
803 		put_child(t, tn, 2*i, resize(t, left));
804 		put_child(t, tn, 2*i+1, resize(t, right));
805 
806 		tnode_free(inode);
807 	}
808 	tnode_free(oldtnode);
809 	return tn;
810 nomem:
811 	{
812 		int size = tnode_child_length(tn);
813 		int j;
814 
815 		for (j = 0; j < size; j++)
816 			if (tn->child[j])
817 				tnode_free((struct tnode *)tn->child[j]);
818 
819 		tnode_free(tn);
820 
821 		return ERR_PTR(-ENOMEM);
822 	}
823 }
824 
825 static struct tnode *halve(struct trie *t, struct tnode *tn)
826 {
827 	struct tnode *oldtnode = tn;
828 	struct node *left, *right;
829 	int i;
830 	int olen = tnode_child_length(tn);
831 
832 	pr_debug("In halve\n");
833 
834 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
835 
836 	if (!tn)
837 		return ERR_PTR(-ENOMEM);
838 
839 	/*
840 	 * Preallocate and store tnodes before the actual work so we
841 	 * don't get into an inconsistent state if memory allocation
842 	 * fails. In case of failure we return the oldnode and halve
843 	 * of tnode is ignored.
844 	 */
845 
846 	for (i = 0; i < olen; i += 2) {
847 		left = tnode_get_child(oldtnode, i);
848 		right = tnode_get_child(oldtnode, i+1);
849 
850 		/* Two nonempty children */
851 		if (left && right) {
852 			struct tnode *newn;
853 
854 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
855 
856 			if (!newn)
857 				goto nomem;
858 
859 			put_child(t, tn, i/2, (struct node *)newn);
860 		}
861 
862 	}
863 
864 	for (i = 0; i < olen; i += 2) {
865 		struct tnode *newBinNode;
866 
867 		left = tnode_get_child(oldtnode, i);
868 		right = tnode_get_child(oldtnode, i+1);
869 
870 		/* At least one of the children is empty */
871 		if (left == NULL) {
872 			if (right == NULL)    /* Both are empty */
873 				continue;
874 			put_child(t, tn, i/2, right);
875 			continue;
876 		}
877 
878 		if (right == NULL) {
879 			put_child(t, tn, i/2, left);
880 			continue;
881 		}
882 
883 		/* Two nonempty children */
884 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
885 		put_child(t, tn, i/2, NULL);
886 		put_child(t, newBinNode, 0, left);
887 		put_child(t, newBinNode, 1, right);
888 		put_child(t, tn, i/2, resize(t, newBinNode));
889 	}
890 	tnode_free(oldtnode);
891 	return tn;
892 nomem:
893 	{
894 		int size = tnode_child_length(tn);
895 		int j;
896 
897 		for (j = 0; j < size; j++)
898 			if (tn->child[j])
899 				tnode_free((struct tnode *)tn->child[j]);
900 
901 		tnode_free(tn);
902 
903 		return ERR_PTR(-ENOMEM);
904 	}
905 }
906 
907 /* readside must use rcu_read_lock currently dump routines
908  via get_fa_head and dump */
909 
910 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
911 {
912 	struct hlist_head *head = &l->list;
913 	struct hlist_node *node;
914 	struct leaf_info *li;
915 
916 	hlist_for_each_entry_rcu(li, node, head, hlist)
917 		if (li->plen == plen)
918 			return li;
919 
920 	return NULL;
921 }
922 
923 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
924 {
925 	struct leaf_info *li = find_leaf_info(l, plen);
926 
927 	if (!li)
928 		return NULL;
929 
930 	return &li->falh;
931 }
932 
933 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
934 {
935 	struct leaf_info *li = NULL, *last = NULL;
936 	struct hlist_node *node;
937 
938 	if (hlist_empty(head)) {
939 		hlist_add_head_rcu(&new->hlist, head);
940 	} else {
941 		hlist_for_each_entry(li, node, head, hlist) {
942 			if (new->plen > li->plen)
943 				break;
944 
945 			last = li;
946 		}
947 		if (last)
948 			hlist_add_after_rcu(&last->hlist, &new->hlist);
949 		else
950 			hlist_add_before_rcu(&new->hlist, &li->hlist);
951 	}
952 }
953 
954 /* rcu_read_lock needs to be hold by caller from readside */
955 
956 static struct leaf *
957 fib_find_node(struct trie *t, u32 key)
958 {
959 	int pos;
960 	struct tnode *tn;
961 	struct node *n;
962 
963 	pos = 0;
964 	n = rcu_dereference(t->trie);
965 
966 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
967 		tn = (struct tnode *) n;
968 
969 		check_tnode(tn);
970 
971 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
972 			pos = tn->pos + tn->bits;
973 			n = tnode_get_child_rcu(tn,
974 						tkey_extract_bits(key,
975 								  tn->pos,
976 								  tn->bits));
977 		} else
978 			break;
979 	}
980 	/* Case we have found a leaf. Compare prefixes */
981 
982 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
983 		return (struct leaf *)n;
984 
985 	return NULL;
986 }
987 
988 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
989 {
990 	int wasfull;
991 	t_key cindex, key = tn->key;
992 	struct tnode *tp;
993 
994 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
995 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
996 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
997 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
998 
999 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1000 				      (struct node *)tn, wasfull);
1001 
1002 		tp = node_parent((struct node *) tn);
1003 		if (!tp)
1004 			break;
1005 		tn = tp;
1006 	}
1007 
1008 	/* Handle last (top) tnode */
1009 	if (IS_TNODE(tn))
1010 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1011 
1012 	return (struct node *)tn;
1013 }
1014 
1015 /* only used from updater-side */
1016 
1017 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1018 {
1019 	int pos, newpos;
1020 	struct tnode *tp = NULL, *tn = NULL;
1021 	struct node *n;
1022 	struct leaf *l;
1023 	int missbit;
1024 	struct list_head *fa_head = NULL;
1025 	struct leaf_info *li;
1026 	t_key cindex;
1027 
1028 	pos = 0;
1029 	n = t->trie;
1030 
1031 	/* If we point to NULL, stop. Either the tree is empty and we should
1032 	 * just put a new leaf in if, or we have reached an empty child slot,
1033 	 * and we should just put our new leaf in that.
1034 	 * If we point to a T_TNODE, check if it matches our key. Note that
1035 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1036 	 * not be the parent's 'pos'+'bits'!
1037 	 *
1038 	 * If it does match the current key, get pos/bits from it, extract
1039 	 * the index from our key, push the T_TNODE and walk the tree.
1040 	 *
1041 	 * If it doesn't, we have to replace it with a new T_TNODE.
1042 	 *
1043 	 * If we point to a T_LEAF, it might or might not have the same key
1044 	 * as we do. If it does, just change the value, update the T_LEAF's
1045 	 * value, and return it.
1046 	 * If it doesn't, we need to replace it with a T_TNODE.
1047 	 */
1048 
1049 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1050 		tn = (struct tnode *) n;
1051 
1052 		check_tnode(tn);
1053 
1054 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1055 			tp = tn;
1056 			pos = tn->pos + tn->bits;
1057 			n = tnode_get_child(tn,
1058 					    tkey_extract_bits(key,
1059 							      tn->pos,
1060 							      tn->bits));
1061 
1062 			BUG_ON(n && node_parent(n) != tn);
1063 		} else
1064 			break;
1065 	}
1066 
1067 	/*
1068 	 * n  ----> NULL, LEAF or TNODE
1069 	 *
1070 	 * tp is n's (parent) ----> NULL or TNODE
1071 	 */
1072 
1073 	BUG_ON(tp && IS_LEAF(tp));
1074 
1075 	/* Case 1: n is a leaf. Compare prefixes */
1076 
1077 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1078 		l = (struct leaf *) n;
1079 		li = leaf_info_new(plen);
1080 
1081 		if (!li)
1082 			return NULL;
1083 
1084 		fa_head = &li->falh;
1085 		insert_leaf_info(&l->list, li);
1086 		goto done;
1087 	}
1088 	l = leaf_new();
1089 
1090 	if (!l)
1091 		return NULL;
1092 
1093 	l->key = key;
1094 	li = leaf_info_new(plen);
1095 
1096 	if (!li) {
1097 		free_leaf(l);
1098 		return NULL;
1099 	}
1100 
1101 	fa_head = &li->falh;
1102 	insert_leaf_info(&l->list, li);
1103 
1104 	if (t->trie && n == NULL) {
1105 		/* Case 2: n is NULL, and will just insert a new leaf */
1106 
1107 		node_set_parent((struct node *)l, tp);
1108 
1109 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1110 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1111 	} else {
1112 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1113 		/*
1114 		 *  Add a new tnode here
1115 		 *  first tnode need some special handling
1116 		 */
1117 
1118 		if (tp)
1119 			pos = tp->pos+tp->bits;
1120 		else
1121 			pos = 0;
1122 
1123 		if (n) {
1124 			newpos = tkey_mismatch(key, pos, n->key);
1125 			tn = tnode_new(n->key, newpos, 1);
1126 		} else {
1127 			newpos = 0;
1128 			tn = tnode_new(key, newpos, 1); /* First tnode */
1129 		}
1130 
1131 		if (!tn) {
1132 			free_leaf_info(li);
1133 			free_leaf(l);
1134 			return NULL;
1135 		}
1136 
1137 		node_set_parent((struct node *)tn, tp);
1138 
1139 		missbit = tkey_extract_bits(key, newpos, 1);
1140 		put_child(t, tn, missbit, (struct node *)l);
1141 		put_child(t, tn, 1-missbit, n);
1142 
1143 		if (tp) {
1144 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1145 			put_child(t, (struct tnode *)tp, cindex,
1146 				  (struct node *)tn);
1147 		} else {
1148 			rcu_assign_pointer(t->trie, (struct node *)tn);
1149 			tp = tn;
1150 		}
1151 	}
1152 
1153 	if (tp && tp->pos + tp->bits > 32)
1154 		pr_warning("fib_trie"
1155 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1156 			   tp, tp->pos, tp->bits, key, plen);
1157 
1158 	/* Rebalance the trie */
1159 
1160 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1161 done:
1162 	return fa_head;
1163 }
1164 
1165 /*
1166  * Caller must hold RTNL.
1167  */
1168 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1169 {
1170 	struct trie *t = (struct trie *) tb->tb_data;
1171 	struct fib_alias *fa, *new_fa;
1172 	struct list_head *fa_head = NULL;
1173 	struct fib_info *fi;
1174 	int plen = cfg->fc_dst_len;
1175 	u8 tos = cfg->fc_tos;
1176 	u32 key, mask;
1177 	int err;
1178 	struct leaf *l;
1179 
1180 	if (plen > 32)
1181 		return -EINVAL;
1182 
1183 	key = ntohl(cfg->fc_dst);
1184 
1185 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1186 
1187 	mask = ntohl(inet_make_mask(plen));
1188 
1189 	if (key & ~mask)
1190 		return -EINVAL;
1191 
1192 	key = key & mask;
1193 
1194 	fi = fib_create_info(cfg);
1195 	if (IS_ERR(fi)) {
1196 		err = PTR_ERR(fi);
1197 		goto err;
1198 	}
1199 
1200 	l = fib_find_node(t, key);
1201 	fa = NULL;
1202 
1203 	if (l) {
1204 		fa_head = get_fa_head(l, plen);
1205 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1206 	}
1207 
1208 	/* Now fa, if non-NULL, points to the first fib alias
1209 	 * with the same keys [prefix,tos,priority], if such key already
1210 	 * exists or to the node before which we will insert new one.
1211 	 *
1212 	 * If fa is NULL, we will need to allocate a new one and
1213 	 * insert to the head of f.
1214 	 *
1215 	 * If f is NULL, no fib node matched the destination key
1216 	 * and we need to allocate a new one of those as well.
1217 	 */
1218 
1219 	if (fa && fa->fa_tos == tos &&
1220 	    fa->fa_info->fib_priority == fi->fib_priority) {
1221 		struct fib_alias *fa_first, *fa_match;
1222 
1223 		err = -EEXIST;
1224 		if (cfg->fc_nlflags & NLM_F_EXCL)
1225 			goto out;
1226 
1227 		/* We have 2 goals:
1228 		 * 1. Find exact match for type, scope, fib_info to avoid
1229 		 * duplicate routes
1230 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1231 		 */
1232 		fa_match = NULL;
1233 		fa_first = fa;
1234 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1235 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1236 			if (fa->fa_tos != tos)
1237 				break;
1238 			if (fa->fa_info->fib_priority != fi->fib_priority)
1239 				break;
1240 			if (fa->fa_type == cfg->fc_type &&
1241 			    fa->fa_scope == cfg->fc_scope &&
1242 			    fa->fa_info == fi) {
1243 				fa_match = fa;
1244 				break;
1245 			}
1246 		}
1247 
1248 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1249 			struct fib_info *fi_drop;
1250 			u8 state;
1251 
1252 			fa = fa_first;
1253 			if (fa_match) {
1254 				if (fa == fa_match)
1255 					err = 0;
1256 				goto out;
1257 			}
1258 			err = -ENOBUFS;
1259 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1260 			if (new_fa == NULL)
1261 				goto out;
1262 
1263 			fi_drop = fa->fa_info;
1264 			new_fa->fa_tos = fa->fa_tos;
1265 			new_fa->fa_info = fi;
1266 			new_fa->fa_type = cfg->fc_type;
1267 			new_fa->fa_scope = cfg->fc_scope;
1268 			state = fa->fa_state;
1269 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1270 
1271 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1272 			alias_free_mem_rcu(fa);
1273 
1274 			fib_release_info(fi_drop);
1275 			if (state & FA_S_ACCESSED)
1276 				rt_cache_flush(-1);
1277 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1278 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1279 
1280 			goto succeeded;
1281 		}
1282 		/* Error if we find a perfect match which
1283 		 * uses the same scope, type, and nexthop
1284 		 * information.
1285 		 */
1286 		if (fa_match)
1287 			goto out;
1288 
1289 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1290 			fa = fa_first;
1291 	}
1292 	err = -ENOENT;
1293 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1294 		goto out;
1295 
1296 	err = -ENOBUFS;
1297 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1298 	if (new_fa == NULL)
1299 		goto out;
1300 
1301 	new_fa->fa_info = fi;
1302 	new_fa->fa_tos = tos;
1303 	new_fa->fa_type = cfg->fc_type;
1304 	new_fa->fa_scope = cfg->fc_scope;
1305 	new_fa->fa_state = 0;
1306 	/*
1307 	 * Insert new entry to the list.
1308 	 */
1309 
1310 	if (!fa_head) {
1311 		fa_head = fib_insert_node(t, key, plen);
1312 		if (unlikely(!fa_head)) {
1313 			err = -ENOMEM;
1314 			goto out_free_new_fa;
1315 		}
1316 	}
1317 
1318 	list_add_tail_rcu(&new_fa->fa_list,
1319 			  (fa ? &fa->fa_list : fa_head));
1320 
1321 	rt_cache_flush(-1);
1322 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1323 		  &cfg->fc_nlinfo, 0);
1324 succeeded:
1325 	return 0;
1326 
1327 out_free_new_fa:
1328 	kmem_cache_free(fn_alias_kmem, new_fa);
1329 out:
1330 	fib_release_info(fi);
1331 err:
1332 	return err;
1333 }
1334 
1335 /* should be called with rcu_read_lock */
1336 static int check_leaf(struct trie *t, struct leaf *l,
1337 		      t_key key,  const struct flowi *flp,
1338 		      struct fib_result *res)
1339 {
1340 	struct leaf_info *li;
1341 	struct hlist_head *hhead = &l->list;
1342 	struct hlist_node *node;
1343 
1344 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1345 		int err;
1346 		int plen = li->plen;
1347 		__be32 mask = inet_make_mask(plen);
1348 
1349 		if (l->key != (key & ntohl(mask)))
1350 			continue;
1351 
1352 		err = fib_semantic_match(&li->falh, flp, res,
1353 					 htonl(l->key), mask, plen);
1354 
1355 #ifdef CONFIG_IP_FIB_TRIE_STATS
1356 		if (err <= 0)
1357 			t->stats.semantic_match_passed++;
1358 		else
1359 			t->stats.semantic_match_miss++;
1360 #endif
1361 		if (err <= 0)
1362 			return plen;
1363 	}
1364 
1365 	return -1;
1366 }
1367 
1368 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1369 			  struct fib_result *res)
1370 {
1371 	struct trie *t = (struct trie *) tb->tb_data;
1372 	int plen, ret = 0;
1373 	struct node *n;
1374 	struct tnode *pn;
1375 	int pos, bits;
1376 	t_key key = ntohl(flp->fl4_dst);
1377 	int chopped_off;
1378 	t_key cindex = 0;
1379 	int current_prefix_length = KEYLENGTH;
1380 	struct tnode *cn;
1381 	t_key node_prefix, key_prefix, pref_mismatch;
1382 	int mp;
1383 
1384 	rcu_read_lock();
1385 
1386 	n = rcu_dereference(t->trie);
1387 	if (!n)
1388 		goto failed;
1389 
1390 #ifdef CONFIG_IP_FIB_TRIE_STATS
1391 	t->stats.gets++;
1392 #endif
1393 
1394 	/* Just a leaf? */
1395 	if (IS_LEAF(n)) {
1396 		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1397 		if (plen < 0)
1398 			goto failed;
1399 		ret = 0;
1400 		goto found;
1401 	}
1402 
1403 	pn = (struct tnode *) n;
1404 	chopped_off = 0;
1405 
1406 	while (pn) {
1407 		pos = pn->pos;
1408 		bits = pn->bits;
1409 
1410 		if (!chopped_off)
1411 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1412 						   pos, bits);
1413 
1414 		n = tnode_get_child(pn, cindex);
1415 
1416 		if (n == NULL) {
1417 #ifdef CONFIG_IP_FIB_TRIE_STATS
1418 			t->stats.null_node_hit++;
1419 #endif
1420 			goto backtrace;
1421 		}
1422 
1423 		if (IS_LEAF(n)) {
1424 			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1425 			if (plen < 0)
1426 				goto backtrace;
1427 
1428 			ret = 0;
1429 			goto found;
1430 		}
1431 
1432 		cn = (struct tnode *)n;
1433 
1434 		/*
1435 		 * It's a tnode, and we can do some extra checks here if we
1436 		 * like, to avoid descending into a dead-end branch.
1437 		 * This tnode is in the parent's child array at index
1438 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1439 		 * chopped off, so in reality the index may be just a
1440 		 * subprefix, padded with zero at the end.
1441 		 * We can also take a look at any skipped bits in this
1442 		 * tnode - everything up to p_pos is supposed to be ok,
1443 		 * and the non-chopped bits of the index (se previous
1444 		 * paragraph) are also guaranteed ok, but the rest is
1445 		 * considered unknown.
1446 		 *
1447 		 * The skipped bits are key[pos+bits..cn->pos].
1448 		 */
1449 
1450 		/* If current_prefix_length < pos+bits, we are already doing
1451 		 * actual prefix  matching, which means everything from
1452 		 * pos+(bits-chopped_off) onward must be zero along some
1453 		 * branch of this subtree - otherwise there is *no* valid
1454 		 * prefix present. Here we can only check the skipped
1455 		 * bits. Remember, since we have already indexed into the
1456 		 * parent's child array, we know that the bits we chopped of
1457 		 * *are* zero.
1458 		 */
1459 
1460 		/* NOTA BENE: Checking only skipped bits
1461 		   for the new node here */
1462 
1463 		if (current_prefix_length < pos+bits) {
1464 			if (tkey_extract_bits(cn->key, current_prefix_length,
1465 						cn->pos - current_prefix_length)
1466 			    || !(cn->child[0]))
1467 				goto backtrace;
1468 		}
1469 
1470 		/*
1471 		 * If chopped_off=0, the index is fully validated and we
1472 		 * only need to look at the skipped bits for this, the new,
1473 		 * tnode. What we actually want to do is to find out if
1474 		 * these skipped bits match our key perfectly, or if we will
1475 		 * have to count on finding a matching prefix further down,
1476 		 * because if we do, we would like to have some way of
1477 		 * verifying the existence of such a prefix at this point.
1478 		 */
1479 
1480 		/* The only thing we can do at this point is to verify that
1481 		 * any such matching prefix can indeed be a prefix to our
1482 		 * key, and if the bits in the node we are inspecting that
1483 		 * do not match our key are not ZERO, this cannot be true.
1484 		 * Thus, find out where there is a mismatch (before cn->pos)
1485 		 * and verify that all the mismatching bits are zero in the
1486 		 * new tnode's key.
1487 		 */
1488 
1489 		/*
1490 		 * Note: We aren't very concerned about the piece of
1491 		 * the key that precede pn->pos+pn->bits, since these
1492 		 * have already been checked. The bits after cn->pos
1493 		 * aren't checked since these are by definition
1494 		 * "unknown" at this point. Thus, what we want to see
1495 		 * is if we are about to enter the "prefix matching"
1496 		 * state, and in that case verify that the skipped
1497 		 * bits that will prevail throughout this subtree are
1498 		 * zero, as they have to be if we are to find a
1499 		 * matching prefix.
1500 		 */
1501 
1502 		node_prefix = mask_pfx(cn->key, cn->pos);
1503 		key_prefix = mask_pfx(key, cn->pos);
1504 		pref_mismatch = key_prefix^node_prefix;
1505 		mp = 0;
1506 
1507 		/*
1508 		 * In short: If skipped bits in this node do not match
1509 		 * the search key, enter the "prefix matching"
1510 		 * state.directly.
1511 		 */
1512 		if (pref_mismatch) {
1513 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1514 				mp++;
1515 				pref_mismatch = pref_mismatch << 1;
1516 			}
1517 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1518 
1519 			if (key_prefix != 0)
1520 				goto backtrace;
1521 
1522 			if (current_prefix_length >= cn->pos)
1523 				current_prefix_length = mp;
1524 		}
1525 
1526 		pn = (struct tnode *)n; /* Descend */
1527 		chopped_off = 0;
1528 		continue;
1529 
1530 backtrace:
1531 		chopped_off++;
1532 
1533 		/* As zero don't change the child key (cindex) */
1534 		while ((chopped_off <= pn->bits)
1535 		       && !(cindex & (1<<(chopped_off-1))))
1536 			chopped_off++;
1537 
1538 		/* Decrease current_... with bits chopped off */
1539 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1540 			current_prefix_length = pn->pos + pn->bits
1541 				- chopped_off;
1542 
1543 		/*
1544 		 * Either we do the actual chop off according or if we have
1545 		 * chopped off all bits in this tnode walk up to our parent.
1546 		 */
1547 
1548 		if (chopped_off <= pn->bits) {
1549 			cindex &= ~(1 << (chopped_off-1));
1550 		} else {
1551 			struct tnode *parent = node_parent((struct node *) pn);
1552 			if (!parent)
1553 				goto failed;
1554 
1555 			/* Get Child's index */
1556 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1557 			pn = parent;
1558 			chopped_off = 0;
1559 
1560 #ifdef CONFIG_IP_FIB_TRIE_STATS
1561 			t->stats.backtrack++;
1562 #endif
1563 			goto backtrace;
1564 		}
1565 	}
1566 failed:
1567 	ret = 1;
1568 found:
1569 	rcu_read_unlock();
1570 	return ret;
1571 }
1572 
1573 /*
1574  * Remove the leaf and return parent.
1575  */
1576 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1577 {
1578 	struct tnode *tp = node_parent((struct node *) l);
1579 
1580 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1581 
1582 	if (tp) {
1583 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1584 		put_child(t, (struct tnode *)tp, cindex, NULL);
1585 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1586 	} else
1587 		rcu_assign_pointer(t->trie, NULL);
1588 
1589 	free_leaf(l);
1590 }
1591 
1592 /*
1593  * Caller must hold RTNL.
1594  */
1595 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1596 {
1597 	struct trie *t = (struct trie *) tb->tb_data;
1598 	u32 key, mask;
1599 	int plen = cfg->fc_dst_len;
1600 	u8 tos = cfg->fc_tos;
1601 	struct fib_alias *fa, *fa_to_delete;
1602 	struct list_head *fa_head;
1603 	struct leaf *l;
1604 	struct leaf_info *li;
1605 
1606 	if (plen > 32)
1607 		return -EINVAL;
1608 
1609 	key = ntohl(cfg->fc_dst);
1610 	mask = ntohl(inet_make_mask(plen));
1611 
1612 	if (key & ~mask)
1613 		return -EINVAL;
1614 
1615 	key = key & mask;
1616 	l = fib_find_node(t, key);
1617 
1618 	if (!l)
1619 		return -ESRCH;
1620 
1621 	fa_head = get_fa_head(l, plen);
1622 	fa = fib_find_alias(fa_head, tos, 0);
1623 
1624 	if (!fa)
1625 		return -ESRCH;
1626 
1627 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1628 
1629 	fa_to_delete = NULL;
1630 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1631 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1632 		struct fib_info *fi = fa->fa_info;
1633 
1634 		if (fa->fa_tos != tos)
1635 			break;
1636 
1637 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1638 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1639 		     fa->fa_scope == cfg->fc_scope) &&
1640 		    (!cfg->fc_protocol ||
1641 		     fi->fib_protocol == cfg->fc_protocol) &&
1642 		    fib_nh_match(cfg, fi) == 0) {
1643 			fa_to_delete = fa;
1644 			break;
1645 		}
1646 	}
1647 
1648 	if (!fa_to_delete)
1649 		return -ESRCH;
1650 
1651 	fa = fa_to_delete;
1652 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1653 		  &cfg->fc_nlinfo, 0);
1654 
1655 	l = fib_find_node(t, key);
1656 	li = find_leaf_info(l, plen);
1657 
1658 	list_del_rcu(&fa->fa_list);
1659 
1660 	if (list_empty(fa_head)) {
1661 		hlist_del_rcu(&li->hlist);
1662 		free_leaf_info(li);
1663 	}
1664 
1665 	if (hlist_empty(&l->list))
1666 		trie_leaf_remove(t, l);
1667 
1668 	if (fa->fa_state & FA_S_ACCESSED)
1669 		rt_cache_flush(-1);
1670 
1671 	fib_release_info(fa->fa_info);
1672 	alias_free_mem_rcu(fa);
1673 	return 0;
1674 }
1675 
1676 static int trie_flush_list(struct list_head *head)
1677 {
1678 	struct fib_alias *fa, *fa_node;
1679 	int found = 0;
1680 
1681 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1682 		struct fib_info *fi = fa->fa_info;
1683 
1684 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1685 			list_del_rcu(&fa->fa_list);
1686 			fib_release_info(fa->fa_info);
1687 			alias_free_mem_rcu(fa);
1688 			found++;
1689 		}
1690 	}
1691 	return found;
1692 }
1693 
1694 static int trie_flush_leaf(struct leaf *l)
1695 {
1696 	int found = 0;
1697 	struct hlist_head *lih = &l->list;
1698 	struct hlist_node *node, *tmp;
1699 	struct leaf_info *li = NULL;
1700 
1701 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1702 		found += trie_flush_list(&li->falh);
1703 
1704 		if (list_empty(&li->falh)) {
1705 			hlist_del_rcu(&li->hlist);
1706 			free_leaf_info(li);
1707 		}
1708 	}
1709 	return found;
1710 }
1711 
1712 /*
1713  * Scan for the next right leaf starting at node p->child[idx]
1714  * Since we have back pointer, no recursion necessary.
1715  */
1716 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1717 {
1718 	do {
1719 		t_key idx;
1720 
1721 		if (c)
1722 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1723 		else
1724 			idx = 0;
1725 
1726 		while (idx < 1u << p->bits) {
1727 			c = tnode_get_child_rcu(p, idx++);
1728 			if (!c)
1729 				continue;
1730 
1731 			if (IS_LEAF(c)) {
1732 				prefetch(p->child[idx]);
1733 				return (struct leaf *) c;
1734 			}
1735 
1736 			/* Rescan start scanning in new node */
1737 			p = (struct tnode *) c;
1738 			idx = 0;
1739 		}
1740 
1741 		/* Node empty, walk back up to parent */
1742 		c = (struct node *) p;
1743 	} while ( (p = node_parent_rcu(c)) != NULL);
1744 
1745 	return NULL; /* Root of trie */
1746 }
1747 
1748 static struct leaf *trie_firstleaf(struct trie *t)
1749 {
1750 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1751 
1752 	if (!n)
1753 		return NULL;
1754 
1755 	if (IS_LEAF(n))          /* trie is just a leaf */
1756 		return (struct leaf *) n;
1757 
1758 	return leaf_walk_rcu(n, NULL);
1759 }
1760 
1761 static struct leaf *trie_nextleaf(struct leaf *l)
1762 {
1763 	struct node *c = (struct node *) l;
1764 	struct tnode *p = node_parent(c);
1765 
1766 	if (!p)
1767 		return NULL;	/* trie with just one leaf */
1768 
1769 	return leaf_walk_rcu(p, c);
1770 }
1771 
1772 static struct leaf *trie_leafindex(struct trie *t, int index)
1773 {
1774 	struct leaf *l = trie_firstleaf(t);
1775 
1776 	while (l && index-- > 0)
1777 		l = trie_nextleaf(l);
1778 
1779 	return l;
1780 }
1781 
1782 
1783 /*
1784  * Caller must hold RTNL.
1785  */
1786 static int fn_trie_flush(struct fib_table *tb)
1787 {
1788 	struct trie *t = (struct trie *) tb->tb_data;
1789 	struct leaf *l, *ll = NULL;
1790 	int found = 0;
1791 
1792 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1793 		found += trie_flush_leaf(l);
1794 
1795 		if (ll && hlist_empty(&ll->list))
1796 			trie_leaf_remove(t, ll);
1797 		ll = l;
1798 	}
1799 
1800 	if (ll && hlist_empty(&ll->list))
1801 		trie_leaf_remove(t, ll);
1802 
1803 	pr_debug("trie_flush found=%d\n", found);
1804 	return found;
1805 }
1806 
1807 static void fn_trie_select_default(struct fib_table *tb,
1808 				   const struct flowi *flp,
1809 				   struct fib_result *res)
1810 {
1811 	struct trie *t = (struct trie *) tb->tb_data;
1812 	int order, last_idx;
1813 	struct fib_info *fi = NULL;
1814 	struct fib_info *last_resort;
1815 	struct fib_alias *fa = NULL;
1816 	struct list_head *fa_head;
1817 	struct leaf *l;
1818 
1819 	last_idx = -1;
1820 	last_resort = NULL;
1821 	order = -1;
1822 
1823 	rcu_read_lock();
1824 
1825 	l = fib_find_node(t, 0);
1826 	if (!l)
1827 		goto out;
1828 
1829 	fa_head = get_fa_head(l, 0);
1830 	if (!fa_head)
1831 		goto out;
1832 
1833 	if (list_empty(fa_head))
1834 		goto out;
1835 
1836 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1837 		struct fib_info *next_fi = fa->fa_info;
1838 
1839 		if (fa->fa_scope != res->scope ||
1840 		    fa->fa_type != RTN_UNICAST)
1841 			continue;
1842 
1843 		if (next_fi->fib_priority > res->fi->fib_priority)
1844 			break;
1845 		if (!next_fi->fib_nh[0].nh_gw ||
1846 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1847 			continue;
1848 		fa->fa_state |= FA_S_ACCESSED;
1849 
1850 		if (fi == NULL) {
1851 			if (next_fi != res->fi)
1852 				break;
1853 		} else if (!fib_detect_death(fi, order, &last_resort,
1854 					     &last_idx, tb->tb_default)) {
1855 			fib_result_assign(res, fi);
1856 			tb->tb_default = order;
1857 			goto out;
1858 		}
1859 		fi = next_fi;
1860 		order++;
1861 	}
1862 	if (order <= 0 || fi == NULL) {
1863 		tb->tb_default = -1;
1864 		goto out;
1865 	}
1866 
1867 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1868 				tb->tb_default)) {
1869 		fib_result_assign(res, fi);
1870 		tb->tb_default = order;
1871 		goto out;
1872 	}
1873 	if (last_idx >= 0)
1874 		fib_result_assign(res, last_resort);
1875 	tb->tb_default = last_idx;
1876 out:
1877 	rcu_read_unlock();
1878 }
1879 
1880 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1881 			   struct fib_table *tb,
1882 			   struct sk_buff *skb, struct netlink_callback *cb)
1883 {
1884 	int i, s_i;
1885 	struct fib_alias *fa;
1886 	__be32 xkey = htonl(key);
1887 
1888 	s_i = cb->args[5];
1889 	i = 0;
1890 
1891 	/* rcu_read_lock is hold by caller */
1892 
1893 	list_for_each_entry_rcu(fa, fah, fa_list) {
1894 		if (i < s_i) {
1895 			i++;
1896 			continue;
1897 		}
1898 
1899 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1900 				  cb->nlh->nlmsg_seq,
1901 				  RTM_NEWROUTE,
1902 				  tb->tb_id,
1903 				  fa->fa_type,
1904 				  fa->fa_scope,
1905 				  xkey,
1906 				  plen,
1907 				  fa->fa_tos,
1908 				  fa->fa_info, NLM_F_MULTI) < 0) {
1909 			cb->args[5] = i;
1910 			return -1;
1911 		}
1912 		i++;
1913 	}
1914 	cb->args[5] = i;
1915 	return skb->len;
1916 }
1917 
1918 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1919 			struct sk_buff *skb, struct netlink_callback *cb)
1920 {
1921 	struct leaf_info *li;
1922 	struct hlist_node *node;
1923 	int i, s_i;
1924 
1925 	s_i = cb->args[4];
1926 	i = 0;
1927 
1928 	/* rcu_read_lock is hold by caller */
1929 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1930 		if (i < s_i) {
1931 			i++;
1932 			continue;
1933 		}
1934 
1935 		if (i > s_i)
1936 			cb->args[5] = 0;
1937 
1938 		if (list_empty(&li->falh))
1939 			continue;
1940 
1941 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1942 			cb->args[4] = i;
1943 			return -1;
1944 		}
1945 		i++;
1946 	}
1947 
1948 	cb->args[4] = i;
1949 	return skb->len;
1950 }
1951 
1952 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1953 			struct netlink_callback *cb)
1954 {
1955 	struct leaf *l;
1956 	struct trie *t = (struct trie *) tb->tb_data;
1957 	t_key key = cb->args[2];
1958 	int count = cb->args[3];
1959 
1960 	rcu_read_lock();
1961 	/* Dump starting at last key.
1962 	 * Note: 0.0.0.0/0 (ie default) is first key.
1963 	 */
1964 	if (count == 0)
1965 		l = trie_firstleaf(t);
1966 	else {
1967 		/* Normally, continue from last key, but if that is missing
1968 		 * fallback to using slow rescan
1969 		 */
1970 		l = fib_find_node(t, key);
1971 		if (!l)
1972 			l = trie_leafindex(t, count);
1973 	}
1974 
1975 	while (l) {
1976 		cb->args[2] = l->key;
1977 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1978 			cb->args[3] = count;
1979 			rcu_read_unlock();
1980 			return -1;
1981 		}
1982 
1983 		++count;
1984 		l = trie_nextleaf(l);
1985 		memset(&cb->args[4], 0,
1986 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1987 	}
1988 	cb->args[3] = count;
1989 	rcu_read_unlock();
1990 
1991 	return skb->len;
1992 }
1993 
1994 void __init fib_hash_init(void)
1995 {
1996 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1997 					  sizeof(struct fib_alias),
1998 					  0, SLAB_PANIC, NULL);
1999 
2000 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2001 					   max(sizeof(struct leaf),
2002 					       sizeof(struct leaf_info)),
2003 					   0, SLAB_PANIC, NULL);
2004 }
2005 
2006 
2007 /* Fix more generic FIB names for init later */
2008 struct fib_table *fib_hash_table(u32 id)
2009 {
2010 	struct fib_table *tb;
2011 	struct trie *t;
2012 
2013 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2014 		     GFP_KERNEL);
2015 	if (tb == NULL)
2016 		return NULL;
2017 
2018 	tb->tb_id = id;
2019 	tb->tb_default = -1;
2020 	tb->tb_lookup = fn_trie_lookup;
2021 	tb->tb_insert = fn_trie_insert;
2022 	tb->tb_delete = fn_trie_delete;
2023 	tb->tb_flush = fn_trie_flush;
2024 	tb->tb_select_default = fn_trie_select_default;
2025 	tb->tb_dump = fn_trie_dump;
2026 
2027 	t = (struct trie *) tb->tb_data;
2028 	memset(t, 0, sizeof(*t));
2029 
2030 	if (id == RT_TABLE_LOCAL)
2031 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2032 
2033 	return tb;
2034 }
2035 
2036 #ifdef CONFIG_PROC_FS
2037 /* Depth first Trie walk iterator */
2038 struct fib_trie_iter {
2039 	struct seq_net_private p;
2040 	struct fib_table *tb;
2041 	struct tnode *tnode;
2042 	unsigned index;
2043 	unsigned depth;
2044 };
2045 
2046 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2047 {
2048 	struct tnode *tn = iter->tnode;
2049 	unsigned cindex = iter->index;
2050 	struct tnode *p;
2051 
2052 	/* A single entry routing table */
2053 	if (!tn)
2054 		return NULL;
2055 
2056 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2057 		 iter->tnode, iter->index, iter->depth);
2058 rescan:
2059 	while (cindex < (1<<tn->bits)) {
2060 		struct node *n = tnode_get_child_rcu(tn, cindex);
2061 
2062 		if (n) {
2063 			if (IS_LEAF(n)) {
2064 				iter->tnode = tn;
2065 				iter->index = cindex + 1;
2066 			} else {
2067 				/* push down one level */
2068 				iter->tnode = (struct tnode *) n;
2069 				iter->index = 0;
2070 				++iter->depth;
2071 			}
2072 			return n;
2073 		}
2074 
2075 		++cindex;
2076 	}
2077 
2078 	/* Current node exhausted, pop back up */
2079 	p = node_parent_rcu((struct node *)tn);
2080 	if (p) {
2081 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2082 		tn = p;
2083 		--iter->depth;
2084 		goto rescan;
2085 	}
2086 
2087 	/* got root? */
2088 	return NULL;
2089 }
2090 
2091 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2092 				       struct trie *t)
2093 {
2094 	struct node *n;
2095 
2096 	if (!t)
2097 		return NULL;
2098 
2099 	n = rcu_dereference(t->trie);
2100 	if (!n)
2101 		return NULL;
2102 
2103 	if (IS_TNODE(n)) {
2104 		iter->tnode = (struct tnode *) n;
2105 		iter->index = 0;
2106 		iter->depth = 1;
2107 	} else {
2108 		iter->tnode = NULL;
2109 		iter->index = 0;
2110 		iter->depth = 0;
2111 	}
2112 
2113 	return n;
2114 }
2115 
2116 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2117 {
2118 	struct node *n;
2119 	struct fib_trie_iter iter;
2120 
2121 	memset(s, 0, sizeof(*s));
2122 
2123 	rcu_read_lock();
2124 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2125 		if (IS_LEAF(n)) {
2126 			struct leaf *l = (struct leaf *)n;
2127 			struct leaf_info *li;
2128 			struct hlist_node *tmp;
2129 
2130 			s->leaves++;
2131 			s->totdepth += iter.depth;
2132 			if (iter.depth > s->maxdepth)
2133 				s->maxdepth = iter.depth;
2134 
2135 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2136 				++s->prefixes;
2137 		} else {
2138 			const struct tnode *tn = (const struct tnode *) n;
2139 			int i;
2140 
2141 			s->tnodes++;
2142 			if (tn->bits < MAX_STAT_DEPTH)
2143 				s->nodesizes[tn->bits]++;
2144 
2145 			for (i = 0; i < (1<<tn->bits); i++)
2146 				if (!tn->child[i])
2147 					s->nullpointers++;
2148 		}
2149 	}
2150 	rcu_read_unlock();
2151 }
2152 
2153 /*
2154  *	This outputs /proc/net/fib_triestats
2155  */
2156 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2157 {
2158 	unsigned i, max, pointers, bytes, avdepth;
2159 
2160 	if (stat->leaves)
2161 		avdepth = stat->totdepth*100 / stat->leaves;
2162 	else
2163 		avdepth = 0;
2164 
2165 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2166 		   avdepth / 100, avdepth % 100);
2167 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2168 
2169 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2170 	bytes = sizeof(struct leaf) * stat->leaves;
2171 
2172 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2173 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2174 
2175 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2176 	bytes += sizeof(struct tnode) * stat->tnodes;
2177 
2178 	max = MAX_STAT_DEPTH;
2179 	while (max > 0 && stat->nodesizes[max-1] == 0)
2180 		max--;
2181 
2182 	pointers = 0;
2183 	for (i = 1; i <= max; i++)
2184 		if (stat->nodesizes[i] != 0) {
2185 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2186 			pointers += (1<<i) * stat->nodesizes[i];
2187 		}
2188 	seq_putc(seq, '\n');
2189 	seq_printf(seq, "\tPointers: %u\n", pointers);
2190 
2191 	bytes += sizeof(struct node *) * pointers;
2192 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2193 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2194 }
2195 
2196 #ifdef CONFIG_IP_FIB_TRIE_STATS
2197 static void trie_show_usage(struct seq_file *seq,
2198 			    const struct trie_use_stats *stats)
2199 {
2200 	seq_printf(seq, "\nCounters:\n---------\n");
2201 	seq_printf(seq, "gets = %u\n", stats->gets);
2202 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2203 	seq_printf(seq, "semantic match passed = %u\n",
2204 		   stats->semantic_match_passed);
2205 	seq_printf(seq, "semantic match miss = %u\n",
2206 		   stats->semantic_match_miss);
2207 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2208 	seq_printf(seq, "skipped node resize = %u\n\n",
2209 		   stats->resize_node_skipped);
2210 }
2211 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2212 
2213 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2214 {
2215 	if (tb->tb_id == RT_TABLE_LOCAL)
2216 		seq_puts(seq, "Local:\n");
2217 	else if (tb->tb_id == RT_TABLE_MAIN)
2218 		seq_puts(seq, "Main:\n");
2219 	else
2220 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2221 }
2222 
2223 
2224 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2225 {
2226 	struct net *net = (struct net *)seq->private;
2227 	unsigned int h;
2228 
2229 	seq_printf(seq,
2230 		   "Basic info: size of leaf:"
2231 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2232 		   sizeof(struct leaf), sizeof(struct tnode));
2233 
2234 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2235 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2236 		struct hlist_node *node;
2237 		struct fib_table *tb;
2238 
2239 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2240 			struct trie *t = (struct trie *) tb->tb_data;
2241 			struct trie_stat stat;
2242 
2243 			if (!t)
2244 				continue;
2245 
2246 			fib_table_print(seq, tb);
2247 
2248 			trie_collect_stats(t, &stat);
2249 			trie_show_stats(seq, &stat);
2250 #ifdef CONFIG_IP_FIB_TRIE_STATS
2251 			trie_show_usage(seq, &t->stats);
2252 #endif
2253 		}
2254 	}
2255 
2256 	return 0;
2257 }
2258 
2259 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2260 {
2261 	int err;
2262 	struct net *net;
2263 
2264 	net = get_proc_net(inode);
2265 	if (net == NULL)
2266 		return -ENXIO;
2267 	err = single_open(file, fib_triestat_seq_show, net);
2268 	if (err < 0) {
2269 		put_net(net);
2270 		return err;
2271 	}
2272 	return 0;
2273 }
2274 
2275 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2276 {
2277 	struct seq_file *seq = f->private_data;
2278 	put_net(seq->private);
2279 	return single_release(ino, f);
2280 }
2281 
2282 static const struct file_operations fib_triestat_fops = {
2283 	.owner	= THIS_MODULE,
2284 	.open	= fib_triestat_seq_open,
2285 	.read	= seq_read,
2286 	.llseek	= seq_lseek,
2287 	.release = fib_triestat_seq_release,
2288 };
2289 
2290 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2291 {
2292 	struct fib_trie_iter *iter = seq->private;
2293 	struct net *net = seq_file_net(seq);
2294 	loff_t idx = 0;
2295 	unsigned int h;
2296 
2297 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2298 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2299 		struct hlist_node *node;
2300 		struct fib_table *tb;
2301 
2302 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2303 			struct node *n;
2304 
2305 			for (n = fib_trie_get_first(iter,
2306 						    (struct trie *) tb->tb_data);
2307 			     n; n = fib_trie_get_next(iter))
2308 				if (pos == idx++) {
2309 					iter->tb = tb;
2310 					return n;
2311 				}
2312 		}
2313 	}
2314 
2315 	return NULL;
2316 }
2317 
2318 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2319 	__acquires(RCU)
2320 {
2321 	rcu_read_lock();
2322 	return fib_trie_get_idx(seq, *pos);
2323 }
2324 
2325 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2326 {
2327 	struct fib_trie_iter *iter = seq->private;
2328 	struct net *net = seq_file_net(seq);
2329 	struct fib_table *tb = iter->tb;
2330 	struct hlist_node *tb_node;
2331 	unsigned int h;
2332 	struct node *n;
2333 
2334 	++*pos;
2335 	/* next node in same table */
2336 	n = fib_trie_get_next(iter);
2337 	if (n)
2338 		return n;
2339 
2340 	/* walk rest of this hash chain */
2341 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2342 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2343 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2344 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2345 		if (n)
2346 			goto found;
2347 	}
2348 
2349 	/* new hash chain */
2350 	while (++h < FIB_TABLE_HASHSZ) {
2351 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2352 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2353 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2354 			if (n)
2355 				goto found;
2356 		}
2357 	}
2358 	return NULL;
2359 
2360 found:
2361 	iter->tb = tb;
2362 	return n;
2363 }
2364 
2365 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2366 	__releases(RCU)
2367 {
2368 	rcu_read_unlock();
2369 }
2370 
2371 static void seq_indent(struct seq_file *seq, int n)
2372 {
2373 	while (n-- > 0) seq_puts(seq, "   ");
2374 }
2375 
2376 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2377 {
2378 	switch (s) {
2379 	case RT_SCOPE_UNIVERSE: return "universe";
2380 	case RT_SCOPE_SITE:	return "site";
2381 	case RT_SCOPE_LINK:	return "link";
2382 	case RT_SCOPE_HOST:	return "host";
2383 	case RT_SCOPE_NOWHERE:	return "nowhere";
2384 	default:
2385 		snprintf(buf, len, "scope=%d", s);
2386 		return buf;
2387 	}
2388 }
2389 
2390 static const char *rtn_type_names[__RTN_MAX] = {
2391 	[RTN_UNSPEC] = "UNSPEC",
2392 	[RTN_UNICAST] = "UNICAST",
2393 	[RTN_LOCAL] = "LOCAL",
2394 	[RTN_BROADCAST] = "BROADCAST",
2395 	[RTN_ANYCAST] = "ANYCAST",
2396 	[RTN_MULTICAST] = "MULTICAST",
2397 	[RTN_BLACKHOLE] = "BLACKHOLE",
2398 	[RTN_UNREACHABLE] = "UNREACHABLE",
2399 	[RTN_PROHIBIT] = "PROHIBIT",
2400 	[RTN_THROW] = "THROW",
2401 	[RTN_NAT] = "NAT",
2402 	[RTN_XRESOLVE] = "XRESOLVE",
2403 };
2404 
2405 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2406 {
2407 	if (t < __RTN_MAX && rtn_type_names[t])
2408 		return rtn_type_names[t];
2409 	snprintf(buf, len, "type %u", t);
2410 	return buf;
2411 }
2412 
2413 /* Pretty print the trie */
2414 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2415 {
2416 	const struct fib_trie_iter *iter = seq->private;
2417 	struct node *n = v;
2418 
2419 	if (!node_parent_rcu(n))
2420 		fib_table_print(seq, iter->tb);
2421 
2422 	if (IS_TNODE(n)) {
2423 		struct tnode *tn = (struct tnode *) n;
2424 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2425 
2426 		seq_indent(seq, iter->depth-1);
2427 		seq_printf(seq, "  +-- " NIPQUAD_FMT "/%d %d %d %d\n",
2428 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2429 			   tn->empty_children);
2430 
2431 	} else {
2432 		struct leaf *l = (struct leaf *) n;
2433 		struct leaf_info *li;
2434 		struct hlist_node *node;
2435 		__be32 val = htonl(l->key);
2436 
2437 		seq_indent(seq, iter->depth);
2438 		seq_printf(seq, "  |-- " NIPQUAD_FMT "\n", NIPQUAD(val));
2439 
2440 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2441 			struct fib_alias *fa;
2442 
2443 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2444 				char buf1[32], buf2[32];
2445 
2446 				seq_indent(seq, iter->depth+1);
2447 				seq_printf(seq, "  /%d %s %s", li->plen,
2448 					   rtn_scope(buf1, sizeof(buf1),
2449 						     fa->fa_scope),
2450 					   rtn_type(buf2, sizeof(buf2),
2451 						    fa->fa_type));
2452 				if (fa->fa_tos)
2453 					seq_printf(seq, " tos=%d", fa->fa_tos);
2454 				seq_putc(seq, '\n');
2455 			}
2456 		}
2457 	}
2458 
2459 	return 0;
2460 }
2461 
2462 static const struct seq_operations fib_trie_seq_ops = {
2463 	.start  = fib_trie_seq_start,
2464 	.next   = fib_trie_seq_next,
2465 	.stop   = fib_trie_seq_stop,
2466 	.show   = fib_trie_seq_show,
2467 };
2468 
2469 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2470 {
2471 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2472 			    sizeof(struct fib_trie_iter));
2473 }
2474 
2475 static const struct file_operations fib_trie_fops = {
2476 	.owner  = THIS_MODULE,
2477 	.open   = fib_trie_seq_open,
2478 	.read   = seq_read,
2479 	.llseek = seq_lseek,
2480 	.release = seq_release_net,
2481 };
2482 
2483 struct fib_route_iter {
2484 	struct seq_net_private p;
2485 	struct trie *main_trie;
2486 	loff_t	pos;
2487 	t_key	key;
2488 };
2489 
2490 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2491 {
2492 	struct leaf *l = NULL;
2493 	struct trie *t = iter->main_trie;
2494 
2495 	/* use cache location of last found key */
2496 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2497 		pos -= iter->pos;
2498 	else {
2499 		iter->pos = 0;
2500 		l = trie_firstleaf(t);
2501 	}
2502 
2503 	while (l && pos-- > 0) {
2504 		iter->pos++;
2505 		l = trie_nextleaf(l);
2506 	}
2507 
2508 	if (l)
2509 		iter->key = pos;	/* remember it */
2510 	else
2511 		iter->pos = 0;		/* forget it */
2512 
2513 	return l;
2514 }
2515 
2516 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2517 	__acquires(RCU)
2518 {
2519 	struct fib_route_iter *iter = seq->private;
2520 	struct fib_table *tb;
2521 
2522 	rcu_read_lock();
2523 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2524 	if (!tb)
2525 		return NULL;
2526 
2527 	iter->main_trie = (struct trie *) tb->tb_data;
2528 	if (*pos == 0)
2529 		return SEQ_START_TOKEN;
2530 	else
2531 		return fib_route_get_idx(iter, *pos - 1);
2532 }
2533 
2534 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2535 {
2536 	struct fib_route_iter *iter = seq->private;
2537 	struct leaf *l = v;
2538 
2539 	++*pos;
2540 	if (v == SEQ_START_TOKEN) {
2541 		iter->pos = 0;
2542 		l = trie_firstleaf(iter->main_trie);
2543 	} else {
2544 		iter->pos++;
2545 		l = trie_nextleaf(l);
2546 	}
2547 
2548 	if (l)
2549 		iter->key = l->key;
2550 	else
2551 		iter->pos = 0;
2552 	return l;
2553 }
2554 
2555 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2556 	__releases(RCU)
2557 {
2558 	rcu_read_unlock();
2559 }
2560 
2561 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2562 {
2563 	static unsigned type2flags[RTN_MAX + 1] = {
2564 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2565 	};
2566 	unsigned flags = type2flags[type];
2567 
2568 	if (fi && fi->fib_nh->nh_gw)
2569 		flags |= RTF_GATEWAY;
2570 	if (mask == htonl(0xFFFFFFFF))
2571 		flags |= RTF_HOST;
2572 	flags |= RTF_UP;
2573 	return flags;
2574 }
2575 
2576 /*
2577  *	This outputs /proc/net/route.
2578  *	The format of the file is not supposed to be changed
2579  * 	and needs to be same as fib_hash output to avoid breaking
2580  *	legacy utilities
2581  */
2582 static int fib_route_seq_show(struct seq_file *seq, void *v)
2583 {
2584 	struct leaf *l = v;
2585 	struct leaf_info *li;
2586 	struct hlist_node *node;
2587 
2588 	if (v == SEQ_START_TOKEN) {
2589 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2590 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2591 			   "\tWindow\tIRTT");
2592 		return 0;
2593 	}
2594 
2595 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2596 		struct fib_alias *fa;
2597 		__be32 mask, prefix;
2598 
2599 		mask = inet_make_mask(li->plen);
2600 		prefix = htonl(l->key);
2601 
2602 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2603 			const struct fib_info *fi = fa->fa_info;
2604 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2605 			char bf[128];
2606 
2607 			if (fa->fa_type == RTN_BROADCAST
2608 			    || fa->fa_type == RTN_MULTICAST)
2609 				continue;
2610 
2611 			if (fi)
2612 				snprintf(bf, sizeof(bf),
2613 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2614 					 fi->fib_dev ? fi->fib_dev->name : "*",
2615 					 prefix,
2616 					 fi->fib_nh->nh_gw, flags, 0, 0,
2617 					 fi->fib_priority,
2618 					 mask,
2619 					 (fi->fib_advmss ?
2620 					  fi->fib_advmss + 40 : 0),
2621 					 fi->fib_window,
2622 					 fi->fib_rtt >> 3);
2623 			else
2624 				snprintf(bf, sizeof(bf),
2625 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2626 					 prefix, 0, flags, 0, 0, 0,
2627 					 mask, 0, 0, 0);
2628 
2629 			seq_printf(seq, "%-127s\n", bf);
2630 		}
2631 	}
2632 
2633 	return 0;
2634 }
2635 
2636 static const struct seq_operations fib_route_seq_ops = {
2637 	.start  = fib_route_seq_start,
2638 	.next   = fib_route_seq_next,
2639 	.stop   = fib_route_seq_stop,
2640 	.show   = fib_route_seq_show,
2641 };
2642 
2643 static int fib_route_seq_open(struct inode *inode, struct file *file)
2644 {
2645 	return seq_open_net(inode, file, &fib_route_seq_ops,
2646 			    sizeof(struct fib_route_iter));
2647 }
2648 
2649 static const struct file_operations fib_route_fops = {
2650 	.owner  = THIS_MODULE,
2651 	.open   = fib_route_seq_open,
2652 	.read   = seq_read,
2653 	.llseek = seq_lseek,
2654 	.release = seq_release_net,
2655 };
2656 
2657 int __net_init fib_proc_init(struct net *net)
2658 {
2659 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2660 		goto out1;
2661 
2662 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2663 				  &fib_triestat_fops))
2664 		goto out2;
2665 
2666 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2667 		goto out3;
2668 
2669 	return 0;
2670 
2671 out3:
2672 	proc_net_remove(net, "fib_triestat");
2673 out2:
2674 	proc_net_remove(net, "fib_trie");
2675 out1:
2676 	return -ENOMEM;
2677 }
2678 
2679 void __net_exit fib_proc_exit(struct net *net)
2680 {
2681 	proc_net_remove(net, "fib_trie");
2682 	proc_net_remove(net, "fib_triestat");
2683 	proc_net_remove(net, "route");
2684 }
2685 
2686 #endif /* CONFIG_PROC_FS */
2687