xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 78c99ba1)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.408"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
82 
83 #define MAX_STAT_DEPTH 32
84 
85 #define KEYLENGTH (8*sizeof(t_key))
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93 
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
96 
97 struct node {
98 	unsigned long parent;
99 	t_key key;
100 };
101 
102 struct leaf {
103 	unsigned long parent;
104 	t_key key;
105 	struct hlist_head list;
106 	struct rcu_head rcu;
107 };
108 
109 struct leaf_info {
110 	struct hlist_node hlist;
111 	struct rcu_head rcu;
112 	int plen;
113 	struct list_head falh;
114 };
115 
116 struct tnode {
117 	unsigned long parent;
118 	t_key key;
119 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
120 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned int full_children;	/* KEYLENGTH bits needed */
122 	unsigned int empty_children;	/* KEYLENGTH bits needed */
123 	union {
124 		struct rcu_head rcu;
125 		struct work_struct work;
126 	};
127 	struct node *child[0];
128 };
129 
130 #ifdef CONFIG_IP_FIB_TRIE_STATS
131 struct trie_use_stats {
132 	unsigned int gets;
133 	unsigned int backtrack;
134 	unsigned int semantic_match_passed;
135 	unsigned int semantic_match_miss;
136 	unsigned int null_node_hit;
137 	unsigned int resize_node_skipped;
138 };
139 #endif
140 
141 struct trie_stat {
142 	unsigned int totdepth;
143 	unsigned int maxdepth;
144 	unsigned int tnodes;
145 	unsigned int leaves;
146 	unsigned int nullpointers;
147 	unsigned int prefixes;
148 	unsigned int nodesizes[MAX_STAT_DEPTH];
149 };
150 
151 struct trie {
152 	struct node *trie;
153 #ifdef CONFIG_IP_FIB_TRIE_STATS
154 	struct trie_use_stats stats;
155 #endif
156 };
157 
158 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
159 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
160 				  int wasfull);
161 static struct node *resize(struct trie *t, struct tnode *tn);
162 static struct tnode *inflate(struct trie *t, struct tnode *tn);
163 static struct tnode *halve(struct trie *t, struct tnode *tn);
164 
165 static struct kmem_cache *fn_alias_kmem __read_mostly;
166 static struct kmem_cache *trie_leaf_kmem __read_mostly;
167 
168 static inline struct tnode *node_parent(struct node *node)
169 {
170 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171 }
172 
173 static inline struct tnode *node_parent_rcu(struct node *node)
174 {
175 	struct tnode *ret = node_parent(node);
176 
177 	return rcu_dereference(ret);
178 }
179 
180 /* Same as rcu_assign_pointer
181  * but that macro() assumes that value is a pointer.
182  */
183 static inline void node_set_parent(struct node *node, struct tnode *ptr)
184 {
185 	smp_wmb();
186 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
187 }
188 
189 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
190 {
191 	BUG_ON(i >= 1U << tn->bits);
192 
193 	return tn->child[i];
194 }
195 
196 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
197 {
198 	struct node *ret = tnode_get_child(tn, i);
199 
200 	return rcu_dereference(ret);
201 }
202 
203 static inline int tnode_child_length(const struct tnode *tn)
204 {
205 	return 1 << tn->bits;
206 }
207 
208 static inline t_key mask_pfx(t_key k, unsigned short l)
209 {
210 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
211 }
212 
213 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
214 {
215 	if (offset < KEYLENGTH)
216 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
217 	else
218 		return 0;
219 }
220 
221 static inline int tkey_equals(t_key a, t_key b)
222 {
223 	return a == b;
224 }
225 
226 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
227 {
228 	if (bits == 0 || offset >= KEYLENGTH)
229 		return 1;
230 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
231 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
232 }
233 
234 static inline int tkey_mismatch(t_key a, int offset, t_key b)
235 {
236 	t_key diff = a ^ b;
237 	int i = offset;
238 
239 	if (!diff)
240 		return 0;
241 	while ((diff << i) >> (KEYLENGTH-1) == 0)
242 		i++;
243 	return i;
244 }
245 
246 /*
247   To understand this stuff, an understanding of keys and all their bits is
248   necessary. Every node in the trie has a key associated with it, but not
249   all of the bits in that key are significant.
250 
251   Consider a node 'n' and its parent 'tp'.
252 
253   If n is a leaf, every bit in its key is significant. Its presence is
254   necessitated by path compression, since during a tree traversal (when
255   searching for a leaf - unless we are doing an insertion) we will completely
256   ignore all skipped bits we encounter. Thus we need to verify, at the end of
257   a potentially successful search, that we have indeed been walking the
258   correct key path.
259 
260   Note that we can never "miss" the correct key in the tree if present by
261   following the wrong path. Path compression ensures that segments of the key
262   that are the same for all keys with a given prefix are skipped, but the
263   skipped part *is* identical for each node in the subtrie below the skipped
264   bit! trie_insert() in this implementation takes care of that - note the
265   call to tkey_sub_equals() in trie_insert().
266 
267   if n is an internal node - a 'tnode' here, the various parts of its key
268   have many different meanings.
269 
270   Example:
271   _________________________________________________________________
272   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
273   -----------------------------------------------------------------
274     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
275 
276   _________________________________________________________________
277   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
278   -----------------------------------------------------------------
279    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
280 
281   tp->pos = 7
282   tp->bits = 3
283   n->pos = 15
284   n->bits = 4
285 
286   First, let's just ignore the bits that come before the parent tp, that is
287   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
288   not use them for anything.
289 
290   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
291   index into the parent's child array. That is, they will be used to find
292   'n' among tp's children.
293 
294   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
295   for the node n.
296 
297   All the bits we have seen so far are significant to the node n. The rest
298   of the bits are really not needed or indeed known in n->key.
299 
300   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
301   n's child array, and will of course be different for each child.
302 
303 
304   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
305   at this point.
306 
307 */
308 
309 static inline void check_tnode(const struct tnode *tn)
310 {
311 	WARN_ON(tn && tn->pos+tn->bits > 32);
312 }
313 
314 static const int halve_threshold = 25;
315 static const int inflate_threshold = 50;
316 static const int halve_threshold_root = 8;
317 static const int inflate_threshold_root = 15;
318 
319 
320 static void __alias_free_mem(struct rcu_head *head)
321 {
322 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
323 	kmem_cache_free(fn_alias_kmem, fa);
324 }
325 
326 static inline void alias_free_mem_rcu(struct fib_alias *fa)
327 {
328 	call_rcu(&fa->rcu, __alias_free_mem);
329 }
330 
331 static void __leaf_free_rcu(struct rcu_head *head)
332 {
333 	struct leaf *l = container_of(head, struct leaf, rcu);
334 	kmem_cache_free(trie_leaf_kmem, l);
335 }
336 
337 static inline void free_leaf(struct leaf *l)
338 {
339 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
340 }
341 
342 static void __leaf_info_free_rcu(struct rcu_head *head)
343 {
344 	kfree(container_of(head, struct leaf_info, rcu));
345 }
346 
347 static inline void free_leaf_info(struct leaf_info *leaf)
348 {
349 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
350 }
351 
352 static struct tnode *tnode_alloc(size_t size)
353 {
354 	if (size <= PAGE_SIZE)
355 		return kzalloc(size, GFP_KERNEL);
356 	else
357 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
358 }
359 
360 static void __tnode_vfree(struct work_struct *arg)
361 {
362 	struct tnode *tn = container_of(arg, struct tnode, work);
363 	vfree(tn);
364 }
365 
366 static void __tnode_free_rcu(struct rcu_head *head)
367 {
368 	struct tnode *tn = container_of(head, struct tnode, rcu);
369 	size_t size = sizeof(struct tnode) +
370 		      (sizeof(struct node *) << tn->bits);
371 
372 	if (size <= PAGE_SIZE)
373 		kfree(tn);
374 	else {
375 		INIT_WORK(&tn->work, __tnode_vfree);
376 		schedule_work(&tn->work);
377 	}
378 }
379 
380 static inline void tnode_free(struct tnode *tn)
381 {
382 	if (IS_LEAF(tn))
383 		free_leaf((struct leaf *) tn);
384 	else
385 		call_rcu(&tn->rcu, __tnode_free_rcu);
386 }
387 
388 static struct leaf *leaf_new(void)
389 {
390 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
391 	if (l) {
392 		l->parent = T_LEAF;
393 		INIT_HLIST_HEAD(&l->list);
394 	}
395 	return l;
396 }
397 
398 static struct leaf_info *leaf_info_new(int plen)
399 {
400 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
401 	if (li) {
402 		li->plen = plen;
403 		INIT_LIST_HEAD(&li->falh);
404 	}
405 	return li;
406 }
407 
408 static struct tnode *tnode_new(t_key key, int pos, int bits)
409 {
410 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
411 	struct tnode *tn = tnode_alloc(sz);
412 
413 	if (tn) {
414 		tn->parent = T_TNODE;
415 		tn->pos = pos;
416 		tn->bits = bits;
417 		tn->key = key;
418 		tn->full_children = 0;
419 		tn->empty_children = 1<<bits;
420 	}
421 
422 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
423 		 (unsigned long) (sizeof(struct node) << bits));
424 	return tn;
425 }
426 
427 /*
428  * Check whether a tnode 'n' is "full", i.e. it is an internal node
429  * and no bits are skipped. See discussion in dyntree paper p. 6
430  */
431 
432 static inline int tnode_full(const struct tnode *tn, const struct node *n)
433 {
434 	if (n == NULL || IS_LEAF(n))
435 		return 0;
436 
437 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
438 }
439 
440 static inline void put_child(struct trie *t, struct tnode *tn, int i,
441 			     struct node *n)
442 {
443 	tnode_put_child_reorg(tn, i, n, -1);
444 }
445 
446  /*
447   * Add a child at position i overwriting the old value.
448   * Update the value of full_children and empty_children.
449   */
450 
451 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
452 				  int wasfull)
453 {
454 	struct node *chi = tn->child[i];
455 	int isfull;
456 
457 	BUG_ON(i >= 1<<tn->bits);
458 
459 	/* update emptyChildren */
460 	if (n == NULL && chi != NULL)
461 		tn->empty_children++;
462 	else if (n != NULL && chi == NULL)
463 		tn->empty_children--;
464 
465 	/* update fullChildren */
466 	if (wasfull == -1)
467 		wasfull = tnode_full(tn, chi);
468 
469 	isfull = tnode_full(tn, n);
470 	if (wasfull && !isfull)
471 		tn->full_children--;
472 	else if (!wasfull && isfull)
473 		tn->full_children++;
474 
475 	if (n)
476 		node_set_parent(n, tn);
477 
478 	rcu_assign_pointer(tn->child[i], n);
479 }
480 
481 static struct node *resize(struct trie *t, struct tnode *tn)
482 {
483 	int i;
484 	int err = 0;
485 	struct tnode *old_tn;
486 	int inflate_threshold_use;
487 	int halve_threshold_use;
488 	int max_resize;
489 
490 	if (!tn)
491 		return NULL;
492 
493 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
494 		 tn, inflate_threshold, halve_threshold);
495 
496 	/* No children */
497 	if (tn->empty_children == tnode_child_length(tn)) {
498 		tnode_free(tn);
499 		return NULL;
500 	}
501 	/* One child */
502 	if (tn->empty_children == tnode_child_length(tn) - 1)
503 		for (i = 0; i < tnode_child_length(tn); i++) {
504 			struct node *n;
505 
506 			n = tn->child[i];
507 			if (!n)
508 				continue;
509 
510 			/* compress one level */
511 			node_set_parent(n, NULL);
512 			tnode_free(tn);
513 			return n;
514 		}
515 	/*
516 	 * Double as long as the resulting node has a number of
517 	 * nonempty nodes that are above the threshold.
518 	 */
519 
520 	/*
521 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
522 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
523 	 * Telecommunications, page 6:
524 	 * "A node is doubled if the ratio of non-empty children to all
525 	 * children in the *doubled* node is at least 'high'."
526 	 *
527 	 * 'high' in this instance is the variable 'inflate_threshold'. It
528 	 * is expressed as a percentage, so we multiply it with
529 	 * tnode_child_length() and instead of multiplying by 2 (since the
530 	 * child array will be doubled by inflate()) and multiplying
531 	 * the left-hand side by 100 (to handle the percentage thing) we
532 	 * multiply the left-hand side by 50.
533 	 *
534 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
535 	 * - tn->empty_children is of course the number of non-null children
536 	 * in the current node. tn->full_children is the number of "full"
537 	 * children, that is non-null tnodes with a skip value of 0.
538 	 * All of those will be doubled in the resulting inflated tnode, so
539 	 * we just count them one extra time here.
540 	 *
541 	 * A clearer way to write this would be:
542 	 *
543 	 * to_be_doubled = tn->full_children;
544 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
545 	 *     tn->full_children;
546 	 *
547 	 * new_child_length = tnode_child_length(tn) * 2;
548 	 *
549 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
550 	 *      new_child_length;
551 	 * if (new_fill_factor >= inflate_threshold)
552 	 *
553 	 * ...and so on, tho it would mess up the while () loop.
554 	 *
555 	 * anyway,
556 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
557 	 *      inflate_threshold
558 	 *
559 	 * avoid a division:
560 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
561 	 *      inflate_threshold * new_child_length
562 	 *
563 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
564 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
565 	 *    tn->full_children) >= inflate_threshold * new_child_length
566 	 *
567 	 * expand new_child_length:
568 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
569 	 *    tn->full_children) >=
570 	 *      inflate_threshold * tnode_child_length(tn) * 2
571 	 *
572 	 * shorten again:
573 	 * 50 * (tn->full_children + tnode_child_length(tn) -
574 	 *    tn->empty_children) >= inflate_threshold *
575 	 *    tnode_child_length(tn)
576 	 *
577 	 */
578 
579 	check_tnode(tn);
580 
581 	/* Keep root node larger  */
582 
583 	if (!tn->parent)
584 		inflate_threshold_use = inflate_threshold_root;
585 	else
586 		inflate_threshold_use = inflate_threshold;
587 
588 	err = 0;
589 	max_resize = 10;
590 	while ((tn->full_children > 0 &&  max_resize-- &&
591 		50 * (tn->full_children + tnode_child_length(tn)
592 		      - tn->empty_children)
593 		>= inflate_threshold_use * tnode_child_length(tn))) {
594 
595 		old_tn = tn;
596 		tn = inflate(t, tn);
597 
598 		if (IS_ERR(tn)) {
599 			tn = old_tn;
600 #ifdef CONFIG_IP_FIB_TRIE_STATS
601 			t->stats.resize_node_skipped++;
602 #endif
603 			break;
604 		}
605 	}
606 
607 	if (max_resize < 0) {
608 		if (!tn->parent)
609 			pr_warning("Fix inflate_threshold_root."
610 				   " Now=%d size=%d bits\n",
611 				   inflate_threshold_root, tn->bits);
612 		else
613 			pr_warning("Fix inflate_threshold."
614 				   " Now=%d size=%d bits\n",
615 				   inflate_threshold, tn->bits);
616 	}
617 
618 	check_tnode(tn);
619 
620 	/*
621 	 * Halve as long as the number of empty children in this
622 	 * node is above threshold.
623 	 */
624 
625 
626 	/* Keep root node larger  */
627 
628 	if (!tn->parent)
629 		halve_threshold_use = halve_threshold_root;
630 	else
631 		halve_threshold_use = halve_threshold;
632 
633 	err = 0;
634 	max_resize = 10;
635 	while (tn->bits > 1 &&  max_resize-- &&
636 	       100 * (tnode_child_length(tn) - tn->empty_children) <
637 	       halve_threshold_use * tnode_child_length(tn)) {
638 
639 		old_tn = tn;
640 		tn = halve(t, tn);
641 		if (IS_ERR(tn)) {
642 			tn = old_tn;
643 #ifdef CONFIG_IP_FIB_TRIE_STATS
644 			t->stats.resize_node_skipped++;
645 #endif
646 			break;
647 		}
648 	}
649 
650 	if (max_resize < 0) {
651 		if (!tn->parent)
652 			pr_warning("Fix halve_threshold_root."
653 				   " Now=%d size=%d bits\n",
654 				   halve_threshold_root, tn->bits);
655 		else
656 			pr_warning("Fix halve_threshold."
657 				   " Now=%d size=%d bits\n",
658 				   halve_threshold, tn->bits);
659 	}
660 
661 	/* Only one child remains */
662 	if (tn->empty_children == tnode_child_length(tn) - 1)
663 		for (i = 0; i < tnode_child_length(tn); i++) {
664 			struct node *n;
665 
666 			n = tn->child[i];
667 			if (!n)
668 				continue;
669 
670 			/* compress one level */
671 
672 			node_set_parent(n, NULL);
673 			tnode_free(tn);
674 			return n;
675 		}
676 
677 	return (struct node *) tn;
678 }
679 
680 static struct tnode *inflate(struct trie *t, struct tnode *tn)
681 {
682 	struct tnode *oldtnode = tn;
683 	int olen = tnode_child_length(tn);
684 	int i;
685 
686 	pr_debug("In inflate\n");
687 
688 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
689 
690 	if (!tn)
691 		return ERR_PTR(-ENOMEM);
692 
693 	/*
694 	 * Preallocate and store tnodes before the actual work so we
695 	 * don't get into an inconsistent state if memory allocation
696 	 * fails. In case of failure we return the oldnode and  inflate
697 	 * of tnode is ignored.
698 	 */
699 
700 	for (i = 0; i < olen; i++) {
701 		struct tnode *inode;
702 
703 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
704 		if (inode &&
705 		    IS_TNODE(inode) &&
706 		    inode->pos == oldtnode->pos + oldtnode->bits &&
707 		    inode->bits > 1) {
708 			struct tnode *left, *right;
709 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
710 
711 			left = tnode_new(inode->key&(~m), inode->pos + 1,
712 					 inode->bits - 1);
713 			if (!left)
714 				goto nomem;
715 
716 			right = tnode_new(inode->key|m, inode->pos + 1,
717 					  inode->bits - 1);
718 
719 			if (!right) {
720 				tnode_free(left);
721 				goto nomem;
722 			}
723 
724 			put_child(t, tn, 2*i, (struct node *) left);
725 			put_child(t, tn, 2*i+1, (struct node *) right);
726 		}
727 	}
728 
729 	for (i = 0; i < olen; i++) {
730 		struct tnode *inode;
731 		struct node *node = tnode_get_child(oldtnode, i);
732 		struct tnode *left, *right;
733 		int size, j;
734 
735 		/* An empty child */
736 		if (node == NULL)
737 			continue;
738 
739 		/* A leaf or an internal node with skipped bits */
740 
741 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
742 		   tn->pos + tn->bits - 1) {
743 			if (tkey_extract_bits(node->key,
744 					      oldtnode->pos + oldtnode->bits,
745 					      1) == 0)
746 				put_child(t, tn, 2*i, node);
747 			else
748 				put_child(t, tn, 2*i+1, node);
749 			continue;
750 		}
751 
752 		/* An internal node with two children */
753 		inode = (struct tnode *) node;
754 
755 		if (inode->bits == 1) {
756 			put_child(t, tn, 2*i, inode->child[0]);
757 			put_child(t, tn, 2*i+1, inode->child[1]);
758 
759 			tnode_free(inode);
760 			continue;
761 		}
762 
763 		/* An internal node with more than two children */
764 
765 		/* We will replace this node 'inode' with two new
766 		 * ones, 'left' and 'right', each with half of the
767 		 * original children. The two new nodes will have
768 		 * a position one bit further down the key and this
769 		 * means that the "significant" part of their keys
770 		 * (see the discussion near the top of this file)
771 		 * will differ by one bit, which will be "0" in
772 		 * left's key and "1" in right's key. Since we are
773 		 * moving the key position by one step, the bit that
774 		 * we are moving away from - the bit at position
775 		 * (inode->pos) - is the one that will differ between
776 		 * left and right. So... we synthesize that bit in the
777 		 * two  new keys.
778 		 * The mask 'm' below will be a single "one" bit at
779 		 * the position (inode->pos)
780 		 */
781 
782 		/* Use the old key, but set the new significant
783 		 *   bit to zero.
784 		 */
785 
786 		left = (struct tnode *) tnode_get_child(tn, 2*i);
787 		put_child(t, tn, 2*i, NULL);
788 
789 		BUG_ON(!left);
790 
791 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
792 		put_child(t, tn, 2*i+1, NULL);
793 
794 		BUG_ON(!right);
795 
796 		size = tnode_child_length(left);
797 		for (j = 0; j < size; j++) {
798 			put_child(t, left, j, inode->child[j]);
799 			put_child(t, right, j, inode->child[j + size]);
800 		}
801 		put_child(t, tn, 2*i, resize(t, left));
802 		put_child(t, tn, 2*i+1, resize(t, right));
803 
804 		tnode_free(inode);
805 	}
806 	tnode_free(oldtnode);
807 	return tn;
808 nomem:
809 	{
810 		int size = tnode_child_length(tn);
811 		int j;
812 
813 		for (j = 0; j < size; j++)
814 			if (tn->child[j])
815 				tnode_free((struct tnode *)tn->child[j]);
816 
817 		tnode_free(tn);
818 
819 		return ERR_PTR(-ENOMEM);
820 	}
821 }
822 
823 static struct tnode *halve(struct trie *t, struct tnode *tn)
824 {
825 	struct tnode *oldtnode = tn;
826 	struct node *left, *right;
827 	int i;
828 	int olen = tnode_child_length(tn);
829 
830 	pr_debug("In halve\n");
831 
832 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
833 
834 	if (!tn)
835 		return ERR_PTR(-ENOMEM);
836 
837 	/*
838 	 * Preallocate and store tnodes before the actual work so we
839 	 * don't get into an inconsistent state if memory allocation
840 	 * fails. In case of failure we return the oldnode and halve
841 	 * of tnode is ignored.
842 	 */
843 
844 	for (i = 0; i < olen; i += 2) {
845 		left = tnode_get_child(oldtnode, i);
846 		right = tnode_get_child(oldtnode, i+1);
847 
848 		/* Two nonempty children */
849 		if (left && right) {
850 			struct tnode *newn;
851 
852 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
853 
854 			if (!newn)
855 				goto nomem;
856 
857 			put_child(t, tn, i/2, (struct node *)newn);
858 		}
859 
860 	}
861 
862 	for (i = 0; i < olen; i += 2) {
863 		struct tnode *newBinNode;
864 
865 		left = tnode_get_child(oldtnode, i);
866 		right = tnode_get_child(oldtnode, i+1);
867 
868 		/* At least one of the children is empty */
869 		if (left == NULL) {
870 			if (right == NULL)    /* Both are empty */
871 				continue;
872 			put_child(t, tn, i/2, right);
873 			continue;
874 		}
875 
876 		if (right == NULL) {
877 			put_child(t, tn, i/2, left);
878 			continue;
879 		}
880 
881 		/* Two nonempty children */
882 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
883 		put_child(t, tn, i/2, NULL);
884 		put_child(t, newBinNode, 0, left);
885 		put_child(t, newBinNode, 1, right);
886 		put_child(t, tn, i/2, resize(t, newBinNode));
887 	}
888 	tnode_free(oldtnode);
889 	return tn;
890 nomem:
891 	{
892 		int size = tnode_child_length(tn);
893 		int j;
894 
895 		for (j = 0; j < size; j++)
896 			if (tn->child[j])
897 				tnode_free((struct tnode *)tn->child[j]);
898 
899 		tnode_free(tn);
900 
901 		return ERR_PTR(-ENOMEM);
902 	}
903 }
904 
905 /* readside must use rcu_read_lock currently dump routines
906  via get_fa_head and dump */
907 
908 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
909 {
910 	struct hlist_head *head = &l->list;
911 	struct hlist_node *node;
912 	struct leaf_info *li;
913 
914 	hlist_for_each_entry_rcu(li, node, head, hlist)
915 		if (li->plen == plen)
916 			return li;
917 
918 	return NULL;
919 }
920 
921 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
922 {
923 	struct leaf_info *li = find_leaf_info(l, plen);
924 
925 	if (!li)
926 		return NULL;
927 
928 	return &li->falh;
929 }
930 
931 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
932 {
933 	struct leaf_info *li = NULL, *last = NULL;
934 	struct hlist_node *node;
935 
936 	if (hlist_empty(head)) {
937 		hlist_add_head_rcu(&new->hlist, head);
938 	} else {
939 		hlist_for_each_entry(li, node, head, hlist) {
940 			if (new->plen > li->plen)
941 				break;
942 
943 			last = li;
944 		}
945 		if (last)
946 			hlist_add_after_rcu(&last->hlist, &new->hlist);
947 		else
948 			hlist_add_before_rcu(&new->hlist, &li->hlist);
949 	}
950 }
951 
952 /* rcu_read_lock needs to be hold by caller from readside */
953 
954 static struct leaf *
955 fib_find_node(struct trie *t, u32 key)
956 {
957 	int pos;
958 	struct tnode *tn;
959 	struct node *n;
960 
961 	pos = 0;
962 	n = rcu_dereference(t->trie);
963 
964 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
965 		tn = (struct tnode *) n;
966 
967 		check_tnode(tn);
968 
969 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
970 			pos = tn->pos + tn->bits;
971 			n = tnode_get_child_rcu(tn,
972 						tkey_extract_bits(key,
973 								  tn->pos,
974 								  tn->bits));
975 		} else
976 			break;
977 	}
978 	/* Case we have found a leaf. Compare prefixes */
979 
980 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
981 		return (struct leaf *)n;
982 
983 	return NULL;
984 }
985 
986 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
987 {
988 	int wasfull;
989 	t_key cindex, key;
990 	struct tnode *tp;
991 
992 	preempt_disable();
993 	key = tn->key;
994 
995 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
996 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
997 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
998 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
999 
1000 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1001 				      (struct node *)tn, wasfull);
1002 
1003 		tp = node_parent((struct node *) tn);
1004 		if (!tp)
1005 			break;
1006 		tn = tp;
1007 	}
1008 
1009 	/* Handle last (top) tnode */
1010 	if (IS_TNODE(tn))
1011 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1012 
1013 	preempt_enable();
1014 	return (struct node *)tn;
1015 }
1016 
1017 /* only used from updater-side */
1018 
1019 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1020 {
1021 	int pos, newpos;
1022 	struct tnode *tp = NULL, *tn = NULL;
1023 	struct node *n;
1024 	struct leaf *l;
1025 	int missbit;
1026 	struct list_head *fa_head = NULL;
1027 	struct leaf_info *li;
1028 	t_key cindex;
1029 
1030 	pos = 0;
1031 	n = t->trie;
1032 
1033 	/* If we point to NULL, stop. Either the tree is empty and we should
1034 	 * just put a new leaf in if, or we have reached an empty child slot,
1035 	 * and we should just put our new leaf in that.
1036 	 * If we point to a T_TNODE, check if it matches our key. Note that
1037 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1038 	 * not be the parent's 'pos'+'bits'!
1039 	 *
1040 	 * If it does match the current key, get pos/bits from it, extract
1041 	 * the index from our key, push the T_TNODE and walk the tree.
1042 	 *
1043 	 * If it doesn't, we have to replace it with a new T_TNODE.
1044 	 *
1045 	 * If we point to a T_LEAF, it might or might not have the same key
1046 	 * as we do. If it does, just change the value, update the T_LEAF's
1047 	 * value, and return it.
1048 	 * If it doesn't, we need to replace it with a T_TNODE.
1049 	 */
1050 
1051 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1052 		tn = (struct tnode *) n;
1053 
1054 		check_tnode(tn);
1055 
1056 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1057 			tp = tn;
1058 			pos = tn->pos + tn->bits;
1059 			n = tnode_get_child(tn,
1060 					    tkey_extract_bits(key,
1061 							      tn->pos,
1062 							      tn->bits));
1063 
1064 			BUG_ON(n && node_parent(n) != tn);
1065 		} else
1066 			break;
1067 	}
1068 
1069 	/*
1070 	 * n  ----> NULL, LEAF or TNODE
1071 	 *
1072 	 * tp is n's (parent) ----> NULL or TNODE
1073 	 */
1074 
1075 	BUG_ON(tp && IS_LEAF(tp));
1076 
1077 	/* Case 1: n is a leaf. Compare prefixes */
1078 
1079 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1080 		l = (struct leaf *) n;
1081 		li = leaf_info_new(plen);
1082 
1083 		if (!li)
1084 			return NULL;
1085 
1086 		fa_head = &li->falh;
1087 		insert_leaf_info(&l->list, li);
1088 		goto done;
1089 	}
1090 	l = leaf_new();
1091 
1092 	if (!l)
1093 		return NULL;
1094 
1095 	l->key = key;
1096 	li = leaf_info_new(plen);
1097 
1098 	if (!li) {
1099 		free_leaf(l);
1100 		return NULL;
1101 	}
1102 
1103 	fa_head = &li->falh;
1104 	insert_leaf_info(&l->list, li);
1105 
1106 	if (t->trie && n == NULL) {
1107 		/* Case 2: n is NULL, and will just insert a new leaf */
1108 
1109 		node_set_parent((struct node *)l, tp);
1110 
1111 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1112 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1113 	} else {
1114 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1115 		/*
1116 		 *  Add a new tnode here
1117 		 *  first tnode need some special handling
1118 		 */
1119 
1120 		if (tp)
1121 			pos = tp->pos+tp->bits;
1122 		else
1123 			pos = 0;
1124 
1125 		if (n) {
1126 			newpos = tkey_mismatch(key, pos, n->key);
1127 			tn = tnode_new(n->key, newpos, 1);
1128 		} else {
1129 			newpos = 0;
1130 			tn = tnode_new(key, newpos, 1); /* First tnode */
1131 		}
1132 
1133 		if (!tn) {
1134 			free_leaf_info(li);
1135 			free_leaf(l);
1136 			return NULL;
1137 		}
1138 
1139 		node_set_parent((struct node *)tn, tp);
1140 
1141 		missbit = tkey_extract_bits(key, newpos, 1);
1142 		put_child(t, tn, missbit, (struct node *)l);
1143 		put_child(t, tn, 1-missbit, n);
1144 
1145 		if (tp) {
1146 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1147 			put_child(t, (struct tnode *)tp, cindex,
1148 				  (struct node *)tn);
1149 		} else {
1150 			rcu_assign_pointer(t->trie, (struct node *)tn);
1151 			tp = tn;
1152 		}
1153 	}
1154 
1155 	if (tp && tp->pos + tp->bits > 32)
1156 		pr_warning("fib_trie"
1157 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1158 			   tp, tp->pos, tp->bits, key, plen);
1159 
1160 	/* Rebalance the trie */
1161 
1162 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1163 done:
1164 	return fa_head;
1165 }
1166 
1167 /*
1168  * Caller must hold RTNL.
1169  */
1170 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1171 {
1172 	struct trie *t = (struct trie *) tb->tb_data;
1173 	struct fib_alias *fa, *new_fa;
1174 	struct list_head *fa_head = NULL;
1175 	struct fib_info *fi;
1176 	int plen = cfg->fc_dst_len;
1177 	u8 tos = cfg->fc_tos;
1178 	u32 key, mask;
1179 	int err;
1180 	struct leaf *l;
1181 
1182 	if (plen > 32)
1183 		return -EINVAL;
1184 
1185 	key = ntohl(cfg->fc_dst);
1186 
1187 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1188 
1189 	mask = ntohl(inet_make_mask(plen));
1190 
1191 	if (key & ~mask)
1192 		return -EINVAL;
1193 
1194 	key = key & mask;
1195 
1196 	fi = fib_create_info(cfg);
1197 	if (IS_ERR(fi)) {
1198 		err = PTR_ERR(fi);
1199 		goto err;
1200 	}
1201 
1202 	l = fib_find_node(t, key);
1203 	fa = NULL;
1204 
1205 	if (l) {
1206 		fa_head = get_fa_head(l, plen);
1207 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1208 	}
1209 
1210 	/* Now fa, if non-NULL, points to the first fib alias
1211 	 * with the same keys [prefix,tos,priority], if such key already
1212 	 * exists or to the node before which we will insert new one.
1213 	 *
1214 	 * If fa is NULL, we will need to allocate a new one and
1215 	 * insert to the head of f.
1216 	 *
1217 	 * If f is NULL, no fib node matched the destination key
1218 	 * and we need to allocate a new one of those as well.
1219 	 */
1220 
1221 	if (fa && fa->fa_tos == tos &&
1222 	    fa->fa_info->fib_priority == fi->fib_priority) {
1223 		struct fib_alias *fa_first, *fa_match;
1224 
1225 		err = -EEXIST;
1226 		if (cfg->fc_nlflags & NLM_F_EXCL)
1227 			goto out;
1228 
1229 		/* We have 2 goals:
1230 		 * 1. Find exact match for type, scope, fib_info to avoid
1231 		 * duplicate routes
1232 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1233 		 */
1234 		fa_match = NULL;
1235 		fa_first = fa;
1236 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1237 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1238 			if (fa->fa_tos != tos)
1239 				break;
1240 			if (fa->fa_info->fib_priority != fi->fib_priority)
1241 				break;
1242 			if (fa->fa_type == cfg->fc_type &&
1243 			    fa->fa_scope == cfg->fc_scope &&
1244 			    fa->fa_info == fi) {
1245 				fa_match = fa;
1246 				break;
1247 			}
1248 		}
1249 
1250 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1251 			struct fib_info *fi_drop;
1252 			u8 state;
1253 
1254 			fa = fa_first;
1255 			if (fa_match) {
1256 				if (fa == fa_match)
1257 					err = 0;
1258 				goto out;
1259 			}
1260 			err = -ENOBUFS;
1261 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1262 			if (new_fa == NULL)
1263 				goto out;
1264 
1265 			fi_drop = fa->fa_info;
1266 			new_fa->fa_tos = fa->fa_tos;
1267 			new_fa->fa_info = fi;
1268 			new_fa->fa_type = cfg->fc_type;
1269 			new_fa->fa_scope = cfg->fc_scope;
1270 			state = fa->fa_state;
1271 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1272 
1273 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1274 			alias_free_mem_rcu(fa);
1275 
1276 			fib_release_info(fi_drop);
1277 			if (state & FA_S_ACCESSED)
1278 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1279 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1280 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1281 
1282 			goto succeeded;
1283 		}
1284 		/* Error if we find a perfect match which
1285 		 * uses the same scope, type, and nexthop
1286 		 * information.
1287 		 */
1288 		if (fa_match)
1289 			goto out;
1290 
1291 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1292 			fa = fa_first;
1293 	}
1294 	err = -ENOENT;
1295 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1296 		goto out;
1297 
1298 	err = -ENOBUFS;
1299 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1300 	if (new_fa == NULL)
1301 		goto out;
1302 
1303 	new_fa->fa_info = fi;
1304 	new_fa->fa_tos = tos;
1305 	new_fa->fa_type = cfg->fc_type;
1306 	new_fa->fa_scope = cfg->fc_scope;
1307 	new_fa->fa_state = 0;
1308 	/*
1309 	 * Insert new entry to the list.
1310 	 */
1311 
1312 	if (!fa_head) {
1313 		fa_head = fib_insert_node(t, key, plen);
1314 		if (unlikely(!fa_head)) {
1315 			err = -ENOMEM;
1316 			goto out_free_new_fa;
1317 		}
1318 	}
1319 
1320 	list_add_tail_rcu(&new_fa->fa_list,
1321 			  (fa ? &fa->fa_list : fa_head));
1322 
1323 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1324 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1325 		  &cfg->fc_nlinfo, 0);
1326 succeeded:
1327 	return 0;
1328 
1329 out_free_new_fa:
1330 	kmem_cache_free(fn_alias_kmem, new_fa);
1331 out:
1332 	fib_release_info(fi);
1333 err:
1334 	return err;
1335 }
1336 
1337 /* should be called with rcu_read_lock */
1338 static int check_leaf(struct trie *t, struct leaf *l,
1339 		      t_key key,  const struct flowi *flp,
1340 		      struct fib_result *res)
1341 {
1342 	struct leaf_info *li;
1343 	struct hlist_head *hhead = &l->list;
1344 	struct hlist_node *node;
1345 
1346 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1347 		int err;
1348 		int plen = li->plen;
1349 		__be32 mask = inet_make_mask(plen);
1350 
1351 		if (l->key != (key & ntohl(mask)))
1352 			continue;
1353 
1354 		err = fib_semantic_match(&li->falh, flp, res,
1355 					 htonl(l->key), mask, plen);
1356 
1357 #ifdef CONFIG_IP_FIB_TRIE_STATS
1358 		if (err <= 0)
1359 			t->stats.semantic_match_passed++;
1360 		else
1361 			t->stats.semantic_match_miss++;
1362 #endif
1363 		if (err <= 0)
1364 			return err;
1365 	}
1366 
1367 	return 1;
1368 }
1369 
1370 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1371 			  struct fib_result *res)
1372 {
1373 	struct trie *t = (struct trie *) tb->tb_data;
1374 	int ret;
1375 	struct node *n;
1376 	struct tnode *pn;
1377 	int pos, bits;
1378 	t_key key = ntohl(flp->fl4_dst);
1379 	int chopped_off;
1380 	t_key cindex = 0;
1381 	int current_prefix_length = KEYLENGTH;
1382 	struct tnode *cn;
1383 	t_key node_prefix, key_prefix, pref_mismatch;
1384 	int mp;
1385 
1386 	rcu_read_lock();
1387 
1388 	n = rcu_dereference(t->trie);
1389 	if (!n)
1390 		goto failed;
1391 
1392 #ifdef CONFIG_IP_FIB_TRIE_STATS
1393 	t->stats.gets++;
1394 #endif
1395 
1396 	/* Just a leaf? */
1397 	if (IS_LEAF(n)) {
1398 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1399 		goto found;
1400 	}
1401 
1402 	pn = (struct tnode *) n;
1403 	chopped_off = 0;
1404 
1405 	while (pn) {
1406 		pos = pn->pos;
1407 		bits = pn->bits;
1408 
1409 		if (!chopped_off)
1410 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1411 						   pos, bits);
1412 
1413 		n = tnode_get_child(pn, cindex);
1414 
1415 		if (n == NULL) {
1416 #ifdef CONFIG_IP_FIB_TRIE_STATS
1417 			t->stats.null_node_hit++;
1418 #endif
1419 			goto backtrace;
1420 		}
1421 
1422 		if (IS_LEAF(n)) {
1423 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1424 			if (ret > 0)
1425 				goto backtrace;
1426 			goto found;
1427 		}
1428 
1429 		cn = (struct tnode *)n;
1430 
1431 		/*
1432 		 * It's a tnode, and we can do some extra checks here if we
1433 		 * like, to avoid descending into a dead-end branch.
1434 		 * This tnode is in the parent's child array at index
1435 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1436 		 * chopped off, so in reality the index may be just a
1437 		 * subprefix, padded with zero at the end.
1438 		 * We can also take a look at any skipped bits in this
1439 		 * tnode - everything up to p_pos is supposed to be ok,
1440 		 * and the non-chopped bits of the index (se previous
1441 		 * paragraph) are also guaranteed ok, but the rest is
1442 		 * considered unknown.
1443 		 *
1444 		 * The skipped bits are key[pos+bits..cn->pos].
1445 		 */
1446 
1447 		/* If current_prefix_length < pos+bits, we are already doing
1448 		 * actual prefix  matching, which means everything from
1449 		 * pos+(bits-chopped_off) onward must be zero along some
1450 		 * branch of this subtree - otherwise there is *no* valid
1451 		 * prefix present. Here we can only check the skipped
1452 		 * bits. Remember, since we have already indexed into the
1453 		 * parent's child array, we know that the bits we chopped of
1454 		 * *are* zero.
1455 		 */
1456 
1457 		/* NOTA BENE: Checking only skipped bits
1458 		   for the new node here */
1459 
1460 		if (current_prefix_length < pos+bits) {
1461 			if (tkey_extract_bits(cn->key, current_prefix_length,
1462 						cn->pos - current_prefix_length)
1463 			    || !(cn->child[0]))
1464 				goto backtrace;
1465 		}
1466 
1467 		/*
1468 		 * If chopped_off=0, the index is fully validated and we
1469 		 * only need to look at the skipped bits for this, the new,
1470 		 * tnode. What we actually want to do is to find out if
1471 		 * these skipped bits match our key perfectly, or if we will
1472 		 * have to count on finding a matching prefix further down,
1473 		 * because if we do, we would like to have some way of
1474 		 * verifying the existence of such a prefix at this point.
1475 		 */
1476 
1477 		/* The only thing we can do at this point is to verify that
1478 		 * any such matching prefix can indeed be a prefix to our
1479 		 * key, and if the bits in the node we are inspecting that
1480 		 * do not match our key are not ZERO, this cannot be true.
1481 		 * Thus, find out where there is a mismatch (before cn->pos)
1482 		 * and verify that all the mismatching bits are zero in the
1483 		 * new tnode's key.
1484 		 */
1485 
1486 		/*
1487 		 * Note: We aren't very concerned about the piece of
1488 		 * the key that precede pn->pos+pn->bits, since these
1489 		 * have already been checked. The bits after cn->pos
1490 		 * aren't checked since these are by definition
1491 		 * "unknown" at this point. Thus, what we want to see
1492 		 * is if we are about to enter the "prefix matching"
1493 		 * state, and in that case verify that the skipped
1494 		 * bits that will prevail throughout this subtree are
1495 		 * zero, as they have to be if we are to find a
1496 		 * matching prefix.
1497 		 */
1498 
1499 		node_prefix = mask_pfx(cn->key, cn->pos);
1500 		key_prefix = mask_pfx(key, cn->pos);
1501 		pref_mismatch = key_prefix^node_prefix;
1502 		mp = 0;
1503 
1504 		/*
1505 		 * In short: If skipped bits in this node do not match
1506 		 * the search key, enter the "prefix matching"
1507 		 * state.directly.
1508 		 */
1509 		if (pref_mismatch) {
1510 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1511 				mp++;
1512 				pref_mismatch = pref_mismatch << 1;
1513 			}
1514 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1515 
1516 			if (key_prefix != 0)
1517 				goto backtrace;
1518 
1519 			if (current_prefix_length >= cn->pos)
1520 				current_prefix_length = mp;
1521 		}
1522 
1523 		pn = (struct tnode *)n; /* Descend */
1524 		chopped_off = 0;
1525 		continue;
1526 
1527 backtrace:
1528 		chopped_off++;
1529 
1530 		/* As zero don't change the child key (cindex) */
1531 		while ((chopped_off <= pn->bits)
1532 		       && !(cindex & (1<<(chopped_off-1))))
1533 			chopped_off++;
1534 
1535 		/* Decrease current_... with bits chopped off */
1536 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1537 			current_prefix_length = pn->pos + pn->bits
1538 				- chopped_off;
1539 
1540 		/*
1541 		 * Either we do the actual chop off according or if we have
1542 		 * chopped off all bits in this tnode walk up to our parent.
1543 		 */
1544 
1545 		if (chopped_off <= pn->bits) {
1546 			cindex &= ~(1 << (chopped_off-1));
1547 		} else {
1548 			struct tnode *parent = node_parent((struct node *) pn);
1549 			if (!parent)
1550 				goto failed;
1551 
1552 			/* Get Child's index */
1553 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1554 			pn = parent;
1555 			chopped_off = 0;
1556 
1557 #ifdef CONFIG_IP_FIB_TRIE_STATS
1558 			t->stats.backtrack++;
1559 #endif
1560 			goto backtrace;
1561 		}
1562 	}
1563 failed:
1564 	ret = 1;
1565 found:
1566 	rcu_read_unlock();
1567 	return ret;
1568 }
1569 
1570 /*
1571  * Remove the leaf and return parent.
1572  */
1573 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1574 {
1575 	struct tnode *tp = node_parent((struct node *) l);
1576 
1577 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1578 
1579 	if (tp) {
1580 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1581 		put_child(t, (struct tnode *)tp, cindex, NULL);
1582 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1583 	} else
1584 		rcu_assign_pointer(t->trie, NULL);
1585 
1586 	free_leaf(l);
1587 }
1588 
1589 /*
1590  * Caller must hold RTNL.
1591  */
1592 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1593 {
1594 	struct trie *t = (struct trie *) tb->tb_data;
1595 	u32 key, mask;
1596 	int plen = cfg->fc_dst_len;
1597 	u8 tos = cfg->fc_tos;
1598 	struct fib_alias *fa, *fa_to_delete;
1599 	struct list_head *fa_head;
1600 	struct leaf *l;
1601 	struct leaf_info *li;
1602 
1603 	if (plen > 32)
1604 		return -EINVAL;
1605 
1606 	key = ntohl(cfg->fc_dst);
1607 	mask = ntohl(inet_make_mask(plen));
1608 
1609 	if (key & ~mask)
1610 		return -EINVAL;
1611 
1612 	key = key & mask;
1613 	l = fib_find_node(t, key);
1614 
1615 	if (!l)
1616 		return -ESRCH;
1617 
1618 	fa_head = get_fa_head(l, plen);
1619 	fa = fib_find_alias(fa_head, tos, 0);
1620 
1621 	if (!fa)
1622 		return -ESRCH;
1623 
1624 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1625 
1626 	fa_to_delete = NULL;
1627 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1628 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1629 		struct fib_info *fi = fa->fa_info;
1630 
1631 		if (fa->fa_tos != tos)
1632 			break;
1633 
1634 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1635 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1636 		     fa->fa_scope == cfg->fc_scope) &&
1637 		    (!cfg->fc_protocol ||
1638 		     fi->fib_protocol == cfg->fc_protocol) &&
1639 		    fib_nh_match(cfg, fi) == 0) {
1640 			fa_to_delete = fa;
1641 			break;
1642 		}
1643 	}
1644 
1645 	if (!fa_to_delete)
1646 		return -ESRCH;
1647 
1648 	fa = fa_to_delete;
1649 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1650 		  &cfg->fc_nlinfo, 0);
1651 
1652 	l = fib_find_node(t, key);
1653 	li = find_leaf_info(l, plen);
1654 
1655 	list_del_rcu(&fa->fa_list);
1656 
1657 	if (list_empty(fa_head)) {
1658 		hlist_del_rcu(&li->hlist);
1659 		free_leaf_info(li);
1660 	}
1661 
1662 	if (hlist_empty(&l->list))
1663 		trie_leaf_remove(t, l);
1664 
1665 	if (fa->fa_state & FA_S_ACCESSED)
1666 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1667 
1668 	fib_release_info(fa->fa_info);
1669 	alias_free_mem_rcu(fa);
1670 	return 0;
1671 }
1672 
1673 static int trie_flush_list(struct list_head *head)
1674 {
1675 	struct fib_alias *fa, *fa_node;
1676 	int found = 0;
1677 
1678 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1679 		struct fib_info *fi = fa->fa_info;
1680 
1681 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1682 			list_del_rcu(&fa->fa_list);
1683 			fib_release_info(fa->fa_info);
1684 			alias_free_mem_rcu(fa);
1685 			found++;
1686 		}
1687 	}
1688 	return found;
1689 }
1690 
1691 static int trie_flush_leaf(struct leaf *l)
1692 {
1693 	int found = 0;
1694 	struct hlist_head *lih = &l->list;
1695 	struct hlist_node *node, *tmp;
1696 	struct leaf_info *li = NULL;
1697 
1698 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1699 		found += trie_flush_list(&li->falh);
1700 
1701 		if (list_empty(&li->falh)) {
1702 			hlist_del_rcu(&li->hlist);
1703 			free_leaf_info(li);
1704 		}
1705 	}
1706 	return found;
1707 }
1708 
1709 /*
1710  * Scan for the next right leaf starting at node p->child[idx]
1711  * Since we have back pointer, no recursion necessary.
1712  */
1713 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1714 {
1715 	do {
1716 		t_key idx;
1717 
1718 		if (c)
1719 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1720 		else
1721 			idx = 0;
1722 
1723 		while (idx < 1u << p->bits) {
1724 			c = tnode_get_child_rcu(p, idx++);
1725 			if (!c)
1726 				continue;
1727 
1728 			if (IS_LEAF(c)) {
1729 				prefetch(p->child[idx]);
1730 				return (struct leaf *) c;
1731 			}
1732 
1733 			/* Rescan start scanning in new node */
1734 			p = (struct tnode *) c;
1735 			idx = 0;
1736 		}
1737 
1738 		/* Node empty, walk back up to parent */
1739 		c = (struct node *) p;
1740 	} while ( (p = node_parent_rcu(c)) != NULL);
1741 
1742 	return NULL; /* Root of trie */
1743 }
1744 
1745 static struct leaf *trie_firstleaf(struct trie *t)
1746 {
1747 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1748 
1749 	if (!n)
1750 		return NULL;
1751 
1752 	if (IS_LEAF(n))          /* trie is just a leaf */
1753 		return (struct leaf *) n;
1754 
1755 	return leaf_walk_rcu(n, NULL);
1756 }
1757 
1758 static struct leaf *trie_nextleaf(struct leaf *l)
1759 {
1760 	struct node *c = (struct node *) l;
1761 	struct tnode *p = node_parent(c);
1762 
1763 	if (!p)
1764 		return NULL;	/* trie with just one leaf */
1765 
1766 	return leaf_walk_rcu(p, c);
1767 }
1768 
1769 static struct leaf *trie_leafindex(struct trie *t, int index)
1770 {
1771 	struct leaf *l = trie_firstleaf(t);
1772 
1773 	while (l && index-- > 0)
1774 		l = trie_nextleaf(l);
1775 
1776 	return l;
1777 }
1778 
1779 
1780 /*
1781  * Caller must hold RTNL.
1782  */
1783 static int fn_trie_flush(struct fib_table *tb)
1784 {
1785 	struct trie *t = (struct trie *) tb->tb_data;
1786 	struct leaf *l, *ll = NULL;
1787 	int found = 0;
1788 
1789 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1790 		found += trie_flush_leaf(l);
1791 
1792 		if (ll && hlist_empty(&ll->list))
1793 			trie_leaf_remove(t, ll);
1794 		ll = l;
1795 	}
1796 
1797 	if (ll && hlist_empty(&ll->list))
1798 		trie_leaf_remove(t, ll);
1799 
1800 	pr_debug("trie_flush found=%d\n", found);
1801 	return found;
1802 }
1803 
1804 static void fn_trie_select_default(struct fib_table *tb,
1805 				   const struct flowi *flp,
1806 				   struct fib_result *res)
1807 {
1808 	struct trie *t = (struct trie *) tb->tb_data;
1809 	int order, last_idx;
1810 	struct fib_info *fi = NULL;
1811 	struct fib_info *last_resort;
1812 	struct fib_alias *fa = NULL;
1813 	struct list_head *fa_head;
1814 	struct leaf *l;
1815 
1816 	last_idx = -1;
1817 	last_resort = NULL;
1818 	order = -1;
1819 
1820 	rcu_read_lock();
1821 
1822 	l = fib_find_node(t, 0);
1823 	if (!l)
1824 		goto out;
1825 
1826 	fa_head = get_fa_head(l, 0);
1827 	if (!fa_head)
1828 		goto out;
1829 
1830 	if (list_empty(fa_head))
1831 		goto out;
1832 
1833 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1834 		struct fib_info *next_fi = fa->fa_info;
1835 
1836 		if (fa->fa_scope != res->scope ||
1837 		    fa->fa_type != RTN_UNICAST)
1838 			continue;
1839 
1840 		if (next_fi->fib_priority > res->fi->fib_priority)
1841 			break;
1842 		if (!next_fi->fib_nh[0].nh_gw ||
1843 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1844 			continue;
1845 		fa->fa_state |= FA_S_ACCESSED;
1846 
1847 		if (fi == NULL) {
1848 			if (next_fi != res->fi)
1849 				break;
1850 		} else if (!fib_detect_death(fi, order, &last_resort,
1851 					     &last_idx, tb->tb_default)) {
1852 			fib_result_assign(res, fi);
1853 			tb->tb_default = order;
1854 			goto out;
1855 		}
1856 		fi = next_fi;
1857 		order++;
1858 	}
1859 	if (order <= 0 || fi == NULL) {
1860 		tb->tb_default = -1;
1861 		goto out;
1862 	}
1863 
1864 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1865 				tb->tb_default)) {
1866 		fib_result_assign(res, fi);
1867 		tb->tb_default = order;
1868 		goto out;
1869 	}
1870 	if (last_idx >= 0)
1871 		fib_result_assign(res, last_resort);
1872 	tb->tb_default = last_idx;
1873 out:
1874 	rcu_read_unlock();
1875 }
1876 
1877 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1878 			   struct fib_table *tb,
1879 			   struct sk_buff *skb, struct netlink_callback *cb)
1880 {
1881 	int i, s_i;
1882 	struct fib_alias *fa;
1883 	__be32 xkey = htonl(key);
1884 
1885 	s_i = cb->args[5];
1886 	i = 0;
1887 
1888 	/* rcu_read_lock is hold by caller */
1889 
1890 	list_for_each_entry_rcu(fa, fah, fa_list) {
1891 		if (i < s_i) {
1892 			i++;
1893 			continue;
1894 		}
1895 
1896 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1897 				  cb->nlh->nlmsg_seq,
1898 				  RTM_NEWROUTE,
1899 				  tb->tb_id,
1900 				  fa->fa_type,
1901 				  fa->fa_scope,
1902 				  xkey,
1903 				  plen,
1904 				  fa->fa_tos,
1905 				  fa->fa_info, NLM_F_MULTI) < 0) {
1906 			cb->args[5] = i;
1907 			return -1;
1908 		}
1909 		i++;
1910 	}
1911 	cb->args[5] = i;
1912 	return skb->len;
1913 }
1914 
1915 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1916 			struct sk_buff *skb, struct netlink_callback *cb)
1917 {
1918 	struct leaf_info *li;
1919 	struct hlist_node *node;
1920 	int i, s_i;
1921 
1922 	s_i = cb->args[4];
1923 	i = 0;
1924 
1925 	/* rcu_read_lock is hold by caller */
1926 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1927 		if (i < s_i) {
1928 			i++;
1929 			continue;
1930 		}
1931 
1932 		if (i > s_i)
1933 			cb->args[5] = 0;
1934 
1935 		if (list_empty(&li->falh))
1936 			continue;
1937 
1938 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1939 			cb->args[4] = i;
1940 			return -1;
1941 		}
1942 		i++;
1943 	}
1944 
1945 	cb->args[4] = i;
1946 	return skb->len;
1947 }
1948 
1949 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1950 			struct netlink_callback *cb)
1951 {
1952 	struct leaf *l;
1953 	struct trie *t = (struct trie *) tb->tb_data;
1954 	t_key key = cb->args[2];
1955 	int count = cb->args[3];
1956 
1957 	rcu_read_lock();
1958 	/* Dump starting at last key.
1959 	 * Note: 0.0.0.0/0 (ie default) is first key.
1960 	 */
1961 	if (count == 0)
1962 		l = trie_firstleaf(t);
1963 	else {
1964 		/* Normally, continue from last key, but if that is missing
1965 		 * fallback to using slow rescan
1966 		 */
1967 		l = fib_find_node(t, key);
1968 		if (!l)
1969 			l = trie_leafindex(t, count);
1970 	}
1971 
1972 	while (l) {
1973 		cb->args[2] = l->key;
1974 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1975 			cb->args[3] = count;
1976 			rcu_read_unlock();
1977 			return -1;
1978 		}
1979 
1980 		++count;
1981 		l = trie_nextleaf(l);
1982 		memset(&cb->args[4], 0,
1983 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1984 	}
1985 	cb->args[3] = count;
1986 	rcu_read_unlock();
1987 
1988 	return skb->len;
1989 }
1990 
1991 void __init fib_hash_init(void)
1992 {
1993 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1994 					  sizeof(struct fib_alias),
1995 					  0, SLAB_PANIC, NULL);
1996 
1997 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1998 					   max(sizeof(struct leaf),
1999 					       sizeof(struct leaf_info)),
2000 					   0, SLAB_PANIC, NULL);
2001 }
2002 
2003 
2004 /* Fix more generic FIB names for init later */
2005 struct fib_table *fib_hash_table(u32 id)
2006 {
2007 	struct fib_table *tb;
2008 	struct trie *t;
2009 
2010 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2011 		     GFP_KERNEL);
2012 	if (tb == NULL)
2013 		return NULL;
2014 
2015 	tb->tb_id = id;
2016 	tb->tb_default = -1;
2017 	tb->tb_lookup = fn_trie_lookup;
2018 	tb->tb_insert = fn_trie_insert;
2019 	tb->tb_delete = fn_trie_delete;
2020 	tb->tb_flush = fn_trie_flush;
2021 	tb->tb_select_default = fn_trie_select_default;
2022 	tb->tb_dump = fn_trie_dump;
2023 
2024 	t = (struct trie *) tb->tb_data;
2025 	memset(t, 0, sizeof(*t));
2026 
2027 	if (id == RT_TABLE_LOCAL)
2028 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2029 
2030 	return tb;
2031 }
2032 
2033 #ifdef CONFIG_PROC_FS
2034 /* Depth first Trie walk iterator */
2035 struct fib_trie_iter {
2036 	struct seq_net_private p;
2037 	struct fib_table *tb;
2038 	struct tnode *tnode;
2039 	unsigned index;
2040 	unsigned depth;
2041 };
2042 
2043 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2044 {
2045 	struct tnode *tn = iter->tnode;
2046 	unsigned cindex = iter->index;
2047 	struct tnode *p;
2048 
2049 	/* A single entry routing table */
2050 	if (!tn)
2051 		return NULL;
2052 
2053 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2054 		 iter->tnode, iter->index, iter->depth);
2055 rescan:
2056 	while (cindex < (1<<tn->bits)) {
2057 		struct node *n = tnode_get_child_rcu(tn, cindex);
2058 
2059 		if (n) {
2060 			if (IS_LEAF(n)) {
2061 				iter->tnode = tn;
2062 				iter->index = cindex + 1;
2063 			} else {
2064 				/* push down one level */
2065 				iter->tnode = (struct tnode *) n;
2066 				iter->index = 0;
2067 				++iter->depth;
2068 			}
2069 			return n;
2070 		}
2071 
2072 		++cindex;
2073 	}
2074 
2075 	/* Current node exhausted, pop back up */
2076 	p = node_parent_rcu((struct node *)tn);
2077 	if (p) {
2078 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2079 		tn = p;
2080 		--iter->depth;
2081 		goto rescan;
2082 	}
2083 
2084 	/* got root? */
2085 	return NULL;
2086 }
2087 
2088 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2089 				       struct trie *t)
2090 {
2091 	struct node *n;
2092 
2093 	if (!t)
2094 		return NULL;
2095 
2096 	n = rcu_dereference(t->trie);
2097 	if (!n)
2098 		return NULL;
2099 
2100 	if (IS_TNODE(n)) {
2101 		iter->tnode = (struct tnode *) n;
2102 		iter->index = 0;
2103 		iter->depth = 1;
2104 	} else {
2105 		iter->tnode = NULL;
2106 		iter->index = 0;
2107 		iter->depth = 0;
2108 	}
2109 
2110 	return n;
2111 }
2112 
2113 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2114 {
2115 	struct node *n;
2116 	struct fib_trie_iter iter;
2117 
2118 	memset(s, 0, sizeof(*s));
2119 
2120 	rcu_read_lock();
2121 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2122 		if (IS_LEAF(n)) {
2123 			struct leaf *l = (struct leaf *)n;
2124 			struct leaf_info *li;
2125 			struct hlist_node *tmp;
2126 
2127 			s->leaves++;
2128 			s->totdepth += iter.depth;
2129 			if (iter.depth > s->maxdepth)
2130 				s->maxdepth = iter.depth;
2131 
2132 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2133 				++s->prefixes;
2134 		} else {
2135 			const struct tnode *tn = (const struct tnode *) n;
2136 			int i;
2137 
2138 			s->tnodes++;
2139 			if (tn->bits < MAX_STAT_DEPTH)
2140 				s->nodesizes[tn->bits]++;
2141 
2142 			for (i = 0; i < (1<<tn->bits); i++)
2143 				if (!tn->child[i])
2144 					s->nullpointers++;
2145 		}
2146 	}
2147 	rcu_read_unlock();
2148 }
2149 
2150 /*
2151  *	This outputs /proc/net/fib_triestats
2152  */
2153 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2154 {
2155 	unsigned i, max, pointers, bytes, avdepth;
2156 
2157 	if (stat->leaves)
2158 		avdepth = stat->totdepth*100 / stat->leaves;
2159 	else
2160 		avdepth = 0;
2161 
2162 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2163 		   avdepth / 100, avdepth % 100);
2164 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2165 
2166 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2167 	bytes = sizeof(struct leaf) * stat->leaves;
2168 
2169 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2170 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2171 
2172 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2173 	bytes += sizeof(struct tnode) * stat->tnodes;
2174 
2175 	max = MAX_STAT_DEPTH;
2176 	while (max > 0 && stat->nodesizes[max-1] == 0)
2177 		max--;
2178 
2179 	pointers = 0;
2180 	for (i = 1; i <= max; i++)
2181 		if (stat->nodesizes[i] != 0) {
2182 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2183 			pointers += (1<<i) * stat->nodesizes[i];
2184 		}
2185 	seq_putc(seq, '\n');
2186 	seq_printf(seq, "\tPointers: %u\n", pointers);
2187 
2188 	bytes += sizeof(struct node *) * pointers;
2189 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2190 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2191 }
2192 
2193 #ifdef CONFIG_IP_FIB_TRIE_STATS
2194 static void trie_show_usage(struct seq_file *seq,
2195 			    const struct trie_use_stats *stats)
2196 {
2197 	seq_printf(seq, "\nCounters:\n---------\n");
2198 	seq_printf(seq, "gets = %u\n", stats->gets);
2199 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2200 	seq_printf(seq, "semantic match passed = %u\n",
2201 		   stats->semantic_match_passed);
2202 	seq_printf(seq, "semantic match miss = %u\n",
2203 		   stats->semantic_match_miss);
2204 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2205 	seq_printf(seq, "skipped node resize = %u\n\n",
2206 		   stats->resize_node_skipped);
2207 }
2208 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2209 
2210 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2211 {
2212 	if (tb->tb_id == RT_TABLE_LOCAL)
2213 		seq_puts(seq, "Local:\n");
2214 	else if (tb->tb_id == RT_TABLE_MAIN)
2215 		seq_puts(seq, "Main:\n");
2216 	else
2217 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2218 }
2219 
2220 
2221 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2222 {
2223 	struct net *net = (struct net *)seq->private;
2224 	unsigned int h;
2225 
2226 	seq_printf(seq,
2227 		   "Basic info: size of leaf:"
2228 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2229 		   sizeof(struct leaf), sizeof(struct tnode));
2230 
2231 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2232 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2233 		struct hlist_node *node;
2234 		struct fib_table *tb;
2235 
2236 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2237 			struct trie *t = (struct trie *) tb->tb_data;
2238 			struct trie_stat stat;
2239 
2240 			if (!t)
2241 				continue;
2242 
2243 			fib_table_print(seq, tb);
2244 
2245 			trie_collect_stats(t, &stat);
2246 			trie_show_stats(seq, &stat);
2247 #ifdef CONFIG_IP_FIB_TRIE_STATS
2248 			trie_show_usage(seq, &t->stats);
2249 #endif
2250 		}
2251 	}
2252 
2253 	return 0;
2254 }
2255 
2256 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2257 {
2258 	return single_open_net(inode, file, fib_triestat_seq_show);
2259 }
2260 
2261 static const struct file_operations fib_triestat_fops = {
2262 	.owner	= THIS_MODULE,
2263 	.open	= fib_triestat_seq_open,
2264 	.read	= seq_read,
2265 	.llseek	= seq_lseek,
2266 	.release = single_release_net,
2267 };
2268 
2269 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2270 {
2271 	struct fib_trie_iter *iter = seq->private;
2272 	struct net *net = seq_file_net(seq);
2273 	loff_t idx = 0;
2274 	unsigned int h;
2275 
2276 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2277 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2278 		struct hlist_node *node;
2279 		struct fib_table *tb;
2280 
2281 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2282 			struct node *n;
2283 
2284 			for (n = fib_trie_get_first(iter,
2285 						    (struct trie *) tb->tb_data);
2286 			     n; n = fib_trie_get_next(iter))
2287 				if (pos == idx++) {
2288 					iter->tb = tb;
2289 					return n;
2290 				}
2291 		}
2292 	}
2293 
2294 	return NULL;
2295 }
2296 
2297 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2298 	__acquires(RCU)
2299 {
2300 	rcu_read_lock();
2301 	return fib_trie_get_idx(seq, *pos);
2302 }
2303 
2304 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305 {
2306 	struct fib_trie_iter *iter = seq->private;
2307 	struct net *net = seq_file_net(seq);
2308 	struct fib_table *tb = iter->tb;
2309 	struct hlist_node *tb_node;
2310 	unsigned int h;
2311 	struct node *n;
2312 
2313 	++*pos;
2314 	/* next node in same table */
2315 	n = fib_trie_get_next(iter);
2316 	if (n)
2317 		return n;
2318 
2319 	/* walk rest of this hash chain */
2320 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2321 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2322 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2323 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2324 		if (n)
2325 			goto found;
2326 	}
2327 
2328 	/* new hash chain */
2329 	while (++h < FIB_TABLE_HASHSZ) {
2330 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2331 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2332 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2333 			if (n)
2334 				goto found;
2335 		}
2336 	}
2337 	return NULL;
2338 
2339 found:
2340 	iter->tb = tb;
2341 	return n;
2342 }
2343 
2344 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2345 	__releases(RCU)
2346 {
2347 	rcu_read_unlock();
2348 }
2349 
2350 static void seq_indent(struct seq_file *seq, int n)
2351 {
2352 	while (n-- > 0) seq_puts(seq, "   ");
2353 }
2354 
2355 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2356 {
2357 	switch (s) {
2358 	case RT_SCOPE_UNIVERSE: return "universe";
2359 	case RT_SCOPE_SITE:	return "site";
2360 	case RT_SCOPE_LINK:	return "link";
2361 	case RT_SCOPE_HOST:	return "host";
2362 	case RT_SCOPE_NOWHERE:	return "nowhere";
2363 	default:
2364 		snprintf(buf, len, "scope=%d", s);
2365 		return buf;
2366 	}
2367 }
2368 
2369 static const char *rtn_type_names[__RTN_MAX] = {
2370 	[RTN_UNSPEC] = "UNSPEC",
2371 	[RTN_UNICAST] = "UNICAST",
2372 	[RTN_LOCAL] = "LOCAL",
2373 	[RTN_BROADCAST] = "BROADCAST",
2374 	[RTN_ANYCAST] = "ANYCAST",
2375 	[RTN_MULTICAST] = "MULTICAST",
2376 	[RTN_BLACKHOLE] = "BLACKHOLE",
2377 	[RTN_UNREACHABLE] = "UNREACHABLE",
2378 	[RTN_PROHIBIT] = "PROHIBIT",
2379 	[RTN_THROW] = "THROW",
2380 	[RTN_NAT] = "NAT",
2381 	[RTN_XRESOLVE] = "XRESOLVE",
2382 };
2383 
2384 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2385 {
2386 	if (t < __RTN_MAX && rtn_type_names[t])
2387 		return rtn_type_names[t];
2388 	snprintf(buf, len, "type %u", t);
2389 	return buf;
2390 }
2391 
2392 /* Pretty print the trie */
2393 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2394 {
2395 	const struct fib_trie_iter *iter = seq->private;
2396 	struct node *n = v;
2397 
2398 	if (!node_parent_rcu(n))
2399 		fib_table_print(seq, iter->tb);
2400 
2401 	if (IS_TNODE(n)) {
2402 		struct tnode *tn = (struct tnode *) n;
2403 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2404 
2405 		seq_indent(seq, iter->depth-1);
2406 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2407 			   &prf, tn->pos, tn->bits, tn->full_children,
2408 			   tn->empty_children);
2409 
2410 	} else {
2411 		struct leaf *l = (struct leaf *) n;
2412 		struct leaf_info *li;
2413 		struct hlist_node *node;
2414 		__be32 val = htonl(l->key);
2415 
2416 		seq_indent(seq, iter->depth);
2417 		seq_printf(seq, "  |-- %pI4\n", &val);
2418 
2419 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2420 			struct fib_alias *fa;
2421 
2422 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2423 				char buf1[32], buf2[32];
2424 
2425 				seq_indent(seq, iter->depth+1);
2426 				seq_printf(seq, "  /%d %s %s", li->plen,
2427 					   rtn_scope(buf1, sizeof(buf1),
2428 						     fa->fa_scope),
2429 					   rtn_type(buf2, sizeof(buf2),
2430 						    fa->fa_type));
2431 				if (fa->fa_tos)
2432 					seq_printf(seq, " tos=%d", fa->fa_tos);
2433 				seq_putc(seq, '\n');
2434 			}
2435 		}
2436 	}
2437 
2438 	return 0;
2439 }
2440 
2441 static const struct seq_operations fib_trie_seq_ops = {
2442 	.start  = fib_trie_seq_start,
2443 	.next   = fib_trie_seq_next,
2444 	.stop   = fib_trie_seq_stop,
2445 	.show   = fib_trie_seq_show,
2446 };
2447 
2448 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2449 {
2450 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2451 			    sizeof(struct fib_trie_iter));
2452 }
2453 
2454 static const struct file_operations fib_trie_fops = {
2455 	.owner  = THIS_MODULE,
2456 	.open   = fib_trie_seq_open,
2457 	.read   = seq_read,
2458 	.llseek = seq_lseek,
2459 	.release = seq_release_net,
2460 };
2461 
2462 struct fib_route_iter {
2463 	struct seq_net_private p;
2464 	struct trie *main_trie;
2465 	loff_t	pos;
2466 	t_key	key;
2467 };
2468 
2469 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2470 {
2471 	struct leaf *l = NULL;
2472 	struct trie *t = iter->main_trie;
2473 
2474 	/* use cache location of last found key */
2475 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2476 		pos -= iter->pos;
2477 	else {
2478 		iter->pos = 0;
2479 		l = trie_firstleaf(t);
2480 	}
2481 
2482 	while (l && pos-- > 0) {
2483 		iter->pos++;
2484 		l = trie_nextleaf(l);
2485 	}
2486 
2487 	if (l)
2488 		iter->key = pos;	/* remember it */
2489 	else
2490 		iter->pos = 0;		/* forget it */
2491 
2492 	return l;
2493 }
2494 
2495 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2496 	__acquires(RCU)
2497 {
2498 	struct fib_route_iter *iter = seq->private;
2499 	struct fib_table *tb;
2500 
2501 	rcu_read_lock();
2502 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2503 	if (!tb)
2504 		return NULL;
2505 
2506 	iter->main_trie = (struct trie *) tb->tb_data;
2507 	if (*pos == 0)
2508 		return SEQ_START_TOKEN;
2509 	else
2510 		return fib_route_get_idx(iter, *pos - 1);
2511 }
2512 
2513 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2514 {
2515 	struct fib_route_iter *iter = seq->private;
2516 	struct leaf *l = v;
2517 
2518 	++*pos;
2519 	if (v == SEQ_START_TOKEN) {
2520 		iter->pos = 0;
2521 		l = trie_firstleaf(iter->main_trie);
2522 	} else {
2523 		iter->pos++;
2524 		l = trie_nextleaf(l);
2525 	}
2526 
2527 	if (l)
2528 		iter->key = l->key;
2529 	else
2530 		iter->pos = 0;
2531 	return l;
2532 }
2533 
2534 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2535 	__releases(RCU)
2536 {
2537 	rcu_read_unlock();
2538 }
2539 
2540 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2541 {
2542 	static unsigned type2flags[RTN_MAX + 1] = {
2543 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2544 	};
2545 	unsigned flags = type2flags[type];
2546 
2547 	if (fi && fi->fib_nh->nh_gw)
2548 		flags |= RTF_GATEWAY;
2549 	if (mask == htonl(0xFFFFFFFF))
2550 		flags |= RTF_HOST;
2551 	flags |= RTF_UP;
2552 	return flags;
2553 }
2554 
2555 /*
2556  *	This outputs /proc/net/route.
2557  *	The format of the file is not supposed to be changed
2558  * 	and needs to be same as fib_hash output to avoid breaking
2559  *	legacy utilities
2560  */
2561 static int fib_route_seq_show(struct seq_file *seq, void *v)
2562 {
2563 	struct leaf *l = v;
2564 	struct leaf_info *li;
2565 	struct hlist_node *node;
2566 
2567 	if (v == SEQ_START_TOKEN) {
2568 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2569 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2570 			   "\tWindow\tIRTT");
2571 		return 0;
2572 	}
2573 
2574 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2575 		struct fib_alias *fa;
2576 		__be32 mask, prefix;
2577 
2578 		mask = inet_make_mask(li->plen);
2579 		prefix = htonl(l->key);
2580 
2581 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2582 			const struct fib_info *fi = fa->fa_info;
2583 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2584 			int len;
2585 
2586 			if (fa->fa_type == RTN_BROADCAST
2587 			    || fa->fa_type == RTN_MULTICAST)
2588 				continue;
2589 
2590 			if (fi)
2591 				seq_printf(seq,
2592 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2593 					 "%d\t%08X\t%d\t%u\t%u%n",
2594 					 fi->fib_dev ? fi->fib_dev->name : "*",
2595 					 prefix,
2596 					 fi->fib_nh->nh_gw, flags, 0, 0,
2597 					 fi->fib_priority,
2598 					 mask,
2599 					 (fi->fib_advmss ?
2600 					  fi->fib_advmss + 40 : 0),
2601 					 fi->fib_window,
2602 					 fi->fib_rtt >> 3, &len);
2603 			else
2604 				seq_printf(seq,
2605 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2606 					 "%d\t%08X\t%d\t%u\t%u%n",
2607 					 prefix, 0, flags, 0, 0, 0,
2608 					 mask, 0, 0, 0, &len);
2609 
2610 			seq_printf(seq, "%*s\n", 127 - len, "");
2611 		}
2612 	}
2613 
2614 	return 0;
2615 }
2616 
2617 static const struct seq_operations fib_route_seq_ops = {
2618 	.start  = fib_route_seq_start,
2619 	.next   = fib_route_seq_next,
2620 	.stop   = fib_route_seq_stop,
2621 	.show   = fib_route_seq_show,
2622 };
2623 
2624 static int fib_route_seq_open(struct inode *inode, struct file *file)
2625 {
2626 	return seq_open_net(inode, file, &fib_route_seq_ops,
2627 			    sizeof(struct fib_route_iter));
2628 }
2629 
2630 static const struct file_operations fib_route_fops = {
2631 	.owner  = THIS_MODULE,
2632 	.open   = fib_route_seq_open,
2633 	.read   = seq_read,
2634 	.llseek = seq_lseek,
2635 	.release = seq_release_net,
2636 };
2637 
2638 int __net_init fib_proc_init(struct net *net)
2639 {
2640 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2641 		goto out1;
2642 
2643 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2644 				  &fib_triestat_fops))
2645 		goto out2;
2646 
2647 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2648 		goto out3;
2649 
2650 	return 0;
2651 
2652 out3:
2653 	proc_net_remove(net, "fib_triestat");
2654 out2:
2655 	proc_net_remove(net, "fib_trie");
2656 out1:
2657 	return -ENOMEM;
2658 }
2659 
2660 void __net_exit fib_proc_exit(struct net *net)
2661 {
2662 	proc_net_remove(net, "fib_trie");
2663 	proc_net_remove(net, "fib_triestat");
2664 	proc_net_remove(net, "route");
2665 }
2666 
2667 #endif /* CONFIG_PROC_FS */
2668