xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.408"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #undef CONFIG_IP_FIB_TRIE_STATS
86 #define MAX_STAT_DEPTH 32
87 
88 #define KEYLENGTH (8*sizeof(t_key))
89 
90 typedef unsigned int t_key;
91 
92 #define T_TNODE 0
93 #define T_LEAF  1
94 #define NODE_TYPE_MASK	0x1UL
95 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
96 
97 #define IS_TNODE(n) (!(n->parent & T_LEAF))
98 #define IS_LEAF(n) (n->parent & T_LEAF)
99 
100 struct node {
101 	t_key key;
102 	unsigned long parent;
103 };
104 
105 struct leaf {
106 	t_key key;
107 	unsigned long parent;
108 	struct hlist_head list;
109 	struct rcu_head rcu;
110 };
111 
112 struct leaf_info {
113 	struct hlist_node hlist;
114 	struct rcu_head rcu;
115 	int plen;
116 	struct list_head falh;
117 };
118 
119 struct tnode {
120 	t_key key;
121 	unsigned long parent;
122 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
123 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
124 	unsigned short full_children;	/* KEYLENGTH bits needed */
125 	unsigned short empty_children;	/* KEYLENGTH bits needed */
126 	struct rcu_head rcu;
127 	struct node *child[0];
128 };
129 
130 #ifdef CONFIG_IP_FIB_TRIE_STATS
131 struct trie_use_stats {
132 	unsigned int gets;
133 	unsigned int backtrack;
134 	unsigned int semantic_match_passed;
135 	unsigned int semantic_match_miss;
136 	unsigned int null_node_hit;
137 	unsigned int resize_node_skipped;
138 };
139 #endif
140 
141 struct trie_stat {
142 	unsigned int totdepth;
143 	unsigned int maxdepth;
144 	unsigned int tnodes;
145 	unsigned int leaves;
146 	unsigned int nullpointers;
147 	unsigned int nodesizes[MAX_STAT_DEPTH];
148 };
149 
150 struct trie {
151 	struct node *trie;
152 #ifdef CONFIG_IP_FIB_TRIE_STATS
153 	struct trie_use_stats stats;
154 #endif
155 	int size;
156 	unsigned int revision;
157 };
158 
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
161 static struct node *resize(struct trie *t, struct tnode *tn);
162 static struct tnode *inflate(struct trie *t, struct tnode *tn);
163 static struct tnode *halve(struct trie *t, struct tnode *tn);
164 static void tnode_free(struct tnode *tn);
165 
166 static struct kmem_cache *fn_alias_kmem __read_mostly;
167 static struct trie *trie_local = NULL, *trie_main = NULL;
168 
169 static inline struct tnode *node_parent(struct node *node)
170 {
171 	struct tnode *ret;
172 
173 	ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
174 	return rcu_dereference(ret);
175 }
176 
177 static inline void node_set_parent(struct node *node, struct tnode *ptr)
178 {
179 	rcu_assign_pointer(node->parent,
180 			   (unsigned long)ptr | NODE_TYPE(node));
181 }
182 
183 /* rcu_read_lock needs to be hold by caller from readside */
184 
185 static inline struct node *tnode_get_child(struct tnode *tn, int i)
186 {
187 	BUG_ON(i >= 1 << tn->bits);
188 
189 	return rcu_dereference(tn->child[i]);
190 }
191 
192 static inline int tnode_child_length(const struct tnode *tn)
193 {
194 	return 1 << tn->bits;
195 }
196 
197 static inline t_key mask_pfx(t_key k, unsigned short l)
198 {
199 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
200 }
201 
202 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
203 {
204 	if (offset < KEYLENGTH)
205 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
206 	else
207 		return 0;
208 }
209 
210 static inline int tkey_equals(t_key a, t_key b)
211 {
212 	return a == b;
213 }
214 
215 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
216 {
217 	if (bits == 0 || offset >= KEYLENGTH)
218 		return 1;
219 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
220 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
221 }
222 
223 static inline int tkey_mismatch(t_key a, int offset, t_key b)
224 {
225 	t_key diff = a ^ b;
226 	int i = offset;
227 
228 	if (!diff)
229 		return 0;
230 	while ((diff << i) >> (KEYLENGTH-1) == 0)
231 		i++;
232 	return i;
233 }
234 
235 /*
236   To understand this stuff, an understanding of keys and all their bits is
237   necessary. Every node in the trie has a key associated with it, but not
238   all of the bits in that key are significant.
239 
240   Consider a node 'n' and its parent 'tp'.
241 
242   If n is a leaf, every bit in its key is significant. Its presence is
243   necessitated by path compression, since during a tree traversal (when
244   searching for a leaf - unless we are doing an insertion) we will completely
245   ignore all skipped bits we encounter. Thus we need to verify, at the end of
246   a potentially successful search, that we have indeed been walking the
247   correct key path.
248 
249   Note that we can never "miss" the correct key in the tree if present by
250   following the wrong path. Path compression ensures that segments of the key
251   that are the same for all keys with a given prefix are skipped, but the
252   skipped part *is* identical for each node in the subtrie below the skipped
253   bit! trie_insert() in this implementation takes care of that - note the
254   call to tkey_sub_equals() in trie_insert().
255 
256   if n is an internal node - a 'tnode' here, the various parts of its key
257   have many different meanings.
258 
259   Example:
260   _________________________________________________________________
261   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
262   -----------------------------------------------------------------
263     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
264 
265   _________________________________________________________________
266   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
267   -----------------------------------------------------------------
268    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
269 
270   tp->pos = 7
271   tp->bits = 3
272   n->pos = 15
273   n->bits = 4
274 
275   First, let's just ignore the bits that come before the parent tp, that is
276   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
277   not use them for anything.
278 
279   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
280   index into the parent's child array. That is, they will be used to find
281   'n' among tp's children.
282 
283   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
284   for the node n.
285 
286   All the bits we have seen so far are significant to the node n. The rest
287   of the bits are really not needed or indeed known in n->key.
288 
289   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
290   n's child array, and will of course be different for each child.
291 
292 
293   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
294   at this point.
295 
296 */
297 
298 static inline void check_tnode(const struct tnode *tn)
299 {
300 	WARN_ON(tn && tn->pos+tn->bits > 32);
301 }
302 
303 static int halve_threshold = 25;
304 static int inflate_threshold = 50;
305 static int halve_threshold_root = 8;
306 static int inflate_threshold_root = 15;
307 
308 
309 static void __alias_free_mem(struct rcu_head *head)
310 {
311 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
312 	kmem_cache_free(fn_alias_kmem, fa);
313 }
314 
315 static inline void alias_free_mem_rcu(struct fib_alias *fa)
316 {
317 	call_rcu(&fa->rcu, __alias_free_mem);
318 }
319 
320 static void __leaf_free_rcu(struct rcu_head *head)
321 {
322 	kfree(container_of(head, struct leaf, rcu));
323 }
324 
325 static void __leaf_info_free_rcu(struct rcu_head *head)
326 {
327 	kfree(container_of(head, struct leaf_info, rcu));
328 }
329 
330 static inline void free_leaf_info(struct leaf_info *leaf)
331 {
332 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
333 }
334 
335 static struct tnode *tnode_alloc(unsigned int size)
336 {
337 	struct page *pages;
338 
339 	if (size <= PAGE_SIZE)
340 		return kcalloc(size, 1, GFP_KERNEL);
341 
342 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
343 	if (!pages)
344 		return NULL;
345 
346 	return page_address(pages);
347 }
348 
349 static void __tnode_free_rcu(struct rcu_head *head)
350 {
351 	struct tnode *tn = container_of(head, struct tnode, rcu);
352 	unsigned int size = sizeof(struct tnode) +
353 		(1 << tn->bits) * sizeof(struct node *);
354 
355 	if (size <= PAGE_SIZE)
356 		kfree(tn);
357 	else
358 		free_pages((unsigned long)tn, get_order(size));
359 }
360 
361 static inline void tnode_free(struct tnode *tn)
362 {
363 	if (IS_LEAF(tn)) {
364 		struct leaf *l = (struct leaf *) tn;
365 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
366 	} else
367 		call_rcu(&tn->rcu, __tnode_free_rcu);
368 }
369 
370 static struct leaf *leaf_new(void)
371 {
372 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
373 	if (l) {
374 		l->parent = T_LEAF;
375 		INIT_HLIST_HEAD(&l->list);
376 	}
377 	return l;
378 }
379 
380 static struct leaf_info *leaf_info_new(int plen)
381 {
382 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
383 	if (li) {
384 		li->plen = plen;
385 		INIT_LIST_HEAD(&li->falh);
386 	}
387 	return li;
388 }
389 
390 static struct tnode* tnode_new(t_key key, int pos, int bits)
391 {
392 	int nchildren = 1<<bits;
393 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
394 	struct tnode *tn = tnode_alloc(sz);
395 
396 	if (tn) {
397 		memset(tn, 0, sz);
398 		tn->parent = T_TNODE;
399 		tn->pos = pos;
400 		tn->bits = bits;
401 		tn->key = key;
402 		tn->full_children = 0;
403 		tn->empty_children = 1<<bits;
404 	}
405 
406 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
407 		 (unsigned int) (sizeof(struct node) * 1<<bits));
408 	return tn;
409 }
410 
411 /*
412  * Check whether a tnode 'n' is "full", i.e. it is an internal node
413  * and no bits are skipped. See discussion in dyntree paper p. 6
414  */
415 
416 static inline int tnode_full(const struct tnode *tn, const struct node *n)
417 {
418 	if (n == NULL || IS_LEAF(n))
419 		return 0;
420 
421 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
422 }
423 
424 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
425 {
426 	tnode_put_child_reorg(tn, i, n, -1);
427 }
428 
429  /*
430   * Add a child at position i overwriting the old value.
431   * Update the value of full_children and empty_children.
432   */
433 
434 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
435 {
436 	struct node *chi = tn->child[i];
437 	int isfull;
438 
439 	BUG_ON(i >= 1<<tn->bits);
440 
441 
442 	/* update emptyChildren */
443 	if (n == NULL && chi != NULL)
444 		tn->empty_children++;
445 	else if (n != NULL && chi == NULL)
446 		tn->empty_children--;
447 
448 	/* update fullChildren */
449 	if (wasfull == -1)
450 		wasfull = tnode_full(tn, chi);
451 
452 	isfull = tnode_full(tn, n);
453 	if (wasfull && !isfull)
454 		tn->full_children--;
455 	else if (!wasfull && isfull)
456 		tn->full_children++;
457 
458 	if (n)
459 		node_set_parent(n, tn);
460 
461 	rcu_assign_pointer(tn->child[i], n);
462 }
463 
464 static struct node *resize(struct trie *t, struct tnode *tn)
465 {
466 	int i;
467 	int err = 0;
468 	struct tnode *old_tn;
469 	int inflate_threshold_use;
470 	int halve_threshold_use;
471 	int max_resize;
472 
473 	if (!tn)
474 		return NULL;
475 
476 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
477 		 tn, inflate_threshold, halve_threshold);
478 
479 	/* No children */
480 	if (tn->empty_children == tnode_child_length(tn)) {
481 		tnode_free(tn);
482 		return NULL;
483 	}
484 	/* One child */
485 	if (tn->empty_children == tnode_child_length(tn) - 1)
486 		for (i = 0; i < tnode_child_length(tn); i++) {
487 			struct node *n;
488 
489 			n = tn->child[i];
490 			if (!n)
491 				continue;
492 
493 			/* compress one level */
494 			node_set_parent(n, NULL);
495 			tnode_free(tn);
496 			return n;
497 		}
498 	/*
499 	 * Double as long as the resulting node has a number of
500 	 * nonempty nodes that are above the threshold.
501 	 */
502 
503 	/*
504 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
505 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
506 	 * Telecommunications, page 6:
507 	 * "A node is doubled if the ratio of non-empty children to all
508 	 * children in the *doubled* node is at least 'high'."
509 	 *
510 	 * 'high' in this instance is the variable 'inflate_threshold'. It
511 	 * is expressed as a percentage, so we multiply it with
512 	 * tnode_child_length() and instead of multiplying by 2 (since the
513 	 * child array will be doubled by inflate()) and multiplying
514 	 * the left-hand side by 100 (to handle the percentage thing) we
515 	 * multiply the left-hand side by 50.
516 	 *
517 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
518 	 * - tn->empty_children is of course the number of non-null children
519 	 * in the current node. tn->full_children is the number of "full"
520 	 * children, that is non-null tnodes with a skip value of 0.
521 	 * All of those will be doubled in the resulting inflated tnode, so
522 	 * we just count them one extra time here.
523 	 *
524 	 * A clearer way to write this would be:
525 	 *
526 	 * to_be_doubled = tn->full_children;
527 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
528 	 *     tn->full_children;
529 	 *
530 	 * new_child_length = tnode_child_length(tn) * 2;
531 	 *
532 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
533 	 *      new_child_length;
534 	 * if (new_fill_factor >= inflate_threshold)
535 	 *
536 	 * ...and so on, tho it would mess up the while () loop.
537 	 *
538 	 * anyway,
539 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
540 	 *      inflate_threshold
541 	 *
542 	 * avoid a division:
543 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
544 	 *      inflate_threshold * new_child_length
545 	 *
546 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
547 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
548 	 *    tn->full_children) >= inflate_threshold * new_child_length
549 	 *
550 	 * expand new_child_length:
551 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
552 	 *    tn->full_children) >=
553 	 *      inflate_threshold * tnode_child_length(tn) * 2
554 	 *
555 	 * shorten again:
556 	 * 50 * (tn->full_children + tnode_child_length(tn) -
557 	 *    tn->empty_children) >= inflate_threshold *
558 	 *    tnode_child_length(tn)
559 	 *
560 	 */
561 
562 	check_tnode(tn);
563 
564 	/* Keep root node larger  */
565 
566 	if (!tn->parent)
567 		inflate_threshold_use = inflate_threshold_root;
568 	else
569 		inflate_threshold_use = inflate_threshold;
570 
571 	err = 0;
572 	max_resize = 10;
573 	while ((tn->full_children > 0 &&  max_resize-- &&
574 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
575 				inflate_threshold_use * tnode_child_length(tn))) {
576 
577 		old_tn = tn;
578 		tn = inflate(t, tn);
579 		if (IS_ERR(tn)) {
580 			tn = old_tn;
581 #ifdef CONFIG_IP_FIB_TRIE_STATS
582 			t->stats.resize_node_skipped++;
583 #endif
584 			break;
585 		}
586 	}
587 
588 	if (max_resize < 0) {
589 		if (!tn->parent)
590 			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
591 			       inflate_threshold_root, tn->bits);
592 		else
593 			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
594 			       inflate_threshold, tn->bits);
595 	}
596 
597 	check_tnode(tn);
598 
599 	/*
600 	 * Halve as long as the number of empty children in this
601 	 * node is above threshold.
602 	 */
603 
604 
605 	/* Keep root node larger  */
606 
607 	if (!tn->parent)
608 		halve_threshold_use = halve_threshold_root;
609 	else
610 		halve_threshold_use = halve_threshold;
611 
612 	err = 0;
613 	max_resize = 10;
614 	while (tn->bits > 1 &&  max_resize-- &&
615 	       100 * (tnode_child_length(tn) - tn->empty_children) <
616 	       halve_threshold_use * tnode_child_length(tn)) {
617 
618 		old_tn = tn;
619 		tn = halve(t, tn);
620 		if (IS_ERR(tn)) {
621 			tn = old_tn;
622 #ifdef CONFIG_IP_FIB_TRIE_STATS
623 			t->stats.resize_node_skipped++;
624 #endif
625 			break;
626 		}
627 	}
628 
629 	if (max_resize < 0) {
630 		if (!tn->parent)
631 			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
632 			       halve_threshold_root, tn->bits);
633 		else
634 			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
635 			       halve_threshold, tn->bits);
636 	}
637 
638 	/* Only one child remains */
639 	if (tn->empty_children == tnode_child_length(tn) - 1)
640 		for (i = 0; i < tnode_child_length(tn); i++) {
641 			struct node *n;
642 
643 			n = tn->child[i];
644 			if (!n)
645 				continue;
646 
647 			/* compress one level */
648 
649 			node_set_parent(n, NULL);
650 			tnode_free(tn);
651 			return n;
652 		}
653 
654 	return (struct node *) tn;
655 }
656 
657 static struct tnode *inflate(struct trie *t, struct tnode *tn)
658 {
659 	struct tnode *inode;
660 	struct tnode *oldtnode = tn;
661 	int olen = tnode_child_length(tn);
662 	int i;
663 
664 	pr_debug("In inflate\n");
665 
666 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
667 
668 	if (!tn)
669 		return ERR_PTR(-ENOMEM);
670 
671 	/*
672 	 * Preallocate and store tnodes before the actual work so we
673 	 * don't get into an inconsistent state if memory allocation
674 	 * fails. In case of failure we return the oldnode and  inflate
675 	 * of tnode is ignored.
676 	 */
677 
678 	for (i = 0; i < olen; i++) {
679 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
680 
681 		if (inode &&
682 		    IS_TNODE(inode) &&
683 		    inode->pos == oldtnode->pos + oldtnode->bits &&
684 		    inode->bits > 1) {
685 			struct tnode *left, *right;
686 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
687 
688 			left = tnode_new(inode->key&(~m), inode->pos + 1,
689 					 inode->bits - 1);
690 			if (!left)
691 				goto nomem;
692 
693 			right = tnode_new(inode->key|m, inode->pos + 1,
694 					  inode->bits - 1);
695 
696 			if (!right) {
697 				tnode_free(left);
698 				goto nomem;
699 			}
700 
701 			put_child(t, tn, 2*i, (struct node *) left);
702 			put_child(t, tn, 2*i+1, (struct node *) right);
703 		}
704 	}
705 
706 	for (i = 0; i < olen; i++) {
707 		struct node *node = tnode_get_child(oldtnode, i);
708 		struct tnode *left, *right;
709 		int size, j;
710 
711 		/* An empty child */
712 		if (node == NULL)
713 			continue;
714 
715 		/* A leaf or an internal node with skipped bits */
716 
717 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
718 		   tn->pos + tn->bits - 1) {
719 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
720 					     1) == 0)
721 				put_child(t, tn, 2*i, node);
722 			else
723 				put_child(t, tn, 2*i+1, node);
724 			continue;
725 		}
726 
727 		/* An internal node with two children */
728 		inode = (struct tnode *) node;
729 
730 		if (inode->bits == 1) {
731 			put_child(t, tn, 2*i, inode->child[0]);
732 			put_child(t, tn, 2*i+1, inode->child[1]);
733 
734 			tnode_free(inode);
735 			continue;
736 		}
737 
738 		/* An internal node with more than two children */
739 
740 		/* We will replace this node 'inode' with two new
741 		 * ones, 'left' and 'right', each with half of the
742 		 * original children. The two new nodes will have
743 		 * a position one bit further down the key and this
744 		 * means that the "significant" part of their keys
745 		 * (see the discussion near the top of this file)
746 		 * will differ by one bit, which will be "0" in
747 		 * left's key and "1" in right's key. Since we are
748 		 * moving the key position by one step, the bit that
749 		 * we are moving away from - the bit at position
750 		 * (inode->pos) - is the one that will differ between
751 		 * left and right. So... we synthesize that bit in the
752 		 * two  new keys.
753 		 * The mask 'm' below will be a single "one" bit at
754 		 * the position (inode->pos)
755 		 */
756 
757 		/* Use the old key, but set the new significant
758 		 *   bit to zero.
759 		 */
760 
761 		left = (struct tnode *) tnode_get_child(tn, 2*i);
762 		put_child(t, tn, 2*i, NULL);
763 
764 		BUG_ON(!left);
765 
766 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
767 		put_child(t, tn, 2*i+1, NULL);
768 
769 		BUG_ON(!right);
770 
771 		size = tnode_child_length(left);
772 		for (j = 0; j < size; j++) {
773 			put_child(t, left, j, inode->child[j]);
774 			put_child(t, right, j, inode->child[j + size]);
775 		}
776 		put_child(t, tn, 2*i, resize(t, left));
777 		put_child(t, tn, 2*i+1, resize(t, right));
778 
779 		tnode_free(inode);
780 	}
781 	tnode_free(oldtnode);
782 	return tn;
783 nomem:
784 	{
785 		int size = tnode_child_length(tn);
786 		int j;
787 
788 		for (j = 0; j < size; j++)
789 			if (tn->child[j])
790 				tnode_free((struct tnode *)tn->child[j]);
791 
792 		tnode_free(tn);
793 
794 		return ERR_PTR(-ENOMEM);
795 	}
796 }
797 
798 static struct tnode *halve(struct trie *t, struct tnode *tn)
799 {
800 	struct tnode *oldtnode = tn;
801 	struct node *left, *right;
802 	int i;
803 	int olen = tnode_child_length(tn);
804 
805 	pr_debug("In halve\n");
806 
807 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
808 
809 	if (!tn)
810 		return ERR_PTR(-ENOMEM);
811 
812 	/*
813 	 * Preallocate and store tnodes before the actual work so we
814 	 * don't get into an inconsistent state if memory allocation
815 	 * fails. In case of failure we return the oldnode and halve
816 	 * of tnode is ignored.
817 	 */
818 
819 	for (i = 0; i < olen; i += 2) {
820 		left = tnode_get_child(oldtnode, i);
821 		right = tnode_get_child(oldtnode, i+1);
822 
823 		/* Two nonempty children */
824 		if (left && right) {
825 			struct tnode *newn;
826 
827 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
828 
829 			if (!newn)
830 				goto nomem;
831 
832 			put_child(t, tn, i/2, (struct node *)newn);
833 		}
834 
835 	}
836 
837 	for (i = 0; i < olen; i += 2) {
838 		struct tnode *newBinNode;
839 
840 		left = tnode_get_child(oldtnode, i);
841 		right = tnode_get_child(oldtnode, i+1);
842 
843 		/* At least one of the children is empty */
844 		if (left == NULL) {
845 			if (right == NULL)    /* Both are empty */
846 				continue;
847 			put_child(t, tn, i/2, right);
848 			continue;
849 		}
850 
851 		if (right == NULL) {
852 			put_child(t, tn, i/2, left);
853 			continue;
854 		}
855 
856 		/* Two nonempty children */
857 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
858 		put_child(t, tn, i/2, NULL);
859 		put_child(t, newBinNode, 0, left);
860 		put_child(t, newBinNode, 1, right);
861 		put_child(t, tn, i/2, resize(t, newBinNode));
862 	}
863 	tnode_free(oldtnode);
864 	return tn;
865 nomem:
866 	{
867 		int size = tnode_child_length(tn);
868 		int j;
869 
870 		for (j = 0; j < size; j++)
871 			if (tn->child[j])
872 				tnode_free((struct tnode *)tn->child[j]);
873 
874 		tnode_free(tn);
875 
876 		return ERR_PTR(-ENOMEM);
877 	}
878 }
879 
880 static void trie_init(struct trie *t)
881 {
882 	if (!t)
883 		return;
884 
885 	t->size = 0;
886 	rcu_assign_pointer(t->trie, NULL);
887 	t->revision = 0;
888 #ifdef CONFIG_IP_FIB_TRIE_STATS
889 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
890 #endif
891 }
892 
893 /* readside must use rcu_read_lock currently dump routines
894  via get_fa_head and dump */
895 
896 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
897 {
898 	struct hlist_head *head = &l->list;
899 	struct hlist_node *node;
900 	struct leaf_info *li;
901 
902 	hlist_for_each_entry_rcu(li, node, head, hlist)
903 		if (li->plen == plen)
904 			return li;
905 
906 	return NULL;
907 }
908 
909 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
910 {
911 	struct leaf_info *li = find_leaf_info(l, plen);
912 
913 	if (!li)
914 		return NULL;
915 
916 	return &li->falh;
917 }
918 
919 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
920 {
921 	struct leaf_info *li = NULL, *last = NULL;
922 	struct hlist_node *node;
923 
924 	if (hlist_empty(head)) {
925 		hlist_add_head_rcu(&new->hlist, head);
926 	} else {
927 		hlist_for_each_entry(li, node, head, hlist) {
928 			if (new->plen > li->plen)
929 				break;
930 
931 			last = li;
932 		}
933 		if (last)
934 			hlist_add_after_rcu(&last->hlist, &new->hlist);
935 		else
936 			hlist_add_before_rcu(&new->hlist, &li->hlist);
937 	}
938 }
939 
940 /* rcu_read_lock needs to be hold by caller from readside */
941 
942 static struct leaf *
943 fib_find_node(struct trie *t, u32 key)
944 {
945 	int pos;
946 	struct tnode *tn;
947 	struct node *n;
948 
949 	pos = 0;
950 	n = rcu_dereference(t->trie);
951 
952 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
953 		tn = (struct tnode *) n;
954 
955 		check_tnode(tn);
956 
957 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
958 			pos = tn->pos + tn->bits;
959 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
960 		} else
961 			break;
962 	}
963 	/* Case we have found a leaf. Compare prefixes */
964 
965 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
966 		return (struct leaf *)n;
967 
968 	return NULL;
969 }
970 
971 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
972 {
973 	int wasfull;
974 	t_key cindex, key = tn->key;
975 	struct tnode *tp;
976 
977 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
978 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
979 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
980 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
981 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
982 
983 		tp = node_parent((struct node *) tn);
984 		if (!tp)
985 			break;
986 		tn = tp;
987 	}
988 
989 	/* Handle last (top) tnode */
990 	if (IS_TNODE(tn))
991 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
992 
993 	return (struct node*) tn;
994 }
995 
996 /* only used from updater-side */
997 
998 static  struct list_head *
999 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
1000 {
1001 	int pos, newpos;
1002 	struct tnode *tp = NULL, *tn = NULL;
1003 	struct node *n;
1004 	struct leaf *l;
1005 	int missbit;
1006 	struct list_head *fa_head = NULL;
1007 	struct leaf_info *li;
1008 	t_key cindex;
1009 
1010 	pos = 0;
1011 	n = t->trie;
1012 
1013 	/* If we point to NULL, stop. Either the tree is empty and we should
1014 	 * just put a new leaf in if, or we have reached an empty child slot,
1015 	 * and we should just put our new leaf in that.
1016 	 * If we point to a T_TNODE, check if it matches our key. Note that
1017 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1018 	 * not be the parent's 'pos'+'bits'!
1019 	 *
1020 	 * If it does match the current key, get pos/bits from it, extract
1021 	 * the index from our key, push the T_TNODE and walk the tree.
1022 	 *
1023 	 * If it doesn't, we have to replace it with a new T_TNODE.
1024 	 *
1025 	 * If we point to a T_LEAF, it might or might not have the same key
1026 	 * as we do. If it does, just change the value, update the T_LEAF's
1027 	 * value, and return it.
1028 	 * If it doesn't, we need to replace it with a T_TNODE.
1029 	 */
1030 
1031 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1032 		tn = (struct tnode *) n;
1033 
1034 		check_tnode(tn);
1035 
1036 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1037 			tp = tn;
1038 			pos = tn->pos + tn->bits;
1039 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1040 
1041 			BUG_ON(n && node_parent(n) != tn);
1042 		} else
1043 			break;
1044 	}
1045 
1046 	/*
1047 	 * n  ----> NULL, LEAF or TNODE
1048 	 *
1049 	 * tp is n's (parent) ----> NULL or TNODE
1050 	 */
1051 
1052 	BUG_ON(tp && IS_LEAF(tp));
1053 
1054 	/* Case 1: n is a leaf. Compare prefixes */
1055 
1056 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1057 		struct leaf *l = (struct leaf *) n;
1058 
1059 		li = leaf_info_new(plen);
1060 
1061 		if (!li) {
1062 			*err = -ENOMEM;
1063 			goto err;
1064 		}
1065 
1066 		fa_head = &li->falh;
1067 		insert_leaf_info(&l->list, li);
1068 		goto done;
1069 	}
1070 	t->size++;
1071 	l = leaf_new();
1072 
1073 	if (!l) {
1074 		*err = -ENOMEM;
1075 		goto err;
1076 	}
1077 
1078 	l->key = key;
1079 	li = leaf_info_new(plen);
1080 
1081 	if (!li) {
1082 		tnode_free((struct tnode *) l);
1083 		*err = -ENOMEM;
1084 		goto err;
1085 	}
1086 
1087 	fa_head = &li->falh;
1088 	insert_leaf_info(&l->list, li);
1089 
1090 	if (t->trie && n == NULL) {
1091 		/* Case 2: n is NULL, and will just insert a new leaf */
1092 
1093 		node_set_parent((struct node *)l, tp);
1094 
1095 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1096 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1097 	} else {
1098 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1099 		/*
1100 		 *  Add a new tnode here
1101 		 *  first tnode need some special handling
1102 		 */
1103 
1104 		if (tp)
1105 			pos = tp->pos+tp->bits;
1106 		else
1107 			pos = 0;
1108 
1109 		if (n) {
1110 			newpos = tkey_mismatch(key, pos, n->key);
1111 			tn = tnode_new(n->key, newpos, 1);
1112 		} else {
1113 			newpos = 0;
1114 			tn = tnode_new(key, newpos, 1); /* First tnode */
1115 		}
1116 
1117 		if (!tn) {
1118 			free_leaf_info(li);
1119 			tnode_free((struct tnode *) l);
1120 			*err = -ENOMEM;
1121 			goto err;
1122 		}
1123 
1124 		node_set_parent((struct node *)tn, tp);
1125 
1126 		missbit = tkey_extract_bits(key, newpos, 1);
1127 		put_child(t, tn, missbit, (struct node *)l);
1128 		put_child(t, tn, 1-missbit, n);
1129 
1130 		if (tp) {
1131 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1132 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1133 		} else {
1134 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1135 			tp = tn;
1136 		}
1137 	}
1138 
1139 	if (tp && tp->pos + tp->bits > 32)
1140 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1141 		       tp, tp->pos, tp->bits, key, plen);
1142 
1143 	/* Rebalance the trie */
1144 
1145 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1146 done:
1147 	t->revision++;
1148 err:
1149 	return fa_head;
1150 }
1151 
1152 /*
1153  * Caller must hold RTNL.
1154  */
1155 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1156 {
1157 	struct trie *t = (struct trie *) tb->tb_data;
1158 	struct fib_alias *fa, *new_fa;
1159 	struct list_head *fa_head = NULL;
1160 	struct fib_info *fi;
1161 	int plen = cfg->fc_dst_len;
1162 	u8 tos = cfg->fc_tos;
1163 	u32 key, mask;
1164 	int err;
1165 	struct leaf *l;
1166 
1167 	if (plen > 32)
1168 		return -EINVAL;
1169 
1170 	key = ntohl(cfg->fc_dst);
1171 
1172 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1173 
1174 	mask = ntohl(inet_make_mask(plen));
1175 
1176 	if (key & ~mask)
1177 		return -EINVAL;
1178 
1179 	key = key & mask;
1180 
1181 	fi = fib_create_info(cfg);
1182 	if (IS_ERR(fi)) {
1183 		err = PTR_ERR(fi);
1184 		goto err;
1185 	}
1186 
1187 	l = fib_find_node(t, key);
1188 	fa = NULL;
1189 
1190 	if (l) {
1191 		fa_head = get_fa_head(l, plen);
1192 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1193 	}
1194 
1195 	/* Now fa, if non-NULL, points to the first fib alias
1196 	 * with the same keys [prefix,tos,priority], if such key already
1197 	 * exists or to the node before which we will insert new one.
1198 	 *
1199 	 * If fa is NULL, we will need to allocate a new one and
1200 	 * insert to the head of f.
1201 	 *
1202 	 * If f is NULL, no fib node matched the destination key
1203 	 * and we need to allocate a new one of those as well.
1204 	 */
1205 
1206 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1207 		struct fib_alias *fa_orig;
1208 
1209 		err = -EEXIST;
1210 		if (cfg->fc_nlflags & NLM_F_EXCL)
1211 			goto out;
1212 
1213 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1214 			struct fib_info *fi_drop;
1215 			u8 state;
1216 
1217 			err = -ENOBUFS;
1218 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1219 			if (new_fa == NULL)
1220 				goto out;
1221 
1222 			fi_drop = fa->fa_info;
1223 			new_fa->fa_tos = fa->fa_tos;
1224 			new_fa->fa_info = fi;
1225 			new_fa->fa_type = cfg->fc_type;
1226 			new_fa->fa_scope = cfg->fc_scope;
1227 			state = fa->fa_state;
1228 			new_fa->fa_state &= ~FA_S_ACCESSED;
1229 
1230 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1231 			alias_free_mem_rcu(fa);
1232 
1233 			fib_release_info(fi_drop);
1234 			if (state & FA_S_ACCESSED)
1235 				rt_cache_flush(-1);
1236 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1237 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1238 
1239 			goto succeeded;
1240 		}
1241 		/* Error if we find a perfect match which
1242 		 * uses the same scope, type, and nexthop
1243 		 * information.
1244 		 */
1245 		fa_orig = fa;
1246 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1247 			if (fa->fa_tos != tos)
1248 				break;
1249 			if (fa->fa_info->fib_priority != fi->fib_priority)
1250 				break;
1251 			if (fa->fa_type == cfg->fc_type &&
1252 			    fa->fa_scope == cfg->fc_scope &&
1253 			    fa->fa_info == fi) {
1254 				goto out;
1255 			}
1256 		}
1257 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1258 			fa = fa_orig;
1259 	}
1260 	err = -ENOENT;
1261 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1262 		goto out;
1263 
1264 	err = -ENOBUFS;
1265 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1266 	if (new_fa == NULL)
1267 		goto out;
1268 
1269 	new_fa->fa_info = fi;
1270 	new_fa->fa_tos = tos;
1271 	new_fa->fa_type = cfg->fc_type;
1272 	new_fa->fa_scope = cfg->fc_scope;
1273 	new_fa->fa_state = 0;
1274 	/*
1275 	 * Insert new entry to the list.
1276 	 */
1277 
1278 	if (!fa_head) {
1279 		err = 0;
1280 		fa_head = fib_insert_node(t, &err, key, plen);
1281 		if (err)
1282 			goto out_free_new_fa;
1283 	}
1284 
1285 	list_add_tail_rcu(&new_fa->fa_list,
1286 			  (fa ? &fa->fa_list : fa_head));
1287 
1288 	rt_cache_flush(-1);
1289 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1290 		  &cfg->fc_nlinfo, 0);
1291 succeeded:
1292 	return 0;
1293 
1294 out_free_new_fa:
1295 	kmem_cache_free(fn_alias_kmem, new_fa);
1296 out:
1297 	fib_release_info(fi);
1298 err:
1299 	return err;
1300 }
1301 
1302 
1303 /* should be called with rcu_read_lock */
1304 static inline int check_leaf(struct trie *t, struct leaf *l,
1305 			     t_key key, int *plen, const struct flowi *flp,
1306 			     struct fib_result *res)
1307 {
1308 	int err, i;
1309 	__be32 mask;
1310 	struct leaf_info *li;
1311 	struct hlist_head *hhead = &l->list;
1312 	struct hlist_node *node;
1313 
1314 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1315 		i = li->plen;
1316 		mask = inet_make_mask(i);
1317 		if (l->key != (key & ntohl(mask)))
1318 			continue;
1319 
1320 		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1321 			*plen = i;
1322 #ifdef CONFIG_IP_FIB_TRIE_STATS
1323 			t->stats.semantic_match_passed++;
1324 #endif
1325 			return err;
1326 		}
1327 #ifdef CONFIG_IP_FIB_TRIE_STATS
1328 		t->stats.semantic_match_miss++;
1329 #endif
1330 	}
1331 	return 1;
1332 }
1333 
1334 static int
1335 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1336 {
1337 	struct trie *t = (struct trie *) tb->tb_data;
1338 	int plen, ret = 0;
1339 	struct node *n;
1340 	struct tnode *pn;
1341 	int pos, bits;
1342 	t_key key = ntohl(flp->fl4_dst);
1343 	int chopped_off;
1344 	t_key cindex = 0;
1345 	int current_prefix_length = KEYLENGTH;
1346 	struct tnode *cn;
1347 	t_key node_prefix, key_prefix, pref_mismatch;
1348 	int mp;
1349 
1350 	rcu_read_lock();
1351 
1352 	n = rcu_dereference(t->trie);
1353 	if (!n)
1354 		goto failed;
1355 
1356 #ifdef CONFIG_IP_FIB_TRIE_STATS
1357 	t->stats.gets++;
1358 #endif
1359 
1360 	/* Just a leaf? */
1361 	if (IS_LEAF(n)) {
1362 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1363 			goto found;
1364 		goto failed;
1365 	}
1366 	pn = (struct tnode *) n;
1367 	chopped_off = 0;
1368 
1369 	while (pn) {
1370 		pos = pn->pos;
1371 		bits = pn->bits;
1372 
1373 		if (!chopped_off)
1374 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1375 						   pos, bits);
1376 
1377 		n = tnode_get_child(pn, cindex);
1378 
1379 		if (n == NULL) {
1380 #ifdef CONFIG_IP_FIB_TRIE_STATS
1381 			t->stats.null_node_hit++;
1382 #endif
1383 			goto backtrace;
1384 		}
1385 
1386 		if (IS_LEAF(n)) {
1387 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1388 				goto found;
1389 			else
1390 				goto backtrace;
1391 		}
1392 
1393 #define HL_OPTIMIZE
1394 #ifdef HL_OPTIMIZE
1395 		cn = (struct tnode *)n;
1396 
1397 		/*
1398 		 * It's a tnode, and we can do some extra checks here if we
1399 		 * like, to avoid descending into a dead-end branch.
1400 		 * This tnode is in the parent's child array at index
1401 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1402 		 * chopped off, so in reality the index may be just a
1403 		 * subprefix, padded with zero at the end.
1404 		 * We can also take a look at any skipped bits in this
1405 		 * tnode - everything up to p_pos is supposed to be ok,
1406 		 * and the non-chopped bits of the index (se previous
1407 		 * paragraph) are also guaranteed ok, but the rest is
1408 		 * considered unknown.
1409 		 *
1410 		 * The skipped bits are key[pos+bits..cn->pos].
1411 		 */
1412 
1413 		/* If current_prefix_length < pos+bits, we are already doing
1414 		 * actual prefix  matching, which means everything from
1415 		 * pos+(bits-chopped_off) onward must be zero along some
1416 		 * branch of this subtree - otherwise there is *no* valid
1417 		 * prefix present. Here we can only check the skipped
1418 		 * bits. Remember, since we have already indexed into the
1419 		 * parent's child array, we know that the bits we chopped of
1420 		 * *are* zero.
1421 		 */
1422 
1423 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1424 
1425 		if (current_prefix_length < pos+bits) {
1426 			if (tkey_extract_bits(cn->key, current_prefix_length,
1427 						cn->pos - current_prefix_length) != 0 ||
1428 			    !(cn->child[0]))
1429 				goto backtrace;
1430 		}
1431 
1432 		/*
1433 		 * If chopped_off=0, the index is fully validated and we
1434 		 * only need to look at the skipped bits for this, the new,
1435 		 * tnode. What we actually want to do is to find out if
1436 		 * these skipped bits match our key perfectly, or if we will
1437 		 * have to count on finding a matching prefix further down,
1438 		 * because if we do, we would like to have some way of
1439 		 * verifying the existence of such a prefix at this point.
1440 		 */
1441 
1442 		/* The only thing we can do at this point is to verify that
1443 		 * any such matching prefix can indeed be a prefix to our
1444 		 * key, and if the bits in the node we are inspecting that
1445 		 * do not match our key are not ZERO, this cannot be true.
1446 		 * Thus, find out where there is a mismatch (before cn->pos)
1447 		 * and verify that all the mismatching bits are zero in the
1448 		 * new tnode's key.
1449 		 */
1450 
1451 		/* Note: We aren't very concerned about the piece of the key
1452 		 * that precede pn->pos+pn->bits, since these have already been
1453 		 * checked. The bits after cn->pos aren't checked since these are
1454 		 * by definition "unknown" at this point. Thus, what we want to
1455 		 * see is if we are about to enter the "prefix matching" state,
1456 		 * and in that case verify that the skipped bits that will prevail
1457 		 * throughout this subtree are zero, as they have to be if we are
1458 		 * to find a matching prefix.
1459 		 */
1460 
1461 		node_prefix = mask_pfx(cn->key, cn->pos);
1462 		key_prefix = mask_pfx(key, cn->pos);
1463 		pref_mismatch = key_prefix^node_prefix;
1464 		mp = 0;
1465 
1466 		/* In short: If skipped bits in this node do not match the search
1467 		 * key, enter the "prefix matching" state.directly.
1468 		 */
1469 		if (pref_mismatch) {
1470 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1471 				mp++;
1472 				pref_mismatch = pref_mismatch <<1;
1473 			}
1474 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1475 
1476 			if (key_prefix != 0)
1477 				goto backtrace;
1478 
1479 			if (current_prefix_length >= cn->pos)
1480 				current_prefix_length = mp;
1481 		}
1482 #endif
1483 		pn = (struct tnode *)n; /* Descend */
1484 		chopped_off = 0;
1485 		continue;
1486 
1487 backtrace:
1488 		chopped_off++;
1489 
1490 		/* As zero don't change the child key (cindex) */
1491 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1492 			chopped_off++;
1493 
1494 		/* Decrease current_... with bits chopped off */
1495 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1496 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1497 
1498 		/*
1499 		 * Either we do the actual chop off according or if we have
1500 		 * chopped off all bits in this tnode walk up to our parent.
1501 		 */
1502 
1503 		if (chopped_off <= pn->bits) {
1504 			cindex &= ~(1 << (chopped_off-1));
1505 		} else {
1506 			struct tnode *parent = node_parent((struct node *) pn);
1507 			if (!parent)
1508 				goto failed;
1509 
1510 			/* Get Child's index */
1511 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1512 			pn = parent;
1513 			chopped_off = 0;
1514 
1515 #ifdef CONFIG_IP_FIB_TRIE_STATS
1516 			t->stats.backtrack++;
1517 #endif
1518 			goto backtrace;
1519 		}
1520 	}
1521 failed:
1522 	ret = 1;
1523 found:
1524 	rcu_read_unlock();
1525 	return ret;
1526 }
1527 
1528 /* only called from updater side */
1529 static int trie_leaf_remove(struct trie *t, t_key key)
1530 {
1531 	t_key cindex;
1532 	struct tnode *tp = NULL;
1533 	struct node *n = t->trie;
1534 	struct leaf *l;
1535 
1536 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1537 
1538 	/* Note that in the case skipped bits, those bits are *not* checked!
1539 	 * When we finish this, we will have NULL or a T_LEAF, and the
1540 	 * T_LEAF may or may not match our key.
1541 	 */
1542 
1543 	while (n != NULL && IS_TNODE(n)) {
1544 		struct tnode *tn = (struct tnode *) n;
1545 		check_tnode(tn);
1546 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1547 
1548 		BUG_ON(n && node_parent(n) != tn);
1549 	}
1550 	l = (struct leaf *) n;
1551 
1552 	if (!n || !tkey_equals(l->key, key))
1553 		return 0;
1554 
1555 	/*
1556 	 * Key found.
1557 	 * Remove the leaf and rebalance the tree
1558 	 */
1559 
1560 	t->revision++;
1561 	t->size--;
1562 
1563 	tp = node_parent(n);
1564 	tnode_free((struct tnode *) n);
1565 
1566 	if (tp) {
1567 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1568 		put_child(t, (struct tnode *)tp, cindex, NULL);
1569 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1570 	} else
1571 		rcu_assign_pointer(t->trie, NULL);
1572 
1573 	return 1;
1574 }
1575 
1576 /*
1577  * Caller must hold RTNL.
1578  */
1579 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1580 {
1581 	struct trie *t = (struct trie *) tb->tb_data;
1582 	u32 key, mask;
1583 	int plen = cfg->fc_dst_len;
1584 	u8 tos = cfg->fc_tos;
1585 	struct fib_alias *fa, *fa_to_delete;
1586 	struct list_head *fa_head;
1587 	struct leaf *l;
1588 	struct leaf_info *li;
1589 
1590 	if (plen > 32)
1591 		return -EINVAL;
1592 
1593 	key = ntohl(cfg->fc_dst);
1594 	mask = ntohl(inet_make_mask(plen));
1595 
1596 	if (key & ~mask)
1597 		return -EINVAL;
1598 
1599 	key = key & mask;
1600 	l = fib_find_node(t, key);
1601 
1602 	if (!l)
1603 		return -ESRCH;
1604 
1605 	fa_head = get_fa_head(l, plen);
1606 	fa = fib_find_alias(fa_head, tos, 0);
1607 
1608 	if (!fa)
1609 		return -ESRCH;
1610 
1611 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1612 
1613 	fa_to_delete = NULL;
1614 	fa_head = fa->fa_list.prev;
1615 
1616 	list_for_each_entry(fa, fa_head, fa_list) {
1617 		struct fib_info *fi = fa->fa_info;
1618 
1619 		if (fa->fa_tos != tos)
1620 			break;
1621 
1622 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1623 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1624 		     fa->fa_scope == cfg->fc_scope) &&
1625 		    (!cfg->fc_protocol ||
1626 		     fi->fib_protocol == cfg->fc_protocol) &&
1627 		    fib_nh_match(cfg, fi) == 0) {
1628 			fa_to_delete = fa;
1629 			break;
1630 		}
1631 	}
1632 
1633 	if (!fa_to_delete)
1634 		return -ESRCH;
1635 
1636 	fa = fa_to_delete;
1637 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1638 		  &cfg->fc_nlinfo, 0);
1639 
1640 	l = fib_find_node(t, key);
1641 	li = find_leaf_info(l, plen);
1642 
1643 	list_del_rcu(&fa->fa_list);
1644 
1645 	if (list_empty(fa_head)) {
1646 		hlist_del_rcu(&li->hlist);
1647 		free_leaf_info(li);
1648 	}
1649 
1650 	if (hlist_empty(&l->list))
1651 		trie_leaf_remove(t, key);
1652 
1653 	if (fa->fa_state & FA_S_ACCESSED)
1654 		rt_cache_flush(-1);
1655 
1656 	fib_release_info(fa->fa_info);
1657 	alias_free_mem_rcu(fa);
1658 	return 0;
1659 }
1660 
1661 static int trie_flush_list(struct trie *t, struct list_head *head)
1662 {
1663 	struct fib_alias *fa, *fa_node;
1664 	int found = 0;
1665 
1666 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1667 		struct fib_info *fi = fa->fa_info;
1668 
1669 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1670 			list_del_rcu(&fa->fa_list);
1671 			fib_release_info(fa->fa_info);
1672 			alias_free_mem_rcu(fa);
1673 			found++;
1674 		}
1675 	}
1676 	return found;
1677 }
1678 
1679 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1680 {
1681 	int found = 0;
1682 	struct hlist_head *lih = &l->list;
1683 	struct hlist_node *node, *tmp;
1684 	struct leaf_info *li = NULL;
1685 
1686 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1687 		found += trie_flush_list(t, &li->falh);
1688 
1689 		if (list_empty(&li->falh)) {
1690 			hlist_del_rcu(&li->hlist);
1691 			free_leaf_info(li);
1692 		}
1693 	}
1694 	return found;
1695 }
1696 
1697 /* rcu_read_lock needs to be hold by caller from readside */
1698 
1699 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1700 {
1701 	struct node *c = (struct node *) thisleaf;
1702 	struct tnode *p;
1703 	int idx;
1704 	struct node *trie = rcu_dereference(t->trie);
1705 
1706 	if (c == NULL) {
1707 		if (trie == NULL)
1708 			return NULL;
1709 
1710 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1711 			return (struct leaf *) trie;
1712 
1713 		p = (struct tnode*) trie;  /* Start */
1714 	} else
1715 		p = node_parent(c);
1716 
1717 	while (p) {
1718 		int pos, last;
1719 
1720 		/*  Find the next child of the parent */
1721 		if (c)
1722 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1723 		else
1724 			pos = 0;
1725 
1726 		last = 1 << p->bits;
1727 		for (idx = pos; idx < last ; idx++) {
1728 			c = rcu_dereference(p->child[idx]);
1729 
1730 			if (!c)
1731 				continue;
1732 
1733 			/* Decend if tnode */
1734 			while (IS_TNODE(c)) {
1735 				p = (struct tnode *) c;
1736 				idx = 0;
1737 
1738 				/* Rightmost non-NULL branch */
1739 				if (p && IS_TNODE(p))
1740 					while (!(c = rcu_dereference(p->child[idx]))
1741 					       && idx < (1<<p->bits)) idx++;
1742 
1743 				/* Done with this tnode? */
1744 				if (idx >= (1 << p->bits) || !c)
1745 					goto up;
1746 			}
1747 			return (struct leaf *) c;
1748 		}
1749 up:
1750 		/* No more children go up one step  */
1751 		c = (struct node *) p;
1752 		p = node_parent(c);
1753 	}
1754 	return NULL; /* Ready. Root of trie */
1755 }
1756 
1757 /*
1758  * Caller must hold RTNL.
1759  */
1760 static int fn_trie_flush(struct fib_table *tb)
1761 {
1762 	struct trie *t = (struct trie *) tb->tb_data;
1763 	struct leaf *ll = NULL, *l = NULL;
1764 	int found = 0, h;
1765 
1766 	t->revision++;
1767 
1768 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1769 		found += trie_flush_leaf(t, l);
1770 
1771 		if (ll && hlist_empty(&ll->list))
1772 			trie_leaf_remove(t, ll->key);
1773 		ll = l;
1774 	}
1775 
1776 	if (ll && hlist_empty(&ll->list))
1777 		trie_leaf_remove(t, ll->key);
1778 
1779 	pr_debug("trie_flush found=%d\n", found);
1780 	return found;
1781 }
1782 
1783 static int trie_last_dflt = -1;
1784 
1785 static void
1786 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1787 {
1788 	struct trie *t = (struct trie *) tb->tb_data;
1789 	int order, last_idx;
1790 	struct fib_info *fi = NULL;
1791 	struct fib_info *last_resort;
1792 	struct fib_alias *fa = NULL;
1793 	struct list_head *fa_head;
1794 	struct leaf *l;
1795 
1796 	last_idx = -1;
1797 	last_resort = NULL;
1798 	order = -1;
1799 
1800 	rcu_read_lock();
1801 
1802 	l = fib_find_node(t, 0);
1803 	if (!l)
1804 		goto out;
1805 
1806 	fa_head = get_fa_head(l, 0);
1807 	if (!fa_head)
1808 		goto out;
1809 
1810 	if (list_empty(fa_head))
1811 		goto out;
1812 
1813 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1814 		struct fib_info *next_fi = fa->fa_info;
1815 
1816 		if (fa->fa_scope != res->scope ||
1817 		    fa->fa_type != RTN_UNICAST)
1818 			continue;
1819 
1820 		if (next_fi->fib_priority > res->fi->fib_priority)
1821 			break;
1822 		if (!next_fi->fib_nh[0].nh_gw ||
1823 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1824 			continue;
1825 		fa->fa_state |= FA_S_ACCESSED;
1826 
1827 		if (fi == NULL) {
1828 			if (next_fi != res->fi)
1829 				break;
1830 		} else if (!fib_detect_death(fi, order, &last_resort,
1831 					     &last_idx, &trie_last_dflt)) {
1832 			if (res->fi)
1833 				fib_info_put(res->fi);
1834 			res->fi = fi;
1835 			atomic_inc(&fi->fib_clntref);
1836 			trie_last_dflt = order;
1837 			goto out;
1838 		}
1839 		fi = next_fi;
1840 		order++;
1841 	}
1842 	if (order <= 0 || fi == NULL) {
1843 		trie_last_dflt = -1;
1844 		goto out;
1845 	}
1846 
1847 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1848 		if (res->fi)
1849 			fib_info_put(res->fi);
1850 		res->fi = fi;
1851 		atomic_inc(&fi->fib_clntref);
1852 		trie_last_dflt = order;
1853 		goto out;
1854 	}
1855 	if (last_idx >= 0) {
1856 		if (res->fi)
1857 			fib_info_put(res->fi);
1858 		res->fi = last_resort;
1859 		if (last_resort)
1860 			atomic_inc(&last_resort->fib_clntref);
1861 	}
1862 	trie_last_dflt = last_idx;
1863  out:;
1864 	rcu_read_unlock();
1865 }
1866 
1867 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1868 			   struct sk_buff *skb, struct netlink_callback *cb)
1869 {
1870 	int i, s_i;
1871 	struct fib_alias *fa;
1872 
1873 	__be32 xkey = htonl(key);
1874 
1875 	s_i = cb->args[4];
1876 	i = 0;
1877 
1878 	/* rcu_read_lock is hold by caller */
1879 
1880 	list_for_each_entry_rcu(fa, fah, fa_list) {
1881 		if (i < s_i) {
1882 			i++;
1883 			continue;
1884 		}
1885 		BUG_ON(!fa->fa_info);
1886 
1887 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1888 				  cb->nlh->nlmsg_seq,
1889 				  RTM_NEWROUTE,
1890 				  tb->tb_id,
1891 				  fa->fa_type,
1892 				  fa->fa_scope,
1893 				  xkey,
1894 				  plen,
1895 				  fa->fa_tos,
1896 				  fa->fa_info, 0) < 0) {
1897 			cb->args[4] = i;
1898 			return -1;
1899 		}
1900 		i++;
1901 	}
1902 	cb->args[4] = i;
1903 	return skb->len;
1904 }
1905 
1906 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1907 			     struct netlink_callback *cb)
1908 {
1909 	int h, s_h;
1910 	struct list_head *fa_head;
1911 	struct leaf *l = NULL;
1912 
1913 	s_h = cb->args[3];
1914 
1915 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1916 		if (h < s_h)
1917 			continue;
1918 		if (h > s_h)
1919 			memset(&cb->args[4], 0,
1920 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1921 
1922 		fa_head = get_fa_head(l, plen);
1923 
1924 		if (!fa_head)
1925 			continue;
1926 
1927 		if (list_empty(fa_head))
1928 			continue;
1929 
1930 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1931 			cb->args[3] = h;
1932 			return -1;
1933 		}
1934 	}
1935 	cb->args[3] = h;
1936 	return skb->len;
1937 }
1938 
1939 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1940 {
1941 	int m, s_m;
1942 	struct trie *t = (struct trie *) tb->tb_data;
1943 
1944 	s_m = cb->args[2];
1945 
1946 	rcu_read_lock();
1947 	for (m = 0; m <= 32; m++) {
1948 		if (m < s_m)
1949 			continue;
1950 		if (m > s_m)
1951 			memset(&cb->args[3], 0,
1952 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1953 
1954 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1955 			cb->args[2] = m;
1956 			goto out;
1957 		}
1958 	}
1959 	rcu_read_unlock();
1960 	cb->args[2] = m;
1961 	return skb->len;
1962 out:
1963 	rcu_read_unlock();
1964 	return -1;
1965 }
1966 
1967 /* Fix more generic FIB names for init later */
1968 
1969 #ifdef CONFIG_IP_MULTIPLE_TABLES
1970 struct fib_table * fib_hash_init(u32 id)
1971 #else
1972 struct fib_table * __init fib_hash_init(u32 id)
1973 #endif
1974 {
1975 	struct fib_table *tb;
1976 	struct trie *t;
1977 
1978 	if (fn_alias_kmem == NULL)
1979 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1980 						  sizeof(struct fib_alias),
1981 						  0, SLAB_HWCACHE_ALIGN,
1982 						  NULL);
1983 
1984 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1985 		     GFP_KERNEL);
1986 	if (tb == NULL)
1987 		return NULL;
1988 
1989 	tb->tb_id = id;
1990 	tb->tb_lookup = fn_trie_lookup;
1991 	tb->tb_insert = fn_trie_insert;
1992 	tb->tb_delete = fn_trie_delete;
1993 	tb->tb_flush = fn_trie_flush;
1994 	tb->tb_select_default = fn_trie_select_default;
1995 	tb->tb_dump = fn_trie_dump;
1996 	memset(tb->tb_data, 0, sizeof(struct trie));
1997 
1998 	t = (struct trie *) tb->tb_data;
1999 
2000 	trie_init(t);
2001 
2002 	if (id == RT_TABLE_LOCAL)
2003 		trie_local = t;
2004 	else if (id == RT_TABLE_MAIN)
2005 		trie_main = t;
2006 
2007 	if (id == RT_TABLE_LOCAL)
2008 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
2009 
2010 	return tb;
2011 }
2012 
2013 #ifdef CONFIG_PROC_FS
2014 /* Depth first Trie walk iterator */
2015 struct fib_trie_iter {
2016 	struct tnode *tnode;
2017 	struct trie *trie;
2018 	unsigned index;
2019 	unsigned depth;
2020 };
2021 
2022 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2023 {
2024 	struct tnode *tn = iter->tnode;
2025 	unsigned cindex = iter->index;
2026 	struct tnode *p;
2027 
2028 	/* A single entry routing table */
2029 	if (!tn)
2030 		return NULL;
2031 
2032 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2033 		 iter->tnode, iter->index, iter->depth);
2034 rescan:
2035 	while (cindex < (1<<tn->bits)) {
2036 		struct node *n = tnode_get_child(tn, cindex);
2037 
2038 		if (n) {
2039 			if (IS_LEAF(n)) {
2040 				iter->tnode = tn;
2041 				iter->index = cindex + 1;
2042 			} else {
2043 				/* push down one level */
2044 				iter->tnode = (struct tnode *) n;
2045 				iter->index = 0;
2046 				++iter->depth;
2047 			}
2048 			return n;
2049 		}
2050 
2051 		++cindex;
2052 	}
2053 
2054 	/* Current node exhausted, pop back up */
2055 	p = node_parent((struct node *)tn);
2056 	if (p) {
2057 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2058 		tn = p;
2059 		--iter->depth;
2060 		goto rescan;
2061 	}
2062 
2063 	/* got root? */
2064 	return NULL;
2065 }
2066 
2067 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2068 				       struct trie *t)
2069 {
2070 	struct node *n ;
2071 
2072 	if (!t)
2073 		return NULL;
2074 
2075 	n = rcu_dereference(t->trie);
2076 
2077 	if (!iter)
2078 		return NULL;
2079 
2080 	if (n) {
2081 		if (IS_TNODE(n)) {
2082 			iter->tnode = (struct tnode *) n;
2083 			iter->trie = t;
2084 			iter->index = 0;
2085 			iter->depth = 1;
2086 		} else {
2087 			iter->tnode = NULL;
2088 			iter->trie  = t;
2089 			iter->index = 0;
2090 			iter->depth = 0;
2091 		}
2092 		return n;
2093 	}
2094 	return NULL;
2095 }
2096 
2097 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2098 {
2099 	struct node *n;
2100 	struct fib_trie_iter iter;
2101 
2102 	memset(s, 0, sizeof(*s));
2103 
2104 	rcu_read_lock();
2105 	for (n = fib_trie_get_first(&iter, t); n;
2106 	     n = fib_trie_get_next(&iter)) {
2107 		if (IS_LEAF(n)) {
2108 			s->leaves++;
2109 			s->totdepth += iter.depth;
2110 			if (iter.depth > s->maxdepth)
2111 				s->maxdepth = iter.depth;
2112 		} else {
2113 			const struct tnode *tn = (const struct tnode *) n;
2114 			int i;
2115 
2116 			s->tnodes++;
2117 			if (tn->bits < MAX_STAT_DEPTH)
2118 				s->nodesizes[tn->bits]++;
2119 
2120 			for (i = 0; i < (1<<tn->bits); i++)
2121 				if (!tn->child[i])
2122 					s->nullpointers++;
2123 		}
2124 	}
2125 	rcu_read_unlock();
2126 }
2127 
2128 /*
2129  *	This outputs /proc/net/fib_triestats
2130  */
2131 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2132 {
2133 	unsigned i, max, pointers, bytes, avdepth;
2134 
2135 	if (stat->leaves)
2136 		avdepth = stat->totdepth*100 / stat->leaves;
2137 	else
2138 		avdepth = 0;
2139 
2140 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2141 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2142 
2143 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2144 
2145 	bytes = sizeof(struct leaf) * stat->leaves;
2146 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2147 	bytes += sizeof(struct tnode) * stat->tnodes;
2148 
2149 	max = MAX_STAT_DEPTH;
2150 	while (max > 0 && stat->nodesizes[max-1] == 0)
2151 		max--;
2152 
2153 	pointers = 0;
2154 	for (i = 1; i <= max; i++)
2155 		if (stat->nodesizes[i] != 0) {
2156 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2157 			pointers += (1<<i) * stat->nodesizes[i];
2158 		}
2159 	seq_putc(seq, '\n');
2160 	seq_printf(seq, "\tPointers: %d\n", pointers);
2161 
2162 	bytes += sizeof(struct node *) * pointers;
2163 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2164 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2165 
2166 #ifdef CONFIG_IP_FIB_TRIE_STATS
2167 	seq_printf(seq, "Counters:\n---------\n");
2168 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2169 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2170 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2171 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2172 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2173 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2174 #ifdef CLEAR_STATS
2175 	memset(&(t->stats), 0, sizeof(t->stats));
2176 #endif
2177 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2178 }
2179 
2180 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2181 {
2182 	struct trie_stat *stat;
2183 
2184 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2185 	if (!stat)
2186 		return -ENOMEM;
2187 
2188 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2189 		   sizeof(struct leaf), sizeof(struct tnode));
2190 
2191 	if (trie_local) {
2192 		seq_printf(seq, "Local:\n");
2193 		trie_collect_stats(trie_local, stat);
2194 		trie_show_stats(seq, stat);
2195 	}
2196 
2197 	if (trie_main) {
2198 		seq_printf(seq, "Main:\n");
2199 		trie_collect_stats(trie_main, stat);
2200 		trie_show_stats(seq, stat);
2201 	}
2202 	kfree(stat);
2203 
2204 	return 0;
2205 }
2206 
2207 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2208 {
2209 	return single_open(file, fib_triestat_seq_show, NULL);
2210 }
2211 
2212 static const struct file_operations fib_triestat_fops = {
2213 	.owner	= THIS_MODULE,
2214 	.open	= fib_triestat_seq_open,
2215 	.read	= seq_read,
2216 	.llseek	= seq_lseek,
2217 	.release = single_release,
2218 };
2219 
2220 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2221 				      loff_t pos)
2222 {
2223 	loff_t idx = 0;
2224 	struct node *n;
2225 
2226 	for (n = fib_trie_get_first(iter, trie_local);
2227 	     n; ++idx, n = fib_trie_get_next(iter)) {
2228 		if (pos == idx)
2229 			return n;
2230 	}
2231 
2232 	for (n = fib_trie_get_first(iter, trie_main);
2233 	     n; ++idx, n = fib_trie_get_next(iter)) {
2234 		if (pos == idx)
2235 			return n;
2236 	}
2237 	return NULL;
2238 }
2239 
2240 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2241 {
2242 	rcu_read_lock();
2243 	if (*pos == 0)
2244 		return SEQ_START_TOKEN;
2245 	return fib_trie_get_idx(seq->private, *pos - 1);
2246 }
2247 
2248 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2249 {
2250 	struct fib_trie_iter *iter = seq->private;
2251 	void *l = v;
2252 
2253 	++*pos;
2254 	if (v == SEQ_START_TOKEN)
2255 		return fib_trie_get_idx(iter, 0);
2256 
2257 	v = fib_trie_get_next(iter);
2258 	BUG_ON(v == l);
2259 	if (v)
2260 		return v;
2261 
2262 	/* continue scan in next trie */
2263 	if (iter->trie == trie_local)
2264 		return fib_trie_get_first(iter, trie_main);
2265 
2266 	return NULL;
2267 }
2268 
2269 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2270 {
2271 	rcu_read_unlock();
2272 }
2273 
2274 static void seq_indent(struct seq_file *seq, int n)
2275 {
2276 	while (n-- > 0) seq_puts(seq, "   ");
2277 }
2278 
2279 static inline const char *rtn_scope(enum rt_scope_t s)
2280 {
2281 	static char buf[32];
2282 
2283 	switch (s) {
2284 	case RT_SCOPE_UNIVERSE: return "universe";
2285 	case RT_SCOPE_SITE:	return "site";
2286 	case RT_SCOPE_LINK:	return "link";
2287 	case RT_SCOPE_HOST:	return "host";
2288 	case RT_SCOPE_NOWHERE:	return "nowhere";
2289 	default:
2290 		snprintf(buf, sizeof(buf), "scope=%d", s);
2291 		return buf;
2292 	}
2293 }
2294 
2295 static const char *rtn_type_names[__RTN_MAX] = {
2296 	[RTN_UNSPEC] = "UNSPEC",
2297 	[RTN_UNICAST] = "UNICAST",
2298 	[RTN_LOCAL] = "LOCAL",
2299 	[RTN_BROADCAST] = "BROADCAST",
2300 	[RTN_ANYCAST] = "ANYCAST",
2301 	[RTN_MULTICAST] = "MULTICAST",
2302 	[RTN_BLACKHOLE] = "BLACKHOLE",
2303 	[RTN_UNREACHABLE] = "UNREACHABLE",
2304 	[RTN_PROHIBIT] = "PROHIBIT",
2305 	[RTN_THROW] = "THROW",
2306 	[RTN_NAT] = "NAT",
2307 	[RTN_XRESOLVE] = "XRESOLVE",
2308 };
2309 
2310 static inline const char *rtn_type(unsigned t)
2311 {
2312 	static char buf[32];
2313 
2314 	if (t < __RTN_MAX && rtn_type_names[t])
2315 		return rtn_type_names[t];
2316 	snprintf(buf, sizeof(buf), "type %d", t);
2317 	return buf;
2318 }
2319 
2320 /* Pretty print the trie */
2321 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2322 {
2323 	const struct fib_trie_iter *iter = seq->private;
2324 	struct node *n = v;
2325 
2326 	if (v == SEQ_START_TOKEN)
2327 		return 0;
2328 
2329 	if (!node_parent(n)) {
2330 		if (iter->trie == trie_local)
2331 			seq_puts(seq, "<local>:\n");
2332 		else
2333 			seq_puts(seq, "<main>:\n");
2334 	}
2335 
2336 	if (IS_TNODE(n)) {
2337 		struct tnode *tn = (struct tnode *) n;
2338 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2339 
2340 		seq_indent(seq, iter->depth-1);
2341 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2342 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2343 			   tn->empty_children);
2344 
2345 	} else {
2346 		struct leaf *l = (struct leaf *) n;
2347 		int i;
2348 		__be32 val = htonl(l->key);
2349 
2350 		seq_indent(seq, iter->depth);
2351 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2352 		for (i = 32; i >= 0; i--) {
2353 			struct leaf_info *li = find_leaf_info(l, i);
2354 			if (li) {
2355 				struct fib_alias *fa;
2356 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2357 					seq_indent(seq, iter->depth+1);
2358 					seq_printf(seq, "  /%d %s %s", i,
2359 						   rtn_scope(fa->fa_scope),
2360 						   rtn_type(fa->fa_type));
2361 					if (fa->fa_tos)
2362 						seq_printf(seq, "tos =%d\n",
2363 							   fa->fa_tos);
2364 					seq_putc(seq, '\n');
2365 				}
2366 			}
2367 		}
2368 	}
2369 
2370 	return 0;
2371 }
2372 
2373 static const struct seq_operations fib_trie_seq_ops = {
2374 	.start  = fib_trie_seq_start,
2375 	.next   = fib_trie_seq_next,
2376 	.stop   = fib_trie_seq_stop,
2377 	.show   = fib_trie_seq_show,
2378 };
2379 
2380 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2381 {
2382 	return seq_open_private(file, &fib_trie_seq_ops,
2383 			sizeof(struct fib_trie_iter));
2384 }
2385 
2386 static const struct file_operations fib_trie_fops = {
2387 	.owner  = THIS_MODULE,
2388 	.open   = fib_trie_seq_open,
2389 	.read   = seq_read,
2390 	.llseek = seq_lseek,
2391 	.release = seq_release_private,
2392 };
2393 
2394 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2395 {
2396 	static unsigned type2flags[RTN_MAX + 1] = {
2397 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2398 	};
2399 	unsigned flags = type2flags[type];
2400 
2401 	if (fi && fi->fib_nh->nh_gw)
2402 		flags |= RTF_GATEWAY;
2403 	if (mask == htonl(0xFFFFFFFF))
2404 		flags |= RTF_HOST;
2405 	flags |= RTF_UP;
2406 	return flags;
2407 }
2408 
2409 /*
2410  *	This outputs /proc/net/route.
2411  *	The format of the file is not supposed to be changed
2412  * 	and needs to be same as fib_hash output to avoid breaking
2413  *	legacy utilities
2414  */
2415 static int fib_route_seq_show(struct seq_file *seq, void *v)
2416 {
2417 	const struct fib_trie_iter *iter = seq->private;
2418 	struct leaf *l = v;
2419 	int i;
2420 	char bf[128];
2421 
2422 	if (v == SEQ_START_TOKEN) {
2423 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2424 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2425 			   "\tWindow\tIRTT");
2426 		return 0;
2427 	}
2428 
2429 	if (iter->trie == trie_local)
2430 		return 0;
2431 	if (IS_TNODE(l))
2432 		return 0;
2433 
2434 	for (i=32; i>=0; i--) {
2435 		struct leaf_info *li = find_leaf_info(l, i);
2436 		struct fib_alias *fa;
2437 		__be32 mask, prefix;
2438 
2439 		if (!li)
2440 			continue;
2441 
2442 		mask = inet_make_mask(li->plen);
2443 		prefix = htonl(l->key);
2444 
2445 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2446 			const struct fib_info *fi = fa->fa_info;
2447 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2448 
2449 			if (fa->fa_type == RTN_BROADCAST
2450 			    || fa->fa_type == RTN_MULTICAST)
2451 				continue;
2452 
2453 			if (fi)
2454 				snprintf(bf, sizeof(bf),
2455 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2456 					 fi->fib_dev ? fi->fib_dev->name : "*",
2457 					 prefix,
2458 					 fi->fib_nh->nh_gw, flags, 0, 0,
2459 					 fi->fib_priority,
2460 					 mask,
2461 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2462 					 fi->fib_window,
2463 					 fi->fib_rtt >> 3);
2464 			else
2465 				snprintf(bf, sizeof(bf),
2466 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2467 					 prefix, 0, flags, 0, 0, 0,
2468 					 mask, 0, 0, 0);
2469 
2470 			seq_printf(seq, "%-127s\n", bf);
2471 		}
2472 	}
2473 
2474 	return 0;
2475 }
2476 
2477 static const struct seq_operations fib_route_seq_ops = {
2478 	.start  = fib_trie_seq_start,
2479 	.next   = fib_trie_seq_next,
2480 	.stop   = fib_trie_seq_stop,
2481 	.show   = fib_route_seq_show,
2482 };
2483 
2484 static int fib_route_seq_open(struct inode *inode, struct file *file)
2485 {
2486 	return seq_open_private(file, &fib_route_seq_ops,
2487 			sizeof(struct fib_trie_iter));
2488 }
2489 
2490 static const struct file_operations fib_route_fops = {
2491 	.owner  = THIS_MODULE,
2492 	.open   = fib_route_seq_open,
2493 	.read   = seq_read,
2494 	.llseek = seq_lseek,
2495 	.release = seq_release_private,
2496 };
2497 
2498 int __init fib_proc_init(void)
2499 {
2500 	if (!proc_net_fops_create(&init_net, "fib_trie", S_IRUGO, &fib_trie_fops))
2501 		goto out1;
2502 
2503 	if (!proc_net_fops_create(&init_net, "fib_triestat", S_IRUGO, &fib_triestat_fops))
2504 		goto out2;
2505 
2506 	if (!proc_net_fops_create(&init_net, "route", S_IRUGO, &fib_route_fops))
2507 		goto out3;
2508 
2509 	return 0;
2510 
2511 out3:
2512 	proc_net_remove(&init_net, "fib_triestat");
2513 out2:
2514 	proc_net_remove(&init_net, "fib_trie");
2515 out1:
2516 	return -ENOMEM;
2517 }
2518 
2519 void __init fib_proc_exit(void)
2520 {
2521 	proc_net_remove(&init_net, "fib_trie");
2522 	proc_net_remove(&init_net, "fib_triestat");
2523 	proc_net_remove(&init_net, "route");
2524 }
2525 
2526 #endif /* CONFIG_PROC_FS */
2527