xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 87c2ce3b)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.404"
54 
55 #include <linux/config.h>
56 #include <asm/uaccess.h>
57 #include <asm/system.h>
58 #include <asm/bitops.h>
59 #include <linux/types.h>
60 #include <linux/kernel.h>
61 #include <linux/sched.h>
62 #include <linux/mm.h>
63 #include <linux/string.h>
64 #include <linux/socket.h>
65 #include <linux/sockios.h>
66 #include <linux/errno.h>
67 #include <linux/in.h>
68 #include <linux/inet.h>
69 #include <linux/inetdevice.h>
70 #include <linux/netdevice.h>
71 #include <linux/if_arp.h>
72 #include <linux/proc_fs.h>
73 #include <linux/rcupdate.h>
74 #include <linux/skbuff.h>
75 #include <linux/netlink.h>
76 #include <linux/init.h>
77 #include <linux/list.h>
78 #include <net/ip.h>
79 #include <net/protocol.h>
80 #include <net/route.h>
81 #include <net/tcp.h>
82 #include <net/sock.h>
83 #include <net/ip_fib.h>
84 #include "fib_lookup.h"
85 
86 #undef CONFIG_IP_FIB_TRIE_STATS
87 #define MAX_CHILDS 16384
88 
89 #define KEYLENGTH (8*sizeof(t_key))
90 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
91 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
92 
93 typedef unsigned int t_key;
94 
95 #define T_TNODE 0
96 #define T_LEAF  1
97 #define NODE_TYPE_MASK	0x1UL
98 #define NODE_PARENT(node) \
99 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
100 
101 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
102 
103 #define NODE_SET_PARENT(node, ptr)		\
104 	rcu_assign_pointer((node)->parent,	\
105 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
106 
107 #define IS_TNODE(n) (!(n->parent & T_LEAF))
108 #define IS_LEAF(n) (n->parent & T_LEAF)
109 
110 struct node {
111 	t_key key;
112 	unsigned long parent;
113 };
114 
115 struct leaf {
116 	t_key key;
117 	unsigned long parent;
118 	struct hlist_head list;
119 	struct rcu_head rcu;
120 };
121 
122 struct leaf_info {
123 	struct hlist_node hlist;
124 	struct rcu_head rcu;
125 	int plen;
126 	struct list_head falh;
127 };
128 
129 struct tnode {
130 	t_key key;
131 	unsigned long parent;
132 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
133 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
134 	unsigned short full_children;	/* KEYLENGTH bits needed */
135 	unsigned short empty_children;	/* KEYLENGTH bits needed */
136 	struct rcu_head rcu;
137 	struct node *child[0];
138 };
139 
140 #ifdef CONFIG_IP_FIB_TRIE_STATS
141 struct trie_use_stats {
142 	unsigned int gets;
143 	unsigned int backtrack;
144 	unsigned int semantic_match_passed;
145 	unsigned int semantic_match_miss;
146 	unsigned int null_node_hit;
147 	unsigned int resize_node_skipped;
148 };
149 #endif
150 
151 struct trie_stat {
152 	unsigned int totdepth;
153 	unsigned int maxdepth;
154 	unsigned int tnodes;
155 	unsigned int leaves;
156 	unsigned int nullpointers;
157 	unsigned int nodesizes[MAX_CHILDS];
158 };
159 
160 struct trie {
161 	struct node *trie;
162 #ifdef CONFIG_IP_FIB_TRIE_STATS
163 	struct trie_use_stats stats;
164 #endif
165 	int size;
166 	unsigned int revision;
167 };
168 
169 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
170 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
171 static struct node *resize(struct trie *t, struct tnode *tn);
172 static struct tnode *inflate(struct trie *t, struct tnode *tn);
173 static struct tnode *halve(struct trie *t, struct tnode *tn);
174 static void tnode_free(struct tnode *tn);
175 
176 static kmem_cache_t *fn_alias_kmem __read_mostly;
177 static struct trie *trie_local = NULL, *trie_main = NULL;
178 
179 
180 /* rcu_read_lock needs to be hold by caller from readside */
181 
182 static inline struct node *tnode_get_child(struct tnode *tn, int i)
183 {
184 	BUG_ON(i >= 1 << tn->bits);
185 
186 	return rcu_dereference(tn->child[i]);
187 }
188 
189 static inline int tnode_child_length(const struct tnode *tn)
190 {
191 	return 1 << tn->bits;
192 }
193 
194 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
195 {
196 	if (offset < KEYLENGTH)
197 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
198 	else
199 		return 0;
200 }
201 
202 static inline int tkey_equals(t_key a, t_key b)
203 {
204 	return a == b;
205 }
206 
207 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
208 {
209 	if (bits == 0 || offset >= KEYLENGTH)
210 		return 1;
211 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
212 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
213 }
214 
215 static inline int tkey_mismatch(t_key a, int offset, t_key b)
216 {
217 	t_key diff = a ^ b;
218 	int i = offset;
219 
220 	if (!diff)
221 		return 0;
222 	while ((diff << i) >> (KEYLENGTH-1) == 0)
223 		i++;
224 	return i;
225 }
226 
227 /*
228   To understand this stuff, an understanding of keys and all their bits is
229   necessary. Every node in the trie has a key associated with it, but not
230   all of the bits in that key are significant.
231 
232   Consider a node 'n' and its parent 'tp'.
233 
234   If n is a leaf, every bit in its key is significant. Its presence is
235   necessitated by path compression, since during a tree traversal (when
236   searching for a leaf - unless we are doing an insertion) we will completely
237   ignore all skipped bits we encounter. Thus we need to verify, at the end of
238   a potentially successful search, that we have indeed been walking the
239   correct key path.
240 
241   Note that we can never "miss" the correct key in the tree if present by
242   following the wrong path. Path compression ensures that segments of the key
243   that are the same for all keys with a given prefix are skipped, but the
244   skipped part *is* identical for each node in the subtrie below the skipped
245   bit! trie_insert() in this implementation takes care of that - note the
246   call to tkey_sub_equals() in trie_insert().
247 
248   if n is an internal node - a 'tnode' here, the various parts of its key
249   have many different meanings.
250 
251   Example:
252   _________________________________________________________________
253   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
254   -----------------------------------------------------------------
255     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
256 
257   _________________________________________________________________
258   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
259   -----------------------------------------------------------------
260    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
261 
262   tp->pos = 7
263   tp->bits = 3
264   n->pos = 15
265   n->bits = 4
266 
267   First, let's just ignore the bits that come before the parent tp, that is
268   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
269   not use them for anything.
270 
271   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
272   index into the parent's child array. That is, they will be used to find
273   'n' among tp's children.
274 
275   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
276   for the node n.
277 
278   All the bits we have seen so far are significant to the node n. The rest
279   of the bits are really not needed or indeed known in n->key.
280 
281   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
282   n's child array, and will of course be different for each child.
283 
284 
285   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
286   at this point.
287 
288 */
289 
290 static inline void check_tnode(const struct tnode *tn)
291 {
292 	WARN_ON(tn && tn->pos+tn->bits > 32);
293 }
294 
295 static int halve_threshold = 25;
296 static int inflate_threshold = 50;
297 static int halve_threshold_root = 15;
298 static int inflate_threshold_root = 25;
299 
300 
301 static void __alias_free_mem(struct rcu_head *head)
302 {
303 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
304 	kmem_cache_free(fn_alias_kmem, fa);
305 }
306 
307 static inline void alias_free_mem_rcu(struct fib_alias *fa)
308 {
309 	call_rcu(&fa->rcu, __alias_free_mem);
310 }
311 
312 static void __leaf_free_rcu(struct rcu_head *head)
313 {
314 	kfree(container_of(head, struct leaf, rcu));
315 }
316 
317 static inline void free_leaf(struct leaf *leaf)
318 {
319 	call_rcu(&leaf->rcu, __leaf_free_rcu);
320 }
321 
322 static void __leaf_info_free_rcu(struct rcu_head *head)
323 {
324 	kfree(container_of(head, struct leaf_info, rcu));
325 }
326 
327 static inline void free_leaf_info(struct leaf_info *leaf)
328 {
329 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
330 }
331 
332 static struct tnode *tnode_alloc(unsigned int size)
333 {
334 	struct page *pages;
335 
336 	if (size <= PAGE_SIZE)
337 		return kcalloc(size, 1, GFP_KERNEL);
338 
339 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
340 	if (!pages)
341 		return NULL;
342 
343 	return page_address(pages);
344 }
345 
346 static void __tnode_free_rcu(struct rcu_head *head)
347 {
348 	struct tnode *tn = container_of(head, struct tnode, rcu);
349 	unsigned int size = sizeof(struct tnode) +
350 		(1 << tn->bits) * sizeof(struct node *);
351 
352 	if (size <= PAGE_SIZE)
353 		kfree(tn);
354 	else
355 		free_pages((unsigned long)tn, get_order(size));
356 }
357 
358 static inline void tnode_free(struct tnode *tn)
359 {
360 	call_rcu(&tn->rcu, __tnode_free_rcu);
361 }
362 
363 static struct leaf *leaf_new(void)
364 {
365 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
366 	if (l) {
367 		l->parent = T_LEAF;
368 		INIT_HLIST_HEAD(&l->list);
369 	}
370 	return l;
371 }
372 
373 static struct leaf_info *leaf_info_new(int plen)
374 {
375 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
376 	if (li) {
377 		li->plen = plen;
378 		INIT_LIST_HEAD(&li->falh);
379 	}
380 	return li;
381 }
382 
383 static struct tnode* tnode_new(t_key key, int pos, int bits)
384 {
385 	int nchildren = 1<<bits;
386 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
387 	struct tnode *tn = tnode_alloc(sz);
388 
389 	if (tn) {
390 		memset(tn, 0, sz);
391 		tn->parent = T_TNODE;
392 		tn->pos = pos;
393 		tn->bits = bits;
394 		tn->key = key;
395 		tn->full_children = 0;
396 		tn->empty_children = 1<<bits;
397 	}
398 
399 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
400 		 (unsigned int) (sizeof(struct node) * 1<<bits));
401 	return tn;
402 }
403 
404 /*
405  * Check whether a tnode 'n' is "full", i.e. it is an internal node
406  * and no bits are skipped. See discussion in dyntree paper p. 6
407  */
408 
409 static inline int tnode_full(const struct tnode *tn, const struct node *n)
410 {
411 	if (n == NULL || IS_LEAF(n))
412 		return 0;
413 
414 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
415 }
416 
417 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
418 {
419 	tnode_put_child_reorg(tn, i, n, -1);
420 }
421 
422  /*
423   * Add a child at position i overwriting the old value.
424   * Update the value of full_children and empty_children.
425   */
426 
427 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
428 {
429 	struct node *chi = tn->child[i];
430 	int isfull;
431 
432 	BUG_ON(i >= 1<<tn->bits);
433 
434 
435 	/* update emptyChildren */
436 	if (n == NULL && chi != NULL)
437 		tn->empty_children++;
438 	else if (n != NULL && chi == NULL)
439 		tn->empty_children--;
440 
441 	/* update fullChildren */
442 	if (wasfull == -1)
443 		wasfull = tnode_full(tn, chi);
444 
445 	isfull = tnode_full(tn, n);
446 	if (wasfull && !isfull)
447 		tn->full_children--;
448 	else if (!wasfull && isfull)
449 		tn->full_children++;
450 
451 	if (n)
452 		NODE_SET_PARENT(n, tn);
453 
454 	rcu_assign_pointer(tn->child[i], n);
455 }
456 
457 static struct node *resize(struct trie *t, struct tnode *tn)
458 {
459 	int i;
460 	int err = 0;
461 	struct tnode *old_tn;
462 	int inflate_threshold_use;
463 	int halve_threshold_use;
464 
465  	if (!tn)
466 		return NULL;
467 
468 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
469 		 tn, inflate_threshold, halve_threshold);
470 
471 	/* No children */
472 	if (tn->empty_children == tnode_child_length(tn)) {
473 		tnode_free(tn);
474 		return NULL;
475 	}
476 	/* One child */
477 	if (tn->empty_children == tnode_child_length(tn) - 1)
478 		for (i = 0; i < tnode_child_length(tn); i++) {
479 			struct node *n;
480 
481 			n = tn->child[i];
482 			if (!n)
483 				continue;
484 
485 			/* compress one level */
486 			NODE_SET_PARENT(n, NULL);
487 			tnode_free(tn);
488 			return n;
489 		}
490 	/*
491 	 * Double as long as the resulting node has a number of
492 	 * nonempty nodes that are above the threshold.
493 	 */
494 
495 	/*
496 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
497 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
498 	 * Telecommunications, page 6:
499 	 * "A node is doubled if the ratio of non-empty children to all
500 	 * children in the *doubled* node is at least 'high'."
501 	 *
502 	 * 'high' in this instance is the variable 'inflate_threshold'. It
503 	 * is expressed as a percentage, so we multiply it with
504 	 * tnode_child_length() and instead of multiplying by 2 (since the
505 	 * child array will be doubled by inflate()) and multiplying
506 	 * the left-hand side by 100 (to handle the percentage thing) we
507 	 * multiply the left-hand side by 50.
508 	 *
509 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
510 	 * - tn->empty_children is of course the number of non-null children
511 	 * in the current node. tn->full_children is the number of "full"
512 	 * children, that is non-null tnodes with a skip value of 0.
513 	 * All of those will be doubled in the resulting inflated tnode, so
514 	 * we just count them one extra time here.
515 	 *
516 	 * A clearer way to write this would be:
517 	 *
518 	 * to_be_doubled = tn->full_children;
519 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
520 	 *     tn->full_children;
521 	 *
522 	 * new_child_length = tnode_child_length(tn) * 2;
523 	 *
524 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
525 	 *      new_child_length;
526 	 * if (new_fill_factor >= inflate_threshold)
527 	 *
528 	 * ...and so on, tho it would mess up the while () loop.
529 	 *
530 	 * anyway,
531 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
532 	 *      inflate_threshold
533 	 *
534 	 * avoid a division:
535 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
536 	 *      inflate_threshold * new_child_length
537 	 *
538 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
539 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
540 	 *    tn->full_children) >= inflate_threshold * new_child_length
541 	 *
542 	 * expand new_child_length:
543 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
544 	 *    tn->full_children) >=
545 	 *      inflate_threshold * tnode_child_length(tn) * 2
546 	 *
547 	 * shorten again:
548 	 * 50 * (tn->full_children + tnode_child_length(tn) -
549 	 *    tn->empty_children) >= inflate_threshold *
550 	 *    tnode_child_length(tn)
551 	 *
552 	 */
553 
554 	check_tnode(tn);
555 
556 	/* Keep root node larger  */
557 
558 	if(!tn->parent)
559 		inflate_threshold_use = inflate_threshold_root;
560 	else
561 		inflate_threshold_use = inflate_threshold;
562 
563 	err = 0;
564 	while ((tn->full_children > 0 &&
565 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
566 				inflate_threshold_use * tnode_child_length(tn))) {
567 
568 		old_tn = tn;
569 		tn = inflate(t, tn);
570 		if (IS_ERR(tn)) {
571 			tn = old_tn;
572 #ifdef CONFIG_IP_FIB_TRIE_STATS
573 			t->stats.resize_node_skipped++;
574 #endif
575 			break;
576 		}
577 	}
578 
579 	check_tnode(tn);
580 
581 	/*
582 	 * Halve as long as the number of empty children in this
583 	 * node is above threshold.
584 	 */
585 
586 
587 	/* Keep root node larger  */
588 
589 	if(!tn->parent)
590 		halve_threshold_use = halve_threshold_root;
591 	else
592 		halve_threshold_use = halve_threshold;
593 
594 	err = 0;
595 	while (tn->bits > 1 &&
596 	       100 * (tnode_child_length(tn) - tn->empty_children) <
597 	       halve_threshold_use * tnode_child_length(tn)) {
598 
599 		old_tn = tn;
600 		tn = halve(t, tn);
601 		if (IS_ERR(tn)) {
602 			tn = old_tn;
603 #ifdef CONFIG_IP_FIB_TRIE_STATS
604 			t->stats.resize_node_skipped++;
605 #endif
606 			break;
607 		}
608 	}
609 
610 
611 	/* Only one child remains */
612 	if (tn->empty_children == tnode_child_length(tn) - 1)
613 		for (i = 0; i < tnode_child_length(tn); i++) {
614 			struct node *n;
615 
616 			n = tn->child[i];
617 			if (!n)
618 				continue;
619 
620 			/* compress one level */
621 
622 			NODE_SET_PARENT(n, NULL);
623 			tnode_free(tn);
624 			return n;
625 		}
626 
627 	return (struct node *) tn;
628 }
629 
630 static struct tnode *inflate(struct trie *t, struct tnode *tn)
631 {
632 	struct tnode *inode;
633 	struct tnode *oldtnode = tn;
634 	int olen = tnode_child_length(tn);
635 	int i;
636 
637 	pr_debug("In inflate\n");
638 
639 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
640 
641 	if (!tn)
642 		return ERR_PTR(-ENOMEM);
643 
644 	/*
645 	 * Preallocate and store tnodes before the actual work so we
646 	 * don't get into an inconsistent state if memory allocation
647 	 * fails. In case of failure we return the oldnode and  inflate
648 	 * of tnode is ignored.
649 	 */
650 
651 	for (i = 0; i < olen; i++) {
652 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
653 
654 		if (inode &&
655 		    IS_TNODE(inode) &&
656 		    inode->pos == oldtnode->pos + oldtnode->bits &&
657 		    inode->bits > 1) {
658 			struct tnode *left, *right;
659 			t_key m = TKEY_GET_MASK(inode->pos, 1);
660 
661 			left = tnode_new(inode->key&(~m), inode->pos + 1,
662 					 inode->bits - 1);
663 			if (!left)
664 				goto nomem;
665 
666 			right = tnode_new(inode->key|m, inode->pos + 1,
667 					  inode->bits - 1);
668 
669                         if (!right) {
670 				tnode_free(left);
671 				goto nomem;
672                         }
673 
674 			put_child(t, tn, 2*i, (struct node *) left);
675 			put_child(t, tn, 2*i+1, (struct node *) right);
676 		}
677 	}
678 
679 	for (i = 0; i < olen; i++) {
680 		struct node *node = tnode_get_child(oldtnode, i);
681 		struct tnode *left, *right;
682 		int size, j;
683 
684 		/* An empty child */
685 		if (node == NULL)
686 			continue;
687 
688 		/* A leaf or an internal node with skipped bits */
689 
690 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
691 		   tn->pos + tn->bits - 1) {
692 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
693 					     1) == 0)
694 				put_child(t, tn, 2*i, node);
695 			else
696 				put_child(t, tn, 2*i+1, node);
697 			continue;
698 		}
699 
700 		/* An internal node with two children */
701 		inode = (struct tnode *) node;
702 
703 		if (inode->bits == 1) {
704 			put_child(t, tn, 2*i, inode->child[0]);
705 			put_child(t, tn, 2*i+1, inode->child[1]);
706 
707 			tnode_free(inode);
708 			continue;
709 		}
710 
711 		/* An internal node with more than two children */
712 
713 		/* We will replace this node 'inode' with two new
714 		 * ones, 'left' and 'right', each with half of the
715 		 * original children. The two new nodes will have
716 		 * a position one bit further down the key and this
717 		 * means that the "significant" part of their keys
718 		 * (see the discussion near the top of this file)
719 		 * will differ by one bit, which will be "0" in
720 		 * left's key and "1" in right's key. Since we are
721 		 * moving the key position by one step, the bit that
722 		 * we are moving away from - the bit at position
723 		 * (inode->pos) - is the one that will differ between
724 		 * left and right. So... we synthesize that bit in the
725 		 * two  new keys.
726 		 * The mask 'm' below will be a single "one" bit at
727 		 * the position (inode->pos)
728 		 */
729 
730 		/* Use the old key, but set the new significant
731 		 *   bit to zero.
732 		 */
733 
734 		left = (struct tnode *) tnode_get_child(tn, 2*i);
735 		put_child(t, tn, 2*i, NULL);
736 
737 		BUG_ON(!left);
738 
739 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
740 		put_child(t, tn, 2*i+1, NULL);
741 
742 		BUG_ON(!right);
743 
744 		size = tnode_child_length(left);
745 		for (j = 0; j < size; j++) {
746 			put_child(t, left, j, inode->child[j]);
747 			put_child(t, right, j, inode->child[j + size]);
748 		}
749 		put_child(t, tn, 2*i, resize(t, left));
750 		put_child(t, tn, 2*i+1, resize(t, right));
751 
752 		tnode_free(inode);
753 	}
754 	tnode_free(oldtnode);
755 	return tn;
756 nomem:
757 	{
758 		int size = tnode_child_length(tn);
759 		int j;
760 
761 		for (j = 0; j < size; j++)
762 			if (tn->child[j])
763 				tnode_free((struct tnode *)tn->child[j]);
764 
765 		tnode_free(tn);
766 
767 		return ERR_PTR(-ENOMEM);
768 	}
769 }
770 
771 static struct tnode *halve(struct trie *t, struct tnode *tn)
772 {
773 	struct tnode *oldtnode = tn;
774 	struct node *left, *right;
775 	int i;
776 	int olen = tnode_child_length(tn);
777 
778 	pr_debug("In halve\n");
779 
780 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
781 
782 	if (!tn)
783 		return ERR_PTR(-ENOMEM);
784 
785 	/*
786 	 * Preallocate and store tnodes before the actual work so we
787 	 * don't get into an inconsistent state if memory allocation
788 	 * fails. In case of failure we return the oldnode and halve
789 	 * of tnode is ignored.
790 	 */
791 
792 	for (i = 0; i < olen; i += 2) {
793 		left = tnode_get_child(oldtnode, i);
794 		right = tnode_get_child(oldtnode, i+1);
795 
796 		/* Two nonempty children */
797 		if (left && right) {
798 			struct tnode *newn;
799 
800 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
801 
802 			if (!newn)
803 				goto nomem;
804 
805 			put_child(t, tn, i/2, (struct node *)newn);
806 		}
807 
808 	}
809 
810 	for (i = 0; i < olen; i += 2) {
811 		struct tnode *newBinNode;
812 
813 		left = tnode_get_child(oldtnode, i);
814 		right = tnode_get_child(oldtnode, i+1);
815 
816 		/* At least one of the children is empty */
817 		if (left == NULL) {
818 			if (right == NULL)    /* Both are empty */
819 				continue;
820 			put_child(t, tn, i/2, right);
821 			continue;
822 		}
823 
824 		if (right == NULL) {
825 			put_child(t, tn, i/2, left);
826 			continue;
827 		}
828 
829 		/* Two nonempty children */
830 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
831 		put_child(t, tn, i/2, NULL);
832 		put_child(t, newBinNode, 0, left);
833 		put_child(t, newBinNode, 1, right);
834 		put_child(t, tn, i/2, resize(t, newBinNode));
835 	}
836 	tnode_free(oldtnode);
837 	return tn;
838 nomem:
839 	{
840 		int size = tnode_child_length(tn);
841 		int j;
842 
843 		for (j = 0; j < size; j++)
844 			if (tn->child[j])
845 				tnode_free((struct tnode *)tn->child[j]);
846 
847 		tnode_free(tn);
848 
849 		return ERR_PTR(-ENOMEM);
850 	}
851 }
852 
853 static void trie_init(struct trie *t)
854 {
855 	if (!t)
856 		return;
857 
858 	t->size = 0;
859 	rcu_assign_pointer(t->trie, NULL);
860 	t->revision = 0;
861 #ifdef CONFIG_IP_FIB_TRIE_STATS
862 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
863 #endif
864 }
865 
866 /* readside must use rcu_read_lock currently dump routines
867  via get_fa_head and dump */
868 
869 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
870 {
871 	struct hlist_head *head = &l->list;
872 	struct hlist_node *node;
873 	struct leaf_info *li;
874 
875 	hlist_for_each_entry_rcu(li, node, head, hlist)
876 		if (li->plen == plen)
877 			return li;
878 
879 	return NULL;
880 }
881 
882 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
883 {
884 	struct leaf_info *li = find_leaf_info(l, plen);
885 
886 	if (!li)
887 		return NULL;
888 
889 	return &li->falh;
890 }
891 
892 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
893 {
894         struct leaf_info *li = NULL, *last = NULL;
895         struct hlist_node *node;
896 
897         if (hlist_empty(head)) {
898                 hlist_add_head_rcu(&new->hlist, head);
899         } else {
900                 hlist_for_each_entry(li, node, head, hlist) {
901                         if (new->plen > li->plen)
902                                 break;
903 
904                         last = li;
905                 }
906                 if (last)
907                         hlist_add_after_rcu(&last->hlist, &new->hlist);
908                 else
909                         hlist_add_before_rcu(&new->hlist, &li->hlist);
910         }
911 }
912 
913 /* rcu_read_lock needs to be hold by caller from readside */
914 
915 static struct leaf *
916 fib_find_node(struct trie *t, u32 key)
917 {
918 	int pos;
919 	struct tnode *tn;
920 	struct node *n;
921 
922 	pos = 0;
923 	n = rcu_dereference(t->trie);
924 
925 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
926 		tn = (struct tnode *) n;
927 
928 		check_tnode(tn);
929 
930 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
931 			pos = tn->pos + tn->bits;
932 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
933 		} else
934 			break;
935 	}
936 	/* Case we have found a leaf. Compare prefixes */
937 
938 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
939 		return (struct leaf *)n;
940 
941 	return NULL;
942 }
943 
944 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
945 {
946 	int wasfull;
947 	t_key cindex, key;
948 	struct tnode *tp = NULL;
949 
950 	key = tn->key;
951 
952 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
953 
954 		tp = NODE_PARENT(tn);
955 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
956 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
957 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
958 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
959 
960 		if (!NODE_PARENT(tn))
961 			break;
962 
963 		tn = NODE_PARENT(tn);
964 	}
965 	/* Handle last (top) tnode */
966 	if (IS_TNODE(tn))
967 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
968 
969 	return (struct node*) tn;
970 }
971 
972 /* only used from updater-side */
973 
974 static  struct list_head *
975 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
976 {
977 	int pos, newpos;
978 	struct tnode *tp = NULL, *tn = NULL;
979 	struct node *n;
980 	struct leaf *l;
981 	int missbit;
982 	struct list_head *fa_head = NULL;
983 	struct leaf_info *li;
984 	t_key cindex;
985 
986 	pos = 0;
987 	n = t->trie;
988 
989 	/* If we point to NULL, stop. Either the tree is empty and we should
990 	 * just put a new leaf in if, or we have reached an empty child slot,
991 	 * and we should just put our new leaf in that.
992 	 * If we point to a T_TNODE, check if it matches our key. Note that
993 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
994 	 * not be the parent's 'pos'+'bits'!
995 	 *
996 	 * If it does match the current key, get pos/bits from it, extract
997 	 * the index from our key, push the T_TNODE and walk the tree.
998 	 *
999 	 * If it doesn't, we have to replace it with a new T_TNODE.
1000 	 *
1001 	 * If we point to a T_LEAF, it might or might not have the same key
1002 	 * as we do. If it does, just change the value, update the T_LEAF's
1003 	 * value, and return it.
1004 	 * If it doesn't, we need to replace it with a T_TNODE.
1005 	 */
1006 
1007 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1008 		tn = (struct tnode *) n;
1009 
1010 		check_tnode(tn);
1011 
1012 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1013 			tp = tn;
1014 			pos = tn->pos + tn->bits;
1015 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1016 
1017 			BUG_ON(n && NODE_PARENT(n) != tn);
1018 		} else
1019 			break;
1020 	}
1021 
1022 	/*
1023 	 * n  ----> NULL, LEAF or TNODE
1024 	 *
1025 	 * tp is n's (parent) ----> NULL or TNODE
1026 	 */
1027 
1028 	BUG_ON(tp && IS_LEAF(tp));
1029 
1030 	/* Case 1: n is a leaf. Compare prefixes */
1031 
1032 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1033 		struct leaf *l = (struct leaf *) n;
1034 
1035 		li = leaf_info_new(plen);
1036 
1037 		if (!li) {
1038 			*err = -ENOMEM;
1039 			goto err;
1040 		}
1041 
1042 		fa_head = &li->falh;
1043 		insert_leaf_info(&l->list, li);
1044 		goto done;
1045 	}
1046 	t->size++;
1047 	l = leaf_new();
1048 
1049 	if (!l) {
1050 		*err = -ENOMEM;
1051 		goto err;
1052 	}
1053 
1054 	l->key = key;
1055 	li = leaf_info_new(plen);
1056 
1057 	if (!li) {
1058 		tnode_free((struct tnode *) l);
1059 		*err = -ENOMEM;
1060 		goto err;
1061 	}
1062 
1063 	fa_head = &li->falh;
1064 	insert_leaf_info(&l->list, li);
1065 
1066 	if (t->trie && n == NULL) {
1067 		/* Case 2: n is NULL, and will just insert a new leaf */
1068 
1069 		NODE_SET_PARENT(l, tp);
1070 
1071 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1072 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1073 	} else {
1074 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1075 		/*
1076 		 *  Add a new tnode here
1077 		 *  first tnode need some special handling
1078 		 */
1079 
1080 		if (tp)
1081 			pos = tp->pos+tp->bits;
1082 		else
1083 			pos = 0;
1084 
1085 		if (n) {
1086 			newpos = tkey_mismatch(key, pos, n->key);
1087 			tn = tnode_new(n->key, newpos, 1);
1088 		} else {
1089 			newpos = 0;
1090 			tn = tnode_new(key, newpos, 1); /* First tnode */
1091 		}
1092 
1093 		if (!tn) {
1094 			free_leaf_info(li);
1095 			tnode_free((struct tnode *) l);
1096 			*err = -ENOMEM;
1097 			goto err;
1098 		}
1099 
1100 		NODE_SET_PARENT(tn, tp);
1101 
1102 		missbit = tkey_extract_bits(key, newpos, 1);
1103 		put_child(t, tn, missbit, (struct node *)l);
1104 		put_child(t, tn, 1-missbit, n);
1105 
1106 		if (tp) {
1107 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1108 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1109 		} else {
1110 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1111 			tp = tn;
1112 		}
1113 	}
1114 
1115 	if (tp && tp->pos + tp->bits > 32)
1116 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1117 		       tp, tp->pos, tp->bits, key, plen);
1118 
1119 	/* Rebalance the trie */
1120 
1121 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1122 done:
1123 	t->revision++;
1124 err:
1125 	return fa_head;
1126 }
1127 
1128 static int
1129 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1130 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1131 {
1132 	struct trie *t = (struct trie *) tb->tb_data;
1133 	struct fib_alias *fa, *new_fa;
1134 	struct list_head *fa_head = NULL;
1135 	struct fib_info *fi;
1136 	int plen = r->rtm_dst_len;
1137 	int type = r->rtm_type;
1138 	u8 tos = r->rtm_tos;
1139 	u32 key, mask;
1140 	int err;
1141 	struct leaf *l;
1142 
1143 	if (plen > 32)
1144 		return -EINVAL;
1145 
1146 	key = 0;
1147 	if (rta->rta_dst)
1148 		memcpy(&key, rta->rta_dst, 4);
1149 
1150 	key = ntohl(key);
1151 
1152 	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1153 
1154 	mask = ntohl(inet_make_mask(plen));
1155 
1156 	if (key & ~mask)
1157 		return -EINVAL;
1158 
1159 	key = key & mask;
1160 
1161 	fi = fib_create_info(r, rta, nlhdr, &err);
1162 
1163 	if (!fi)
1164 		goto err;
1165 
1166 	l = fib_find_node(t, key);
1167 	fa = NULL;
1168 
1169 	if (l) {
1170 		fa_head = get_fa_head(l, plen);
1171 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1172 	}
1173 
1174 	/* Now fa, if non-NULL, points to the first fib alias
1175 	 * with the same keys [prefix,tos,priority], if such key already
1176 	 * exists or to the node before which we will insert new one.
1177 	 *
1178 	 * If fa is NULL, we will need to allocate a new one and
1179 	 * insert to the head of f.
1180 	 *
1181 	 * If f is NULL, no fib node matched the destination key
1182 	 * and we need to allocate a new one of those as well.
1183 	 */
1184 
1185 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1186 		struct fib_alias *fa_orig;
1187 
1188 		err = -EEXIST;
1189 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1190 			goto out;
1191 
1192 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1193 			struct fib_info *fi_drop;
1194 			u8 state;
1195 
1196 			err = -ENOBUFS;
1197 			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1198 			if (new_fa == NULL)
1199 				goto out;
1200 
1201 			fi_drop = fa->fa_info;
1202 			new_fa->fa_tos = fa->fa_tos;
1203 			new_fa->fa_info = fi;
1204 			new_fa->fa_type = type;
1205 			new_fa->fa_scope = r->rtm_scope;
1206 			state = fa->fa_state;
1207 			new_fa->fa_state &= ~FA_S_ACCESSED;
1208 
1209 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1210 			alias_free_mem_rcu(fa);
1211 
1212 			fib_release_info(fi_drop);
1213 			if (state & FA_S_ACCESSED)
1214 				rt_cache_flush(-1);
1215 
1216 			goto succeeded;
1217 		}
1218 		/* Error if we find a perfect match which
1219 		 * uses the same scope, type, and nexthop
1220 		 * information.
1221 		 */
1222 		fa_orig = fa;
1223 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1224 			if (fa->fa_tos != tos)
1225 				break;
1226 			if (fa->fa_info->fib_priority != fi->fib_priority)
1227 				break;
1228 			if (fa->fa_type == type &&
1229 			    fa->fa_scope == r->rtm_scope &&
1230 			    fa->fa_info == fi) {
1231 				goto out;
1232 			}
1233 		}
1234 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1235 			fa = fa_orig;
1236 	}
1237 	err = -ENOENT;
1238 	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1239 		goto out;
1240 
1241 	err = -ENOBUFS;
1242 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1243 	if (new_fa == NULL)
1244 		goto out;
1245 
1246 	new_fa->fa_info = fi;
1247 	new_fa->fa_tos = tos;
1248 	new_fa->fa_type = type;
1249 	new_fa->fa_scope = r->rtm_scope;
1250 	new_fa->fa_state = 0;
1251 	/*
1252 	 * Insert new entry to the list.
1253 	 */
1254 
1255 	if (!fa_head) {
1256 		fa_head = fib_insert_node(t, &err, key, plen);
1257 		err = 0;
1258 		if (err)
1259 			goto out_free_new_fa;
1260 	}
1261 
1262 	list_add_tail_rcu(&new_fa->fa_list,
1263 			  (fa ? &fa->fa_list : fa_head));
1264 
1265 	rt_cache_flush(-1);
1266 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1267 succeeded:
1268 	return 0;
1269 
1270 out_free_new_fa:
1271 	kmem_cache_free(fn_alias_kmem, new_fa);
1272 out:
1273 	fib_release_info(fi);
1274 err:
1275 	return err;
1276 }
1277 
1278 
1279 /* should be called with rcu_read_lock */
1280 static inline int check_leaf(struct trie *t, struct leaf *l,
1281 			     t_key key, int *plen, const struct flowi *flp,
1282 			     struct fib_result *res)
1283 {
1284 	int err, i;
1285 	t_key mask;
1286 	struct leaf_info *li;
1287 	struct hlist_head *hhead = &l->list;
1288 	struct hlist_node *node;
1289 
1290 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1291 		i = li->plen;
1292 		mask = ntohl(inet_make_mask(i));
1293 		if (l->key != (key & mask))
1294 			continue;
1295 
1296 		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1297 			*plen = i;
1298 #ifdef CONFIG_IP_FIB_TRIE_STATS
1299 			t->stats.semantic_match_passed++;
1300 #endif
1301 			return err;
1302 		}
1303 #ifdef CONFIG_IP_FIB_TRIE_STATS
1304 		t->stats.semantic_match_miss++;
1305 #endif
1306 	}
1307 	return 1;
1308 }
1309 
1310 static int
1311 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1312 {
1313 	struct trie *t = (struct trie *) tb->tb_data;
1314 	int plen, ret = 0;
1315 	struct node *n;
1316 	struct tnode *pn;
1317 	int pos, bits;
1318 	t_key key = ntohl(flp->fl4_dst);
1319 	int chopped_off;
1320 	t_key cindex = 0;
1321 	int current_prefix_length = KEYLENGTH;
1322 	struct tnode *cn;
1323 	t_key node_prefix, key_prefix, pref_mismatch;
1324 	int mp;
1325 
1326 	rcu_read_lock();
1327 
1328 	n = rcu_dereference(t->trie);
1329 	if (!n)
1330 		goto failed;
1331 
1332 #ifdef CONFIG_IP_FIB_TRIE_STATS
1333 	t->stats.gets++;
1334 #endif
1335 
1336 	/* Just a leaf? */
1337 	if (IS_LEAF(n)) {
1338 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1339 			goto found;
1340 		goto failed;
1341 	}
1342 	pn = (struct tnode *) n;
1343 	chopped_off = 0;
1344 
1345 	while (pn) {
1346 		pos = pn->pos;
1347 		bits = pn->bits;
1348 
1349 		if (!chopped_off)
1350 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1351 
1352 		n = tnode_get_child(pn, cindex);
1353 
1354 		if (n == NULL) {
1355 #ifdef CONFIG_IP_FIB_TRIE_STATS
1356 			t->stats.null_node_hit++;
1357 #endif
1358 			goto backtrace;
1359 		}
1360 
1361 		if (IS_LEAF(n)) {
1362 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1363 				goto found;
1364 			else
1365 				goto backtrace;
1366 		}
1367 
1368 #define HL_OPTIMIZE
1369 #ifdef HL_OPTIMIZE
1370 		cn = (struct tnode *)n;
1371 
1372 		/*
1373 		 * It's a tnode, and we can do some extra checks here if we
1374 		 * like, to avoid descending into a dead-end branch.
1375 		 * This tnode is in the parent's child array at index
1376 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1377 		 * chopped off, so in reality the index may be just a
1378 		 * subprefix, padded with zero at the end.
1379 		 * We can also take a look at any skipped bits in this
1380 		 * tnode - everything up to p_pos is supposed to be ok,
1381 		 * and the non-chopped bits of the index (se previous
1382 		 * paragraph) are also guaranteed ok, but the rest is
1383 		 * considered unknown.
1384 		 *
1385 		 * The skipped bits are key[pos+bits..cn->pos].
1386 		 */
1387 
1388 		/* If current_prefix_length < pos+bits, we are already doing
1389 		 * actual prefix  matching, which means everything from
1390 		 * pos+(bits-chopped_off) onward must be zero along some
1391 		 * branch of this subtree - otherwise there is *no* valid
1392 		 * prefix present. Here we can only check the skipped
1393 		 * bits. Remember, since we have already indexed into the
1394 		 * parent's child array, we know that the bits we chopped of
1395 		 * *are* zero.
1396 		 */
1397 
1398 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1399 
1400 		if (current_prefix_length < pos+bits) {
1401 			if (tkey_extract_bits(cn->key, current_prefix_length,
1402 						cn->pos - current_prefix_length) != 0 ||
1403 			    !(cn->child[0]))
1404 				goto backtrace;
1405 		}
1406 
1407 		/*
1408 		 * If chopped_off=0, the index is fully validated and we
1409 		 * only need to look at the skipped bits for this, the new,
1410 		 * tnode. What we actually want to do is to find out if
1411 		 * these skipped bits match our key perfectly, or if we will
1412 		 * have to count on finding a matching prefix further down,
1413 		 * because if we do, we would like to have some way of
1414 		 * verifying the existence of such a prefix at this point.
1415 		 */
1416 
1417 		/* The only thing we can do at this point is to verify that
1418 		 * any such matching prefix can indeed be a prefix to our
1419 		 * key, and if the bits in the node we are inspecting that
1420 		 * do not match our key are not ZERO, this cannot be true.
1421 		 * Thus, find out where there is a mismatch (before cn->pos)
1422 		 * and verify that all the mismatching bits are zero in the
1423 		 * new tnode's key.
1424 		 */
1425 
1426 		/* Note: We aren't very concerned about the piece of the key
1427 		 * that precede pn->pos+pn->bits, since these have already been
1428 		 * checked. The bits after cn->pos aren't checked since these are
1429 		 * by definition "unknown" at this point. Thus, what we want to
1430 		 * see is if we are about to enter the "prefix matching" state,
1431 		 * and in that case verify that the skipped bits that will prevail
1432 		 * throughout this subtree are zero, as they have to be if we are
1433 		 * to find a matching prefix.
1434 		 */
1435 
1436 		node_prefix = MASK_PFX(cn->key, cn->pos);
1437 		key_prefix = MASK_PFX(key, cn->pos);
1438 		pref_mismatch = key_prefix^node_prefix;
1439 		mp = 0;
1440 
1441 		/* In short: If skipped bits in this node do not match the search
1442 		 * key, enter the "prefix matching" state.directly.
1443 		 */
1444 		if (pref_mismatch) {
1445 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1446 				mp++;
1447 				pref_mismatch = pref_mismatch <<1;
1448 			}
1449 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1450 
1451 			if (key_prefix != 0)
1452 				goto backtrace;
1453 
1454 			if (current_prefix_length >= cn->pos)
1455 				current_prefix_length = mp;
1456 		}
1457 #endif
1458 		pn = (struct tnode *)n; /* Descend */
1459 		chopped_off = 0;
1460 		continue;
1461 
1462 backtrace:
1463 		chopped_off++;
1464 
1465 		/* As zero don't change the child key (cindex) */
1466 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1467 			chopped_off++;
1468 
1469 		/* Decrease current_... with bits chopped off */
1470 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1471 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1472 
1473 		/*
1474 		 * Either we do the actual chop off according or if we have
1475 		 * chopped off all bits in this tnode walk up to our parent.
1476 		 */
1477 
1478 		if (chopped_off <= pn->bits) {
1479 			cindex &= ~(1 << (chopped_off-1));
1480 		} else {
1481 			if (NODE_PARENT(pn) == NULL)
1482 				goto failed;
1483 
1484 			/* Get Child's index */
1485 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1486 			pn = NODE_PARENT(pn);
1487 			chopped_off = 0;
1488 
1489 #ifdef CONFIG_IP_FIB_TRIE_STATS
1490 			t->stats.backtrack++;
1491 #endif
1492 			goto backtrace;
1493 		}
1494 	}
1495 failed:
1496 	ret = 1;
1497 found:
1498 	rcu_read_unlock();
1499 	return ret;
1500 }
1501 
1502 /* only called from updater side */
1503 static int trie_leaf_remove(struct trie *t, t_key key)
1504 {
1505 	t_key cindex;
1506 	struct tnode *tp = NULL;
1507 	struct node *n = t->trie;
1508 	struct leaf *l;
1509 
1510 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1511 
1512 	/* Note that in the case skipped bits, those bits are *not* checked!
1513 	 * When we finish this, we will have NULL or a T_LEAF, and the
1514 	 * T_LEAF may or may not match our key.
1515 	 */
1516 
1517 	while (n != NULL && IS_TNODE(n)) {
1518 		struct tnode *tn = (struct tnode *) n;
1519 		check_tnode(tn);
1520 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1521 
1522 		BUG_ON(n && NODE_PARENT(n) != tn);
1523 	}
1524 	l = (struct leaf *) n;
1525 
1526 	if (!n || !tkey_equals(l->key, key))
1527 		return 0;
1528 
1529 	/*
1530 	 * Key found.
1531 	 * Remove the leaf and rebalance the tree
1532 	 */
1533 
1534 	t->revision++;
1535 	t->size--;
1536 
1537 	preempt_disable();
1538 	tp = NODE_PARENT(n);
1539 	tnode_free((struct tnode *) n);
1540 
1541 	if (tp) {
1542 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1543 		put_child(t, (struct tnode *)tp, cindex, NULL);
1544 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1545 	} else
1546 		rcu_assign_pointer(t->trie, NULL);
1547 	preempt_enable();
1548 
1549 	return 1;
1550 }
1551 
1552 static int
1553 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1554 		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1555 {
1556 	struct trie *t = (struct trie *) tb->tb_data;
1557 	u32 key, mask;
1558 	int plen = r->rtm_dst_len;
1559 	u8 tos = r->rtm_tos;
1560 	struct fib_alias *fa, *fa_to_delete;
1561 	struct list_head *fa_head;
1562 	struct leaf *l;
1563 	struct leaf_info *li;
1564 
1565 
1566 	if (plen > 32)
1567 		return -EINVAL;
1568 
1569 	key = 0;
1570 	if (rta->rta_dst)
1571 		memcpy(&key, rta->rta_dst, 4);
1572 
1573 	key = ntohl(key);
1574 	mask = ntohl(inet_make_mask(plen));
1575 
1576 	if (key & ~mask)
1577 		return -EINVAL;
1578 
1579 	key = key & mask;
1580 	l = fib_find_node(t, key);
1581 
1582 	if (!l)
1583 		return -ESRCH;
1584 
1585 	fa_head = get_fa_head(l, plen);
1586 	fa = fib_find_alias(fa_head, tos, 0);
1587 
1588 	if (!fa)
1589 		return -ESRCH;
1590 
1591 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1592 
1593 	fa_to_delete = NULL;
1594 	fa_head = fa->fa_list.prev;
1595 
1596 	list_for_each_entry(fa, fa_head, fa_list) {
1597 		struct fib_info *fi = fa->fa_info;
1598 
1599 		if (fa->fa_tos != tos)
1600 			break;
1601 
1602 		if ((!r->rtm_type ||
1603 		     fa->fa_type == r->rtm_type) &&
1604 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1605 		     fa->fa_scope == r->rtm_scope) &&
1606 		    (!r->rtm_protocol ||
1607 		     fi->fib_protocol == r->rtm_protocol) &&
1608 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1609 			fa_to_delete = fa;
1610 			break;
1611 		}
1612 	}
1613 
1614 	if (!fa_to_delete)
1615 		return -ESRCH;
1616 
1617 	fa = fa_to_delete;
1618 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1619 
1620 	l = fib_find_node(t, key);
1621 	li = find_leaf_info(l, plen);
1622 
1623 	list_del_rcu(&fa->fa_list);
1624 
1625 	if (list_empty(fa_head)) {
1626 		hlist_del_rcu(&li->hlist);
1627 		free_leaf_info(li);
1628 	}
1629 
1630 	if (hlist_empty(&l->list))
1631 		trie_leaf_remove(t, key);
1632 
1633 	if (fa->fa_state & FA_S_ACCESSED)
1634 		rt_cache_flush(-1);
1635 
1636 	fib_release_info(fa->fa_info);
1637 	alias_free_mem_rcu(fa);
1638 	return 0;
1639 }
1640 
1641 static int trie_flush_list(struct trie *t, struct list_head *head)
1642 {
1643 	struct fib_alias *fa, *fa_node;
1644 	int found = 0;
1645 
1646 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1647 		struct fib_info *fi = fa->fa_info;
1648 
1649 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1650 			list_del_rcu(&fa->fa_list);
1651 			fib_release_info(fa->fa_info);
1652 			alias_free_mem_rcu(fa);
1653 			found++;
1654 		}
1655 	}
1656 	return found;
1657 }
1658 
1659 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1660 {
1661 	int found = 0;
1662 	struct hlist_head *lih = &l->list;
1663 	struct hlist_node *node, *tmp;
1664 	struct leaf_info *li = NULL;
1665 
1666 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1667 		found += trie_flush_list(t, &li->falh);
1668 
1669 		if (list_empty(&li->falh)) {
1670 			hlist_del_rcu(&li->hlist);
1671 			free_leaf_info(li);
1672 		}
1673 	}
1674 	return found;
1675 }
1676 
1677 /* rcu_read_lock needs to be hold by caller from readside */
1678 
1679 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1680 {
1681 	struct node *c = (struct node *) thisleaf;
1682 	struct tnode *p;
1683 	int idx;
1684 	struct node *trie = rcu_dereference(t->trie);
1685 
1686 	if (c == NULL) {
1687 		if (trie == NULL)
1688 			return NULL;
1689 
1690 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1691 			return (struct leaf *) trie;
1692 
1693 		p = (struct tnode*) trie;  /* Start */
1694 	} else
1695 		p = (struct tnode *) NODE_PARENT(c);
1696 
1697 	while (p) {
1698 		int pos, last;
1699 
1700 		/*  Find the next child of the parent */
1701 		if (c)
1702 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1703 		else
1704 			pos = 0;
1705 
1706 		last = 1 << p->bits;
1707 		for (idx = pos; idx < last ; idx++) {
1708 			c = rcu_dereference(p->child[idx]);
1709 
1710 			if (!c)
1711 				continue;
1712 
1713 			/* Decend if tnode */
1714 			while (IS_TNODE(c)) {
1715 				p = (struct tnode *) c;
1716   				idx = 0;
1717 
1718 				/* Rightmost non-NULL branch */
1719 				if (p && IS_TNODE(p))
1720 					while (!(c = rcu_dereference(p->child[idx]))
1721 					       && idx < (1<<p->bits)) idx++;
1722 
1723 				/* Done with this tnode? */
1724 				if (idx >= (1 << p->bits) || !c)
1725 					goto up;
1726 			}
1727 			return (struct leaf *) c;
1728 		}
1729 up:
1730 		/* No more children go up one step  */
1731 		c = (struct node *) p;
1732 		p = (struct tnode *) NODE_PARENT(p);
1733 	}
1734 	return NULL; /* Ready. Root of trie */
1735 }
1736 
1737 static int fn_trie_flush(struct fib_table *tb)
1738 {
1739 	struct trie *t = (struct trie *) tb->tb_data;
1740 	struct leaf *ll = NULL, *l = NULL;
1741 	int found = 0, h;
1742 
1743 	t->revision++;
1744 
1745 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1746 		found += trie_flush_leaf(t, l);
1747 
1748 		if (ll && hlist_empty(&ll->list))
1749 			trie_leaf_remove(t, ll->key);
1750 		ll = l;
1751 	}
1752 
1753 	if (ll && hlist_empty(&ll->list))
1754 		trie_leaf_remove(t, ll->key);
1755 
1756 	pr_debug("trie_flush found=%d\n", found);
1757 	return found;
1758 }
1759 
1760 static int trie_last_dflt = -1;
1761 
1762 static void
1763 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1764 {
1765 	struct trie *t = (struct trie *) tb->tb_data;
1766 	int order, last_idx;
1767 	struct fib_info *fi = NULL;
1768 	struct fib_info *last_resort;
1769 	struct fib_alias *fa = NULL;
1770 	struct list_head *fa_head;
1771 	struct leaf *l;
1772 
1773 	last_idx = -1;
1774 	last_resort = NULL;
1775 	order = -1;
1776 
1777 	rcu_read_lock();
1778 
1779 	l = fib_find_node(t, 0);
1780 	if (!l)
1781 		goto out;
1782 
1783 	fa_head = get_fa_head(l, 0);
1784 	if (!fa_head)
1785 		goto out;
1786 
1787 	if (list_empty(fa_head))
1788 		goto out;
1789 
1790 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1791 		struct fib_info *next_fi = fa->fa_info;
1792 
1793 		if (fa->fa_scope != res->scope ||
1794 		    fa->fa_type != RTN_UNICAST)
1795 			continue;
1796 
1797 		if (next_fi->fib_priority > res->fi->fib_priority)
1798 			break;
1799 		if (!next_fi->fib_nh[0].nh_gw ||
1800 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1801 			continue;
1802 		fa->fa_state |= FA_S_ACCESSED;
1803 
1804 		if (fi == NULL) {
1805 			if (next_fi != res->fi)
1806 				break;
1807 		} else if (!fib_detect_death(fi, order, &last_resort,
1808 					     &last_idx, &trie_last_dflt)) {
1809 			if (res->fi)
1810 				fib_info_put(res->fi);
1811 			res->fi = fi;
1812 			atomic_inc(&fi->fib_clntref);
1813 			trie_last_dflt = order;
1814 			goto out;
1815 		}
1816 		fi = next_fi;
1817 		order++;
1818 	}
1819 	if (order <= 0 || fi == NULL) {
1820 		trie_last_dflt = -1;
1821 		goto out;
1822 	}
1823 
1824 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1825 		if (res->fi)
1826 			fib_info_put(res->fi);
1827 		res->fi = fi;
1828 		atomic_inc(&fi->fib_clntref);
1829 		trie_last_dflt = order;
1830 		goto out;
1831 	}
1832 	if (last_idx >= 0) {
1833 		if (res->fi)
1834 			fib_info_put(res->fi);
1835 		res->fi = last_resort;
1836 		if (last_resort)
1837 			atomic_inc(&last_resort->fib_clntref);
1838 	}
1839 	trie_last_dflt = last_idx;
1840  out:;
1841 	rcu_read_unlock();
1842 }
1843 
1844 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1845 			   struct sk_buff *skb, struct netlink_callback *cb)
1846 {
1847 	int i, s_i;
1848 	struct fib_alias *fa;
1849 
1850 	u32 xkey = htonl(key);
1851 
1852 	s_i = cb->args[3];
1853 	i = 0;
1854 
1855 	/* rcu_read_lock is hold by caller */
1856 
1857 	list_for_each_entry_rcu(fa, fah, fa_list) {
1858 		if (i < s_i) {
1859 			i++;
1860 			continue;
1861 		}
1862 		BUG_ON(!fa->fa_info);
1863 
1864 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1865 				  cb->nlh->nlmsg_seq,
1866 				  RTM_NEWROUTE,
1867 				  tb->tb_id,
1868 				  fa->fa_type,
1869 				  fa->fa_scope,
1870 				  &xkey,
1871 				  plen,
1872 				  fa->fa_tos,
1873 				  fa->fa_info, 0) < 0) {
1874 			cb->args[3] = i;
1875 			return -1;
1876 		}
1877 		i++;
1878 	}
1879 	cb->args[3] = i;
1880 	return skb->len;
1881 }
1882 
1883 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1884 			     struct netlink_callback *cb)
1885 {
1886 	int h, s_h;
1887 	struct list_head *fa_head;
1888 	struct leaf *l = NULL;
1889 
1890 	s_h = cb->args[2];
1891 
1892 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1893 		if (h < s_h)
1894 			continue;
1895 		if (h > s_h)
1896 			memset(&cb->args[3], 0,
1897 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1898 
1899 		fa_head = get_fa_head(l, plen);
1900 
1901 		if (!fa_head)
1902 			continue;
1903 
1904 		if (list_empty(fa_head))
1905 			continue;
1906 
1907 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1908 			cb->args[2] = h;
1909 			return -1;
1910 		}
1911 	}
1912 	cb->args[2] = h;
1913 	return skb->len;
1914 }
1915 
1916 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1917 {
1918 	int m, s_m;
1919 	struct trie *t = (struct trie *) tb->tb_data;
1920 
1921 	s_m = cb->args[1];
1922 
1923 	rcu_read_lock();
1924 	for (m = 0; m <= 32; m++) {
1925 		if (m < s_m)
1926 			continue;
1927 		if (m > s_m)
1928 			memset(&cb->args[2], 0,
1929 				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1930 
1931 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1932 			cb->args[1] = m;
1933 			goto out;
1934 		}
1935 	}
1936 	rcu_read_unlock();
1937 	cb->args[1] = m;
1938 	return skb->len;
1939 out:
1940 	rcu_read_unlock();
1941 	return -1;
1942 }
1943 
1944 /* Fix more generic FIB names for init later */
1945 
1946 #ifdef CONFIG_IP_MULTIPLE_TABLES
1947 struct fib_table * fib_hash_init(int id)
1948 #else
1949 struct fib_table * __init fib_hash_init(int id)
1950 #endif
1951 {
1952 	struct fib_table *tb;
1953 	struct trie *t;
1954 
1955 	if (fn_alias_kmem == NULL)
1956 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957 						  sizeof(struct fib_alias),
1958 						  0, SLAB_HWCACHE_ALIGN,
1959 						  NULL, NULL);
1960 
1961 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1962 		     GFP_KERNEL);
1963 	if (tb == NULL)
1964 		return NULL;
1965 
1966 	tb->tb_id = id;
1967 	tb->tb_lookup = fn_trie_lookup;
1968 	tb->tb_insert = fn_trie_insert;
1969 	tb->tb_delete = fn_trie_delete;
1970 	tb->tb_flush = fn_trie_flush;
1971 	tb->tb_select_default = fn_trie_select_default;
1972 	tb->tb_dump = fn_trie_dump;
1973 	memset(tb->tb_data, 0, sizeof(struct trie));
1974 
1975 	t = (struct trie *) tb->tb_data;
1976 
1977 	trie_init(t);
1978 
1979 	if (id == RT_TABLE_LOCAL)
1980 		trie_local = t;
1981 	else if (id == RT_TABLE_MAIN)
1982 		trie_main = t;
1983 
1984 	if (id == RT_TABLE_LOCAL)
1985 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1986 
1987 	return tb;
1988 }
1989 
1990 #ifdef CONFIG_PROC_FS
1991 /* Depth first Trie walk iterator */
1992 struct fib_trie_iter {
1993 	struct tnode *tnode;
1994 	struct trie *trie;
1995 	unsigned index;
1996 	unsigned depth;
1997 };
1998 
1999 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2000 {
2001 	struct tnode *tn = iter->tnode;
2002 	unsigned cindex = iter->index;
2003 	struct tnode *p;
2004 
2005 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2006 		 iter->tnode, iter->index, iter->depth);
2007 rescan:
2008 	while (cindex < (1<<tn->bits)) {
2009 		struct node *n = tnode_get_child(tn, cindex);
2010 
2011 		if (n) {
2012 			if (IS_LEAF(n)) {
2013 				iter->tnode = tn;
2014 				iter->index = cindex + 1;
2015 			} else {
2016 				/* push down one level */
2017 				iter->tnode = (struct tnode *) n;
2018 				iter->index = 0;
2019 				++iter->depth;
2020 			}
2021 			return n;
2022 		}
2023 
2024 		++cindex;
2025 	}
2026 
2027 	/* Current node exhausted, pop back up */
2028 	p = NODE_PARENT(tn);
2029 	if (p) {
2030 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2031 		tn = p;
2032 		--iter->depth;
2033 		goto rescan;
2034 	}
2035 
2036 	/* got root? */
2037 	return NULL;
2038 }
2039 
2040 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2041 				       struct trie *t)
2042 {
2043 	struct node *n = rcu_dereference(t->trie);
2044 
2045 	if (n && IS_TNODE(n)) {
2046 		iter->tnode = (struct tnode *) n;
2047 		iter->trie = t;
2048 		iter->index = 0;
2049 		iter->depth = 1;
2050 		return n;
2051 	}
2052 	return NULL;
2053 }
2054 
2055 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2056 {
2057 	struct node *n;
2058 	struct fib_trie_iter iter;
2059 
2060 	memset(s, 0, sizeof(*s));
2061 
2062 	rcu_read_lock();
2063 	for (n = fib_trie_get_first(&iter, t); n;
2064 	     n = fib_trie_get_next(&iter)) {
2065 		if (IS_LEAF(n)) {
2066 			s->leaves++;
2067 			s->totdepth += iter.depth;
2068 			if (iter.depth > s->maxdepth)
2069 				s->maxdepth = iter.depth;
2070 		} else {
2071 			const struct tnode *tn = (const struct tnode *) n;
2072 			int i;
2073 
2074 			s->tnodes++;
2075 			s->nodesizes[tn->bits]++;
2076 			for (i = 0; i < (1<<tn->bits); i++)
2077 				if (!tn->child[i])
2078 					s->nullpointers++;
2079 		}
2080 	}
2081 	rcu_read_unlock();
2082 }
2083 
2084 /*
2085  *	This outputs /proc/net/fib_triestats
2086  */
2087 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2088 {
2089 	unsigned i, max, pointers, bytes, avdepth;
2090 
2091 	if (stat->leaves)
2092 		avdepth = stat->totdepth*100 / stat->leaves;
2093 	else
2094 		avdepth = 0;
2095 
2096 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2097 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2098 
2099 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2100 
2101 	bytes = sizeof(struct leaf) * stat->leaves;
2102 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2103 	bytes += sizeof(struct tnode) * stat->tnodes;
2104 
2105 	max = MAX_CHILDS-1;
2106 	while (max >= 0 && stat->nodesizes[max] == 0)
2107 		max--;
2108 
2109 	pointers = 0;
2110 	for (i = 1; i <= max; i++)
2111 		if (stat->nodesizes[i] != 0) {
2112 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2113 			pointers += (1<<i) * stat->nodesizes[i];
2114 		}
2115 	seq_putc(seq, '\n');
2116 	seq_printf(seq, "\tPointers: %d\n", pointers);
2117 
2118 	bytes += sizeof(struct node *) * pointers;
2119 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2120 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2121 
2122 #ifdef CONFIG_IP_FIB_TRIE_STATS
2123 	seq_printf(seq, "Counters:\n---------\n");
2124 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2125 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2126 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2127 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2128 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2129 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2130 #ifdef CLEAR_STATS
2131 	memset(&(t->stats), 0, sizeof(t->stats));
2132 #endif
2133 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2134 }
2135 
2136 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2137 {
2138 	struct trie_stat *stat;
2139 
2140 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2141 	if (!stat)
2142 		return -ENOMEM;
2143 
2144 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2145 		   sizeof(struct leaf), sizeof(struct tnode));
2146 
2147 	if (trie_local) {
2148 		seq_printf(seq, "Local:\n");
2149 		trie_collect_stats(trie_local, stat);
2150 		trie_show_stats(seq, stat);
2151 	}
2152 
2153 	if (trie_main) {
2154 		seq_printf(seq, "Main:\n");
2155 		trie_collect_stats(trie_main, stat);
2156 		trie_show_stats(seq, stat);
2157 	}
2158 	kfree(stat);
2159 
2160 	return 0;
2161 }
2162 
2163 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2164 {
2165 	return single_open(file, fib_triestat_seq_show, NULL);
2166 }
2167 
2168 static struct file_operations fib_triestat_fops = {
2169 	.owner	= THIS_MODULE,
2170 	.open	= fib_triestat_seq_open,
2171 	.read	= seq_read,
2172 	.llseek	= seq_lseek,
2173 	.release = single_release,
2174 };
2175 
2176 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2177 				      loff_t pos)
2178 {
2179 	loff_t idx = 0;
2180 	struct node *n;
2181 
2182 	for (n = fib_trie_get_first(iter, trie_local);
2183 	     n; ++idx, n = fib_trie_get_next(iter)) {
2184 		if (pos == idx)
2185 			return n;
2186 	}
2187 
2188 	for (n = fib_trie_get_first(iter, trie_main);
2189 	     n; ++idx, n = fib_trie_get_next(iter)) {
2190 		if (pos == idx)
2191 			return n;
2192 	}
2193 	return NULL;
2194 }
2195 
2196 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2197 {
2198 	rcu_read_lock();
2199 	if (*pos == 0)
2200 		return SEQ_START_TOKEN;
2201 	return fib_trie_get_idx(seq->private, *pos - 1);
2202 }
2203 
2204 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2205 {
2206 	struct fib_trie_iter *iter = seq->private;
2207 	void *l = v;
2208 
2209 	++*pos;
2210 	if (v == SEQ_START_TOKEN)
2211 		return fib_trie_get_idx(iter, 0);
2212 
2213 	v = fib_trie_get_next(iter);
2214 	BUG_ON(v == l);
2215 	if (v)
2216 		return v;
2217 
2218 	/* continue scan in next trie */
2219 	if (iter->trie == trie_local)
2220 		return fib_trie_get_first(iter, trie_main);
2221 
2222 	return NULL;
2223 }
2224 
2225 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2226 {
2227 	rcu_read_unlock();
2228 }
2229 
2230 static void seq_indent(struct seq_file *seq, int n)
2231 {
2232 	while (n-- > 0) seq_puts(seq, "   ");
2233 }
2234 
2235 static inline const char *rtn_scope(enum rt_scope_t s)
2236 {
2237 	static char buf[32];
2238 
2239 	switch(s) {
2240 	case RT_SCOPE_UNIVERSE: return "universe";
2241 	case RT_SCOPE_SITE:	return "site";
2242 	case RT_SCOPE_LINK:	return "link";
2243 	case RT_SCOPE_HOST:	return "host";
2244 	case RT_SCOPE_NOWHERE:	return "nowhere";
2245 	default:
2246 		snprintf(buf, sizeof(buf), "scope=%d", s);
2247 		return buf;
2248 	}
2249 }
2250 
2251 static const char *rtn_type_names[__RTN_MAX] = {
2252 	[RTN_UNSPEC] = "UNSPEC",
2253 	[RTN_UNICAST] = "UNICAST",
2254 	[RTN_LOCAL] = "LOCAL",
2255 	[RTN_BROADCAST] = "BROADCAST",
2256 	[RTN_ANYCAST] = "ANYCAST",
2257 	[RTN_MULTICAST] = "MULTICAST",
2258 	[RTN_BLACKHOLE] = "BLACKHOLE",
2259 	[RTN_UNREACHABLE] = "UNREACHABLE",
2260 	[RTN_PROHIBIT] = "PROHIBIT",
2261 	[RTN_THROW] = "THROW",
2262 	[RTN_NAT] = "NAT",
2263 	[RTN_XRESOLVE] = "XRESOLVE",
2264 };
2265 
2266 static inline const char *rtn_type(unsigned t)
2267 {
2268 	static char buf[32];
2269 
2270 	if (t < __RTN_MAX && rtn_type_names[t])
2271 		return rtn_type_names[t];
2272 	snprintf(buf, sizeof(buf), "type %d", t);
2273 	return buf;
2274 }
2275 
2276 /* Pretty print the trie */
2277 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2278 {
2279 	const struct fib_trie_iter *iter = seq->private;
2280 	struct node *n = v;
2281 
2282 	if (v == SEQ_START_TOKEN)
2283 		return 0;
2284 
2285 	if (IS_TNODE(n)) {
2286 		struct tnode *tn = (struct tnode *) n;
2287 		t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2288 
2289 		if (!NODE_PARENT(n)) {
2290 			if (iter->trie == trie_local)
2291 				seq_puts(seq, "<local>:\n");
2292 			else
2293 				seq_puts(seq, "<main>:\n");
2294 		}
2295 		seq_indent(seq, iter->depth-1);
2296 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2297 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2298 			   tn->empty_children);
2299 
2300 	} else {
2301 		struct leaf *l = (struct leaf *) n;
2302 		int i;
2303 		u32 val = ntohl(l->key);
2304 
2305 		seq_indent(seq, iter->depth);
2306 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2307 		for (i = 32; i >= 0; i--) {
2308 			struct leaf_info *li = find_leaf_info(l, i);
2309 			if (li) {
2310 				struct fib_alias *fa;
2311 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2312 					seq_indent(seq, iter->depth+1);
2313 					seq_printf(seq, "  /%d %s %s", i,
2314 						   rtn_scope(fa->fa_scope),
2315 						   rtn_type(fa->fa_type));
2316 					if (fa->fa_tos)
2317 						seq_printf(seq, "tos =%d\n",
2318 							   fa->fa_tos);
2319 					seq_putc(seq, '\n');
2320 				}
2321 			}
2322 		}
2323 	}
2324 
2325 	return 0;
2326 }
2327 
2328 static struct seq_operations fib_trie_seq_ops = {
2329 	.start  = fib_trie_seq_start,
2330 	.next   = fib_trie_seq_next,
2331 	.stop   = fib_trie_seq_stop,
2332 	.show   = fib_trie_seq_show,
2333 };
2334 
2335 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2336 {
2337 	struct seq_file *seq;
2338 	int rc = -ENOMEM;
2339 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2340 
2341 	if (!s)
2342 		goto out;
2343 
2344 	rc = seq_open(file, &fib_trie_seq_ops);
2345 	if (rc)
2346 		goto out_kfree;
2347 
2348 	seq	     = file->private_data;
2349 	seq->private = s;
2350 	memset(s, 0, sizeof(*s));
2351 out:
2352 	return rc;
2353 out_kfree:
2354 	kfree(s);
2355 	goto out;
2356 }
2357 
2358 static struct file_operations fib_trie_fops = {
2359 	.owner  = THIS_MODULE,
2360 	.open   = fib_trie_seq_open,
2361 	.read   = seq_read,
2362 	.llseek = seq_lseek,
2363 	.release = seq_release_private,
2364 };
2365 
2366 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2367 {
2368 	static unsigned type2flags[RTN_MAX + 1] = {
2369 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2370 	};
2371 	unsigned flags = type2flags[type];
2372 
2373 	if (fi && fi->fib_nh->nh_gw)
2374 		flags |= RTF_GATEWAY;
2375 	if (mask == 0xFFFFFFFF)
2376 		flags |= RTF_HOST;
2377 	flags |= RTF_UP;
2378 	return flags;
2379 }
2380 
2381 /*
2382  *	This outputs /proc/net/route.
2383  *	The format of the file is not supposed to be changed
2384  * 	and needs to be same as fib_hash output to avoid breaking
2385  *	legacy utilities
2386  */
2387 static int fib_route_seq_show(struct seq_file *seq, void *v)
2388 {
2389 	const struct fib_trie_iter *iter = seq->private;
2390 	struct leaf *l = v;
2391 	int i;
2392 	char bf[128];
2393 
2394 	if (v == SEQ_START_TOKEN) {
2395 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2396 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2397 			   "\tWindow\tIRTT");
2398 		return 0;
2399 	}
2400 
2401 	if (iter->trie == trie_local)
2402 		return 0;
2403 	if (IS_TNODE(l))
2404 		return 0;
2405 
2406 	for (i=32; i>=0; i--) {
2407 		struct leaf_info *li = find_leaf_info(l, i);
2408 		struct fib_alias *fa;
2409 		u32 mask, prefix;
2410 
2411 		if (!li)
2412 			continue;
2413 
2414 		mask = inet_make_mask(li->plen);
2415 		prefix = htonl(l->key);
2416 
2417 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2418 			const struct fib_info *fi = fa->fa_info;
2419 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2420 
2421 			if (fa->fa_type == RTN_BROADCAST
2422 			    || fa->fa_type == RTN_MULTICAST)
2423 				continue;
2424 
2425 			if (fi)
2426 				snprintf(bf, sizeof(bf),
2427 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2428 					 fi->fib_dev ? fi->fib_dev->name : "*",
2429 					 prefix,
2430 					 fi->fib_nh->nh_gw, flags, 0, 0,
2431 					 fi->fib_priority,
2432 					 mask,
2433 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2434 					 fi->fib_window,
2435 					 fi->fib_rtt >> 3);
2436 			else
2437 				snprintf(bf, sizeof(bf),
2438 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2439 					 prefix, 0, flags, 0, 0, 0,
2440 					 mask, 0, 0, 0);
2441 
2442 			seq_printf(seq, "%-127s\n", bf);
2443 		}
2444 	}
2445 
2446 	return 0;
2447 }
2448 
2449 static struct seq_operations fib_route_seq_ops = {
2450 	.start  = fib_trie_seq_start,
2451 	.next   = fib_trie_seq_next,
2452 	.stop   = fib_trie_seq_stop,
2453 	.show   = fib_route_seq_show,
2454 };
2455 
2456 static int fib_route_seq_open(struct inode *inode, struct file *file)
2457 {
2458 	struct seq_file *seq;
2459 	int rc = -ENOMEM;
2460 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2461 
2462 	if (!s)
2463 		goto out;
2464 
2465 	rc = seq_open(file, &fib_route_seq_ops);
2466 	if (rc)
2467 		goto out_kfree;
2468 
2469 	seq	     = file->private_data;
2470 	seq->private = s;
2471 	memset(s, 0, sizeof(*s));
2472 out:
2473 	return rc;
2474 out_kfree:
2475 	kfree(s);
2476 	goto out;
2477 }
2478 
2479 static struct file_operations fib_route_fops = {
2480 	.owner  = THIS_MODULE,
2481 	.open   = fib_route_seq_open,
2482 	.read   = seq_read,
2483 	.llseek = seq_lseek,
2484 	.release = seq_release_private,
2485 };
2486 
2487 int __init fib_proc_init(void)
2488 {
2489 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2490 		goto out1;
2491 
2492 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2493 		goto out2;
2494 
2495 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2496 		goto out3;
2497 
2498 	return 0;
2499 
2500 out3:
2501 	proc_net_remove("fib_triestat");
2502 out2:
2503 	proc_net_remove("fib_trie");
2504 out1:
2505 	return -ENOMEM;
2506 }
2507 
2508 void __init fib_proc_exit(void)
2509 {
2510 	proc_net_remove("fib_trie");
2511 	proc_net_remove("fib_triestat");
2512 	proc_net_remove("route");
2513 }
2514 
2515 #endif /* CONFIG_PROC_FS */
2516