xref: /openbmc/linux/net/ipv4/fib_trie.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  */
45 
46 #define VERSION "0.404"
47 
48 #include <linux/config.h>
49 #include <asm/uaccess.h>
50 #include <asm/system.h>
51 #include <asm/bitops.h>
52 #include <linux/types.h>
53 #include <linux/kernel.h>
54 #include <linux/sched.h>
55 #include <linux/mm.h>
56 #include <linux/string.h>
57 #include <linux/socket.h>
58 #include <linux/sockios.h>
59 #include <linux/errno.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/if_arp.h>
64 #include <linux/proc_fs.h>
65 #include <linux/rcupdate.h>
66 #include <linux/skbuff.h>
67 #include <linux/netlink.h>
68 #include <linux/init.h>
69 #include <linux/list.h>
70 #include <net/ip.h>
71 #include <net/protocol.h>
72 #include <net/route.h>
73 #include <net/tcp.h>
74 #include <net/sock.h>
75 #include <net/ip_fib.h>
76 #include "fib_lookup.h"
77 
78 #undef CONFIG_IP_FIB_TRIE_STATS
79 #define MAX_CHILDS 16384
80 
81 #define KEYLENGTH (8*sizeof(t_key))
82 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84 
85 typedef unsigned int t_key;
86 
87 #define T_TNODE 0
88 #define T_LEAF  1
89 #define NODE_TYPE_MASK	0x1UL
90 #define NODE_PARENT(node) \
91 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
92 
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 
95 #define NODE_SET_PARENT(node, ptr)		\
96 	rcu_assign_pointer((node)->parent,	\
97 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
98 
99 #define IS_TNODE(n) (!(n->parent & T_LEAF))
100 #define IS_LEAF(n) (n->parent & T_LEAF)
101 
102 struct node {
103 	t_key key;
104 	unsigned long parent;
105 };
106 
107 struct leaf {
108 	t_key key;
109 	unsigned long parent;
110 	struct hlist_head list;
111 	struct rcu_head rcu;
112 };
113 
114 struct leaf_info {
115 	struct hlist_node hlist;
116 	struct rcu_head rcu;
117 	int plen;
118 	struct list_head falh;
119 };
120 
121 struct tnode {
122 	t_key key;
123 	unsigned long parent;
124 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
125 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
126 	unsigned short full_children;	/* KEYLENGTH bits needed */
127 	unsigned short empty_children;	/* KEYLENGTH bits needed */
128 	struct rcu_head rcu;
129 	struct node *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int nodesizes[MAX_CHILDS];
150 };
151 
152 struct trie {
153 	struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 	struct trie_use_stats stats;
156 #endif
157 	int size;
158 	unsigned int revision;
159 };
160 
161 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
162 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
163 static struct node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 static void tnode_free(struct tnode *tn);
167 
168 static kmem_cache_t *fn_alias_kmem __read_mostly;
169 static struct trie *trie_local = NULL, *trie_main = NULL;
170 
171 
172 /* rcu_read_lock needs to be hold by caller from readside */
173 
174 static inline struct node *tnode_get_child(struct tnode *tn, int i)
175 {
176 	BUG_ON(i >= 1 << tn->bits);
177 
178 	return rcu_dereference(tn->child[i]);
179 }
180 
181 static inline int tnode_child_length(const struct tnode *tn)
182 {
183 	return 1 << tn->bits;
184 }
185 
186 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
187 {
188 	if (offset < KEYLENGTH)
189 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
190 	else
191 		return 0;
192 }
193 
194 static inline int tkey_equals(t_key a, t_key b)
195 {
196 	return a == b;
197 }
198 
199 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
200 {
201 	if (bits == 0 || offset >= KEYLENGTH)
202 		return 1;
203 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
204 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
205 }
206 
207 static inline int tkey_mismatch(t_key a, int offset, t_key b)
208 {
209 	t_key diff = a ^ b;
210 	int i = offset;
211 
212 	if (!diff)
213 		return 0;
214 	while ((diff << i) >> (KEYLENGTH-1) == 0)
215 		i++;
216 	return i;
217 }
218 
219 /*
220   To understand this stuff, an understanding of keys and all their bits is
221   necessary. Every node in the trie has a key associated with it, but not
222   all of the bits in that key are significant.
223 
224   Consider a node 'n' and its parent 'tp'.
225 
226   If n is a leaf, every bit in its key is significant. Its presence is
227   necessitated by path compression, since during a tree traversal (when
228   searching for a leaf - unless we are doing an insertion) we will completely
229   ignore all skipped bits we encounter. Thus we need to verify, at the end of
230   a potentially successful search, that we have indeed been walking the
231   correct key path.
232 
233   Note that we can never "miss" the correct key in the tree if present by
234   following the wrong path. Path compression ensures that segments of the key
235   that are the same for all keys with a given prefix are skipped, but the
236   skipped part *is* identical for each node in the subtrie below the skipped
237   bit! trie_insert() in this implementation takes care of that - note the
238   call to tkey_sub_equals() in trie_insert().
239 
240   if n is an internal node - a 'tnode' here, the various parts of its key
241   have many different meanings.
242 
243   Example:
244   _________________________________________________________________
245   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
246   -----------------------------------------------------------------
247     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
248 
249   _________________________________________________________________
250   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
251   -----------------------------------------------------------------
252    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
253 
254   tp->pos = 7
255   tp->bits = 3
256   n->pos = 15
257   n->bits = 4
258 
259   First, let's just ignore the bits that come before the parent tp, that is
260   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
261   not use them for anything.
262 
263   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
264   index into the parent's child array. That is, they will be used to find
265   'n' among tp's children.
266 
267   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
268   for the node n.
269 
270   All the bits we have seen so far are significant to the node n. The rest
271   of the bits are really not needed or indeed known in n->key.
272 
273   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
274   n's child array, and will of course be different for each child.
275 
276 
277   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
278   at this point.
279 
280 */
281 
282 static inline void check_tnode(const struct tnode *tn)
283 {
284 	WARN_ON(tn && tn->pos+tn->bits > 32);
285 }
286 
287 static int halve_threshold = 25;
288 static int inflate_threshold = 50;
289 static int halve_threshold_root = 15;
290 static int inflate_threshold_root = 25;
291 
292 
293 static void __alias_free_mem(struct rcu_head *head)
294 {
295 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
296 	kmem_cache_free(fn_alias_kmem, fa);
297 }
298 
299 static inline void alias_free_mem_rcu(struct fib_alias *fa)
300 {
301 	call_rcu(&fa->rcu, __alias_free_mem);
302 }
303 
304 static void __leaf_free_rcu(struct rcu_head *head)
305 {
306 	kfree(container_of(head, struct leaf, rcu));
307 }
308 
309 static inline void free_leaf(struct leaf *leaf)
310 {
311 	call_rcu(&leaf->rcu, __leaf_free_rcu);
312 }
313 
314 static void __leaf_info_free_rcu(struct rcu_head *head)
315 {
316 	kfree(container_of(head, struct leaf_info, rcu));
317 }
318 
319 static inline void free_leaf_info(struct leaf_info *leaf)
320 {
321 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
322 }
323 
324 static struct tnode *tnode_alloc(unsigned int size)
325 {
326 	struct page *pages;
327 
328 	if (size <= PAGE_SIZE)
329 		return kcalloc(size, 1, GFP_KERNEL);
330 
331 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
332 	if (!pages)
333 		return NULL;
334 
335 	return page_address(pages);
336 }
337 
338 static void __tnode_free_rcu(struct rcu_head *head)
339 {
340 	struct tnode *tn = container_of(head, struct tnode, rcu);
341 	unsigned int size = sizeof(struct tnode) +
342 		(1 << tn->bits) * sizeof(struct node *);
343 
344 	if (size <= PAGE_SIZE)
345 		kfree(tn);
346 	else
347 		free_pages((unsigned long)tn, get_order(size));
348 }
349 
350 static inline void tnode_free(struct tnode *tn)
351 {
352 	call_rcu(&tn->rcu, __tnode_free_rcu);
353 }
354 
355 static struct leaf *leaf_new(void)
356 {
357 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
358 	if (l) {
359 		l->parent = T_LEAF;
360 		INIT_HLIST_HEAD(&l->list);
361 	}
362 	return l;
363 }
364 
365 static struct leaf_info *leaf_info_new(int plen)
366 {
367 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
368 	if (li) {
369 		li->plen = plen;
370 		INIT_LIST_HEAD(&li->falh);
371 	}
372 	return li;
373 }
374 
375 static struct tnode* tnode_new(t_key key, int pos, int bits)
376 {
377 	int nchildren = 1<<bits;
378 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
379 	struct tnode *tn = tnode_alloc(sz);
380 
381 	if (tn) {
382 		memset(tn, 0, sz);
383 		tn->parent = T_TNODE;
384 		tn->pos = pos;
385 		tn->bits = bits;
386 		tn->key = key;
387 		tn->full_children = 0;
388 		tn->empty_children = 1<<bits;
389 	}
390 
391 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
392 		 (unsigned int) (sizeof(struct node) * 1<<bits));
393 	return tn;
394 }
395 
396 /*
397  * Check whether a tnode 'n' is "full", i.e. it is an internal node
398  * and no bits are skipped. See discussion in dyntree paper p. 6
399  */
400 
401 static inline int tnode_full(const struct tnode *tn, const struct node *n)
402 {
403 	if (n == NULL || IS_LEAF(n))
404 		return 0;
405 
406 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
407 }
408 
409 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
410 {
411 	tnode_put_child_reorg(tn, i, n, -1);
412 }
413 
414  /*
415   * Add a child at position i overwriting the old value.
416   * Update the value of full_children and empty_children.
417   */
418 
419 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
420 {
421 	struct node *chi = tn->child[i];
422 	int isfull;
423 
424 	BUG_ON(i >= 1<<tn->bits);
425 
426 
427 	/* update emptyChildren */
428 	if (n == NULL && chi != NULL)
429 		tn->empty_children++;
430 	else if (n != NULL && chi == NULL)
431 		tn->empty_children--;
432 
433 	/* update fullChildren */
434 	if (wasfull == -1)
435 		wasfull = tnode_full(tn, chi);
436 
437 	isfull = tnode_full(tn, n);
438 	if (wasfull && !isfull)
439 		tn->full_children--;
440 	else if (!wasfull && isfull)
441 		tn->full_children++;
442 
443 	if (n)
444 		NODE_SET_PARENT(n, tn);
445 
446 	rcu_assign_pointer(tn->child[i], n);
447 }
448 
449 static struct node *resize(struct trie *t, struct tnode *tn)
450 {
451 	int i;
452 	int err = 0;
453 	struct tnode *old_tn;
454 	int inflate_threshold_use;
455 	int halve_threshold_use;
456 
457  	if (!tn)
458 		return NULL;
459 
460 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
461 		 tn, inflate_threshold, halve_threshold);
462 
463 	/* No children */
464 	if (tn->empty_children == tnode_child_length(tn)) {
465 		tnode_free(tn);
466 		return NULL;
467 	}
468 	/* One child */
469 	if (tn->empty_children == tnode_child_length(tn) - 1)
470 		for (i = 0; i < tnode_child_length(tn); i++) {
471 			struct node *n;
472 
473 			n = tn->child[i];
474 			if (!n)
475 				continue;
476 
477 			/* compress one level */
478 			NODE_SET_PARENT(n, NULL);
479 			tnode_free(tn);
480 			return n;
481 		}
482 	/*
483 	 * Double as long as the resulting node has a number of
484 	 * nonempty nodes that are above the threshold.
485 	 */
486 
487 	/*
488 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
489 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
490 	 * Telecommunications, page 6:
491 	 * "A node is doubled if the ratio of non-empty children to all
492 	 * children in the *doubled* node is at least 'high'."
493 	 *
494 	 * 'high' in this instance is the variable 'inflate_threshold'. It
495 	 * is expressed as a percentage, so we multiply it with
496 	 * tnode_child_length() and instead of multiplying by 2 (since the
497 	 * child array will be doubled by inflate()) and multiplying
498 	 * the left-hand side by 100 (to handle the percentage thing) we
499 	 * multiply the left-hand side by 50.
500 	 *
501 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
502 	 * - tn->empty_children is of course the number of non-null children
503 	 * in the current node. tn->full_children is the number of "full"
504 	 * children, that is non-null tnodes with a skip value of 0.
505 	 * All of those will be doubled in the resulting inflated tnode, so
506 	 * we just count them one extra time here.
507 	 *
508 	 * A clearer way to write this would be:
509 	 *
510 	 * to_be_doubled = tn->full_children;
511 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
512 	 *     tn->full_children;
513 	 *
514 	 * new_child_length = tnode_child_length(tn) * 2;
515 	 *
516 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
517 	 *      new_child_length;
518 	 * if (new_fill_factor >= inflate_threshold)
519 	 *
520 	 * ...and so on, tho it would mess up the while () loop.
521 	 *
522 	 * anyway,
523 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
524 	 *      inflate_threshold
525 	 *
526 	 * avoid a division:
527 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
528 	 *      inflate_threshold * new_child_length
529 	 *
530 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
531 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
532 	 *    tn->full_children) >= inflate_threshold * new_child_length
533 	 *
534 	 * expand new_child_length:
535 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
536 	 *    tn->full_children) >=
537 	 *      inflate_threshold * tnode_child_length(tn) * 2
538 	 *
539 	 * shorten again:
540 	 * 50 * (tn->full_children + tnode_child_length(tn) -
541 	 *    tn->empty_children) >= inflate_threshold *
542 	 *    tnode_child_length(tn)
543 	 *
544 	 */
545 
546 	check_tnode(tn);
547 
548 	/* Keep root node larger  */
549 
550 	if(!tn->parent)
551 		inflate_threshold_use = inflate_threshold_root;
552 	else
553 		inflate_threshold_use = inflate_threshold;
554 
555 	err = 0;
556 	while ((tn->full_children > 0 &&
557 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
558 				inflate_threshold_use * tnode_child_length(tn))) {
559 
560 		old_tn = tn;
561 		tn = inflate(t, tn);
562 		if (IS_ERR(tn)) {
563 			tn = old_tn;
564 #ifdef CONFIG_IP_FIB_TRIE_STATS
565 			t->stats.resize_node_skipped++;
566 #endif
567 			break;
568 		}
569 	}
570 
571 	check_tnode(tn);
572 
573 	/*
574 	 * Halve as long as the number of empty children in this
575 	 * node is above threshold.
576 	 */
577 
578 
579 	/* Keep root node larger  */
580 
581 	if(!tn->parent)
582 		halve_threshold_use = halve_threshold_root;
583 	else
584 		halve_threshold_use = halve_threshold;
585 
586 	err = 0;
587 	while (tn->bits > 1 &&
588 	       100 * (tnode_child_length(tn) - tn->empty_children) <
589 	       halve_threshold_use * tnode_child_length(tn)) {
590 
591 		old_tn = tn;
592 		tn = halve(t, tn);
593 		if (IS_ERR(tn)) {
594 			tn = old_tn;
595 #ifdef CONFIG_IP_FIB_TRIE_STATS
596 			t->stats.resize_node_skipped++;
597 #endif
598 			break;
599 		}
600 	}
601 
602 
603 	/* Only one child remains */
604 	if (tn->empty_children == tnode_child_length(tn) - 1)
605 		for (i = 0; i < tnode_child_length(tn); i++) {
606 			struct node *n;
607 
608 			n = tn->child[i];
609 			if (!n)
610 				continue;
611 
612 			/* compress one level */
613 
614 			NODE_SET_PARENT(n, NULL);
615 			tnode_free(tn);
616 			return n;
617 		}
618 
619 	return (struct node *) tn;
620 }
621 
622 static struct tnode *inflate(struct trie *t, struct tnode *tn)
623 {
624 	struct tnode *inode;
625 	struct tnode *oldtnode = tn;
626 	int olen = tnode_child_length(tn);
627 	int i;
628 
629 	pr_debug("In inflate\n");
630 
631 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
632 
633 	if (!tn)
634 		return ERR_PTR(-ENOMEM);
635 
636 	/*
637 	 * Preallocate and store tnodes before the actual work so we
638 	 * don't get into an inconsistent state if memory allocation
639 	 * fails. In case of failure we return the oldnode and  inflate
640 	 * of tnode is ignored.
641 	 */
642 
643 	for (i = 0; i < olen; i++) {
644 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
645 
646 		if (inode &&
647 		    IS_TNODE(inode) &&
648 		    inode->pos == oldtnode->pos + oldtnode->bits &&
649 		    inode->bits > 1) {
650 			struct tnode *left, *right;
651 			t_key m = TKEY_GET_MASK(inode->pos, 1);
652 
653 			left = tnode_new(inode->key&(~m), inode->pos + 1,
654 					 inode->bits - 1);
655 			if (!left)
656 				goto nomem;
657 
658 			right = tnode_new(inode->key|m, inode->pos + 1,
659 					  inode->bits - 1);
660 
661                         if (!right) {
662 				tnode_free(left);
663 				goto nomem;
664                         }
665 
666 			put_child(t, tn, 2*i, (struct node *) left);
667 			put_child(t, tn, 2*i+1, (struct node *) right);
668 		}
669 	}
670 
671 	for (i = 0; i < olen; i++) {
672 		struct node *node = tnode_get_child(oldtnode, i);
673 		struct tnode *left, *right;
674 		int size, j;
675 
676 		/* An empty child */
677 		if (node == NULL)
678 			continue;
679 
680 		/* A leaf or an internal node with skipped bits */
681 
682 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
683 		   tn->pos + tn->bits - 1) {
684 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
685 					     1) == 0)
686 				put_child(t, tn, 2*i, node);
687 			else
688 				put_child(t, tn, 2*i+1, node);
689 			continue;
690 		}
691 
692 		/* An internal node with two children */
693 		inode = (struct tnode *) node;
694 
695 		if (inode->bits == 1) {
696 			put_child(t, tn, 2*i, inode->child[0]);
697 			put_child(t, tn, 2*i+1, inode->child[1]);
698 
699 			tnode_free(inode);
700 			continue;
701 		}
702 
703 		/* An internal node with more than two children */
704 
705 		/* We will replace this node 'inode' with two new
706 		 * ones, 'left' and 'right', each with half of the
707 		 * original children. The two new nodes will have
708 		 * a position one bit further down the key and this
709 		 * means that the "significant" part of their keys
710 		 * (see the discussion near the top of this file)
711 		 * will differ by one bit, which will be "0" in
712 		 * left's key and "1" in right's key. Since we are
713 		 * moving the key position by one step, the bit that
714 		 * we are moving away from - the bit at position
715 		 * (inode->pos) - is the one that will differ between
716 		 * left and right. So... we synthesize that bit in the
717 		 * two  new keys.
718 		 * The mask 'm' below will be a single "one" bit at
719 		 * the position (inode->pos)
720 		 */
721 
722 		/* Use the old key, but set the new significant
723 		 *   bit to zero.
724 		 */
725 
726 		left = (struct tnode *) tnode_get_child(tn, 2*i);
727 		put_child(t, tn, 2*i, NULL);
728 
729 		BUG_ON(!left);
730 
731 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
732 		put_child(t, tn, 2*i+1, NULL);
733 
734 		BUG_ON(!right);
735 
736 		size = tnode_child_length(left);
737 		for (j = 0; j < size; j++) {
738 			put_child(t, left, j, inode->child[j]);
739 			put_child(t, right, j, inode->child[j + size]);
740 		}
741 		put_child(t, tn, 2*i, resize(t, left));
742 		put_child(t, tn, 2*i+1, resize(t, right));
743 
744 		tnode_free(inode);
745 	}
746 	tnode_free(oldtnode);
747 	return tn;
748 nomem:
749 	{
750 		int size = tnode_child_length(tn);
751 		int j;
752 
753 		for (j = 0; j < size; j++)
754 			if (tn->child[j])
755 				tnode_free((struct tnode *)tn->child[j]);
756 
757 		tnode_free(tn);
758 
759 		return ERR_PTR(-ENOMEM);
760 	}
761 }
762 
763 static struct tnode *halve(struct trie *t, struct tnode *tn)
764 {
765 	struct tnode *oldtnode = tn;
766 	struct node *left, *right;
767 	int i;
768 	int olen = tnode_child_length(tn);
769 
770 	pr_debug("In halve\n");
771 
772 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
773 
774 	if (!tn)
775 		return ERR_PTR(-ENOMEM);
776 
777 	/*
778 	 * Preallocate and store tnodes before the actual work so we
779 	 * don't get into an inconsistent state if memory allocation
780 	 * fails. In case of failure we return the oldnode and halve
781 	 * of tnode is ignored.
782 	 */
783 
784 	for (i = 0; i < olen; i += 2) {
785 		left = tnode_get_child(oldtnode, i);
786 		right = tnode_get_child(oldtnode, i+1);
787 
788 		/* Two nonempty children */
789 		if (left && right) {
790 			struct tnode *newn;
791 
792 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
793 
794 			if (!newn)
795 				goto nomem;
796 
797 			put_child(t, tn, i/2, (struct node *)newn);
798 		}
799 
800 	}
801 
802 	for (i = 0; i < olen; i += 2) {
803 		struct tnode *newBinNode;
804 
805 		left = tnode_get_child(oldtnode, i);
806 		right = tnode_get_child(oldtnode, i+1);
807 
808 		/* At least one of the children is empty */
809 		if (left == NULL) {
810 			if (right == NULL)    /* Both are empty */
811 				continue;
812 			put_child(t, tn, i/2, right);
813 			continue;
814 		}
815 
816 		if (right == NULL) {
817 			put_child(t, tn, i/2, left);
818 			continue;
819 		}
820 
821 		/* Two nonempty children */
822 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
823 		put_child(t, tn, i/2, NULL);
824 		put_child(t, newBinNode, 0, left);
825 		put_child(t, newBinNode, 1, right);
826 		put_child(t, tn, i/2, resize(t, newBinNode));
827 	}
828 	tnode_free(oldtnode);
829 	return tn;
830 nomem:
831 	{
832 		int size = tnode_child_length(tn);
833 		int j;
834 
835 		for (j = 0; j < size; j++)
836 			if (tn->child[j])
837 				tnode_free((struct tnode *)tn->child[j]);
838 
839 		tnode_free(tn);
840 
841 		return ERR_PTR(-ENOMEM);
842 	}
843 }
844 
845 static void trie_init(struct trie *t)
846 {
847 	if (!t)
848 		return;
849 
850 	t->size = 0;
851 	rcu_assign_pointer(t->trie, NULL);
852 	t->revision = 0;
853 #ifdef CONFIG_IP_FIB_TRIE_STATS
854 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
855 #endif
856 }
857 
858 /* readside must use rcu_read_lock currently dump routines
859  via get_fa_head and dump */
860 
861 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
862 {
863 	struct hlist_head *head = &l->list;
864 	struct hlist_node *node;
865 	struct leaf_info *li;
866 
867 	hlist_for_each_entry_rcu(li, node, head, hlist)
868 		if (li->plen == plen)
869 			return li;
870 
871 	return NULL;
872 }
873 
874 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
875 {
876 	struct leaf_info *li = find_leaf_info(l, plen);
877 
878 	if (!li)
879 		return NULL;
880 
881 	return &li->falh;
882 }
883 
884 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
885 {
886         struct leaf_info *li = NULL, *last = NULL;
887         struct hlist_node *node;
888 
889         if (hlist_empty(head)) {
890                 hlist_add_head_rcu(&new->hlist, head);
891         } else {
892                 hlist_for_each_entry(li, node, head, hlist) {
893                         if (new->plen > li->plen)
894                                 break;
895 
896                         last = li;
897                 }
898                 if (last)
899                         hlist_add_after_rcu(&last->hlist, &new->hlist);
900                 else
901                         hlist_add_before_rcu(&new->hlist, &li->hlist);
902         }
903 }
904 
905 /* rcu_read_lock needs to be hold by caller from readside */
906 
907 static struct leaf *
908 fib_find_node(struct trie *t, u32 key)
909 {
910 	int pos;
911 	struct tnode *tn;
912 	struct node *n;
913 
914 	pos = 0;
915 	n = rcu_dereference(t->trie);
916 
917 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
918 		tn = (struct tnode *) n;
919 
920 		check_tnode(tn);
921 
922 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
923 			pos = tn->pos + tn->bits;
924 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
925 		} else
926 			break;
927 	}
928 	/* Case we have found a leaf. Compare prefixes */
929 
930 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
931 		return (struct leaf *)n;
932 
933 	return NULL;
934 }
935 
936 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
937 {
938 	int wasfull;
939 	t_key cindex, key;
940 	struct tnode *tp = NULL;
941 
942 	key = tn->key;
943 
944 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
945 
946 		tp = NODE_PARENT(tn);
947 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
948 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
949 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
950 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
951 
952 		if (!NODE_PARENT(tn))
953 			break;
954 
955 		tn = NODE_PARENT(tn);
956 	}
957 	/* Handle last (top) tnode */
958 	if (IS_TNODE(tn))
959 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
960 
961 	return (struct node*) tn;
962 }
963 
964 /* only used from updater-side */
965 
966 static  struct list_head *
967 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
968 {
969 	int pos, newpos;
970 	struct tnode *tp = NULL, *tn = NULL;
971 	struct node *n;
972 	struct leaf *l;
973 	int missbit;
974 	struct list_head *fa_head = NULL;
975 	struct leaf_info *li;
976 	t_key cindex;
977 
978 	pos = 0;
979 	n = t->trie;
980 
981 	/* If we point to NULL, stop. Either the tree is empty and we should
982 	 * just put a new leaf in if, or we have reached an empty child slot,
983 	 * and we should just put our new leaf in that.
984 	 * If we point to a T_TNODE, check if it matches our key. Note that
985 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
986 	 * not be the parent's 'pos'+'bits'!
987 	 *
988 	 * If it does match the current key, get pos/bits from it, extract
989 	 * the index from our key, push the T_TNODE and walk the tree.
990 	 *
991 	 * If it doesn't, we have to replace it with a new T_TNODE.
992 	 *
993 	 * If we point to a T_LEAF, it might or might not have the same key
994 	 * as we do. If it does, just change the value, update the T_LEAF's
995 	 * value, and return it.
996 	 * If it doesn't, we need to replace it with a T_TNODE.
997 	 */
998 
999 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1000 		tn = (struct tnode *) n;
1001 
1002 		check_tnode(tn);
1003 
1004 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1005 			tp = tn;
1006 			pos = tn->pos + tn->bits;
1007 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1008 
1009 			BUG_ON(n && NODE_PARENT(n) != tn);
1010 		} else
1011 			break;
1012 	}
1013 
1014 	/*
1015 	 * n  ----> NULL, LEAF or TNODE
1016 	 *
1017 	 * tp is n's (parent) ----> NULL or TNODE
1018 	 */
1019 
1020 	BUG_ON(tp && IS_LEAF(tp));
1021 
1022 	/* Case 1: n is a leaf. Compare prefixes */
1023 
1024 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1025 		struct leaf *l = (struct leaf *) n;
1026 
1027 		li = leaf_info_new(plen);
1028 
1029 		if (!li) {
1030 			*err = -ENOMEM;
1031 			goto err;
1032 		}
1033 
1034 		fa_head = &li->falh;
1035 		insert_leaf_info(&l->list, li);
1036 		goto done;
1037 	}
1038 	t->size++;
1039 	l = leaf_new();
1040 
1041 	if (!l) {
1042 		*err = -ENOMEM;
1043 		goto err;
1044 	}
1045 
1046 	l->key = key;
1047 	li = leaf_info_new(plen);
1048 
1049 	if (!li) {
1050 		tnode_free((struct tnode *) l);
1051 		*err = -ENOMEM;
1052 		goto err;
1053 	}
1054 
1055 	fa_head = &li->falh;
1056 	insert_leaf_info(&l->list, li);
1057 
1058 	if (t->trie && n == NULL) {
1059 		/* Case 2: n is NULL, and will just insert a new leaf */
1060 
1061 		NODE_SET_PARENT(l, tp);
1062 
1063 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1064 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1065 	} else {
1066 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1067 		/*
1068 		 *  Add a new tnode here
1069 		 *  first tnode need some special handling
1070 		 */
1071 
1072 		if (tp)
1073 			pos = tp->pos+tp->bits;
1074 		else
1075 			pos = 0;
1076 
1077 		if (n) {
1078 			newpos = tkey_mismatch(key, pos, n->key);
1079 			tn = tnode_new(n->key, newpos, 1);
1080 		} else {
1081 			newpos = 0;
1082 			tn = tnode_new(key, newpos, 1); /* First tnode */
1083 		}
1084 
1085 		if (!tn) {
1086 			free_leaf_info(li);
1087 			tnode_free((struct tnode *) l);
1088 			*err = -ENOMEM;
1089 			goto err;
1090 		}
1091 
1092 		NODE_SET_PARENT(tn, tp);
1093 
1094 		missbit = tkey_extract_bits(key, newpos, 1);
1095 		put_child(t, tn, missbit, (struct node *)l);
1096 		put_child(t, tn, 1-missbit, n);
1097 
1098 		if (tp) {
1099 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1100 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1101 		} else {
1102 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1103 			tp = tn;
1104 		}
1105 	}
1106 
1107 	if (tp && tp->pos + tp->bits > 32)
1108 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1109 		       tp, tp->pos, tp->bits, key, plen);
1110 
1111 	/* Rebalance the trie */
1112 
1113 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1114 done:
1115 	t->revision++;
1116 err:
1117 	return fa_head;
1118 }
1119 
1120 static int
1121 fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1122 	       struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1123 {
1124 	struct trie *t = (struct trie *) tb->tb_data;
1125 	struct fib_alias *fa, *new_fa;
1126 	struct list_head *fa_head = NULL;
1127 	struct fib_info *fi;
1128 	int plen = r->rtm_dst_len;
1129 	int type = r->rtm_type;
1130 	u8 tos = r->rtm_tos;
1131 	u32 key, mask;
1132 	int err;
1133 	struct leaf *l;
1134 
1135 	if (plen > 32)
1136 		return -EINVAL;
1137 
1138 	key = 0;
1139 	if (rta->rta_dst)
1140 		memcpy(&key, rta->rta_dst, 4);
1141 
1142 	key = ntohl(key);
1143 
1144 	pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
1145 
1146 	mask = ntohl(inet_make_mask(plen));
1147 
1148 	if (key & ~mask)
1149 		return -EINVAL;
1150 
1151 	key = key & mask;
1152 
1153 	fi = fib_create_info(r, rta, nlhdr, &err);
1154 
1155 	if (!fi)
1156 		goto err;
1157 
1158 	l = fib_find_node(t, key);
1159 	fa = NULL;
1160 
1161 	if (l) {
1162 		fa_head = get_fa_head(l, plen);
1163 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1164 	}
1165 
1166 	/* Now fa, if non-NULL, points to the first fib alias
1167 	 * with the same keys [prefix,tos,priority], if such key already
1168 	 * exists or to the node before which we will insert new one.
1169 	 *
1170 	 * If fa is NULL, we will need to allocate a new one and
1171 	 * insert to the head of f.
1172 	 *
1173 	 * If f is NULL, no fib node matched the destination key
1174 	 * and we need to allocate a new one of those as well.
1175 	 */
1176 
1177 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1178 		struct fib_alias *fa_orig;
1179 
1180 		err = -EEXIST;
1181 		if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1182 			goto out;
1183 
1184 		if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1185 			struct fib_info *fi_drop;
1186 			u8 state;
1187 
1188 			err = -ENOBUFS;
1189 			new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1190 			if (new_fa == NULL)
1191 				goto out;
1192 
1193 			fi_drop = fa->fa_info;
1194 			new_fa->fa_tos = fa->fa_tos;
1195 			new_fa->fa_info = fi;
1196 			new_fa->fa_type = type;
1197 			new_fa->fa_scope = r->rtm_scope;
1198 			state = fa->fa_state;
1199 			new_fa->fa_state &= ~FA_S_ACCESSED;
1200 
1201 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1202 			alias_free_mem_rcu(fa);
1203 
1204 			fib_release_info(fi_drop);
1205 			if (state & FA_S_ACCESSED)
1206 				rt_cache_flush(-1);
1207 
1208 			goto succeeded;
1209 		}
1210 		/* Error if we find a perfect match which
1211 		 * uses the same scope, type, and nexthop
1212 		 * information.
1213 		 */
1214 		fa_orig = fa;
1215 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1216 			if (fa->fa_tos != tos)
1217 				break;
1218 			if (fa->fa_info->fib_priority != fi->fib_priority)
1219 				break;
1220 			if (fa->fa_type == type &&
1221 			    fa->fa_scope == r->rtm_scope &&
1222 			    fa->fa_info == fi) {
1223 				goto out;
1224 			}
1225 		}
1226 		if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1227 			fa = fa_orig;
1228 	}
1229 	err = -ENOENT;
1230 	if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
1231 		goto out;
1232 
1233 	err = -ENOBUFS;
1234 	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1235 	if (new_fa == NULL)
1236 		goto out;
1237 
1238 	new_fa->fa_info = fi;
1239 	new_fa->fa_tos = tos;
1240 	new_fa->fa_type = type;
1241 	new_fa->fa_scope = r->rtm_scope;
1242 	new_fa->fa_state = 0;
1243 	/*
1244 	 * Insert new entry to the list.
1245 	 */
1246 
1247 	if (!fa_head) {
1248 		fa_head = fib_insert_node(t, &err, key, plen);
1249 		err = 0;
1250 		if (err)
1251 			goto out_free_new_fa;
1252 	}
1253 
1254 	list_add_tail_rcu(&new_fa->fa_list,
1255 			  (fa ? &fa->fa_list : fa_head));
1256 
1257 	rt_cache_flush(-1);
1258 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1259 succeeded:
1260 	return 0;
1261 
1262 out_free_new_fa:
1263 	kmem_cache_free(fn_alias_kmem, new_fa);
1264 out:
1265 	fib_release_info(fi);
1266 err:
1267 	return err;
1268 }
1269 
1270 
1271 /* should be called with rcu_read_lock */
1272 static inline int check_leaf(struct trie *t, struct leaf *l,
1273 			     t_key key, int *plen, const struct flowi *flp,
1274 			     struct fib_result *res)
1275 {
1276 	int err, i;
1277 	t_key mask;
1278 	struct leaf_info *li;
1279 	struct hlist_head *hhead = &l->list;
1280 	struct hlist_node *node;
1281 
1282 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1283 		i = li->plen;
1284 		mask = ntohl(inet_make_mask(i));
1285 		if (l->key != (key & mask))
1286 			continue;
1287 
1288 		if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
1289 			*plen = i;
1290 #ifdef CONFIG_IP_FIB_TRIE_STATS
1291 			t->stats.semantic_match_passed++;
1292 #endif
1293 			return err;
1294 		}
1295 #ifdef CONFIG_IP_FIB_TRIE_STATS
1296 		t->stats.semantic_match_miss++;
1297 #endif
1298 	}
1299 	return 1;
1300 }
1301 
1302 static int
1303 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1304 {
1305 	struct trie *t = (struct trie *) tb->tb_data;
1306 	int plen, ret = 0;
1307 	struct node *n;
1308 	struct tnode *pn;
1309 	int pos, bits;
1310 	t_key key = ntohl(flp->fl4_dst);
1311 	int chopped_off;
1312 	t_key cindex = 0;
1313 	int current_prefix_length = KEYLENGTH;
1314 	struct tnode *cn;
1315 	t_key node_prefix, key_prefix, pref_mismatch;
1316 	int mp;
1317 
1318 	rcu_read_lock();
1319 
1320 	n = rcu_dereference(t->trie);
1321 	if (!n)
1322 		goto failed;
1323 
1324 #ifdef CONFIG_IP_FIB_TRIE_STATS
1325 	t->stats.gets++;
1326 #endif
1327 
1328 	/* Just a leaf? */
1329 	if (IS_LEAF(n)) {
1330 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1331 			goto found;
1332 		goto failed;
1333 	}
1334 	pn = (struct tnode *) n;
1335 	chopped_off = 0;
1336 
1337 	while (pn) {
1338 		pos = pn->pos;
1339 		bits = pn->bits;
1340 
1341 		if (!chopped_off)
1342 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1343 
1344 		n = tnode_get_child(pn, cindex);
1345 
1346 		if (n == NULL) {
1347 #ifdef CONFIG_IP_FIB_TRIE_STATS
1348 			t->stats.null_node_hit++;
1349 #endif
1350 			goto backtrace;
1351 		}
1352 
1353 		if (IS_LEAF(n)) {
1354 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1355 				goto found;
1356 			else
1357 				goto backtrace;
1358 		}
1359 
1360 #define HL_OPTIMIZE
1361 #ifdef HL_OPTIMIZE
1362 		cn = (struct tnode *)n;
1363 
1364 		/*
1365 		 * It's a tnode, and we can do some extra checks here if we
1366 		 * like, to avoid descending into a dead-end branch.
1367 		 * This tnode is in the parent's child array at index
1368 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1369 		 * chopped off, so in reality the index may be just a
1370 		 * subprefix, padded with zero at the end.
1371 		 * We can also take a look at any skipped bits in this
1372 		 * tnode - everything up to p_pos is supposed to be ok,
1373 		 * and the non-chopped bits of the index (se previous
1374 		 * paragraph) are also guaranteed ok, but the rest is
1375 		 * considered unknown.
1376 		 *
1377 		 * The skipped bits are key[pos+bits..cn->pos].
1378 		 */
1379 
1380 		/* If current_prefix_length < pos+bits, we are already doing
1381 		 * actual prefix  matching, which means everything from
1382 		 * pos+(bits-chopped_off) onward must be zero along some
1383 		 * branch of this subtree - otherwise there is *no* valid
1384 		 * prefix present. Here we can only check the skipped
1385 		 * bits. Remember, since we have already indexed into the
1386 		 * parent's child array, we know that the bits we chopped of
1387 		 * *are* zero.
1388 		 */
1389 
1390 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1391 
1392 		if (current_prefix_length < pos+bits) {
1393 			if (tkey_extract_bits(cn->key, current_prefix_length,
1394 						cn->pos - current_prefix_length) != 0 ||
1395 			    !(cn->child[0]))
1396 				goto backtrace;
1397 		}
1398 
1399 		/*
1400 		 * If chopped_off=0, the index is fully validated and we
1401 		 * only need to look at the skipped bits for this, the new,
1402 		 * tnode. What we actually want to do is to find out if
1403 		 * these skipped bits match our key perfectly, or if we will
1404 		 * have to count on finding a matching prefix further down,
1405 		 * because if we do, we would like to have some way of
1406 		 * verifying the existence of such a prefix at this point.
1407 		 */
1408 
1409 		/* The only thing we can do at this point is to verify that
1410 		 * any such matching prefix can indeed be a prefix to our
1411 		 * key, and if the bits in the node we are inspecting that
1412 		 * do not match our key are not ZERO, this cannot be true.
1413 		 * Thus, find out where there is a mismatch (before cn->pos)
1414 		 * and verify that all the mismatching bits are zero in the
1415 		 * new tnode's key.
1416 		 */
1417 
1418 		/* Note: We aren't very concerned about the piece of the key
1419 		 * that precede pn->pos+pn->bits, since these have already been
1420 		 * checked. The bits after cn->pos aren't checked since these are
1421 		 * by definition "unknown" at this point. Thus, what we want to
1422 		 * see is if we are about to enter the "prefix matching" state,
1423 		 * and in that case verify that the skipped bits that will prevail
1424 		 * throughout this subtree are zero, as they have to be if we are
1425 		 * to find a matching prefix.
1426 		 */
1427 
1428 		node_prefix = MASK_PFX(cn->key, cn->pos);
1429 		key_prefix = MASK_PFX(key, cn->pos);
1430 		pref_mismatch = key_prefix^node_prefix;
1431 		mp = 0;
1432 
1433 		/* In short: If skipped bits in this node do not match the search
1434 		 * key, enter the "prefix matching" state.directly.
1435 		 */
1436 		if (pref_mismatch) {
1437 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1438 				mp++;
1439 				pref_mismatch = pref_mismatch <<1;
1440 			}
1441 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1442 
1443 			if (key_prefix != 0)
1444 				goto backtrace;
1445 
1446 			if (current_prefix_length >= cn->pos)
1447 				current_prefix_length = mp;
1448 		}
1449 #endif
1450 		pn = (struct tnode *)n; /* Descend */
1451 		chopped_off = 0;
1452 		continue;
1453 
1454 backtrace:
1455 		chopped_off++;
1456 
1457 		/* As zero don't change the child key (cindex) */
1458 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1459 			chopped_off++;
1460 
1461 		/* Decrease current_... with bits chopped off */
1462 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1463 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1464 
1465 		/*
1466 		 * Either we do the actual chop off according or if we have
1467 		 * chopped off all bits in this tnode walk up to our parent.
1468 		 */
1469 
1470 		if (chopped_off <= pn->bits) {
1471 			cindex &= ~(1 << (chopped_off-1));
1472 		} else {
1473 			if (NODE_PARENT(pn) == NULL)
1474 				goto failed;
1475 
1476 			/* Get Child's index */
1477 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1478 			pn = NODE_PARENT(pn);
1479 			chopped_off = 0;
1480 
1481 #ifdef CONFIG_IP_FIB_TRIE_STATS
1482 			t->stats.backtrack++;
1483 #endif
1484 			goto backtrace;
1485 		}
1486 	}
1487 failed:
1488 	ret = 1;
1489 found:
1490 	rcu_read_unlock();
1491 	return ret;
1492 }
1493 
1494 /* only called from updater side */
1495 static int trie_leaf_remove(struct trie *t, t_key key)
1496 {
1497 	t_key cindex;
1498 	struct tnode *tp = NULL;
1499 	struct node *n = t->trie;
1500 	struct leaf *l;
1501 
1502 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1503 
1504 	/* Note that in the case skipped bits, those bits are *not* checked!
1505 	 * When we finish this, we will have NULL or a T_LEAF, and the
1506 	 * T_LEAF may or may not match our key.
1507 	 */
1508 
1509 	while (n != NULL && IS_TNODE(n)) {
1510 		struct tnode *tn = (struct tnode *) n;
1511 		check_tnode(tn);
1512 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1513 
1514 		BUG_ON(n && NODE_PARENT(n) != tn);
1515 	}
1516 	l = (struct leaf *) n;
1517 
1518 	if (!n || !tkey_equals(l->key, key))
1519 		return 0;
1520 
1521 	/*
1522 	 * Key found.
1523 	 * Remove the leaf and rebalance the tree
1524 	 */
1525 
1526 	t->revision++;
1527 	t->size--;
1528 
1529 	preempt_disable();
1530 	tp = NODE_PARENT(n);
1531 	tnode_free((struct tnode *) n);
1532 
1533 	if (tp) {
1534 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1535 		put_child(t, (struct tnode *)tp, cindex, NULL);
1536 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1537 	} else
1538 		rcu_assign_pointer(t->trie, NULL);
1539 	preempt_enable();
1540 
1541 	return 1;
1542 }
1543 
1544 static int
1545 fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1546 		struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1547 {
1548 	struct trie *t = (struct trie *) tb->tb_data;
1549 	u32 key, mask;
1550 	int plen = r->rtm_dst_len;
1551 	u8 tos = r->rtm_tos;
1552 	struct fib_alias *fa, *fa_to_delete;
1553 	struct list_head *fa_head;
1554 	struct leaf *l;
1555 	struct leaf_info *li;
1556 
1557 
1558 	if (plen > 32)
1559 		return -EINVAL;
1560 
1561 	key = 0;
1562 	if (rta->rta_dst)
1563 		memcpy(&key, rta->rta_dst, 4);
1564 
1565 	key = ntohl(key);
1566 	mask = ntohl(inet_make_mask(plen));
1567 
1568 	if (key & ~mask)
1569 		return -EINVAL;
1570 
1571 	key = key & mask;
1572 	l = fib_find_node(t, key);
1573 
1574 	if (!l)
1575 		return -ESRCH;
1576 
1577 	fa_head = get_fa_head(l, plen);
1578 	fa = fib_find_alias(fa_head, tos, 0);
1579 
1580 	if (!fa)
1581 		return -ESRCH;
1582 
1583 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1584 
1585 	fa_to_delete = NULL;
1586 	fa_head = fa->fa_list.prev;
1587 
1588 	list_for_each_entry(fa, fa_head, fa_list) {
1589 		struct fib_info *fi = fa->fa_info;
1590 
1591 		if (fa->fa_tos != tos)
1592 			break;
1593 
1594 		if ((!r->rtm_type ||
1595 		     fa->fa_type == r->rtm_type) &&
1596 		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
1597 		     fa->fa_scope == r->rtm_scope) &&
1598 		    (!r->rtm_protocol ||
1599 		     fi->fib_protocol == r->rtm_protocol) &&
1600 		    fib_nh_match(r, nlhdr, rta, fi) == 0) {
1601 			fa_to_delete = fa;
1602 			break;
1603 		}
1604 	}
1605 
1606 	if (!fa_to_delete)
1607 		return -ESRCH;
1608 
1609 	fa = fa_to_delete;
1610 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1611 
1612 	l = fib_find_node(t, key);
1613 	li = find_leaf_info(l, plen);
1614 
1615 	list_del_rcu(&fa->fa_list);
1616 
1617 	if (list_empty(fa_head)) {
1618 		hlist_del_rcu(&li->hlist);
1619 		free_leaf_info(li);
1620 	}
1621 
1622 	if (hlist_empty(&l->list))
1623 		trie_leaf_remove(t, key);
1624 
1625 	if (fa->fa_state & FA_S_ACCESSED)
1626 		rt_cache_flush(-1);
1627 
1628 	fib_release_info(fa->fa_info);
1629 	alias_free_mem_rcu(fa);
1630 	return 0;
1631 }
1632 
1633 static int trie_flush_list(struct trie *t, struct list_head *head)
1634 {
1635 	struct fib_alias *fa, *fa_node;
1636 	int found = 0;
1637 
1638 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1639 		struct fib_info *fi = fa->fa_info;
1640 
1641 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1642 			list_del_rcu(&fa->fa_list);
1643 			fib_release_info(fa->fa_info);
1644 			alias_free_mem_rcu(fa);
1645 			found++;
1646 		}
1647 	}
1648 	return found;
1649 }
1650 
1651 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1652 {
1653 	int found = 0;
1654 	struct hlist_head *lih = &l->list;
1655 	struct hlist_node *node, *tmp;
1656 	struct leaf_info *li = NULL;
1657 
1658 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1659 		found += trie_flush_list(t, &li->falh);
1660 
1661 		if (list_empty(&li->falh)) {
1662 			hlist_del_rcu(&li->hlist);
1663 			free_leaf_info(li);
1664 		}
1665 	}
1666 	return found;
1667 }
1668 
1669 /* rcu_read_lock needs to be hold by caller from readside */
1670 
1671 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1672 {
1673 	struct node *c = (struct node *) thisleaf;
1674 	struct tnode *p;
1675 	int idx;
1676 	struct node *trie = rcu_dereference(t->trie);
1677 
1678 	if (c == NULL) {
1679 		if (trie == NULL)
1680 			return NULL;
1681 
1682 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1683 			return (struct leaf *) trie;
1684 
1685 		p = (struct tnode*) trie;  /* Start */
1686 	} else
1687 		p = (struct tnode *) NODE_PARENT(c);
1688 
1689 	while (p) {
1690 		int pos, last;
1691 
1692 		/*  Find the next child of the parent */
1693 		if (c)
1694 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1695 		else
1696 			pos = 0;
1697 
1698 		last = 1 << p->bits;
1699 		for (idx = pos; idx < last ; idx++) {
1700 			c = rcu_dereference(p->child[idx]);
1701 
1702 			if (!c)
1703 				continue;
1704 
1705 			/* Decend if tnode */
1706 			while (IS_TNODE(c)) {
1707 				p = (struct tnode *) c;
1708   				idx = 0;
1709 
1710 				/* Rightmost non-NULL branch */
1711 				if (p && IS_TNODE(p))
1712 					while (!(c = rcu_dereference(p->child[idx]))
1713 					       && idx < (1<<p->bits)) idx++;
1714 
1715 				/* Done with this tnode? */
1716 				if (idx >= (1 << p->bits) || !c)
1717 					goto up;
1718 			}
1719 			return (struct leaf *) c;
1720 		}
1721 up:
1722 		/* No more children go up one step  */
1723 		c = (struct node *) p;
1724 		p = (struct tnode *) NODE_PARENT(p);
1725 	}
1726 	return NULL; /* Ready. Root of trie */
1727 }
1728 
1729 static int fn_trie_flush(struct fib_table *tb)
1730 {
1731 	struct trie *t = (struct trie *) tb->tb_data;
1732 	struct leaf *ll = NULL, *l = NULL;
1733 	int found = 0, h;
1734 
1735 	t->revision++;
1736 
1737 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1738 		found += trie_flush_leaf(t, l);
1739 
1740 		if (ll && hlist_empty(&ll->list))
1741 			trie_leaf_remove(t, ll->key);
1742 		ll = l;
1743 	}
1744 
1745 	if (ll && hlist_empty(&ll->list))
1746 		trie_leaf_remove(t, ll->key);
1747 
1748 	pr_debug("trie_flush found=%d\n", found);
1749 	return found;
1750 }
1751 
1752 static int trie_last_dflt = -1;
1753 
1754 static void
1755 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1756 {
1757 	struct trie *t = (struct trie *) tb->tb_data;
1758 	int order, last_idx;
1759 	struct fib_info *fi = NULL;
1760 	struct fib_info *last_resort;
1761 	struct fib_alias *fa = NULL;
1762 	struct list_head *fa_head;
1763 	struct leaf *l;
1764 
1765 	last_idx = -1;
1766 	last_resort = NULL;
1767 	order = -1;
1768 
1769 	rcu_read_lock();
1770 
1771 	l = fib_find_node(t, 0);
1772 	if (!l)
1773 		goto out;
1774 
1775 	fa_head = get_fa_head(l, 0);
1776 	if (!fa_head)
1777 		goto out;
1778 
1779 	if (list_empty(fa_head))
1780 		goto out;
1781 
1782 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1783 		struct fib_info *next_fi = fa->fa_info;
1784 
1785 		if (fa->fa_scope != res->scope ||
1786 		    fa->fa_type != RTN_UNICAST)
1787 			continue;
1788 
1789 		if (next_fi->fib_priority > res->fi->fib_priority)
1790 			break;
1791 		if (!next_fi->fib_nh[0].nh_gw ||
1792 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1793 			continue;
1794 		fa->fa_state |= FA_S_ACCESSED;
1795 
1796 		if (fi == NULL) {
1797 			if (next_fi != res->fi)
1798 				break;
1799 		} else if (!fib_detect_death(fi, order, &last_resort,
1800 					     &last_idx, &trie_last_dflt)) {
1801 			if (res->fi)
1802 				fib_info_put(res->fi);
1803 			res->fi = fi;
1804 			atomic_inc(&fi->fib_clntref);
1805 			trie_last_dflt = order;
1806 			goto out;
1807 		}
1808 		fi = next_fi;
1809 		order++;
1810 	}
1811 	if (order <= 0 || fi == NULL) {
1812 		trie_last_dflt = -1;
1813 		goto out;
1814 	}
1815 
1816 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1817 		if (res->fi)
1818 			fib_info_put(res->fi);
1819 		res->fi = fi;
1820 		atomic_inc(&fi->fib_clntref);
1821 		trie_last_dflt = order;
1822 		goto out;
1823 	}
1824 	if (last_idx >= 0) {
1825 		if (res->fi)
1826 			fib_info_put(res->fi);
1827 		res->fi = last_resort;
1828 		if (last_resort)
1829 			atomic_inc(&last_resort->fib_clntref);
1830 	}
1831 	trie_last_dflt = last_idx;
1832  out:;
1833 	rcu_read_unlock();
1834 }
1835 
1836 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1837 			   struct sk_buff *skb, struct netlink_callback *cb)
1838 {
1839 	int i, s_i;
1840 	struct fib_alias *fa;
1841 
1842 	u32 xkey = htonl(key);
1843 
1844 	s_i = cb->args[3];
1845 	i = 0;
1846 
1847 	/* rcu_read_lock is hold by caller */
1848 
1849 	list_for_each_entry_rcu(fa, fah, fa_list) {
1850 		if (i < s_i) {
1851 			i++;
1852 			continue;
1853 		}
1854 		BUG_ON(!fa->fa_info);
1855 
1856 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1857 				  cb->nlh->nlmsg_seq,
1858 				  RTM_NEWROUTE,
1859 				  tb->tb_id,
1860 				  fa->fa_type,
1861 				  fa->fa_scope,
1862 				  &xkey,
1863 				  plen,
1864 				  fa->fa_tos,
1865 				  fa->fa_info, 0) < 0) {
1866 			cb->args[3] = i;
1867 			return -1;
1868 		}
1869 		i++;
1870 	}
1871 	cb->args[3] = i;
1872 	return skb->len;
1873 }
1874 
1875 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1876 			     struct netlink_callback *cb)
1877 {
1878 	int h, s_h;
1879 	struct list_head *fa_head;
1880 	struct leaf *l = NULL;
1881 
1882 	s_h = cb->args[2];
1883 
1884 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1885 		if (h < s_h)
1886 			continue;
1887 		if (h > s_h)
1888 			memset(&cb->args[3], 0,
1889 			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
1890 
1891 		fa_head = get_fa_head(l, plen);
1892 
1893 		if (!fa_head)
1894 			continue;
1895 
1896 		if (list_empty(fa_head))
1897 			continue;
1898 
1899 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1900 			cb->args[2] = h;
1901 			return -1;
1902 		}
1903 	}
1904 	cb->args[2] = h;
1905 	return skb->len;
1906 }
1907 
1908 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1909 {
1910 	int m, s_m;
1911 	struct trie *t = (struct trie *) tb->tb_data;
1912 
1913 	s_m = cb->args[1];
1914 
1915 	rcu_read_lock();
1916 	for (m = 0; m <= 32; m++) {
1917 		if (m < s_m)
1918 			continue;
1919 		if (m > s_m)
1920 			memset(&cb->args[2], 0,
1921 				sizeof(cb->args) - 2*sizeof(cb->args[0]));
1922 
1923 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1924 			cb->args[1] = m;
1925 			goto out;
1926 		}
1927 	}
1928 	rcu_read_unlock();
1929 	cb->args[1] = m;
1930 	return skb->len;
1931 out:
1932 	rcu_read_unlock();
1933 	return -1;
1934 }
1935 
1936 /* Fix more generic FIB names for init later */
1937 
1938 #ifdef CONFIG_IP_MULTIPLE_TABLES
1939 struct fib_table * fib_hash_init(int id)
1940 #else
1941 struct fib_table * __init fib_hash_init(int id)
1942 #endif
1943 {
1944 	struct fib_table *tb;
1945 	struct trie *t;
1946 
1947 	if (fn_alias_kmem == NULL)
1948 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1949 						  sizeof(struct fib_alias),
1950 						  0, SLAB_HWCACHE_ALIGN,
1951 						  NULL, NULL);
1952 
1953 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1954 		     GFP_KERNEL);
1955 	if (tb == NULL)
1956 		return NULL;
1957 
1958 	tb->tb_id = id;
1959 	tb->tb_lookup = fn_trie_lookup;
1960 	tb->tb_insert = fn_trie_insert;
1961 	tb->tb_delete = fn_trie_delete;
1962 	tb->tb_flush = fn_trie_flush;
1963 	tb->tb_select_default = fn_trie_select_default;
1964 	tb->tb_dump = fn_trie_dump;
1965 	memset(tb->tb_data, 0, sizeof(struct trie));
1966 
1967 	t = (struct trie *) tb->tb_data;
1968 
1969 	trie_init(t);
1970 
1971 	if (id == RT_TABLE_LOCAL)
1972 		trie_local = t;
1973 	else if (id == RT_TABLE_MAIN)
1974 		trie_main = t;
1975 
1976 	if (id == RT_TABLE_LOCAL)
1977 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
1978 
1979 	return tb;
1980 }
1981 
1982 #ifdef CONFIG_PROC_FS
1983 /* Depth first Trie walk iterator */
1984 struct fib_trie_iter {
1985 	struct tnode *tnode;
1986 	struct trie *trie;
1987 	unsigned index;
1988 	unsigned depth;
1989 };
1990 
1991 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
1992 {
1993 	struct tnode *tn = iter->tnode;
1994 	unsigned cindex = iter->index;
1995 	struct tnode *p;
1996 
1997 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
1998 		 iter->tnode, iter->index, iter->depth);
1999 rescan:
2000 	while (cindex < (1<<tn->bits)) {
2001 		struct node *n = tnode_get_child(tn, cindex);
2002 
2003 		if (n) {
2004 			if (IS_LEAF(n)) {
2005 				iter->tnode = tn;
2006 				iter->index = cindex + 1;
2007 			} else {
2008 				/* push down one level */
2009 				iter->tnode = (struct tnode *) n;
2010 				iter->index = 0;
2011 				++iter->depth;
2012 			}
2013 			return n;
2014 		}
2015 
2016 		++cindex;
2017 	}
2018 
2019 	/* Current node exhausted, pop back up */
2020 	p = NODE_PARENT(tn);
2021 	if (p) {
2022 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2023 		tn = p;
2024 		--iter->depth;
2025 		goto rescan;
2026 	}
2027 
2028 	/* got root? */
2029 	return NULL;
2030 }
2031 
2032 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2033 				       struct trie *t)
2034 {
2035 	struct node *n = rcu_dereference(t->trie);
2036 
2037 	if (n && IS_TNODE(n)) {
2038 		iter->tnode = (struct tnode *) n;
2039 		iter->trie = t;
2040 		iter->index = 0;
2041 		iter->depth = 1;
2042 		return n;
2043 	}
2044 	return NULL;
2045 }
2046 
2047 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2048 {
2049 	struct node *n;
2050 	struct fib_trie_iter iter;
2051 
2052 	memset(s, 0, sizeof(*s));
2053 
2054 	rcu_read_lock();
2055 	for (n = fib_trie_get_first(&iter, t); n;
2056 	     n = fib_trie_get_next(&iter)) {
2057 		if (IS_LEAF(n)) {
2058 			s->leaves++;
2059 			s->totdepth += iter.depth;
2060 			if (iter.depth > s->maxdepth)
2061 				s->maxdepth = iter.depth;
2062 		} else {
2063 			const struct tnode *tn = (const struct tnode *) n;
2064 			int i;
2065 
2066 			s->tnodes++;
2067 			s->nodesizes[tn->bits]++;
2068 			for (i = 0; i < (1<<tn->bits); i++)
2069 				if (!tn->child[i])
2070 					s->nullpointers++;
2071 		}
2072 	}
2073 	rcu_read_unlock();
2074 }
2075 
2076 /*
2077  *	This outputs /proc/net/fib_triestats
2078  */
2079 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2080 {
2081 	unsigned i, max, pointers, bytes, avdepth;
2082 
2083 	if (stat->leaves)
2084 		avdepth = stat->totdepth*100 / stat->leaves;
2085 	else
2086 		avdepth = 0;
2087 
2088 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2089 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2090 
2091 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2092 
2093 	bytes = sizeof(struct leaf) * stat->leaves;
2094 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2095 	bytes += sizeof(struct tnode) * stat->tnodes;
2096 
2097 	max = MAX_CHILDS-1;
2098 	while (max >= 0 && stat->nodesizes[max] == 0)
2099 		max--;
2100 
2101 	pointers = 0;
2102 	for (i = 1; i <= max; i++)
2103 		if (stat->nodesizes[i] != 0) {
2104 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2105 			pointers += (1<<i) * stat->nodesizes[i];
2106 		}
2107 	seq_putc(seq, '\n');
2108 	seq_printf(seq, "\tPointers: %d\n", pointers);
2109 
2110 	bytes += sizeof(struct node *) * pointers;
2111 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2112 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2113 
2114 #ifdef CONFIG_IP_FIB_TRIE_STATS
2115 	seq_printf(seq, "Counters:\n---------\n");
2116 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2117 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2118 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2119 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2120 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2121 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2122 #ifdef CLEAR_STATS
2123 	memset(&(t->stats), 0, sizeof(t->stats));
2124 #endif
2125 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2126 }
2127 
2128 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2129 {
2130 	struct trie_stat *stat;
2131 
2132 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2133 	if (!stat)
2134 		return -ENOMEM;
2135 
2136 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2137 		   sizeof(struct leaf), sizeof(struct tnode));
2138 
2139 	if (trie_local) {
2140 		seq_printf(seq, "Local:\n");
2141 		trie_collect_stats(trie_local, stat);
2142 		trie_show_stats(seq, stat);
2143 	}
2144 
2145 	if (trie_main) {
2146 		seq_printf(seq, "Main:\n");
2147 		trie_collect_stats(trie_main, stat);
2148 		trie_show_stats(seq, stat);
2149 	}
2150 	kfree(stat);
2151 
2152 	return 0;
2153 }
2154 
2155 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2156 {
2157 	return single_open(file, fib_triestat_seq_show, NULL);
2158 }
2159 
2160 static struct file_operations fib_triestat_fops = {
2161 	.owner	= THIS_MODULE,
2162 	.open	= fib_triestat_seq_open,
2163 	.read	= seq_read,
2164 	.llseek	= seq_lseek,
2165 	.release = single_release,
2166 };
2167 
2168 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2169 				      loff_t pos)
2170 {
2171 	loff_t idx = 0;
2172 	struct node *n;
2173 
2174 	for (n = fib_trie_get_first(iter, trie_local);
2175 	     n; ++idx, n = fib_trie_get_next(iter)) {
2176 		if (pos == idx)
2177 			return n;
2178 	}
2179 
2180 	for (n = fib_trie_get_first(iter, trie_main);
2181 	     n; ++idx, n = fib_trie_get_next(iter)) {
2182 		if (pos == idx)
2183 			return n;
2184 	}
2185 	return NULL;
2186 }
2187 
2188 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2189 {
2190 	rcu_read_lock();
2191 	if (*pos == 0)
2192 		return SEQ_START_TOKEN;
2193 	return fib_trie_get_idx(seq->private, *pos - 1);
2194 }
2195 
2196 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2197 {
2198 	struct fib_trie_iter *iter = seq->private;
2199 	void *l = v;
2200 
2201 	++*pos;
2202 	if (v == SEQ_START_TOKEN)
2203 		return fib_trie_get_idx(iter, 0);
2204 
2205 	v = fib_trie_get_next(iter);
2206 	BUG_ON(v == l);
2207 	if (v)
2208 		return v;
2209 
2210 	/* continue scan in next trie */
2211 	if (iter->trie == trie_local)
2212 		return fib_trie_get_first(iter, trie_main);
2213 
2214 	return NULL;
2215 }
2216 
2217 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2218 {
2219 	rcu_read_unlock();
2220 }
2221 
2222 static void seq_indent(struct seq_file *seq, int n)
2223 {
2224 	while (n-- > 0) seq_puts(seq, "   ");
2225 }
2226 
2227 static inline const char *rtn_scope(enum rt_scope_t s)
2228 {
2229 	static char buf[32];
2230 
2231 	switch(s) {
2232 	case RT_SCOPE_UNIVERSE: return "universe";
2233 	case RT_SCOPE_SITE:	return "site";
2234 	case RT_SCOPE_LINK:	return "link";
2235 	case RT_SCOPE_HOST:	return "host";
2236 	case RT_SCOPE_NOWHERE:	return "nowhere";
2237 	default:
2238 		snprintf(buf, sizeof(buf), "scope=%d", s);
2239 		return buf;
2240 	}
2241 }
2242 
2243 static const char *rtn_type_names[__RTN_MAX] = {
2244 	[RTN_UNSPEC] = "UNSPEC",
2245 	[RTN_UNICAST] = "UNICAST",
2246 	[RTN_LOCAL] = "LOCAL",
2247 	[RTN_BROADCAST] = "BROADCAST",
2248 	[RTN_ANYCAST] = "ANYCAST",
2249 	[RTN_MULTICAST] = "MULTICAST",
2250 	[RTN_BLACKHOLE] = "BLACKHOLE",
2251 	[RTN_UNREACHABLE] = "UNREACHABLE",
2252 	[RTN_PROHIBIT] = "PROHIBIT",
2253 	[RTN_THROW] = "THROW",
2254 	[RTN_NAT] = "NAT",
2255 	[RTN_XRESOLVE] = "XRESOLVE",
2256 };
2257 
2258 static inline const char *rtn_type(unsigned t)
2259 {
2260 	static char buf[32];
2261 
2262 	if (t < __RTN_MAX && rtn_type_names[t])
2263 		return rtn_type_names[t];
2264 	snprintf(buf, sizeof(buf), "type %d", t);
2265 	return buf;
2266 }
2267 
2268 /* Pretty print the trie */
2269 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2270 {
2271 	const struct fib_trie_iter *iter = seq->private;
2272 	struct node *n = v;
2273 
2274 	if (v == SEQ_START_TOKEN)
2275 		return 0;
2276 
2277 	if (IS_TNODE(n)) {
2278 		struct tnode *tn = (struct tnode *) n;
2279 		t_key prf = ntohl(MASK_PFX(tn->key, tn->pos));
2280 
2281 		if (!NODE_PARENT(n)) {
2282 			if (iter->trie == trie_local)
2283 				seq_puts(seq, "<local>:\n");
2284 			else
2285 				seq_puts(seq, "<main>:\n");
2286 		}
2287 		seq_indent(seq, iter->depth-1);
2288 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2289 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2290 			   tn->empty_children);
2291 
2292 	} else {
2293 		struct leaf *l = (struct leaf *) n;
2294 		int i;
2295 		u32 val = ntohl(l->key);
2296 
2297 		seq_indent(seq, iter->depth);
2298 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2299 		for (i = 32; i >= 0; i--) {
2300 			struct leaf_info *li = find_leaf_info(l, i);
2301 			if (li) {
2302 				struct fib_alias *fa;
2303 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2304 					seq_indent(seq, iter->depth+1);
2305 					seq_printf(seq, "  /%d %s %s", i,
2306 						   rtn_scope(fa->fa_scope),
2307 						   rtn_type(fa->fa_type));
2308 					if (fa->fa_tos)
2309 						seq_printf(seq, "tos =%d\n",
2310 							   fa->fa_tos);
2311 					seq_putc(seq, '\n');
2312 				}
2313 			}
2314 		}
2315 	}
2316 
2317 	return 0;
2318 }
2319 
2320 static struct seq_operations fib_trie_seq_ops = {
2321 	.start  = fib_trie_seq_start,
2322 	.next   = fib_trie_seq_next,
2323 	.stop   = fib_trie_seq_stop,
2324 	.show   = fib_trie_seq_show,
2325 };
2326 
2327 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2328 {
2329 	struct seq_file *seq;
2330 	int rc = -ENOMEM;
2331 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2332 
2333 	if (!s)
2334 		goto out;
2335 
2336 	rc = seq_open(file, &fib_trie_seq_ops);
2337 	if (rc)
2338 		goto out_kfree;
2339 
2340 	seq	     = file->private_data;
2341 	seq->private = s;
2342 	memset(s, 0, sizeof(*s));
2343 out:
2344 	return rc;
2345 out_kfree:
2346 	kfree(s);
2347 	goto out;
2348 }
2349 
2350 static struct file_operations fib_trie_fops = {
2351 	.owner  = THIS_MODULE,
2352 	.open   = fib_trie_seq_open,
2353 	.read   = seq_read,
2354 	.llseek = seq_lseek,
2355 	.release = seq_release_private,
2356 };
2357 
2358 static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi)
2359 {
2360 	static unsigned type2flags[RTN_MAX + 1] = {
2361 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2362 	};
2363 	unsigned flags = type2flags[type];
2364 
2365 	if (fi && fi->fib_nh->nh_gw)
2366 		flags |= RTF_GATEWAY;
2367 	if (mask == 0xFFFFFFFF)
2368 		flags |= RTF_HOST;
2369 	flags |= RTF_UP;
2370 	return flags;
2371 }
2372 
2373 /*
2374  *	This outputs /proc/net/route.
2375  *	The format of the file is not supposed to be changed
2376  * 	and needs to be same as fib_hash output to avoid breaking
2377  *	legacy utilities
2378  */
2379 static int fib_route_seq_show(struct seq_file *seq, void *v)
2380 {
2381 	struct leaf *l = v;
2382 	int i;
2383 	char bf[128];
2384 
2385 	if (v == SEQ_START_TOKEN) {
2386 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2387 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2388 			   "\tWindow\tIRTT");
2389 		return 0;
2390 	}
2391 
2392 	if (IS_TNODE(l))
2393 		return 0;
2394 
2395 	for (i=32; i>=0; i--) {
2396 		struct leaf_info *li = find_leaf_info(l, i);
2397 		struct fib_alias *fa;
2398 		u32 mask, prefix;
2399 
2400 		if (!li)
2401 			continue;
2402 
2403 		mask = inet_make_mask(li->plen);
2404 		prefix = htonl(l->key);
2405 
2406 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2407 			const struct fib_info *fi = fa->fa_info;
2408 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2409 
2410 			if (fa->fa_type == RTN_BROADCAST
2411 			    || fa->fa_type == RTN_MULTICAST)
2412 				continue;
2413 
2414 			if (fi)
2415 				snprintf(bf, sizeof(bf),
2416 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2417 					 fi->fib_dev ? fi->fib_dev->name : "*",
2418 					 prefix,
2419 					 fi->fib_nh->nh_gw, flags, 0, 0,
2420 					 fi->fib_priority,
2421 					 mask,
2422 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2423 					 fi->fib_window,
2424 					 fi->fib_rtt >> 3);
2425 			else
2426 				snprintf(bf, sizeof(bf),
2427 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2428 					 prefix, 0, flags, 0, 0, 0,
2429 					 mask, 0, 0, 0);
2430 
2431 			seq_printf(seq, "%-127s\n", bf);
2432 		}
2433 	}
2434 
2435 	return 0;
2436 }
2437 
2438 static struct seq_operations fib_route_seq_ops = {
2439 	.start  = fib_trie_seq_start,
2440 	.next   = fib_trie_seq_next,
2441 	.stop   = fib_trie_seq_stop,
2442 	.show   = fib_route_seq_show,
2443 };
2444 
2445 static int fib_route_seq_open(struct inode *inode, struct file *file)
2446 {
2447 	struct seq_file *seq;
2448 	int rc = -ENOMEM;
2449 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2450 
2451 	if (!s)
2452 		goto out;
2453 
2454 	rc = seq_open(file, &fib_route_seq_ops);
2455 	if (rc)
2456 		goto out_kfree;
2457 
2458 	seq	     = file->private_data;
2459 	seq->private = s;
2460 	memset(s, 0, sizeof(*s));
2461 out:
2462 	return rc;
2463 out_kfree:
2464 	kfree(s);
2465 	goto out;
2466 }
2467 
2468 static struct file_operations fib_route_fops = {
2469 	.owner  = THIS_MODULE,
2470 	.open   = fib_route_seq_open,
2471 	.read   = seq_read,
2472 	.llseek = seq_lseek,
2473 	.release = seq_release_private,
2474 };
2475 
2476 int __init fib_proc_init(void)
2477 {
2478 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2479 		goto out1;
2480 
2481 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2482 		goto out2;
2483 
2484 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2485 		goto out3;
2486 
2487 	return 0;
2488 
2489 out3:
2490 	proc_net_remove("fib_triestat");
2491 out2:
2492 	proc_net_remove("fib_trie");
2493 out1:
2494 	return -ENOMEM;
2495 }
2496 
2497 void __init fib_proc_exit(void)
2498 {
2499 	proc_net_remove("fib_trie");
2500 	proc_net_remove("fib_triestat");
2501 	proc_net_remove("route");
2502 }
2503 
2504 #endif /* CONFIG_PROC_FS */
2505