xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 64c70b1c)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.408"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <asm/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #undef CONFIG_IP_FIB_TRIE_STATS
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89 #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90 
91 typedef unsigned int t_key;
92 
93 #define T_TNODE 0
94 #define T_LEAF  1
95 #define NODE_TYPE_MASK	0x1UL
96 #define NODE_PARENT(node) \
97 	((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK)))
98 
99 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
100 
101 #define NODE_SET_PARENT(node, ptr)		\
102 	rcu_assign_pointer((node)->parent,	\
103 			   ((unsigned long)(ptr)) | NODE_TYPE(node))
104 
105 #define IS_TNODE(n) (!(n->parent & T_LEAF))
106 #define IS_LEAF(n) (n->parent & T_LEAF)
107 
108 struct node {
109 	t_key key;
110 	unsigned long parent;
111 };
112 
113 struct leaf {
114 	t_key key;
115 	unsigned long parent;
116 	struct hlist_head list;
117 	struct rcu_head rcu;
118 };
119 
120 struct leaf_info {
121 	struct hlist_node hlist;
122 	struct rcu_head rcu;
123 	int plen;
124 	struct list_head falh;
125 };
126 
127 struct tnode {
128 	t_key key;
129 	unsigned long parent;
130 	unsigned short pos:5;		/* 2log(KEYLENGTH) bits needed */
131 	unsigned short bits:5;		/* 2log(KEYLENGTH) bits needed */
132 	unsigned short full_children;	/* KEYLENGTH bits needed */
133 	unsigned short empty_children;	/* KEYLENGTH bits needed */
134 	struct rcu_head rcu;
135 	struct node *child[0];
136 };
137 
138 #ifdef CONFIG_IP_FIB_TRIE_STATS
139 struct trie_use_stats {
140 	unsigned int gets;
141 	unsigned int backtrack;
142 	unsigned int semantic_match_passed;
143 	unsigned int semantic_match_miss;
144 	unsigned int null_node_hit;
145 	unsigned int resize_node_skipped;
146 };
147 #endif
148 
149 struct trie_stat {
150 	unsigned int totdepth;
151 	unsigned int maxdepth;
152 	unsigned int tnodes;
153 	unsigned int leaves;
154 	unsigned int nullpointers;
155 	unsigned int nodesizes[MAX_STAT_DEPTH];
156 };
157 
158 struct trie {
159 	struct node *trie;
160 #ifdef CONFIG_IP_FIB_TRIE_STATS
161 	struct trie_use_stats stats;
162 #endif
163 	int size;
164 	unsigned int revision;
165 };
166 
167 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
168 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
169 static struct node *resize(struct trie *t, struct tnode *tn);
170 static struct tnode *inflate(struct trie *t, struct tnode *tn);
171 static struct tnode *halve(struct trie *t, struct tnode *tn);
172 static void tnode_free(struct tnode *tn);
173 
174 static struct kmem_cache *fn_alias_kmem __read_mostly;
175 static struct trie *trie_local = NULL, *trie_main = NULL;
176 
177 
178 /* rcu_read_lock needs to be hold by caller from readside */
179 
180 static inline struct node *tnode_get_child(struct tnode *tn, int i)
181 {
182 	BUG_ON(i >= 1 << tn->bits);
183 
184 	return rcu_dereference(tn->child[i]);
185 }
186 
187 static inline int tnode_child_length(const struct tnode *tn)
188 {
189 	return 1 << tn->bits;
190 }
191 
192 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
193 {
194 	if (offset < KEYLENGTH)
195 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
196 	else
197 		return 0;
198 }
199 
200 static inline int tkey_equals(t_key a, t_key b)
201 {
202 	return a == b;
203 }
204 
205 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
206 {
207 	if (bits == 0 || offset >= KEYLENGTH)
208 		return 1;
209 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
210 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
211 }
212 
213 static inline int tkey_mismatch(t_key a, int offset, t_key b)
214 {
215 	t_key diff = a ^ b;
216 	int i = offset;
217 
218 	if (!diff)
219 		return 0;
220 	while ((diff << i) >> (KEYLENGTH-1) == 0)
221 		i++;
222 	return i;
223 }
224 
225 /*
226   To understand this stuff, an understanding of keys and all their bits is
227   necessary. Every node in the trie has a key associated with it, but not
228   all of the bits in that key are significant.
229 
230   Consider a node 'n' and its parent 'tp'.
231 
232   If n is a leaf, every bit in its key is significant. Its presence is
233   necessitated by path compression, since during a tree traversal (when
234   searching for a leaf - unless we are doing an insertion) we will completely
235   ignore all skipped bits we encounter. Thus we need to verify, at the end of
236   a potentially successful search, that we have indeed been walking the
237   correct key path.
238 
239   Note that we can never "miss" the correct key in the tree if present by
240   following the wrong path. Path compression ensures that segments of the key
241   that are the same for all keys with a given prefix are skipped, but the
242   skipped part *is* identical for each node in the subtrie below the skipped
243   bit! trie_insert() in this implementation takes care of that - note the
244   call to tkey_sub_equals() in trie_insert().
245 
246   if n is an internal node - a 'tnode' here, the various parts of its key
247   have many different meanings.
248 
249   Example:
250   _________________________________________________________________
251   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
252   -----------------------------------------------------------------
253     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
254 
255   _________________________________________________________________
256   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
257   -----------------------------------------------------------------
258    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
259 
260   tp->pos = 7
261   tp->bits = 3
262   n->pos = 15
263   n->bits = 4
264 
265   First, let's just ignore the bits that come before the parent tp, that is
266   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
267   not use them for anything.
268 
269   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
270   index into the parent's child array. That is, they will be used to find
271   'n' among tp's children.
272 
273   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
274   for the node n.
275 
276   All the bits we have seen so far are significant to the node n. The rest
277   of the bits are really not needed or indeed known in n->key.
278 
279   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
280   n's child array, and will of course be different for each child.
281 
282 
283   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
284   at this point.
285 
286 */
287 
288 static inline void check_tnode(const struct tnode *tn)
289 {
290 	WARN_ON(tn && tn->pos+tn->bits > 32);
291 }
292 
293 static int halve_threshold = 25;
294 static int inflate_threshold = 50;
295 static int halve_threshold_root = 8;
296 static int inflate_threshold_root = 15;
297 
298 
299 static void __alias_free_mem(struct rcu_head *head)
300 {
301 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
302 	kmem_cache_free(fn_alias_kmem, fa);
303 }
304 
305 static inline void alias_free_mem_rcu(struct fib_alias *fa)
306 {
307 	call_rcu(&fa->rcu, __alias_free_mem);
308 }
309 
310 static void __leaf_free_rcu(struct rcu_head *head)
311 {
312 	kfree(container_of(head, struct leaf, rcu));
313 }
314 
315 static void __leaf_info_free_rcu(struct rcu_head *head)
316 {
317 	kfree(container_of(head, struct leaf_info, rcu));
318 }
319 
320 static inline void free_leaf_info(struct leaf_info *leaf)
321 {
322 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
323 }
324 
325 static struct tnode *tnode_alloc(unsigned int size)
326 {
327 	struct page *pages;
328 
329 	if (size <= PAGE_SIZE)
330 		return kcalloc(size, 1, GFP_KERNEL);
331 
332 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
333 	if (!pages)
334 		return NULL;
335 
336 	return page_address(pages);
337 }
338 
339 static void __tnode_free_rcu(struct rcu_head *head)
340 {
341 	struct tnode *tn = container_of(head, struct tnode, rcu);
342 	unsigned int size = sizeof(struct tnode) +
343 		(1 << tn->bits) * sizeof(struct node *);
344 
345 	if (size <= PAGE_SIZE)
346 		kfree(tn);
347 	else
348 		free_pages((unsigned long)tn, get_order(size));
349 }
350 
351 static inline void tnode_free(struct tnode *tn)
352 {
353 	if (IS_LEAF(tn)) {
354 		struct leaf *l = (struct leaf *) tn;
355 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
356 	} else
357 		call_rcu(&tn->rcu, __tnode_free_rcu);
358 }
359 
360 static struct leaf *leaf_new(void)
361 {
362 	struct leaf *l = kmalloc(sizeof(struct leaf),  GFP_KERNEL);
363 	if (l) {
364 		l->parent = T_LEAF;
365 		INIT_HLIST_HEAD(&l->list);
366 	}
367 	return l;
368 }
369 
370 static struct leaf_info *leaf_info_new(int plen)
371 {
372 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
373 	if (li) {
374 		li->plen = plen;
375 		INIT_LIST_HEAD(&li->falh);
376 	}
377 	return li;
378 }
379 
380 static struct tnode* tnode_new(t_key key, int pos, int bits)
381 {
382 	int nchildren = 1<<bits;
383 	int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
384 	struct tnode *tn = tnode_alloc(sz);
385 
386 	if (tn) {
387 		memset(tn, 0, sz);
388 		tn->parent = T_TNODE;
389 		tn->pos = pos;
390 		tn->bits = bits;
391 		tn->key = key;
392 		tn->full_children = 0;
393 		tn->empty_children = 1<<bits;
394 	}
395 
396 	pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
397 		 (unsigned int) (sizeof(struct node) * 1<<bits));
398 	return tn;
399 }
400 
401 /*
402  * Check whether a tnode 'n' is "full", i.e. it is an internal node
403  * and no bits are skipped. See discussion in dyntree paper p. 6
404  */
405 
406 static inline int tnode_full(const struct tnode *tn, const struct node *n)
407 {
408 	if (n == NULL || IS_LEAF(n))
409 		return 0;
410 
411 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
412 }
413 
414 static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
415 {
416 	tnode_put_child_reorg(tn, i, n, -1);
417 }
418 
419  /*
420   * Add a child at position i overwriting the old value.
421   * Update the value of full_children and empty_children.
422   */
423 
424 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
425 {
426 	struct node *chi = tn->child[i];
427 	int isfull;
428 
429 	BUG_ON(i >= 1<<tn->bits);
430 
431 
432 	/* update emptyChildren */
433 	if (n == NULL && chi != NULL)
434 		tn->empty_children++;
435 	else if (n != NULL && chi == NULL)
436 		tn->empty_children--;
437 
438 	/* update fullChildren */
439 	if (wasfull == -1)
440 		wasfull = tnode_full(tn, chi);
441 
442 	isfull = tnode_full(tn, n);
443 	if (wasfull && !isfull)
444 		tn->full_children--;
445 	else if (!wasfull && isfull)
446 		tn->full_children++;
447 
448 	if (n)
449 		NODE_SET_PARENT(n, tn);
450 
451 	rcu_assign_pointer(tn->child[i], n);
452 }
453 
454 static struct node *resize(struct trie *t, struct tnode *tn)
455 {
456 	int i;
457 	int err = 0;
458 	struct tnode *old_tn;
459 	int inflate_threshold_use;
460 	int halve_threshold_use;
461 	int max_resize;
462 
463 	if (!tn)
464 		return NULL;
465 
466 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
467 		 tn, inflate_threshold, halve_threshold);
468 
469 	/* No children */
470 	if (tn->empty_children == tnode_child_length(tn)) {
471 		tnode_free(tn);
472 		return NULL;
473 	}
474 	/* One child */
475 	if (tn->empty_children == tnode_child_length(tn) - 1)
476 		for (i = 0; i < tnode_child_length(tn); i++) {
477 			struct node *n;
478 
479 			n = tn->child[i];
480 			if (!n)
481 				continue;
482 
483 			/* compress one level */
484 			NODE_SET_PARENT(n, NULL);
485 			tnode_free(tn);
486 			return n;
487 		}
488 	/*
489 	 * Double as long as the resulting node has a number of
490 	 * nonempty nodes that are above the threshold.
491 	 */
492 
493 	/*
494 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
495 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
496 	 * Telecommunications, page 6:
497 	 * "A node is doubled if the ratio of non-empty children to all
498 	 * children in the *doubled* node is at least 'high'."
499 	 *
500 	 * 'high' in this instance is the variable 'inflate_threshold'. It
501 	 * is expressed as a percentage, so we multiply it with
502 	 * tnode_child_length() and instead of multiplying by 2 (since the
503 	 * child array will be doubled by inflate()) and multiplying
504 	 * the left-hand side by 100 (to handle the percentage thing) we
505 	 * multiply the left-hand side by 50.
506 	 *
507 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
508 	 * - tn->empty_children is of course the number of non-null children
509 	 * in the current node. tn->full_children is the number of "full"
510 	 * children, that is non-null tnodes with a skip value of 0.
511 	 * All of those will be doubled in the resulting inflated tnode, so
512 	 * we just count them one extra time here.
513 	 *
514 	 * A clearer way to write this would be:
515 	 *
516 	 * to_be_doubled = tn->full_children;
517 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
518 	 *     tn->full_children;
519 	 *
520 	 * new_child_length = tnode_child_length(tn) * 2;
521 	 *
522 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
523 	 *      new_child_length;
524 	 * if (new_fill_factor >= inflate_threshold)
525 	 *
526 	 * ...and so on, tho it would mess up the while () loop.
527 	 *
528 	 * anyway,
529 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
530 	 *      inflate_threshold
531 	 *
532 	 * avoid a division:
533 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
534 	 *      inflate_threshold * new_child_length
535 	 *
536 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
537 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
538 	 *    tn->full_children) >= inflate_threshold * new_child_length
539 	 *
540 	 * expand new_child_length:
541 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
542 	 *    tn->full_children) >=
543 	 *      inflate_threshold * tnode_child_length(tn) * 2
544 	 *
545 	 * shorten again:
546 	 * 50 * (tn->full_children + tnode_child_length(tn) -
547 	 *    tn->empty_children) >= inflate_threshold *
548 	 *    tnode_child_length(tn)
549 	 *
550 	 */
551 
552 	check_tnode(tn);
553 
554 	/* Keep root node larger  */
555 
556 	if (!tn->parent)
557 		inflate_threshold_use = inflate_threshold_root;
558 	else
559 		inflate_threshold_use = inflate_threshold;
560 
561 	err = 0;
562 	max_resize = 10;
563 	while ((tn->full_children > 0 &&  max_resize-- &&
564 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
565 				inflate_threshold_use * tnode_child_length(tn))) {
566 
567 		old_tn = tn;
568 		tn = inflate(t, tn);
569 		if (IS_ERR(tn)) {
570 			tn = old_tn;
571 #ifdef CONFIG_IP_FIB_TRIE_STATS
572 			t->stats.resize_node_skipped++;
573 #endif
574 			break;
575 		}
576 	}
577 
578 	if (max_resize < 0) {
579 		if (!tn->parent)
580 			printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
581 			       inflate_threshold_root, tn->bits);
582 		else
583 			printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
584 			       inflate_threshold, tn->bits);
585 	}
586 
587 	check_tnode(tn);
588 
589 	/*
590 	 * Halve as long as the number of empty children in this
591 	 * node is above threshold.
592 	 */
593 
594 
595 	/* Keep root node larger  */
596 
597 	if (!tn->parent)
598 		halve_threshold_use = halve_threshold_root;
599 	else
600 		halve_threshold_use = halve_threshold;
601 
602 	err = 0;
603 	max_resize = 10;
604 	while (tn->bits > 1 &&  max_resize-- &&
605 	       100 * (tnode_child_length(tn) - tn->empty_children) <
606 	       halve_threshold_use * tnode_child_length(tn)) {
607 
608 		old_tn = tn;
609 		tn = halve(t, tn);
610 		if (IS_ERR(tn)) {
611 			tn = old_tn;
612 #ifdef CONFIG_IP_FIB_TRIE_STATS
613 			t->stats.resize_node_skipped++;
614 #endif
615 			break;
616 		}
617 	}
618 
619 	if (max_resize < 0) {
620 		if (!tn->parent)
621 			printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
622 			       halve_threshold_root, tn->bits);
623 		else
624 			printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
625 			       halve_threshold, tn->bits);
626 	}
627 
628 	/* Only one child remains */
629 	if (tn->empty_children == tnode_child_length(tn) - 1)
630 		for (i = 0; i < tnode_child_length(tn); i++) {
631 			struct node *n;
632 
633 			n = tn->child[i];
634 			if (!n)
635 				continue;
636 
637 			/* compress one level */
638 
639 			NODE_SET_PARENT(n, NULL);
640 			tnode_free(tn);
641 			return n;
642 		}
643 
644 	return (struct node *) tn;
645 }
646 
647 static struct tnode *inflate(struct trie *t, struct tnode *tn)
648 {
649 	struct tnode *inode;
650 	struct tnode *oldtnode = tn;
651 	int olen = tnode_child_length(tn);
652 	int i;
653 
654 	pr_debug("In inflate\n");
655 
656 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
657 
658 	if (!tn)
659 		return ERR_PTR(-ENOMEM);
660 
661 	/*
662 	 * Preallocate and store tnodes before the actual work so we
663 	 * don't get into an inconsistent state if memory allocation
664 	 * fails. In case of failure we return the oldnode and  inflate
665 	 * of tnode is ignored.
666 	 */
667 
668 	for (i = 0; i < olen; i++) {
669 		struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670 
671 		if (inode &&
672 		    IS_TNODE(inode) &&
673 		    inode->pos == oldtnode->pos + oldtnode->bits &&
674 		    inode->bits > 1) {
675 			struct tnode *left, *right;
676 			t_key m = TKEY_GET_MASK(inode->pos, 1);
677 
678 			left = tnode_new(inode->key&(~m), inode->pos + 1,
679 					 inode->bits - 1);
680 			if (!left)
681 				goto nomem;
682 
683 			right = tnode_new(inode->key|m, inode->pos + 1,
684 					  inode->bits - 1);
685 
686 			if (!right) {
687 				tnode_free(left);
688 				goto nomem;
689 			}
690 
691 			put_child(t, tn, 2*i, (struct node *) left);
692 			put_child(t, tn, 2*i+1, (struct node *) right);
693 		}
694 	}
695 
696 	for (i = 0; i < olen; i++) {
697 		struct node *node = tnode_get_child(oldtnode, i);
698 		struct tnode *left, *right;
699 		int size, j;
700 
701 		/* An empty child */
702 		if (node == NULL)
703 			continue;
704 
705 		/* A leaf or an internal node with skipped bits */
706 
707 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
708 		   tn->pos + tn->bits - 1) {
709 			if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
710 					     1) == 0)
711 				put_child(t, tn, 2*i, node);
712 			else
713 				put_child(t, tn, 2*i+1, node);
714 			continue;
715 		}
716 
717 		/* An internal node with two children */
718 		inode = (struct tnode *) node;
719 
720 		if (inode->bits == 1) {
721 			put_child(t, tn, 2*i, inode->child[0]);
722 			put_child(t, tn, 2*i+1, inode->child[1]);
723 
724 			tnode_free(inode);
725 			continue;
726 		}
727 
728 		/* An internal node with more than two children */
729 
730 		/* We will replace this node 'inode' with two new
731 		 * ones, 'left' and 'right', each with half of the
732 		 * original children. The two new nodes will have
733 		 * a position one bit further down the key and this
734 		 * means that the "significant" part of their keys
735 		 * (see the discussion near the top of this file)
736 		 * will differ by one bit, which will be "0" in
737 		 * left's key and "1" in right's key. Since we are
738 		 * moving the key position by one step, the bit that
739 		 * we are moving away from - the bit at position
740 		 * (inode->pos) - is the one that will differ between
741 		 * left and right. So... we synthesize that bit in the
742 		 * two  new keys.
743 		 * The mask 'm' below will be a single "one" bit at
744 		 * the position (inode->pos)
745 		 */
746 
747 		/* Use the old key, but set the new significant
748 		 *   bit to zero.
749 		 */
750 
751 		left = (struct tnode *) tnode_get_child(tn, 2*i);
752 		put_child(t, tn, 2*i, NULL);
753 
754 		BUG_ON(!left);
755 
756 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
757 		put_child(t, tn, 2*i+1, NULL);
758 
759 		BUG_ON(!right);
760 
761 		size = tnode_child_length(left);
762 		for (j = 0; j < size; j++) {
763 			put_child(t, left, j, inode->child[j]);
764 			put_child(t, right, j, inode->child[j + size]);
765 		}
766 		put_child(t, tn, 2*i, resize(t, left));
767 		put_child(t, tn, 2*i+1, resize(t, right));
768 
769 		tnode_free(inode);
770 	}
771 	tnode_free(oldtnode);
772 	return tn;
773 nomem:
774 	{
775 		int size = tnode_child_length(tn);
776 		int j;
777 
778 		for (j = 0; j < size; j++)
779 			if (tn->child[j])
780 				tnode_free((struct tnode *)tn->child[j]);
781 
782 		tnode_free(tn);
783 
784 		return ERR_PTR(-ENOMEM);
785 	}
786 }
787 
788 static struct tnode *halve(struct trie *t, struct tnode *tn)
789 {
790 	struct tnode *oldtnode = tn;
791 	struct node *left, *right;
792 	int i;
793 	int olen = tnode_child_length(tn);
794 
795 	pr_debug("In halve\n");
796 
797 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
798 
799 	if (!tn)
800 		return ERR_PTR(-ENOMEM);
801 
802 	/*
803 	 * Preallocate and store tnodes before the actual work so we
804 	 * don't get into an inconsistent state if memory allocation
805 	 * fails. In case of failure we return the oldnode and halve
806 	 * of tnode is ignored.
807 	 */
808 
809 	for (i = 0; i < olen; i += 2) {
810 		left = tnode_get_child(oldtnode, i);
811 		right = tnode_get_child(oldtnode, i+1);
812 
813 		/* Two nonempty children */
814 		if (left && right) {
815 			struct tnode *newn;
816 
817 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
818 
819 			if (!newn)
820 				goto nomem;
821 
822 			put_child(t, tn, i/2, (struct node *)newn);
823 		}
824 
825 	}
826 
827 	for (i = 0; i < olen; i += 2) {
828 		struct tnode *newBinNode;
829 
830 		left = tnode_get_child(oldtnode, i);
831 		right = tnode_get_child(oldtnode, i+1);
832 
833 		/* At least one of the children is empty */
834 		if (left == NULL) {
835 			if (right == NULL)    /* Both are empty */
836 				continue;
837 			put_child(t, tn, i/2, right);
838 			continue;
839 		}
840 
841 		if (right == NULL) {
842 			put_child(t, tn, i/2, left);
843 			continue;
844 		}
845 
846 		/* Two nonempty children */
847 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
848 		put_child(t, tn, i/2, NULL);
849 		put_child(t, newBinNode, 0, left);
850 		put_child(t, newBinNode, 1, right);
851 		put_child(t, tn, i/2, resize(t, newBinNode));
852 	}
853 	tnode_free(oldtnode);
854 	return tn;
855 nomem:
856 	{
857 		int size = tnode_child_length(tn);
858 		int j;
859 
860 		for (j = 0; j < size; j++)
861 			if (tn->child[j])
862 				tnode_free((struct tnode *)tn->child[j]);
863 
864 		tnode_free(tn);
865 
866 		return ERR_PTR(-ENOMEM);
867 	}
868 }
869 
870 static void trie_init(struct trie *t)
871 {
872 	if (!t)
873 		return;
874 
875 	t->size = 0;
876 	rcu_assign_pointer(t->trie, NULL);
877 	t->revision = 0;
878 #ifdef CONFIG_IP_FIB_TRIE_STATS
879 	memset(&t->stats, 0, sizeof(struct trie_use_stats));
880 #endif
881 }
882 
883 /* readside must use rcu_read_lock currently dump routines
884  via get_fa_head and dump */
885 
886 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
887 {
888 	struct hlist_head *head = &l->list;
889 	struct hlist_node *node;
890 	struct leaf_info *li;
891 
892 	hlist_for_each_entry_rcu(li, node, head, hlist)
893 		if (li->plen == plen)
894 			return li;
895 
896 	return NULL;
897 }
898 
899 static inline struct list_head * get_fa_head(struct leaf *l, int plen)
900 {
901 	struct leaf_info *li = find_leaf_info(l, plen);
902 
903 	if (!li)
904 		return NULL;
905 
906 	return &li->falh;
907 }
908 
909 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
910 {
911 	struct leaf_info *li = NULL, *last = NULL;
912 	struct hlist_node *node;
913 
914 	if (hlist_empty(head)) {
915 		hlist_add_head_rcu(&new->hlist, head);
916 	} else {
917 		hlist_for_each_entry(li, node, head, hlist) {
918 			if (new->plen > li->plen)
919 				break;
920 
921 			last = li;
922 		}
923 		if (last)
924 			hlist_add_after_rcu(&last->hlist, &new->hlist);
925 		else
926 			hlist_add_before_rcu(&new->hlist, &li->hlist);
927 	}
928 }
929 
930 /* rcu_read_lock needs to be hold by caller from readside */
931 
932 static struct leaf *
933 fib_find_node(struct trie *t, u32 key)
934 {
935 	int pos;
936 	struct tnode *tn;
937 	struct node *n;
938 
939 	pos = 0;
940 	n = rcu_dereference(t->trie);
941 
942 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
943 		tn = (struct tnode *) n;
944 
945 		check_tnode(tn);
946 
947 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
948 			pos = tn->pos + tn->bits;
949 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
950 		} else
951 			break;
952 	}
953 	/* Case we have found a leaf. Compare prefixes */
954 
955 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
956 		return (struct leaf *)n;
957 
958 	return NULL;
959 }
960 
961 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
962 {
963 	int wasfull;
964 	t_key cindex, key;
965 	struct tnode *tp = NULL;
966 
967 	key = tn->key;
968 
969 	while (tn != NULL && NODE_PARENT(tn) != NULL) {
970 
971 		tp = NODE_PARENT(tn);
972 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
973 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
974 		tn = (struct tnode *) resize (t, (struct tnode *)tn);
975 		tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
976 
977 		if (!NODE_PARENT(tn))
978 			break;
979 
980 		tn = NODE_PARENT(tn);
981 	}
982 	/* Handle last (top) tnode */
983 	if (IS_TNODE(tn))
984 		tn = (struct tnode*) resize(t, (struct tnode *)tn);
985 
986 	return (struct node*) tn;
987 }
988 
989 /* only used from updater-side */
990 
991 static  struct list_head *
992 fib_insert_node(struct trie *t, int *err, u32 key, int plen)
993 {
994 	int pos, newpos;
995 	struct tnode *tp = NULL, *tn = NULL;
996 	struct node *n;
997 	struct leaf *l;
998 	int missbit;
999 	struct list_head *fa_head = NULL;
1000 	struct leaf_info *li;
1001 	t_key cindex;
1002 
1003 	pos = 0;
1004 	n = t->trie;
1005 
1006 	/* If we point to NULL, stop. Either the tree is empty and we should
1007 	 * just put a new leaf in if, or we have reached an empty child slot,
1008 	 * and we should just put our new leaf in that.
1009 	 * If we point to a T_TNODE, check if it matches our key. Note that
1010 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1011 	 * not be the parent's 'pos'+'bits'!
1012 	 *
1013 	 * If it does match the current key, get pos/bits from it, extract
1014 	 * the index from our key, push the T_TNODE and walk the tree.
1015 	 *
1016 	 * If it doesn't, we have to replace it with a new T_TNODE.
1017 	 *
1018 	 * If we point to a T_LEAF, it might or might not have the same key
1019 	 * as we do. If it does, just change the value, update the T_LEAF's
1020 	 * value, and return it.
1021 	 * If it doesn't, we need to replace it with a T_TNODE.
1022 	 */
1023 
1024 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1025 		tn = (struct tnode *) n;
1026 
1027 		check_tnode(tn);
1028 
1029 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1030 			tp = tn;
1031 			pos = tn->pos + tn->bits;
1032 			n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1033 
1034 			BUG_ON(n && NODE_PARENT(n) != tn);
1035 		} else
1036 			break;
1037 	}
1038 
1039 	/*
1040 	 * n  ----> NULL, LEAF or TNODE
1041 	 *
1042 	 * tp is n's (parent) ----> NULL or TNODE
1043 	 */
1044 
1045 	BUG_ON(tp && IS_LEAF(tp));
1046 
1047 	/* Case 1: n is a leaf. Compare prefixes */
1048 
1049 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1050 		struct leaf *l = (struct leaf *) n;
1051 
1052 		li = leaf_info_new(plen);
1053 
1054 		if (!li) {
1055 			*err = -ENOMEM;
1056 			goto err;
1057 		}
1058 
1059 		fa_head = &li->falh;
1060 		insert_leaf_info(&l->list, li);
1061 		goto done;
1062 	}
1063 	t->size++;
1064 	l = leaf_new();
1065 
1066 	if (!l) {
1067 		*err = -ENOMEM;
1068 		goto err;
1069 	}
1070 
1071 	l->key = key;
1072 	li = leaf_info_new(plen);
1073 
1074 	if (!li) {
1075 		tnode_free((struct tnode *) l);
1076 		*err = -ENOMEM;
1077 		goto err;
1078 	}
1079 
1080 	fa_head = &li->falh;
1081 	insert_leaf_info(&l->list, li);
1082 
1083 	if (t->trie && n == NULL) {
1084 		/* Case 2: n is NULL, and will just insert a new leaf */
1085 
1086 		NODE_SET_PARENT(l, tp);
1087 
1088 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1089 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1090 	} else {
1091 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1092 		/*
1093 		 *  Add a new tnode here
1094 		 *  first tnode need some special handling
1095 		 */
1096 
1097 		if (tp)
1098 			pos = tp->pos+tp->bits;
1099 		else
1100 			pos = 0;
1101 
1102 		if (n) {
1103 			newpos = tkey_mismatch(key, pos, n->key);
1104 			tn = tnode_new(n->key, newpos, 1);
1105 		} else {
1106 			newpos = 0;
1107 			tn = tnode_new(key, newpos, 1); /* First tnode */
1108 		}
1109 
1110 		if (!tn) {
1111 			free_leaf_info(li);
1112 			tnode_free((struct tnode *) l);
1113 			*err = -ENOMEM;
1114 			goto err;
1115 		}
1116 
1117 		NODE_SET_PARENT(tn, tp);
1118 
1119 		missbit = tkey_extract_bits(key, newpos, 1);
1120 		put_child(t, tn, missbit, (struct node *)l);
1121 		put_child(t, tn, 1-missbit, n);
1122 
1123 		if (tp) {
1124 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1125 			put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
1126 		} else {
1127 			rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
1128 			tp = tn;
1129 		}
1130 	}
1131 
1132 	if (tp && tp->pos + tp->bits > 32)
1133 		printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1134 		       tp, tp->pos, tp->bits, key, plen);
1135 
1136 	/* Rebalance the trie */
1137 
1138 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1139 done:
1140 	t->revision++;
1141 err:
1142 	return fa_head;
1143 }
1144 
1145 /*
1146  * Caller must hold RTNL.
1147  */
1148 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1149 {
1150 	struct trie *t = (struct trie *) tb->tb_data;
1151 	struct fib_alias *fa, *new_fa;
1152 	struct list_head *fa_head = NULL;
1153 	struct fib_info *fi;
1154 	int plen = cfg->fc_dst_len;
1155 	u8 tos = cfg->fc_tos;
1156 	u32 key, mask;
1157 	int err;
1158 	struct leaf *l;
1159 
1160 	if (plen > 32)
1161 		return -EINVAL;
1162 
1163 	key = ntohl(cfg->fc_dst);
1164 
1165 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1166 
1167 	mask = ntohl(inet_make_mask(plen));
1168 
1169 	if (key & ~mask)
1170 		return -EINVAL;
1171 
1172 	key = key & mask;
1173 
1174 	fi = fib_create_info(cfg);
1175 	if (IS_ERR(fi)) {
1176 		err = PTR_ERR(fi);
1177 		goto err;
1178 	}
1179 
1180 	l = fib_find_node(t, key);
1181 	fa = NULL;
1182 
1183 	if (l) {
1184 		fa_head = get_fa_head(l, plen);
1185 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1186 	}
1187 
1188 	/* Now fa, if non-NULL, points to the first fib alias
1189 	 * with the same keys [prefix,tos,priority], if such key already
1190 	 * exists or to the node before which we will insert new one.
1191 	 *
1192 	 * If fa is NULL, we will need to allocate a new one and
1193 	 * insert to the head of f.
1194 	 *
1195 	 * If f is NULL, no fib node matched the destination key
1196 	 * and we need to allocate a new one of those as well.
1197 	 */
1198 
1199 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
1200 		struct fib_alias *fa_orig;
1201 
1202 		err = -EEXIST;
1203 		if (cfg->fc_nlflags & NLM_F_EXCL)
1204 			goto out;
1205 
1206 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1207 			struct fib_info *fi_drop;
1208 			u8 state;
1209 
1210 			err = -ENOBUFS;
1211 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1212 			if (new_fa == NULL)
1213 				goto out;
1214 
1215 			fi_drop = fa->fa_info;
1216 			new_fa->fa_tos = fa->fa_tos;
1217 			new_fa->fa_info = fi;
1218 			new_fa->fa_type = cfg->fc_type;
1219 			new_fa->fa_scope = cfg->fc_scope;
1220 			state = fa->fa_state;
1221 			new_fa->fa_state &= ~FA_S_ACCESSED;
1222 
1223 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1224 			alias_free_mem_rcu(fa);
1225 
1226 			fib_release_info(fi_drop);
1227 			if (state & FA_S_ACCESSED)
1228 				rt_cache_flush(-1);
1229 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1230 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1231 
1232 			goto succeeded;
1233 		}
1234 		/* Error if we find a perfect match which
1235 		 * uses the same scope, type, and nexthop
1236 		 * information.
1237 		 */
1238 		fa_orig = fa;
1239 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1240 			if (fa->fa_tos != tos)
1241 				break;
1242 			if (fa->fa_info->fib_priority != fi->fib_priority)
1243 				break;
1244 			if (fa->fa_type == cfg->fc_type &&
1245 			    fa->fa_scope == cfg->fc_scope &&
1246 			    fa->fa_info == fi) {
1247 				goto out;
1248 			}
1249 		}
1250 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1251 			fa = fa_orig;
1252 	}
1253 	err = -ENOENT;
1254 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1255 		goto out;
1256 
1257 	err = -ENOBUFS;
1258 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1259 	if (new_fa == NULL)
1260 		goto out;
1261 
1262 	new_fa->fa_info = fi;
1263 	new_fa->fa_tos = tos;
1264 	new_fa->fa_type = cfg->fc_type;
1265 	new_fa->fa_scope = cfg->fc_scope;
1266 	new_fa->fa_state = 0;
1267 	/*
1268 	 * Insert new entry to the list.
1269 	 */
1270 
1271 	if (!fa_head) {
1272 		err = 0;
1273 		fa_head = fib_insert_node(t, &err, key, plen);
1274 		if (err)
1275 			goto out_free_new_fa;
1276 	}
1277 
1278 	list_add_tail_rcu(&new_fa->fa_list,
1279 			  (fa ? &fa->fa_list : fa_head));
1280 
1281 	rt_cache_flush(-1);
1282 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1283 		  &cfg->fc_nlinfo, 0);
1284 succeeded:
1285 	return 0;
1286 
1287 out_free_new_fa:
1288 	kmem_cache_free(fn_alias_kmem, new_fa);
1289 out:
1290 	fib_release_info(fi);
1291 err:
1292 	return err;
1293 }
1294 
1295 
1296 /* should be called with rcu_read_lock */
1297 static inline int check_leaf(struct trie *t, struct leaf *l,
1298 			     t_key key, int *plen, const struct flowi *flp,
1299 			     struct fib_result *res)
1300 {
1301 	int err, i;
1302 	__be32 mask;
1303 	struct leaf_info *li;
1304 	struct hlist_head *hhead = &l->list;
1305 	struct hlist_node *node;
1306 
1307 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1308 		i = li->plen;
1309 		mask = inet_make_mask(i);
1310 		if (l->key != (key & ntohl(mask)))
1311 			continue;
1312 
1313 		if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
1314 			*plen = i;
1315 #ifdef CONFIG_IP_FIB_TRIE_STATS
1316 			t->stats.semantic_match_passed++;
1317 #endif
1318 			return err;
1319 		}
1320 #ifdef CONFIG_IP_FIB_TRIE_STATS
1321 		t->stats.semantic_match_miss++;
1322 #endif
1323 	}
1324 	return 1;
1325 }
1326 
1327 static int
1328 fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1329 {
1330 	struct trie *t = (struct trie *) tb->tb_data;
1331 	int plen, ret = 0;
1332 	struct node *n;
1333 	struct tnode *pn;
1334 	int pos, bits;
1335 	t_key key = ntohl(flp->fl4_dst);
1336 	int chopped_off;
1337 	t_key cindex = 0;
1338 	int current_prefix_length = KEYLENGTH;
1339 	struct tnode *cn;
1340 	t_key node_prefix, key_prefix, pref_mismatch;
1341 	int mp;
1342 
1343 	rcu_read_lock();
1344 
1345 	n = rcu_dereference(t->trie);
1346 	if (!n)
1347 		goto failed;
1348 
1349 #ifdef CONFIG_IP_FIB_TRIE_STATS
1350 	t->stats.gets++;
1351 #endif
1352 
1353 	/* Just a leaf? */
1354 	if (IS_LEAF(n)) {
1355 		if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1356 			goto found;
1357 		goto failed;
1358 	}
1359 	pn = (struct tnode *) n;
1360 	chopped_off = 0;
1361 
1362 	while (pn) {
1363 		pos = pn->pos;
1364 		bits = pn->bits;
1365 
1366 		if (!chopped_off)
1367 			cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1368 
1369 		n = tnode_get_child(pn, cindex);
1370 
1371 		if (n == NULL) {
1372 #ifdef CONFIG_IP_FIB_TRIE_STATS
1373 			t->stats.null_node_hit++;
1374 #endif
1375 			goto backtrace;
1376 		}
1377 
1378 		if (IS_LEAF(n)) {
1379 			if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1380 				goto found;
1381 			else
1382 				goto backtrace;
1383 		}
1384 
1385 #define HL_OPTIMIZE
1386 #ifdef HL_OPTIMIZE
1387 		cn = (struct tnode *)n;
1388 
1389 		/*
1390 		 * It's a tnode, and we can do some extra checks here if we
1391 		 * like, to avoid descending into a dead-end branch.
1392 		 * This tnode is in the parent's child array at index
1393 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1394 		 * chopped off, so in reality the index may be just a
1395 		 * subprefix, padded with zero at the end.
1396 		 * We can also take a look at any skipped bits in this
1397 		 * tnode - everything up to p_pos is supposed to be ok,
1398 		 * and the non-chopped bits of the index (se previous
1399 		 * paragraph) are also guaranteed ok, but the rest is
1400 		 * considered unknown.
1401 		 *
1402 		 * The skipped bits are key[pos+bits..cn->pos].
1403 		 */
1404 
1405 		/* If current_prefix_length < pos+bits, we are already doing
1406 		 * actual prefix  matching, which means everything from
1407 		 * pos+(bits-chopped_off) onward must be zero along some
1408 		 * branch of this subtree - otherwise there is *no* valid
1409 		 * prefix present. Here we can only check the skipped
1410 		 * bits. Remember, since we have already indexed into the
1411 		 * parent's child array, we know that the bits we chopped of
1412 		 * *are* zero.
1413 		 */
1414 
1415 		/* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
1416 
1417 		if (current_prefix_length < pos+bits) {
1418 			if (tkey_extract_bits(cn->key, current_prefix_length,
1419 						cn->pos - current_prefix_length) != 0 ||
1420 			    !(cn->child[0]))
1421 				goto backtrace;
1422 		}
1423 
1424 		/*
1425 		 * If chopped_off=0, the index is fully validated and we
1426 		 * only need to look at the skipped bits for this, the new,
1427 		 * tnode. What we actually want to do is to find out if
1428 		 * these skipped bits match our key perfectly, or if we will
1429 		 * have to count on finding a matching prefix further down,
1430 		 * because if we do, we would like to have some way of
1431 		 * verifying the existence of such a prefix at this point.
1432 		 */
1433 
1434 		/* The only thing we can do at this point is to verify that
1435 		 * any such matching prefix can indeed be a prefix to our
1436 		 * key, and if the bits in the node we are inspecting that
1437 		 * do not match our key are not ZERO, this cannot be true.
1438 		 * Thus, find out where there is a mismatch (before cn->pos)
1439 		 * and verify that all the mismatching bits are zero in the
1440 		 * new tnode's key.
1441 		 */
1442 
1443 		/* Note: We aren't very concerned about the piece of the key
1444 		 * that precede pn->pos+pn->bits, since these have already been
1445 		 * checked. The bits after cn->pos aren't checked since these are
1446 		 * by definition "unknown" at this point. Thus, what we want to
1447 		 * see is if we are about to enter the "prefix matching" state,
1448 		 * and in that case verify that the skipped bits that will prevail
1449 		 * throughout this subtree are zero, as they have to be if we are
1450 		 * to find a matching prefix.
1451 		 */
1452 
1453 		node_prefix = MASK_PFX(cn->key, cn->pos);
1454 		key_prefix = MASK_PFX(key, cn->pos);
1455 		pref_mismatch = key_prefix^node_prefix;
1456 		mp = 0;
1457 
1458 		/* In short: If skipped bits in this node do not match the search
1459 		 * key, enter the "prefix matching" state.directly.
1460 		 */
1461 		if (pref_mismatch) {
1462 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1463 				mp++;
1464 				pref_mismatch = pref_mismatch <<1;
1465 			}
1466 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1467 
1468 			if (key_prefix != 0)
1469 				goto backtrace;
1470 
1471 			if (current_prefix_length >= cn->pos)
1472 				current_prefix_length = mp;
1473 		}
1474 #endif
1475 		pn = (struct tnode *)n; /* Descend */
1476 		chopped_off = 0;
1477 		continue;
1478 
1479 backtrace:
1480 		chopped_off++;
1481 
1482 		/* As zero don't change the child key (cindex) */
1483 		while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
1484 			chopped_off++;
1485 
1486 		/* Decrease current_... with bits chopped off */
1487 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1488 			current_prefix_length = pn->pos + pn->bits - chopped_off;
1489 
1490 		/*
1491 		 * Either we do the actual chop off according or if we have
1492 		 * chopped off all bits in this tnode walk up to our parent.
1493 		 */
1494 
1495 		if (chopped_off <= pn->bits) {
1496 			cindex &= ~(1 << (chopped_off-1));
1497 		} else {
1498 			if (NODE_PARENT(pn) == NULL)
1499 				goto failed;
1500 
1501 			/* Get Child's index */
1502 			cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1503 			pn = NODE_PARENT(pn);
1504 			chopped_off = 0;
1505 
1506 #ifdef CONFIG_IP_FIB_TRIE_STATS
1507 			t->stats.backtrack++;
1508 #endif
1509 			goto backtrace;
1510 		}
1511 	}
1512 failed:
1513 	ret = 1;
1514 found:
1515 	rcu_read_unlock();
1516 	return ret;
1517 }
1518 
1519 /* only called from updater side */
1520 static int trie_leaf_remove(struct trie *t, t_key key)
1521 {
1522 	t_key cindex;
1523 	struct tnode *tp = NULL;
1524 	struct node *n = t->trie;
1525 	struct leaf *l;
1526 
1527 	pr_debug("entering trie_leaf_remove(%p)\n", n);
1528 
1529 	/* Note that in the case skipped bits, those bits are *not* checked!
1530 	 * When we finish this, we will have NULL or a T_LEAF, and the
1531 	 * T_LEAF may or may not match our key.
1532 	 */
1533 
1534 	while (n != NULL && IS_TNODE(n)) {
1535 		struct tnode *tn = (struct tnode *) n;
1536 		check_tnode(tn);
1537 		n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1538 
1539 		BUG_ON(n && NODE_PARENT(n) != tn);
1540 	}
1541 	l = (struct leaf *) n;
1542 
1543 	if (!n || !tkey_equals(l->key, key))
1544 		return 0;
1545 
1546 	/*
1547 	 * Key found.
1548 	 * Remove the leaf and rebalance the tree
1549 	 */
1550 
1551 	t->revision++;
1552 	t->size--;
1553 
1554 	tp = NODE_PARENT(n);
1555 	tnode_free((struct tnode *) n);
1556 
1557 	if (tp) {
1558 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1559 		put_child(t, (struct tnode *)tp, cindex, NULL);
1560 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1561 	} else
1562 		rcu_assign_pointer(t->trie, NULL);
1563 
1564 	return 1;
1565 }
1566 
1567 /*
1568  * Caller must hold RTNL.
1569  */
1570 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1571 {
1572 	struct trie *t = (struct trie *) tb->tb_data;
1573 	u32 key, mask;
1574 	int plen = cfg->fc_dst_len;
1575 	u8 tos = cfg->fc_tos;
1576 	struct fib_alias *fa, *fa_to_delete;
1577 	struct list_head *fa_head;
1578 	struct leaf *l;
1579 	struct leaf_info *li;
1580 
1581 	if (plen > 32)
1582 		return -EINVAL;
1583 
1584 	key = ntohl(cfg->fc_dst);
1585 	mask = ntohl(inet_make_mask(plen));
1586 
1587 	if (key & ~mask)
1588 		return -EINVAL;
1589 
1590 	key = key & mask;
1591 	l = fib_find_node(t, key);
1592 
1593 	if (!l)
1594 		return -ESRCH;
1595 
1596 	fa_head = get_fa_head(l, plen);
1597 	fa = fib_find_alias(fa_head, tos, 0);
1598 
1599 	if (!fa)
1600 		return -ESRCH;
1601 
1602 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1603 
1604 	fa_to_delete = NULL;
1605 	fa_head = fa->fa_list.prev;
1606 
1607 	list_for_each_entry(fa, fa_head, fa_list) {
1608 		struct fib_info *fi = fa->fa_info;
1609 
1610 		if (fa->fa_tos != tos)
1611 			break;
1612 
1613 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1614 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1615 		     fa->fa_scope == cfg->fc_scope) &&
1616 		    (!cfg->fc_protocol ||
1617 		     fi->fib_protocol == cfg->fc_protocol) &&
1618 		    fib_nh_match(cfg, fi) == 0) {
1619 			fa_to_delete = fa;
1620 			break;
1621 		}
1622 	}
1623 
1624 	if (!fa_to_delete)
1625 		return -ESRCH;
1626 
1627 	fa = fa_to_delete;
1628 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1629 		  &cfg->fc_nlinfo, 0);
1630 
1631 	l = fib_find_node(t, key);
1632 	li = find_leaf_info(l, plen);
1633 
1634 	list_del_rcu(&fa->fa_list);
1635 
1636 	if (list_empty(fa_head)) {
1637 		hlist_del_rcu(&li->hlist);
1638 		free_leaf_info(li);
1639 	}
1640 
1641 	if (hlist_empty(&l->list))
1642 		trie_leaf_remove(t, key);
1643 
1644 	if (fa->fa_state & FA_S_ACCESSED)
1645 		rt_cache_flush(-1);
1646 
1647 	fib_release_info(fa->fa_info);
1648 	alias_free_mem_rcu(fa);
1649 	return 0;
1650 }
1651 
1652 static int trie_flush_list(struct trie *t, struct list_head *head)
1653 {
1654 	struct fib_alias *fa, *fa_node;
1655 	int found = 0;
1656 
1657 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1658 		struct fib_info *fi = fa->fa_info;
1659 
1660 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1661 			list_del_rcu(&fa->fa_list);
1662 			fib_release_info(fa->fa_info);
1663 			alias_free_mem_rcu(fa);
1664 			found++;
1665 		}
1666 	}
1667 	return found;
1668 }
1669 
1670 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1671 {
1672 	int found = 0;
1673 	struct hlist_head *lih = &l->list;
1674 	struct hlist_node *node, *tmp;
1675 	struct leaf_info *li = NULL;
1676 
1677 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1678 		found += trie_flush_list(t, &li->falh);
1679 
1680 		if (list_empty(&li->falh)) {
1681 			hlist_del_rcu(&li->hlist);
1682 			free_leaf_info(li);
1683 		}
1684 	}
1685 	return found;
1686 }
1687 
1688 /* rcu_read_lock needs to be hold by caller from readside */
1689 
1690 static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1691 {
1692 	struct node *c = (struct node *) thisleaf;
1693 	struct tnode *p;
1694 	int idx;
1695 	struct node *trie = rcu_dereference(t->trie);
1696 
1697 	if (c == NULL) {
1698 		if (trie == NULL)
1699 			return NULL;
1700 
1701 		if (IS_LEAF(trie))          /* trie w. just a leaf */
1702 			return (struct leaf *) trie;
1703 
1704 		p = (struct tnode*) trie;  /* Start */
1705 	} else
1706 		p = (struct tnode *) NODE_PARENT(c);
1707 
1708 	while (p) {
1709 		int pos, last;
1710 
1711 		/*  Find the next child of the parent */
1712 		if (c)
1713 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1714 		else
1715 			pos = 0;
1716 
1717 		last = 1 << p->bits;
1718 		for (idx = pos; idx < last ; idx++) {
1719 			c = rcu_dereference(p->child[idx]);
1720 
1721 			if (!c)
1722 				continue;
1723 
1724 			/* Decend if tnode */
1725 			while (IS_TNODE(c)) {
1726 				p = (struct tnode *) c;
1727 				idx = 0;
1728 
1729 				/* Rightmost non-NULL branch */
1730 				if (p && IS_TNODE(p))
1731 					while (!(c = rcu_dereference(p->child[idx]))
1732 					       && idx < (1<<p->bits)) idx++;
1733 
1734 				/* Done with this tnode? */
1735 				if (idx >= (1 << p->bits) || !c)
1736 					goto up;
1737 			}
1738 			return (struct leaf *) c;
1739 		}
1740 up:
1741 		/* No more children go up one step  */
1742 		c = (struct node *) p;
1743 		p = (struct tnode *) NODE_PARENT(p);
1744 	}
1745 	return NULL; /* Ready. Root of trie */
1746 }
1747 
1748 /*
1749  * Caller must hold RTNL.
1750  */
1751 static int fn_trie_flush(struct fib_table *tb)
1752 {
1753 	struct trie *t = (struct trie *) tb->tb_data;
1754 	struct leaf *ll = NULL, *l = NULL;
1755 	int found = 0, h;
1756 
1757 	t->revision++;
1758 
1759 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1760 		found += trie_flush_leaf(t, l);
1761 
1762 		if (ll && hlist_empty(&ll->list))
1763 			trie_leaf_remove(t, ll->key);
1764 		ll = l;
1765 	}
1766 
1767 	if (ll && hlist_empty(&ll->list))
1768 		trie_leaf_remove(t, ll->key);
1769 
1770 	pr_debug("trie_flush found=%d\n", found);
1771 	return found;
1772 }
1773 
1774 static int trie_last_dflt = -1;
1775 
1776 static void
1777 fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1778 {
1779 	struct trie *t = (struct trie *) tb->tb_data;
1780 	int order, last_idx;
1781 	struct fib_info *fi = NULL;
1782 	struct fib_info *last_resort;
1783 	struct fib_alias *fa = NULL;
1784 	struct list_head *fa_head;
1785 	struct leaf *l;
1786 
1787 	last_idx = -1;
1788 	last_resort = NULL;
1789 	order = -1;
1790 
1791 	rcu_read_lock();
1792 
1793 	l = fib_find_node(t, 0);
1794 	if (!l)
1795 		goto out;
1796 
1797 	fa_head = get_fa_head(l, 0);
1798 	if (!fa_head)
1799 		goto out;
1800 
1801 	if (list_empty(fa_head))
1802 		goto out;
1803 
1804 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1805 		struct fib_info *next_fi = fa->fa_info;
1806 
1807 		if (fa->fa_scope != res->scope ||
1808 		    fa->fa_type != RTN_UNICAST)
1809 			continue;
1810 
1811 		if (next_fi->fib_priority > res->fi->fib_priority)
1812 			break;
1813 		if (!next_fi->fib_nh[0].nh_gw ||
1814 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1815 			continue;
1816 		fa->fa_state |= FA_S_ACCESSED;
1817 
1818 		if (fi == NULL) {
1819 			if (next_fi != res->fi)
1820 				break;
1821 		} else if (!fib_detect_death(fi, order, &last_resort,
1822 					     &last_idx, &trie_last_dflt)) {
1823 			if (res->fi)
1824 				fib_info_put(res->fi);
1825 			res->fi = fi;
1826 			atomic_inc(&fi->fib_clntref);
1827 			trie_last_dflt = order;
1828 			goto out;
1829 		}
1830 		fi = next_fi;
1831 		order++;
1832 	}
1833 	if (order <= 0 || fi == NULL) {
1834 		trie_last_dflt = -1;
1835 		goto out;
1836 	}
1837 
1838 	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1839 		if (res->fi)
1840 			fib_info_put(res->fi);
1841 		res->fi = fi;
1842 		atomic_inc(&fi->fib_clntref);
1843 		trie_last_dflt = order;
1844 		goto out;
1845 	}
1846 	if (last_idx >= 0) {
1847 		if (res->fi)
1848 			fib_info_put(res->fi);
1849 		res->fi = last_resort;
1850 		if (last_resort)
1851 			atomic_inc(&last_resort->fib_clntref);
1852 	}
1853 	trie_last_dflt = last_idx;
1854  out:;
1855 	rcu_read_unlock();
1856 }
1857 
1858 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
1859 			   struct sk_buff *skb, struct netlink_callback *cb)
1860 {
1861 	int i, s_i;
1862 	struct fib_alias *fa;
1863 
1864 	__be32 xkey = htonl(key);
1865 
1866 	s_i = cb->args[4];
1867 	i = 0;
1868 
1869 	/* rcu_read_lock is hold by caller */
1870 
1871 	list_for_each_entry_rcu(fa, fah, fa_list) {
1872 		if (i < s_i) {
1873 			i++;
1874 			continue;
1875 		}
1876 		BUG_ON(!fa->fa_info);
1877 
1878 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1879 				  cb->nlh->nlmsg_seq,
1880 				  RTM_NEWROUTE,
1881 				  tb->tb_id,
1882 				  fa->fa_type,
1883 				  fa->fa_scope,
1884 				  xkey,
1885 				  plen,
1886 				  fa->fa_tos,
1887 				  fa->fa_info, 0) < 0) {
1888 			cb->args[4] = i;
1889 			return -1;
1890 		}
1891 		i++;
1892 	}
1893 	cb->args[4] = i;
1894 	return skb->len;
1895 }
1896 
1897 static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
1898 			     struct netlink_callback *cb)
1899 {
1900 	int h, s_h;
1901 	struct list_head *fa_head;
1902 	struct leaf *l = NULL;
1903 
1904 	s_h = cb->args[3];
1905 
1906 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
1907 		if (h < s_h)
1908 			continue;
1909 		if (h > s_h)
1910 			memset(&cb->args[4], 0,
1911 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1912 
1913 		fa_head = get_fa_head(l, plen);
1914 
1915 		if (!fa_head)
1916 			continue;
1917 
1918 		if (list_empty(fa_head))
1919 			continue;
1920 
1921 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1922 			cb->args[3] = h;
1923 			return -1;
1924 		}
1925 	}
1926 	cb->args[3] = h;
1927 	return skb->len;
1928 }
1929 
1930 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1931 {
1932 	int m, s_m;
1933 	struct trie *t = (struct trie *) tb->tb_data;
1934 
1935 	s_m = cb->args[2];
1936 
1937 	rcu_read_lock();
1938 	for (m = 0; m <= 32; m++) {
1939 		if (m < s_m)
1940 			continue;
1941 		if (m > s_m)
1942 			memset(&cb->args[3], 0,
1943 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
1944 
1945 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1946 			cb->args[2] = m;
1947 			goto out;
1948 		}
1949 	}
1950 	rcu_read_unlock();
1951 	cb->args[2] = m;
1952 	return skb->len;
1953 out:
1954 	rcu_read_unlock();
1955 	return -1;
1956 }
1957 
1958 /* Fix more generic FIB names for init later */
1959 
1960 #ifdef CONFIG_IP_MULTIPLE_TABLES
1961 struct fib_table * fib_hash_init(u32 id)
1962 #else
1963 struct fib_table * __init fib_hash_init(u32 id)
1964 #endif
1965 {
1966 	struct fib_table *tb;
1967 	struct trie *t;
1968 
1969 	if (fn_alias_kmem == NULL)
1970 		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1971 						  sizeof(struct fib_alias),
1972 						  0, SLAB_HWCACHE_ALIGN,
1973 						  NULL, NULL);
1974 
1975 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1976 		     GFP_KERNEL);
1977 	if (tb == NULL)
1978 		return NULL;
1979 
1980 	tb->tb_id = id;
1981 	tb->tb_lookup = fn_trie_lookup;
1982 	tb->tb_insert = fn_trie_insert;
1983 	tb->tb_delete = fn_trie_delete;
1984 	tb->tb_flush = fn_trie_flush;
1985 	tb->tb_select_default = fn_trie_select_default;
1986 	tb->tb_dump = fn_trie_dump;
1987 	memset(tb->tb_data, 0, sizeof(struct trie));
1988 
1989 	t = (struct trie *) tb->tb_data;
1990 
1991 	trie_init(t);
1992 
1993 	if (id == RT_TABLE_LOCAL)
1994 		trie_local = t;
1995 	else if (id == RT_TABLE_MAIN)
1996 		trie_main = t;
1997 
1998 	if (id == RT_TABLE_LOCAL)
1999 		printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
2000 
2001 	return tb;
2002 }
2003 
2004 #ifdef CONFIG_PROC_FS
2005 /* Depth first Trie walk iterator */
2006 struct fib_trie_iter {
2007 	struct tnode *tnode;
2008 	struct trie *trie;
2009 	unsigned index;
2010 	unsigned depth;
2011 };
2012 
2013 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2014 {
2015 	struct tnode *tn = iter->tnode;
2016 	unsigned cindex = iter->index;
2017 	struct tnode *p;
2018 
2019 	/* A single entry routing table */
2020 	if (!tn)
2021 		return NULL;
2022 
2023 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2024 		 iter->tnode, iter->index, iter->depth);
2025 rescan:
2026 	while (cindex < (1<<tn->bits)) {
2027 		struct node *n = tnode_get_child(tn, cindex);
2028 
2029 		if (n) {
2030 			if (IS_LEAF(n)) {
2031 				iter->tnode = tn;
2032 				iter->index = cindex + 1;
2033 			} else {
2034 				/* push down one level */
2035 				iter->tnode = (struct tnode *) n;
2036 				iter->index = 0;
2037 				++iter->depth;
2038 			}
2039 			return n;
2040 		}
2041 
2042 		++cindex;
2043 	}
2044 
2045 	/* Current node exhausted, pop back up */
2046 	p = NODE_PARENT(tn);
2047 	if (p) {
2048 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2049 		tn = p;
2050 		--iter->depth;
2051 		goto rescan;
2052 	}
2053 
2054 	/* got root? */
2055 	return NULL;
2056 }
2057 
2058 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2059 				       struct trie *t)
2060 {
2061 	struct node *n ;
2062 
2063 	if (!t)
2064 		return NULL;
2065 
2066 	n = rcu_dereference(t->trie);
2067 
2068 	if (!iter)
2069 		return NULL;
2070 
2071 	if (n) {
2072 		if (IS_TNODE(n)) {
2073 			iter->tnode = (struct tnode *) n;
2074 			iter->trie = t;
2075 			iter->index = 0;
2076 			iter->depth = 1;
2077 		} else {
2078 			iter->tnode = NULL;
2079 			iter->trie  = t;
2080 			iter->index = 0;
2081 			iter->depth = 0;
2082 		}
2083 		return n;
2084 	}
2085 	return NULL;
2086 }
2087 
2088 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2089 {
2090 	struct node *n;
2091 	struct fib_trie_iter iter;
2092 
2093 	memset(s, 0, sizeof(*s));
2094 
2095 	rcu_read_lock();
2096 	for (n = fib_trie_get_first(&iter, t); n;
2097 	     n = fib_trie_get_next(&iter)) {
2098 		if (IS_LEAF(n)) {
2099 			s->leaves++;
2100 			s->totdepth += iter.depth;
2101 			if (iter.depth > s->maxdepth)
2102 				s->maxdepth = iter.depth;
2103 		} else {
2104 			const struct tnode *tn = (const struct tnode *) n;
2105 			int i;
2106 
2107 			s->tnodes++;
2108 			if (tn->bits < MAX_STAT_DEPTH)
2109 				s->nodesizes[tn->bits]++;
2110 
2111 			for (i = 0; i < (1<<tn->bits); i++)
2112 				if (!tn->child[i])
2113 					s->nullpointers++;
2114 		}
2115 	}
2116 	rcu_read_unlock();
2117 }
2118 
2119 /*
2120  *	This outputs /proc/net/fib_triestats
2121  */
2122 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2123 {
2124 	unsigned i, max, pointers, bytes, avdepth;
2125 
2126 	if (stat->leaves)
2127 		avdepth = stat->totdepth*100 / stat->leaves;
2128 	else
2129 		avdepth = 0;
2130 
2131 	seq_printf(seq, "\tAver depth:     %d.%02d\n", avdepth / 100, avdepth % 100 );
2132 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2133 
2134 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2135 
2136 	bytes = sizeof(struct leaf) * stat->leaves;
2137 	seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2138 	bytes += sizeof(struct tnode) * stat->tnodes;
2139 
2140 	max = MAX_STAT_DEPTH;
2141 	while (max > 0 && stat->nodesizes[max-1] == 0)
2142 		max--;
2143 
2144 	pointers = 0;
2145 	for (i = 1; i <= max; i++)
2146 		if (stat->nodesizes[i] != 0) {
2147 			seq_printf(seq, "  %d: %d",  i, stat->nodesizes[i]);
2148 			pointers += (1<<i) * stat->nodesizes[i];
2149 		}
2150 	seq_putc(seq, '\n');
2151 	seq_printf(seq, "\tPointers: %d\n", pointers);
2152 
2153 	bytes += sizeof(struct node *) * pointers;
2154 	seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2155 	seq_printf(seq, "Total size: %d  kB\n", (bytes + 1023) / 1024);
2156 
2157 #ifdef CONFIG_IP_FIB_TRIE_STATS
2158 	seq_printf(seq, "Counters:\n---------\n");
2159 	seq_printf(seq,"gets = %d\n", t->stats.gets);
2160 	seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2161 	seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2162 	seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2163 	seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2164 	seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2165 #ifdef CLEAR_STATS
2166 	memset(&(t->stats), 0, sizeof(t->stats));
2167 #endif
2168 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2169 }
2170 
2171 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2172 {
2173 	struct trie_stat *stat;
2174 
2175 	stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2176 	if (!stat)
2177 		return -ENOMEM;
2178 
2179 	seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2180 		   sizeof(struct leaf), sizeof(struct tnode));
2181 
2182 	if (trie_local) {
2183 		seq_printf(seq, "Local:\n");
2184 		trie_collect_stats(trie_local, stat);
2185 		trie_show_stats(seq, stat);
2186 	}
2187 
2188 	if (trie_main) {
2189 		seq_printf(seq, "Main:\n");
2190 		trie_collect_stats(trie_main, stat);
2191 		trie_show_stats(seq, stat);
2192 	}
2193 	kfree(stat);
2194 
2195 	return 0;
2196 }
2197 
2198 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2199 {
2200 	return single_open(file, fib_triestat_seq_show, NULL);
2201 }
2202 
2203 static const struct file_operations fib_triestat_fops = {
2204 	.owner	= THIS_MODULE,
2205 	.open	= fib_triestat_seq_open,
2206 	.read	= seq_read,
2207 	.llseek	= seq_lseek,
2208 	.release = single_release,
2209 };
2210 
2211 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2212 				      loff_t pos)
2213 {
2214 	loff_t idx = 0;
2215 	struct node *n;
2216 
2217 	for (n = fib_trie_get_first(iter, trie_local);
2218 	     n; ++idx, n = fib_trie_get_next(iter)) {
2219 		if (pos == idx)
2220 			return n;
2221 	}
2222 
2223 	for (n = fib_trie_get_first(iter, trie_main);
2224 	     n; ++idx, n = fib_trie_get_next(iter)) {
2225 		if (pos == idx)
2226 			return n;
2227 	}
2228 	return NULL;
2229 }
2230 
2231 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2232 {
2233 	rcu_read_lock();
2234 	if (*pos == 0)
2235 		return SEQ_START_TOKEN;
2236 	return fib_trie_get_idx(seq->private, *pos - 1);
2237 }
2238 
2239 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2240 {
2241 	struct fib_trie_iter *iter = seq->private;
2242 	void *l = v;
2243 
2244 	++*pos;
2245 	if (v == SEQ_START_TOKEN)
2246 		return fib_trie_get_idx(iter, 0);
2247 
2248 	v = fib_trie_get_next(iter);
2249 	BUG_ON(v == l);
2250 	if (v)
2251 		return v;
2252 
2253 	/* continue scan in next trie */
2254 	if (iter->trie == trie_local)
2255 		return fib_trie_get_first(iter, trie_main);
2256 
2257 	return NULL;
2258 }
2259 
2260 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2261 {
2262 	rcu_read_unlock();
2263 }
2264 
2265 static void seq_indent(struct seq_file *seq, int n)
2266 {
2267 	while (n-- > 0) seq_puts(seq, "   ");
2268 }
2269 
2270 static inline const char *rtn_scope(enum rt_scope_t s)
2271 {
2272 	static char buf[32];
2273 
2274 	switch (s) {
2275 	case RT_SCOPE_UNIVERSE: return "universe";
2276 	case RT_SCOPE_SITE:	return "site";
2277 	case RT_SCOPE_LINK:	return "link";
2278 	case RT_SCOPE_HOST:	return "host";
2279 	case RT_SCOPE_NOWHERE:	return "nowhere";
2280 	default:
2281 		snprintf(buf, sizeof(buf), "scope=%d", s);
2282 		return buf;
2283 	}
2284 }
2285 
2286 static const char *rtn_type_names[__RTN_MAX] = {
2287 	[RTN_UNSPEC] = "UNSPEC",
2288 	[RTN_UNICAST] = "UNICAST",
2289 	[RTN_LOCAL] = "LOCAL",
2290 	[RTN_BROADCAST] = "BROADCAST",
2291 	[RTN_ANYCAST] = "ANYCAST",
2292 	[RTN_MULTICAST] = "MULTICAST",
2293 	[RTN_BLACKHOLE] = "BLACKHOLE",
2294 	[RTN_UNREACHABLE] = "UNREACHABLE",
2295 	[RTN_PROHIBIT] = "PROHIBIT",
2296 	[RTN_THROW] = "THROW",
2297 	[RTN_NAT] = "NAT",
2298 	[RTN_XRESOLVE] = "XRESOLVE",
2299 };
2300 
2301 static inline const char *rtn_type(unsigned t)
2302 {
2303 	static char buf[32];
2304 
2305 	if (t < __RTN_MAX && rtn_type_names[t])
2306 		return rtn_type_names[t];
2307 	snprintf(buf, sizeof(buf), "type %d", t);
2308 	return buf;
2309 }
2310 
2311 /* Pretty print the trie */
2312 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2313 {
2314 	const struct fib_trie_iter *iter = seq->private;
2315 	struct node *n = v;
2316 
2317 	if (v == SEQ_START_TOKEN)
2318 		return 0;
2319 
2320 	if (!NODE_PARENT(n)) {
2321 		if (iter->trie == trie_local)
2322 			seq_puts(seq, "<local>:\n");
2323 		else
2324 			seq_puts(seq, "<main>:\n");
2325 	}
2326 
2327 	if (IS_TNODE(n)) {
2328 		struct tnode *tn = (struct tnode *) n;
2329 		__be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
2330 
2331 		seq_indent(seq, iter->depth-1);
2332 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2333 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2334 			   tn->empty_children);
2335 
2336 	} else {
2337 		struct leaf *l = (struct leaf *) n;
2338 		int i;
2339 		__be32 val = htonl(l->key);
2340 
2341 		seq_indent(seq, iter->depth);
2342 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2343 		for (i = 32; i >= 0; i--) {
2344 			struct leaf_info *li = find_leaf_info(l, i);
2345 			if (li) {
2346 				struct fib_alias *fa;
2347 				list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2348 					seq_indent(seq, iter->depth+1);
2349 					seq_printf(seq, "  /%d %s %s", i,
2350 						   rtn_scope(fa->fa_scope),
2351 						   rtn_type(fa->fa_type));
2352 					if (fa->fa_tos)
2353 						seq_printf(seq, "tos =%d\n",
2354 							   fa->fa_tos);
2355 					seq_putc(seq, '\n');
2356 				}
2357 			}
2358 		}
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 static const struct seq_operations fib_trie_seq_ops = {
2365 	.start  = fib_trie_seq_start,
2366 	.next   = fib_trie_seq_next,
2367 	.stop   = fib_trie_seq_stop,
2368 	.show   = fib_trie_seq_show,
2369 };
2370 
2371 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2372 {
2373 	struct seq_file *seq;
2374 	int rc = -ENOMEM;
2375 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2376 
2377 	if (!s)
2378 		goto out;
2379 
2380 	rc = seq_open(file, &fib_trie_seq_ops);
2381 	if (rc)
2382 		goto out_kfree;
2383 
2384 	seq	     = file->private_data;
2385 	seq->private = s;
2386 	memset(s, 0, sizeof(*s));
2387 out:
2388 	return rc;
2389 out_kfree:
2390 	kfree(s);
2391 	goto out;
2392 }
2393 
2394 static const struct file_operations fib_trie_fops = {
2395 	.owner  = THIS_MODULE,
2396 	.open   = fib_trie_seq_open,
2397 	.read   = seq_read,
2398 	.llseek = seq_lseek,
2399 	.release = seq_release_private,
2400 };
2401 
2402 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2403 {
2404 	static unsigned type2flags[RTN_MAX + 1] = {
2405 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2406 	};
2407 	unsigned flags = type2flags[type];
2408 
2409 	if (fi && fi->fib_nh->nh_gw)
2410 		flags |= RTF_GATEWAY;
2411 	if (mask == htonl(0xFFFFFFFF))
2412 		flags |= RTF_HOST;
2413 	flags |= RTF_UP;
2414 	return flags;
2415 }
2416 
2417 /*
2418  *	This outputs /proc/net/route.
2419  *	The format of the file is not supposed to be changed
2420  * 	and needs to be same as fib_hash output to avoid breaking
2421  *	legacy utilities
2422  */
2423 static int fib_route_seq_show(struct seq_file *seq, void *v)
2424 {
2425 	const struct fib_trie_iter *iter = seq->private;
2426 	struct leaf *l = v;
2427 	int i;
2428 	char bf[128];
2429 
2430 	if (v == SEQ_START_TOKEN) {
2431 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2432 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2433 			   "\tWindow\tIRTT");
2434 		return 0;
2435 	}
2436 
2437 	if (iter->trie == trie_local)
2438 		return 0;
2439 	if (IS_TNODE(l))
2440 		return 0;
2441 
2442 	for (i=32; i>=0; i--) {
2443 		struct leaf_info *li = find_leaf_info(l, i);
2444 		struct fib_alias *fa;
2445 		__be32 mask, prefix;
2446 
2447 		if (!li)
2448 			continue;
2449 
2450 		mask = inet_make_mask(li->plen);
2451 		prefix = htonl(l->key);
2452 
2453 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2454 			const struct fib_info *fi = fa->fa_info;
2455 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2456 
2457 			if (fa->fa_type == RTN_BROADCAST
2458 			    || fa->fa_type == RTN_MULTICAST)
2459 				continue;
2460 
2461 			if (fi)
2462 				snprintf(bf, sizeof(bf),
2463 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2464 					 fi->fib_dev ? fi->fib_dev->name : "*",
2465 					 prefix,
2466 					 fi->fib_nh->nh_gw, flags, 0, 0,
2467 					 fi->fib_priority,
2468 					 mask,
2469 					 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2470 					 fi->fib_window,
2471 					 fi->fib_rtt >> 3);
2472 			else
2473 				snprintf(bf, sizeof(bf),
2474 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2475 					 prefix, 0, flags, 0, 0, 0,
2476 					 mask, 0, 0, 0);
2477 
2478 			seq_printf(seq, "%-127s\n", bf);
2479 		}
2480 	}
2481 
2482 	return 0;
2483 }
2484 
2485 static const struct seq_operations fib_route_seq_ops = {
2486 	.start  = fib_trie_seq_start,
2487 	.next   = fib_trie_seq_next,
2488 	.stop   = fib_trie_seq_stop,
2489 	.show   = fib_route_seq_show,
2490 };
2491 
2492 static int fib_route_seq_open(struct inode *inode, struct file *file)
2493 {
2494 	struct seq_file *seq;
2495 	int rc = -ENOMEM;
2496 	struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
2497 
2498 	if (!s)
2499 		goto out;
2500 
2501 	rc = seq_open(file, &fib_route_seq_ops);
2502 	if (rc)
2503 		goto out_kfree;
2504 
2505 	seq	     = file->private_data;
2506 	seq->private = s;
2507 	memset(s, 0, sizeof(*s));
2508 out:
2509 	return rc;
2510 out_kfree:
2511 	kfree(s);
2512 	goto out;
2513 }
2514 
2515 static const struct file_operations fib_route_fops = {
2516 	.owner  = THIS_MODULE,
2517 	.open   = fib_route_seq_open,
2518 	.read   = seq_read,
2519 	.llseek = seq_lseek,
2520 	.release = seq_release_private,
2521 };
2522 
2523 int __init fib_proc_init(void)
2524 {
2525 	if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2526 		goto out1;
2527 
2528 	if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2529 		goto out2;
2530 
2531 	if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2532 		goto out3;
2533 
2534 	return 0;
2535 
2536 out3:
2537 	proc_net_remove("fib_triestat");
2538 out2:
2539 	proc_net_remove("fib_trie");
2540 out1:
2541 	return -ENOMEM;
2542 }
2543 
2544 void __init fib_proc_exit(void)
2545 {
2546 	proc_net_remove("fib_trie");
2547 	proc_net_remove("fib_triestat");
2548 	proc_net_remove("route");
2549 }
2550 
2551 #endif /* CONFIG_PROC_FS */
2552