xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 643d1f7f)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26  *
27  *
28  * Code from fib_hash has been reused which includes the following header:
29  *
30  *
31  * INET		An implementation of the TCP/IP protocol suite for the LINUX
32  *		operating system.  INET is implemented using the  BSD Socket
33  *		interface as the means of communication with the user level.
34  *
35  *		IPv4 FIB: lookup engine and maintenance routines.
36  *
37  *
38  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39  *
40  *		This program is free software; you can redistribute it and/or
41  *		modify it under the terms of the GNU General Public License
42  *		as published by the Free Software Foundation; either version
43  *		2 of the License, or (at your option) any later version.
44  *
45  * Substantial contributions to this work comes from:
46  *
47  *		David S. Miller, <davem@davemloft.net>
48  *		Stephen Hemminger <shemminger@osdl.org>
49  *		Paul E. McKenney <paulmck@us.ibm.com>
50  *		Patrick McHardy <kaber@trash.net>
51  */
52 
53 #define VERSION "0.408"
54 
55 #include <asm/uaccess.h>
56 #include <asm/system.h>
57 #include <linux/bitops.h>
58 #include <linux/types.h>
59 #include <linux/kernel.h>
60 #include <linux/mm.h>
61 #include <linux/string.h>
62 #include <linux/socket.h>
63 #include <linux/sockios.h>
64 #include <linux/errno.h>
65 #include <linux/in.h>
66 #include <linux/inet.h>
67 #include <linux/inetdevice.h>
68 #include <linux/netdevice.h>
69 #include <linux/if_arp.h>
70 #include <linux/proc_fs.h>
71 #include <linux/rcupdate.h>
72 #include <linux/skbuff.h>
73 #include <linux/netlink.h>
74 #include <linux/init.h>
75 #include <linux/list.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 
89 typedef unsigned int t_key;
90 
91 #define T_TNODE 0
92 #define T_LEAF  1
93 #define NODE_TYPE_MASK	0x1UL
94 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 
96 #define IS_TNODE(n) (!(n->parent & T_LEAF))
97 #define IS_LEAF(n) (n->parent & T_LEAF)
98 
99 struct node {
100 	unsigned long parent;
101 	t_key key;
102 };
103 
104 struct leaf {
105 	unsigned long parent;
106 	t_key key;
107 	struct hlist_head list;
108 	struct rcu_head rcu;
109 };
110 
111 struct leaf_info {
112 	struct hlist_node hlist;
113 	struct rcu_head rcu;
114 	int plen;
115 	struct list_head falh;
116 };
117 
118 struct tnode {
119 	unsigned long parent;
120 	t_key key;
121 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 	unsigned int full_children;	/* KEYLENGTH bits needed */
124 	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 	struct rcu_head rcu;
126 	struct node *child[0];
127 };
128 
129 #ifdef CONFIG_IP_FIB_TRIE_STATS
130 struct trie_use_stats {
131 	unsigned int gets;
132 	unsigned int backtrack;
133 	unsigned int semantic_match_passed;
134 	unsigned int semantic_match_miss;
135 	unsigned int null_node_hit;
136 	unsigned int resize_node_skipped;
137 };
138 #endif
139 
140 struct trie_stat {
141 	unsigned int totdepth;
142 	unsigned int maxdepth;
143 	unsigned int tnodes;
144 	unsigned int leaves;
145 	unsigned int nullpointers;
146 	unsigned int prefixes;
147 	unsigned int nodesizes[MAX_STAT_DEPTH];
148 };
149 
150 struct trie {
151 	struct node *trie;
152 #ifdef CONFIG_IP_FIB_TRIE_STATS
153 	struct trie_use_stats stats;
154 #endif
155 };
156 
157 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159 				  int wasfull);
160 static struct node *resize(struct trie *t, struct tnode *tn);
161 static struct tnode *inflate(struct trie *t, struct tnode *tn);
162 static struct tnode *halve(struct trie *t, struct tnode *tn);
163 static void tnode_free(struct tnode *tn);
164 
165 static struct kmem_cache *fn_alias_kmem __read_mostly;
166 static struct kmem_cache *trie_leaf_kmem __read_mostly;
167 
168 static inline struct tnode *node_parent(struct node *node)
169 {
170 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171 }
172 
173 static inline struct tnode *node_parent_rcu(struct node *node)
174 {
175 	struct tnode *ret = node_parent(node);
176 
177 	return rcu_dereference(ret);
178 }
179 
180 static inline void node_set_parent(struct node *node, struct tnode *ptr)
181 {
182 	rcu_assign_pointer(node->parent,
183 			   (unsigned long)ptr | NODE_TYPE(node));
184 }
185 
186 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
187 {
188 	BUG_ON(i >= 1U << tn->bits);
189 
190 	return tn->child[i];
191 }
192 
193 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
194 {
195 	struct node *ret = tnode_get_child(tn, i);
196 
197 	return rcu_dereference(ret);
198 }
199 
200 static inline int tnode_child_length(const struct tnode *tn)
201 {
202 	return 1 << tn->bits;
203 }
204 
205 static inline t_key mask_pfx(t_key k, unsigned short l)
206 {
207 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
208 }
209 
210 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
211 {
212 	if (offset < KEYLENGTH)
213 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
214 	else
215 		return 0;
216 }
217 
218 static inline int tkey_equals(t_key a, t_key b)
219 {
220 	return a == b;
221 }
222 
223 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
224 {
225 	if (bits == 0 || offset >= KEYLENGTH)
226 		return 1;
227 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
228 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
229 }
230 
231 static inline int tkey_mismatch(t_key a, int offset, t_key b)
232 {
233 	t_key diff = a ^ b;
234 	int i = offset;
235 
236 	if (!diff)
237 		return 0;
238 	while ((diff << i) >> (KEYLENGTH-1) == 0)
239 		i++;
240 	return i;
241 }
242 
243 /*
244   To understand this stuff, an understanding of keys and all their bits is
245   necessary. Every node in the trie has a key associated with it, but not
246   all of the bits in that key are significant.
247 
248   Consider a node 'n' and its parent 'tp'.
249 
250   If n is a leaf, every bit in its key is significant. Its presence is
251   necessitated by path compression, since during a tree traversal (when
252   searching for a leaf - unless we are doing an insertion) we will completely
253   ignore all skipped bits we encounter. Thus we need to verify, at the end of
254   a potentially successful search, that we have indeed been walking the
255   correct key path.
256 
257   Note that we can never "miss" the correct key in the tree if present by
258   following the wrong path. Path compression ensures that segments of the key
259   that are the same for all keys with a given prefix are skipped, but the
260   skipped part *is* identical for each node in the subtrie below the skipped
261   bit! trie_insert() in this implementation takes care of that - note the
262   call to tkey_sub_equals() in trie_insert().
263 
264   if n is an internal node - a 'tnode' here, the various parts of its key
265   have many different meanings.
266 
267   Example:
268   _________________________________________________________________
269   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
270   -----------------------------------------------------------------
271     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
272 
273   _________________________________________________________________
274   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
275   -----------------------------------------------------------------
276    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
277 
278   tp->pos = 7
279   tp->bits = 3
280   n->pos = 15
281   n->bits = 4
282 
283   First, let's just ignore the bits that come before the parent tp, that is
284   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
285   not use them for anything.
286 
287   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
288   index into the parent's child array. That is, they will be used to find
289   'n' among tp's children.
290 
291   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
292   for the node n.
293 
294   All the bits we have seen so far are significant to the node n. The rest
295   of the bits are really not needed or indeed known in n->key.
296 
297   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
298   n's child array, and will of course be different for each child.
299 
300 
301   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
302   at this point.
303 
304 */
305 
306 static inline void check_tnode(const struct tnode *tn)
307 {
308 	WARN_ON(tn && tn->pos+tn->bits > 32);
309 }
310 
311 static const int halve_threshold = 25;
312 static const int inflate_threshold = 50;
313 static const int halve_threshold_root = 8;
314 static const int inflate_threshold_root = 15;
315 
316 
317 static void __alias_free_mem(struct rcu_head *head)
318 {
319 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
320 	kmem_cache_free(fn_alias_kmem, fa);
321 }
322 
323 static inline void alias_free_mem_rcu(struct fib_alias *fa)
324 {
325 	call_rcu(&fa->rcu, __alias_free_mem);
326 }
327 
328 static void __leaf_free_rcu(struct rcu_head *head)
329 {
330 	struct leaf *l = container_of(head, struct leaf, rcu);
331 	kmem_cache_free(trie_leaf_kmem, l);
332 }
333 
334 static void __leaf_info_free_rcu(struct rcu_head *head)
335 {
336 	kfree(container_of(head, struct leaf_info, rcu));
337 }
338 
339 static inline void free_leaf_info(struct leaf_info *leaf)
340 {
341 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
342 }
343 
344 static struct tnode *tnode_alloc(size_t size)
345 {
346 	struct page *pages;
347 
348 	if (size <= PAGE_SIZE)
349 		return kzalloc(size, GFP_KERNEL);
350 
351 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
352 	if (!pages)
353 		return NULL;
354 
355 	return page_address(pages);
356 }
357 
358 static void __tnode_free_rcu(struct rcu_head *head)
359 {
360 	struct tnode *tn = container_of(head, struct tnode, rcu);
361 	size_t size = sizeof(struct tnode) +
362 		      (sizeof(struct node *) << tn->bits);
363 
364 	if (size <= PAGE_SIZE)
365 		kfree(tn);
366 	else
367 		free_pages((unsigned long)tn, get_order(size));
368 }
369 
370 static inline void tnode_free(struct tnode *tn)
371 {
372 	if (IS_LEAF(tn)) {
373 		struct leaf *l = (struct leaf *) tn;
374 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
375 	} else
376 		call_rcu(&tn->rcu, __tnode_free_rcu);
377 }
378 
379 static struct leaf *leaf_new(void)
380 {
381 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
382 	if (l) {
383 		l->parent = T_LEAF;
384 		INIT_HLIST_HEAD(&l->list);
385 	}
386 	return l;
387 }
388 
389 static struct leaf_info *leaf_info_new(int plen)
390 {
391 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
392 	if (li) {
393 		li->plen = plen;
394 		INIT_LIST_HEAD(&li->falh);
395 	}
396 	return li;
397 }
398 
399 static struct tnode *tnode_new(t_key key, int pos, int bits)
400 {
401 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
402 	struct tnode *tn = tnode_alloc(sz);
403 
404 	if (tn) {
405 		tn->parent = T_TNODE;
406 		tn->pos = pos;
407 		tn->bits = bits;
408 		tn->key = key;
409 		tn->full_children = 0;
410 		tn->empty_children = 1<<bits;
411 	}
412 
413 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
414 		 (unsigned long) (sizeof(struct node) << bits));
415 	return tn;
416 }
417 
418 /*
419  * Check whether a tnode 'n' is "full", i.e. it is an internal node
420  * and no bits are skipped. See discussion in dyntree paper p. 6
421  */
422 
423 static inline int tnode_full(const struct tnode *tn, const struct node *n)
424 {
425 	if (n == NULL || IS_LEAF(n))
426 		return 0;
427 
428 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
429 }
430 
431 static inline void put_child(struct trie *t, struct tnode *tn, int i,
432 			     struct node *n)
433 {
434 	tnode_put_child_reorg(tn, i, n, -1);
435 }
436 
437  /*
438   * Add a child at position i overwriting the old value.
439   * Update the value of full_children and empty_children.
440   */
441 
442 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
443 				  int wasfull)
444 {
445 	struct node *chi = tn->child[i];
446 	int isfull;
447 
448 	BUG_ON(i >= 1<<tn->bits);
449 
450 	/* update emptyChildren */
451 	if (n == NULL && chi != NULL)
452 		tn->empty_children++;
453 	else if (n != NULL && chi == NULL)
454 		tn->empty_children--;
455 
456 	/* update fullChildren */
457 	if (wasfull == -1)
458 		wasfull = tnode_full(tn, chi);
459 
460 	isfull = tnode_full(tn, n);
461 	if (wasfull && !isfull)
462 		tn->full_children--;
463 	else if (!wasfull && isfull)
464 		tn->full_children++;
465 
466 	if (n)
467 		node_set_parent(n, tn);
468 
469 	rcu_assign_pointer(tn->child[i], n);
470 }
471 
472 static struct node *resize(struct trie *t, struct tnode *tn)
473 {
474 	int i;
475 	int err = 0;
476 	struct tnode *old_tn;
477 	int inflate_threshold_use;
478 	int halve_threshold_use;
479 	int max_resize;
480 
481 	if (!tn)
482 		return NULL;
483 
484 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
485 		 tn, inflate_threshold, halve_threshold);
486 
487 	/* No children */
488 	if (tn->empty_children == tnode_child_length(tn)) {
489 		tnode_free(tn);
490 		return NULL;
491 	}
492 	/* One child */
493 	if (tn->empty_children == tnode_child_length(tn) - 1)
494 		for (i = 0; i < tnode_child_length(tn); i++) {
495 			struct node *n;
496 
497 			n = tn->child[i];
498 			if (!n)
499 				continue;
500 
501 			/* compress one level */
502 			node_set_parent(n, NULL);
503 			tnode_free(tn);
504 			return n;
505 		}
506 	/*
507 	 * Double as long as the resulting node has a number of
508 	 * nonempty nodes that are above the threshold.
509 	 */
510 
511 	/*
512 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
513 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
514 	 * Telecommunications, page 6:
515 	 * "A node is doubled if the ratio of non-empty children to all
516 	 * children in the *doubled* node is at least 'high'."
517 	 *
518 	 * 'high' in this instance is the variable 'inflate_threshold'. It
519 	 * is expressed as a percentage, so we multiply it with
520 	 * tnode_child_length() and instead of multiplying by 2 (since the
521 	 * child array will be doubled by inflate()) and multiplying
522 	 * the left-hand side by 100 (to handle the percentage thing) we
523 	 * multiply the left-hand side by 50.
524 	 *
525 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
526 	 * - tn->empty_children is of course the number of non-null children
527 	 * in the current node. tn->full_children is the number of "full"
528 	 * children, that is non-null tnodes with a skip value of 0.
529 	 * All of those will be doubled in the resulting inflated tnode, so
530 	 * we just count them one extra time here.
531 	 *
532 	 * A clearer way to write this would be:
533 	 *
534 	 * to_be_doubled = tn->full_children;
535 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
536 	 *     tn->full_children;
537 	 *
538 	 * new_child_length = tnode_child_length(tn) * 2;
539 	 *
540 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
541 	 *      new_child_length;
542 	 * if (new_fill_factor >= inflate_threshold)
543 	 *
544 	 * ...and so on, tho it would mess up the while () loop.
545 	 *
546 	 * anyway,
547 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
548 	 *      inflate_threshold
549 	 *
550 	 * avoid a division:
551 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
552 	 *      inflate_threshold * new_child_length
553 	 *
554 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
555 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
556 	 *    tn->full_children) >= inflate_threshold * new_child_length
557 	 *
558 	 * expand new_child_length:
559 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
560 	 *    tn->full_children) >=
561 	 *      inflate_threshold * tnode_child_length(tn) * 2
562 	 *
563 	 * shorten again:
564 	 * 50 * (tn->full_children + tnode_child_length(tn) -
565 	 *    tn->empty_children) >= inflate_threshold *
566 	 *    tnode_child_length(tn)
567 	 *
568 	 */
569 
570 	check_tnode(tn);
571 
572 	/* Keep root node larger  */
573 
574 	if (!tn->parent)
575 		inflate_threshold_use = inflate_threshold_root;
576 	else
577 		inflate_threshold_use = inflate_threshold;
578 
579 	err = 0;
580 	max_resize = 10;
581 	while ((tn->full_children > 0 &&  max_resize-- &&
582 		50 * (tn->full_children + tnode_child_length(tn)
583 		      - tn->empty_children)
584 		>= inflate_threshold_use * tnode_child_length(tn))) {
585 
586 		old_tn = tn;
587 		tn = inflate(t, tn);
588 
589 		if (IS_ERR(tn)) {
590 			tn = old_tn;
591 #ifdef CONFIG_IP_FIB_TRIE_STATS
592 			t->stats.resize_node_skipped++;
593 #endif
594 			break;
595 		}
596 	}
597 
598 	if (max_resize < 0) {
599 		if (!tn->parent)
600 			pr_warning("Fix inflate_threshold_root."
601 				   " Now=%d size=%d bits\n",
602 				   inflate_threshold_root, tn->bits);
603 		else
604 			pr_warning("Fix inflate_threshold."
605 				   " Now=%d size=%d bits\n",
606 				   inflate_threshold, tn->bits);
607 	}
608 
609 	check_tnode(tn);
610 
611 	/*
612 	 * Halve as long as the number of empty children in this
613 	 * node is above threshold.
614 	 */
615 
616 
617 	/* Keep root node larger  */
618 
619 	if (!tn->parent)
620 		halve_threshold_use = halve_threshold_root;
621 	else
622 		halve_threshold_use = halve_threshold;
623 
624 	err = 0;
625 	max_resize = 10;
626 	while (tn->bits > 1 &&  max_resize-- &&
627 	       100 * (tnode_child_length(tn) - tn->empty_children) <
628 	       halve_threshold_use * tnode_child_length(tn)) {
629 
630 		old_tn = tn;
631 		tn = halve(t, tn);
632 		if (IS_ERR(tn)) {
633 			tn = old_tn;
634 #ifdef CONFIG_IP_FIB_TRIE_STATS
635 			t->stats.resize_node_skipped++;
636 #endif
637 			break;
638 		}
639 	}
640 
641 	if (max_resize < 0) {
642 		if (!tn->parent)
643 			pr_warning("Fix halve_threshold_root."
644 				   " Now=%d size=%d bits\n",
645 				   halve_threshold_root, tn->bits);
646 		else
647 			pr_warning("Fix halve_threshold."
648 				   " Now=%d size=%d bits\n",
649 				   halve_threshold, tn->bits);
650 	}
651 
652 	/* Only one child remains */
653 	if (tn->empty_children == tnode_child_length(tn) - 1)
654 		for (i = 0; i < tnode_child_length(tn); i++) {
655 			struct node *n;
656 
657 			n = tn->child[i];
658 			if (!n)
659 				continue;
660 
661 			/* compress one level */
662 
663 			node_set_parent(n, NULL);
664 			tnode_free(tn);
665 			return n;
666 		}
667 
668 	return (struct node *) tn;
669 }
670 
671 static struct tnode *inflate(struct trie *t, struct tnode *tn)
672 {
673 	struct tnode *oldtnode = tn;
674 	int olen = tnode_child_length(tn);
675 	int i;
676 
677 	pr_debug("In inflate\n");
678 
679 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
680 
681 	if (!tn)
682 		return ERR_PTR(-ENOMEM);
683 
684 	/*
685 	 * Preallocate and store tnodes before the actual work so we
686 	 * don't get into an inconsistent state if memory allocation
687 	 * fails. In case of failure we return the oldnode and  inflate
688 	 * of tnode is ignored.
689 	 */
690 
691 	for (i = 0; i < olen; i++) {
692 		struct tnode *inode;
693 
694 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
695 		if (inode &&
696 		    IS_TNODE(inode) &&
697 		    inode->pos == oldtnode->pos + oldtnode->bits &&
698 		    inode->bits > 1) {
699 			struct tnode *left, *right;
700 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
701 
702 			left = tnode_new(inode->key&(~m), inode->pos + 1,
703 					 inode->bits - 1);
704 			if (!left)
705 				goto nomem;
706 
707 			right = tnode_new(inode->key|m, inode->pos + 1,
708 					  inode->bits - 1);
709 
710 			if (!right) {
711 				tnode_free(left);
712 				goto nomem;
713 			}
714 
715 			put_child(t, tn, 2*i, (struct node *) left);
716 			put_child(t, tn, 2*i+1, (struct node *) right);
717 		}
718 	}
719 
720 	for (i = 0; i < olen; i++) {
721 		struct tnode *inode;
722 		struct node *node = tnode_get_child(oldtnode, i);
723 		struct tnode *left, *right;
724 		int size, j;
725 
726 		/* An empty child */
727 		if (node == NULL)
728 			continue;
729 
730 		/* A leaf or an internal node with skipped bits */
731 
732 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
733 		   tn->pos + tn->bits - 1) {
734 			if (tkey_extract_bits(node->key,
735 					      oldtnode->pos + oldtnode->bits,
736 					      1) == 0)
737 				put_child(t, tn, 2*i, node);
738 			else
739 				put_child(t, tn, 2*i+1, node);
740 			continue;
741 		}
742 
743 		/* An internal node with two children */
744 		inode = (struct tnode *) node;
745 
746 		if (inode->bits == 1) {
747 			put_child(t, tn, 2*i, inode->child[0]);
748 			put_child(t, tn, 2*i+1, inode->child[1]);
749 
750 			tnode_free(inode);
751 			continue;
752 		}
753 
754 		/* An internal node with more than two children */
755 
756 		/* We will replace this node 'inode' with two new
757 		 * ones, 'left' and 'right', each with half of the
758 		 * original children. The two new nodes will have
759 		 * a position one bit further down the key and this
760 		 * means that the "significant" part of their keys
761 		 * (see the discussion near the top of this file)
762 		 * will differ by one bit, which will be "0" in
763 		 * left's key and "1" in right's key. Since we are
764 		 * moving the key position by one step, the bit that
765 		 * we are moving away from - the bit at position
766 		 * (inode->pos) - is the one that will differ between
767 		 * left and right. So... we synthesize that bit in the
768 		 * two  new keys.
769 		 * The mask 'm' below will be a single "one" bit at
770 		 * the position (inode->pos)
771 		 */
772 
773 		/* Use the old key, but set the new significant
774 		 *   bit to zero.
775 		 */
776 
777 		left = (struct tnode *) tnode_get_child(tn, 2*i);
778 		put_child(t, tn, 2*i, NULL);
779 
780 		BUG_ON(!left);
781 
782 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
783 		put_child(t, tn, 2*i+1, NULL);
784 
785 		BUG_ON(!right);
786 
787 		size = tnode_child_length(left);
788 		for (j = 0; j < size; j++) {
789 			put_child(t, left, j, inode->child[j]);
790 			put_child(t, right, j, inode->child[j + size]);
791 		}
792 		put_child(t, tn, 2*i, resize(t, left));
793 		put_child(t, tn, 2*i+1, resize(t, right));
794 
795 		tnode_free(inode);
796 	}
797 	tnode_free(oldtnode);
798 	return tn;
799 nomem:
800 	{
801 		int size = tnode_child_length(tn);
802 		int j;
803 
804 		for (j = 0; j < size; j++)
805 			if (tn->child[j])
806 				tnode_free((struct tnode *)tn->child[j]);
807 
808 		tnode_free(tn);
809 
810 		return ERR_PTR(-ENOMEM);
811 	}
812 }
813 
814 static struct tnode *halve(struct trie *t, struct tnode *tn)
815 {
816 	struct tnode *oldtnode = tn;
817 	struct node *left, *right;
818 	int i;
819 	int olen = tnode_child_length(tn);
820 
821 	pr_debug("In halve\n");
822 
823 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
824 
825 	if (!tn)
826 		return ERR_PTR(-ENOMEM);
827 
828 	/*
829 	 * Preallocate and store tnodes before the actual work so we
830 	 * don't get into an inconsistent state if memory allocation
831 	 * fails. In case of failure we return the oldnode and halve
832 	 * of tnode is ignored.
833 	 */
834 
835 	for (i = 0; i < olen; i += 2) {
836 		left = tnode_get_child(oldtnode, i);
837 		right = tnode_get_child(oldtnode, i+1);
838 
839 		/* Two nonempty children */
840 		if (left && right) {
841 			struct tnode *newn;
842 
843 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
844 
845 			if (!newn)
846 				goto nomem;
847 
848 			put_child(t, tn, i/2, (struct node *)newn);
849 		}
850 
851 	}
852 
853 	for (i = 0; i < olen; i += 2) {
854 		struct tnode *newBinNode;
855 
856 		left = tnode_get_child(oldtnode, i);
857 		right = tnode_get_child(oldtnode, i+1);
858 
859 		/* At least one of the children is empty */
860 		if (left == NULL) {
861 			if (right == NULL)    /* Both are empty */
862 				continue;
863 			put_child(t, tn, i/2, right);
864 			continue;
865 		}
866 
867 		if (right == NULL) {
868 			put_child(t, tn, i/2, left);
869 			continue;
870 		}
871 
872 		/* Two nonempty children */
873 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
874 		put_child(t, tn, i/2, NULL);
875 		put_child(t, newBinNode, 0, left);
876 		put_child(t, newBinNode, 1, right);
877 		put_child(t, tn, i/2, resize(t, newBinNode));
878 	}
879 	tnode_free(oldtnode);
880 	return tn;
881 nomem:
882 	{
883 		int size = tnode_child_length(tn);
884 		int j;
885 
886 		for (j = 0; j < size; j++)
887 			if (tn->child[j])
888 				tnode_free((struct tnode *)tn->child[j]);
889 
890 		tnode_free(tn);
891 
892 		return ERR_PTR(-ENOMEM);
893 	}
894 }
895 
896 /* readside must use rcu_read_lock currently dump routines
897  via get_fa_head and dump */
898 
899 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
900 {
901 	struct hlist_head *head = &l->list;
902 	struct hlist_node *node;
903 	struct leaf_info *li;
904 
905 	hlist_for_each_entry_rcu(li, node, head, hlist)
906 		if (li->plen == plen)
907 			return li;
908 
909 	return NULL;
910 }
911 
912 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
913 {
914 	struct leaf_info *li = find_leaf_info(l, plen);
915 
916 	if (!li)
917 		return NULL;
918 
919 	return &li->falh;
920 }
921 
922 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
923 {
924 	struct leaf_info *li = NULL, *last = NULL;
925 	struct hlist_node *node;
926 
927 	if (hlist_empty(head)) {
928 		hlist_add_head_rcu(&new->hlist, head);
929 	} else {
930 		hlist_for_each_entry(li, node, head, hlist) {
931 			if (new->plen > li->plen)
932 				break;
933 
934 			last = li;
935 		}
936 		if (last)
937 			hlist_add_after_rcu(&last->hlist, &new->hlist);
938 		else
939 			hlist_add_before_rcu(&new->hlist, &li->hlist);
940 	}
941 }
942 
943 /* rcu_read_lock needs to be hold by caller from readside */
944 
945 static struct leaf *
946 fib_find_node(struct trie *t, u32 key)
947 {
948 	int pos;
949 	struct tnode *tn;
950 	struct node *n;
951 
952 	pos = 0;
953 	n = rcu_dereference(t->trie);
954 
955 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
956 		tn = (struct tnode *) n;
957 
958 		check_tnode(tn);
959 
960 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
961 			pos = tn->pos + tn->bits;
962 			n = tnode_get_child_rcu(tn,
963 						tkey_extract_bits(key,
964 								  tn->pos,
965 								  tn->bits));
966 		} else
967 			break;
968 	}
969 	/* Case we have found a leaf. Compare prefixes */
970 
971 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
972 		return (struct leaf *)n;
973 
974 	return NULL;
975 }
976 
977 static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
978 {
979 	int wasfull;
980 	t_key cindex, key = tn->key;
981 	struct tnode *tp;
982 
983 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
984 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
985 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
986 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
987 
988 		tnode_put_child_reorg((struct tnode *)tp, cindex,
989 				      (struct node *)tn, wasfull);
990 
991 		tp = node_parent((struct node *) tn);
992 		if (!tp)
993 			break;
994 		tn = tp;
995 	}
996 
997 	/* Handle last (top) tnode */
998 	if (IS_TNODE(tn))
999 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1000 
1001 	return (struct node *)tn;
1002 }
1003 
1004 /* only used from updater-side */
1005 
1006 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1007 {
1008 	int pos, newpos;
1009 	struct tnode *tp = NULL, *tn = NULL;
1010 	struct node *n;
1011 	struct leaf *l;
1012 	int missbit;
1013 	struct list_head *fa_head = NULL;
1014 	struct leaf_info *li;
1015 	t_key cindex;
1016 
1017 	pos = 0;
1018 	n = t->trie;
1019 
1020 	/* If we point to NULL, stop. Either the tree is empty and we should
1021 	 * just put a new leaf in if, or we have reached an empty child slot,
1022 	 * and we should just put our new leaf in that.
1023 	 * If we point to a T_TNODE, check if it matches our key. Note that
1024 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1025 	 * not be the parent's 'pos'+'bits'!
1026 	 *
1027 	 * If it does match the current key, get pos/bits from it, extract
1028 	 * the index from our key, push the T_TNODE and walk the tree.
1029 	 *
1030 	 * If it doesn't, we have to replace it with a new T_TNODE.
1031 	 *
1032 	 * If we point to a T_LEAF, it might or might not have the same key
1033 	 * as we do. If it does, just change the value, update the T_LEAF's
1034 	 * value, and return it.
1035 	 * If it doesn't, we need to replace it with a T_TNODE.
1036 	 */
1037 
1038 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1039 		tn = (struct tnode *) n;
1040 
1041 		check_tnode(tn);
1042 
1043 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1044 			tp = tn;
1045 			pos = tn->pos + tn->bits;
1046 			n = tnode_get_child(tn,
1047 					    tkey_extract_bits(key,
1048 							      tn->pos,
1049 							      tn->bits));
1050 
1051 			BUG_ON(n && node_parent(n) != tn);
1052 		} else
1053 			break;
1054 	}
1055 
1056 	/*
1057 	 * n  ----> NULL, LEAF or TNODE
1058 	 *
1059 	 * tp is n's (parent) ----> NULL or TNODE
1060 	 */
1061 
1062 	BUG_ON(tp && IS_LEAF(tp));
1063 
1064 	/* Case 1: n is a leaf. Compare prefixes */
1065 
1066 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1067 		l = (struct leaf *) n;
1068 		li = leaf_info_new(plen);
1069 
1070 		if (!li)
1071 			return NULL;
1072 
1073 		fa_head = &li->falh;
1074 		insert_leaf_info(&l->list, li);
1075 		goto done;
1076 	}
1077 	l = leaf_new();
1078 
1079 	if (!l)
1080 		return NULL;
1081 
1082 	l->key = key;
1083 	li = leaf_info_new(plen);
1084 
1085 	if (!li) {
1086 		tnode_free((struct tnode *) l);
1087 		return NULL;
1088 	}
1089 
1090 	fa_head = &li->falh;
1091 	insert_leaf_info(&l->list, li);
1092 
1093 	if (t->trie && n == NULL) {
1094 		/* Case 2: n is NULL, and will just insert a new leaf */
1095 
1096 		node_set_parent((struct node *)l, tp);
1097 
1098 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1099 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1100 	} else {
1101 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1102 		/*
1103 		 *  Add a new tnode here
1104 		 *  first tnode need some special handling
1105 		 */
1106 
1107 		if (tp)
1108 			pos = tp->pos+tp->bits;
1109 		else
1110 			pos = 0;
1111 
1112 		if (n) {
1113 			newpos = tkey_mismatch(key, pos, n->key);
1114 			tn = tnode_new(n->key, newpos, 1);
1115 		} else {
1116 			newpos = 0;
1117 			tn = tnode_new(key, newpos, 1); /* First tnode */
1118 		}
1119 
1120 		if (!tn) {
1121 			free_leaf_info(li);
1122 			tnode_free((struct tnode *) l);
1123 			return NULL;
1124 		}
1125 
1126 		node_set_parent((struct node *)tn, tp);
1127 
1128 		missbit = tkey_extract_bits(key, newpos, 1);
1129 		put_child(t, tn, missbit, (struct node *)l);
1130 		put_child(t, tn, 1-missbit, n);
1131 
1132 		if (tp) {
1133 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1134 			put_child(t, (struct tnode *)tp, cindex,
1135 				  (struct node *)tn);
1136 		} else {
1137 			rcu_assign_pointer(t->trie, (struct node *)tn);
1138 			tp = tn;
1139 		}
1140 	}
1141 
1142 	if (tp && tp->pos + tp->bits > 32)
1143 		pr_warning("fib_trie"
1144 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1145 			   tp, tp->pos, tp->bits, key, plen);
1146 
1147 	/* Rebalance the trie */
1148 
1149 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1150 done:
1151 	return fa_head;
1152 }
1153 
1154 /*
1155  * Caller must hold RTNL.
1156  */
1157 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1158 {
1159 	struct trie *t = (struct trie *) tb->tb_data;
1160 	struct fib_alias *fa, *new_fa;
1161 	struct list_head *fa_head = NULL;
1162 	struct fib_info *fi;
1163 	int plen = cfg->fc_dst_len;
1164 	u8 tos = cfg->fc_tos;
1165 	u32 key, mask;
1166 	int err;
1167 	struct leaf *l;
1168 
1169 	if (plen > 32)
1170 		return -EINVAL;
1171 
1172 	key = ntohl(cfg->fc_dst);
1173 
1174 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1175 
1176 	mask = ntohl(inet_make_mask(plen));
1177 
1178 	if (key & ~mask)
1179 		return -EINVAL;
1180 
1181 	key = key & mask;
1182 
1183 	fi = fib_create_info(cfg);
1184 	if (IS_ERR(fi)) {
1185 		err = PTR_ERR(fi);
1186 		goto err;
1187 	}
1188 
1189 	l = fib_find_node(t, key);
1190 	fa = NULL;
1191 
1192 	if (l) {
1193 		fa_head = get_fa_head(l, plen);
1194 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1195 	}
1196 
1197 	/* Now fa, if non-NULL, points to the first fib alias
1198 	 * with the same keys [prefix,tos,priority], if such key already
1199 	 * exists or to the node before which we will insert new one.
1200 	 *
1201 	 * If fa is NULL, we will need to allocate a new one and
1202 	 * insert to the head of f.
1203 	 *
1204 	 * If f is NULL, no fib node matched the destination key
1205 	 * and we need to allocate a new one of those as well.
1206 	 */
1207 
1208 	if (fa && fa->fa_tos == tos &&
1209 	    fa->fa_info->fib_priority == fi->fib_priority) {
1210 		struct fib_alias *fa_first, *fa_match;
1211 
1212 		err = -EEXIST;
1213 		if (cfg->fc_nlflags & NLM_F_EXCL)
1214 			goto out;
1215 
1216 		/* We have 2 goals:
1217 		 * 1. Find exact match for type, scope, fib_info to avoid
1218 		 * duplicate routes
1219 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1220 		 */
1221 		fa_match = NULL;
1222 		fa_first = fa;
1223 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1224 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1225 			if (fa->fa_tos != tos)
1226 				break;
1227 			if (fa->fa_info->fib_priority != fi->fib_priority)
1228 				break;
1229 			if (fa->fa_type == cfg->fc_type &&
1230 			    fa->fa_scope == cfg->fc_scope &&
1231 			    fa->fa_info == fi) {
1232 				fa_match = fa;
1233 				break;
1234 			}
1235 		}
1236 
1237 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1238 			struct fib_info *fi_drop;
1239 			u8 state;
1240 
1241 			fa = fa_first;
1242 			if (fa_match) {
1243 				if (fa == fa_match)
1244 					err = 0;
1245 				goto out;
1246 			}
1247 			err = -ENOBUFS;
1248 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1249 			if (new_fa == NULL)
1250 				goto out;
1251 
1252 			fi_drop = fa->fa_info;
1253 			new_fa->fa_tos = fa->fa_tos;
1254 			new_fa->fa_info = fi;
1255 			new_fa->fa_type = cfg->fc_type;
1256 			new_fa->fa_scope = cfg->fc_scope;
1257 			state = fa->fa_state;
1258 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1259 
1260 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1261 			alias_free_mem_rcu(fa);
1262 
1263 			fib_release_info(fi_drop);
1264 			if (state & FA_S_ACCESSED)
1265 				rt_cache_flush(-1);
1266 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1267 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1268 
1269 			goto succeeded;
1270 		}
1271 		/* Error if we find a perfect match which
1272 		 * uses the same scope, type, and nexthop
1273 		 * information.
1274 		 */
1275 		if (fa_match)
1276 			goto out;
1277 
1278 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1279 			fa = fa_first;
1280 	}
1281 	err = -ENOENT;
1282 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1283 		goto out;
1284 
1285 	err = -ENOBUFS;
1286 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1287 	if (new_fa == NULL)
1288 		goto out;
1289 
1290 	new_fa->fa_info = fi;
1291 	new_fa->fa_tos = tos;
1292 	new_fa->fa_type = cfg->fc_type;
1293 	new_fa->fa_scope = cfg->fc_scope;
1294 	new_fa->fa_state = 0;
1295 	/*
1296 	 * Insert new entry to the list.
1297 	 */
1298 
1299 	if (!fa_head) {
1300 		fa_head = fib_insert_node(t, key, plen);
1301 		if (unlikely(!fa_head)) {
1302 			err = -ENOMEM;
1303 			goto out_free_new_fa;
1304 		}
1305 	}
1306 
1307 	list_add_tail_rcu(&new_fa->fa_list,
1308 			  (fa ? &fa->fa_list : fa_head));
1309 
1310 	rt_cache_flush(-1);
1311 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1312 		  &cfg->fc_nlinfo, 0);
1313 succeeded:
1314 	return 0;
1315 
1316 out_free_new_fa:
1317 	kmem_cache_free(fn_alias_kmem, new_fa);
1318 out:
1319 	fib_release_info(fi);
1320 err:
1321 	return err;
1322 }
1323 
1324 /* should be called with rcu_read_lock */
1325 static int check_leaf(struct trie *t, struct leaf *l,
1326 		      t_key key,  const struct flowi *flp,
1327 		      struct fib_result *res)
1328 {
1329 	struct leaf_info *li;
1330 	struct hlist_head *hhead = &l->list;
1331 	struct hlist_node *node;
1332 
1333 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1334 		int err;
1335 		int plen = li->plen;
1336 		__be32 mask = inet_make_mask(plen);
1337 
1338 		if (l->key != (key & ntohl(mask)))
1339 			continue;
1340 
1341 		err = fib_semantic_match(&li->falh, flp, res,
1342 					 htonl(l->key), mask, plen);
1343 
1344 #ifdef CONFIG_IP_FIB_TRIE_STATS
1345 		if (err <= 0)
1346 			t->stats.semantic_match_passed++;
1347 		else
1348 			t->stats.semantic_match_miss++;
1349 #endif
1350 		if (err <= 0)
1351 			return plen;
1352 	}
1353 
1354 	return -1;
1355 }
1356 
1357 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1358 			  struct fib_result *res)
1359 {
1360 	struct trie *t = (struct trie *) tb->tb_data;
1361 	int plen, ret = 0;
1362 	struct node *n;
1363 	struct tnode *pn;
1364 	int pos, bits;
1365 	t_key key = ntohl(flp->fl4_dst);
1366 	int chopped_off;
1367 	t_key cindex = 0;
1368 	int current_prefix_length = KEYLENGTH;
1369 	struct tnode *cn;
1370 	t_key node_prefix, key_prefix, pref_mismatch;
1371 	int mp;
1372 
1373 	rcu_read_lock();
1374 
1375 	n = rcu_dereference(t->trie);
1376 	if (!n)
1377 		goto failed;
1378 
1379 #ifdef CONFIG_IP_FIB_TRIE_STATS
1380 	t->stats.gets++;
1381 #endif
1382 
1383 	/* Just a leaf? */
1384 	if (IS_LEAF(n)) {
1385 		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1386 		if (plen < 0)
1387 			goto failed;
1388 		ret = 0;
1389 		goto found;
1390 	}
1391 
1392 	pn = (struct tnode *) n;
1393 	chopped_off = 0;
1394 
1395 	while (pn) {
1396 		pos = pn->pos;
1397 		bits = pn->bits;
1398 
1399 		if (!chopped_off)
1400 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1401 						   pos, bits);
1402 
1403 		n = tnode_get_child(pn, cindex);
1404 
1405 		if (n == NULL) {
1406 #ifdef CONFIG_IP_FIB_TRIE_STATS
1407 			t->stats.null_node_hit++;
1408 #endif
1409 			goto backtrace;
1410 		}
1411 
1412 		if (IS_LEAF(n)) {
1413 			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1414 			if (plen < 0)
1415 				goto backtrace;
1416 
1417 			ret = 0;
1418 			goto found;
1419 		}
1420 
1421 		cn = (struct tnode *)n;
1422 
1423 		/*
1424 		 * It's a tnode, and we can do some extra checks here if we
1425 		 * like, to avoid descending into a dead-end branch.
1426 		 * This tnode is in the parent's child array at index
1427 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1428 		 * chopped off, so in reality the index may be just a
1429 		 * subprefix, padded with zero at the end.
1430 		 * We can also take a look at any skipped bits in this
1431 		 * tnode - everything up to p_pos is supposed to be ok,
1432 		 * and the non-chopped bits of the index (se previous
1433 		 * paragraph) are also guaranteed ok, but the rest is
1434 		 * considered unknown.
1435 		 *
1436 		 * The skipped bits are key[pos+bits..cn->pos].
1437 		 */
1438 
1439 		/* If current_prefix_length < pos+bits, we are already doing
1440 		 * actual prefix  matching, which means everything from
1441 		 * pos+(bits-chopped_off) onward must be zero along some
1442 		 * branch of this subtree - otherwise there is *no* valid
1443 		 * prefix present. Here we can only check the skipped
1444 		 * bits. Remember, since we have already indexed into the
1445 		 * parent's child array, we know that the bits we chopped of
1446 		 * *are* zero.
1447 		 */
1448 
1449 		/* NOTA BENE: Checking only skipped bits
1450 		   for the new node here */
1451 
1452 		if (current_prefix_length < pos+bits) {
1453 			if (tkey_extract_bits(cn->key, current_prefix_length,
1454 						cn->pos - current_prefix_length)
1455 			    || !(cn->child[0]))
1456 				goto backtrace;
1457 		}
1458 
1459 		/*
1460 		 * If chopped_off=0, the index is fully validated and we
1461 		 * only need to look at the skipped bits for this, the new,
1462 		 * tnode. What we actually want to do is to find out if
1463 		 * these skipped bits match our key perfectly, or if we will
1464 		 * have to count on finding a matching prefix further down,
1465 		 * because if we do, we would like to have some way of
1466 		 * verifying the existence of such a prefix at this point.
1467 		 */
1468 
1469 		/* The only thing we can do at this point is to verify that
1470 		 * any such matching prefix can indeed be a prefix to our
1471 		 * key, and if the bits in the node we are inspecting that
1472 		 * do not match our key are not ZERO, this cannot be true.
1473 		 * Thus, find out where there is a mismatch (before cn->pos)
1474 		 * and verify that all the mismatching bits are zero in the
1475 		 * new tnode's key.
1476 		 */
1477 
1478 		/*
1479 		 * Note: We aren't very concerned about the piece of
1480 		 * the key that precede pn->pos+pn->bits, since these
1481 		 * have already been checked. The bits after cn->pos
1482 		 * aren't checked since these are by definition
1483 		 * "unknown" at this point. Thus, what we want to see
1484 		 * is if we are about to enter the "prefix matching"
1485 		 * state, and in that case verify that the skipped
1486 		 * bits that will prevail throughout this subtree are
1487 		 * zero, as they have to be if we are to find a
1488 		 * matching prefix.
1489 		 */
1490 
1491 		node_prefix = mask_pfx(cn->key, cn->pos);
1492 		key_prefix = mask_pfx(key, cn->pos);
1493 		pref_mismatch = key_prefix^node_prefix;
1494 		mp = 0;
1495 
1496 		/*
1497 		 * In short: If skipped bits in this node do not match
1498 		 * the search key, enter the "prefix matching"
1499 		 * state.directly.
1500 		 */
1501 		if (pref_mismatch) {
1502 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1503 				mp++;
1504 				pref_mismatch = pref_mismatch << 1;
1505 			}
1506 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1507 
1508 			if (key_prefix != 0)
1509 				goto backtrace;
1510 
1511 			if (current_prefix_length >= cn->pos)
1512 				current_prefix_length = mp;
1513 		}
1514 
1515 		pn = (struct tnode *)n; /* Descend */
1516 		chopped_off = 0;
1517 		continue;
1518 
1519 backtrace:
1520 		chopped_off++;
1521 
1522 		/* As zero don't change the child key (cindex) */
1523 		while ((chopped_off <= pn->bits)
1524 		       && !(cindex & (1<<(chopped_off-1))))
1525 			chopped_off++;
1526 
1527 		/* Decrease current_... with bits chopped off */
1528 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1529 			current_prefix_length = pn->pos + pn->bits
1530 				- chopped_off;
1531 
1532 		/*
1533 		 * Either we do the actual chop off according or if we have
1534 		 * chopped off all bits in this tnode walk up to our parent.
1535 		 */
1536 
1537 		if (chopped_off <= pn->bits) {
1538 			cindex &= ~(1 << (chopped_off-1));
1539 		} else {
1540 			struct tnode *parent = node_parent((struct node *) pn);
1541 			if (!parent)
1542 				goto failed;
1543 
1544 			/* Get Child's index */
1545 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1546 			pn = parent;
1547 			chopped_off = 0;
1548 
1549 #ifdef CONFIG_IP_FIB_TRIE_STATS
1550 			t->stats.backtrack++;
1551 #endif
1552 			goto backtrace;
1553 		}
1554 	}
1555 failed:
1556 	ret = 1;
1557 found:
1558 	rcu_read_unlock();
1559 	return ret;
1560 }
1561 
1562 /*
1563  * Remove the leaf and return parent.
1564  */
1565 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1566 {
1567 	struct tnode *tp = node_parent((struct node *) l);
1568 
1569 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1570 
1571 	if (tp) {
1572 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1573 		put_child(t, (struct tnode *)tp, cindex, NULL);
1574 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1575 	} else
1576 		rcu_assign_pointer(t->trie, NULL);
1577 
1578 	tnode_free((struct tnode *) l);
1579 }
1580 
1581 /*
1582  * Caller must hold RTNL.
1583  */
1584 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1585 {
1586 	struct trie *t = (struct trie *) tb->tb_data;
1587 	u32 key, mask;
1588 	int plen = cfg->fc_dst_len;
1589 	u8 tos = cfg->fc_tos;
1590 	struct fib_alias *fa, *fa_to_delete;
1591 	struct list_head *fa_head;
1592 	struct leaf *l;
1593 	struct leaf_info *li;
1594 
1595 	if (plen > 32)
1596 		return -EINVAL;
1597 
1598 	key = ntohl(cfg->fc_dst);
1599 	mask = ntohl(inet_make_mask(plen));
1600 
1601 	if (key & ~mask)
1602 		return -EINVAL;
1603 
1604 	key = key & mask;
1605 	l = fib_find_node(t, key);
1606 
1607 	if (!l)
1608 		return -ESRCH;
1609 
1610 	fa_head = get_fa_head(l, plen);
1611 	fa = fib_find_alias(fa_head, tos, 0);
1612 
1613 	if (!fa)
1614 		return -ESRCH;
1615 
1616 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1617 
1618 	fa_to_delete = NULL;
1619 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1620 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1621 		struct fib_info *fi = fa->fa_info;
1622 
1623 		if (fa->fa_tos != tos)
1624 			break;
1625 
1626 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1627 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1628 		     fa->fa_scope == cfg->fc_scope) &&
1629 		    (!cfg->fc_protocol ||
1630 		     fi->fib_protocol == cfg->fc_protocol) &&
1631 		    fib_nh_match(cfg, fi) == 0) {
1632 			fa_to_delete = fa;
1633 			break;
1634 		}
1635 	}
1636 
1637 	if (!fa_to_delete)
1638 		return -ESRCH;
1639 
1640 	fa = fa_to_delete;
1641 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1642 		  &cfg->fc_nlinfo, 0);
1643 
1644 	l = fib_find_node(t, key);
1645 	li = find_leaf_info(l, plen);
1646 
1647 	list_del_rcu(&fa->fa_list);
1648 
1649 	if (list_empty(fa_head)) {
1650 		hlist_del_rcu(&li->hlist);
1651 		free_leaf_info(li);
1652 	}
1653 
1654 	if (hlist_empty(&l->list))
1655 		trie_leaf_remove(t, l);
1656 
1657 	if (fa->fa_state & FA_S_ACCESSED)
1658 		rt_cache_flush(-1);
1659 
1660 	fib_release_info(fa->fa_info);
1661 	alias_free_mem_rcu(fa);
1662 	return 0;
1663 }
1664 
1665 static int trie_flush_list(struct trie *t, struct list_head *head)
1666 {
1667 	struct fib_alias *fa, *fa_node;
1668 	int found = 0;
1669 
1670 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1671 		struct fib_info *fi = fa->fa_info;
1672 
1673 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1674 			list_del_rcu(&fa->fa_list);
1675 			fib_release_info(fa->fa_info);
1676 			alias_free_mem_rcu(fa);
1677 			found++;
1678 		}
1679 	}
1680 	return found;
1681 }
1682 
1683 static int trie_flush_leaf(struct trie *t, struct leaf *l)
1684 {
1685 	int found = 0;
1686 	struct hlist_head *lih = &l->list;
1687 	struct hlist_node *node, *tmp;
1688 	struct leaf_info *li = NULL;
1689 
1690 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1691 		found += trie_flush_list(t, &li->falh);
1692 
1693 		if (list_empty(&li->falh)) {
1694 			hlist_del_rcu(&li->hlist);
1695 			free_leaf_info(li);
1696 		}
1697 	}
1698 	return found;
1699 }
1700 
1701 /*
1702  * Scan for the next right leaf starting at node p->child[idx]
1703  * Since we have back pointer, no recursion necessary.
1704  */
1705 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1706 {
1707 	do {
1708 		t_key idx;
1709 
1710 		if (c)
1711 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1712 		else
1713 			idx = 0;
1714 
1715 		while (idx < 1u << p->bits) {
1716 			c = tnode_get_child_rcu(p, idx++);
1717 			if (!c)
1718 				continue;
1719 
1720 			if (IS_LEAF(c)) {
1721 				prefetch(p->child[idx]);
1722 				return (struct leaf *) c;
1723 			}
1724 
1725 			/* Rescan start scanning in new node */
1726 			p = (struct tnode *) c;
1727 			idx = 0;
1728 		}
1729 
1730 		/* Node empty, walk back up to parent */
1731 		c = (struct node *) p;
1732 	} while ( (p = node_parent_rcu(c)) != NULL);
1733 
1734 	return NULL; /* Root of trie */
1735 }
1736 
1737 static struct leaf *trie_firstleaf(struct trie *t)
1738 {
1739 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1740 
1741 	if (!n)
1742 		return NULL;
1743 
1744 	if (IS_LEAF(n))          /* trie is just a leaf */
1745 		return (struct leaf *) n;
1746 
1747 	return leaf_walk_rcu(n, NULL);
1748 }
1749 
1750 static struct leaf *trie_nextleaf(struct leaf *l)
1751 {
1752 	struct node *c = (struct node *) l;
1753 	struct tnode *p = node_parent(c);
1754 
1755 	if (!p)
1756 		return NULL;	/* trie with just one leaf */
1757 
1758 	return leaf_walk_rcu(p, c);
1759 }
1760 
1761 static struct leaf *trie_leafindex(struct trie *t, int index)
1762 {
1763 	struct leaf *l = trie_firstleaf(t);
1764 
1765 	while (index-- > 0) {
1766 		l = trie_nextleaf(l);
1767 		if (!l)
1768 			break;
1769 	}
1770 	return l;
1771 }
1772 
1773 
1774 /*
1775  * Caller must hold RTNL.
1776  */
1777 static int fn_trie_flush(struct fib_table *tb)
1778 {
1779 	struct trie *t = (struct trie *) tb->tb_data;
1780 	struct leaf *l, *ll = NULL;
1781 	int found = 0;
1782 
1783 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1784 		found += trie_flush_leaf(t, l);
1785 
1786 		if (ll && hlist_empty(&ll->list))
1787 			trie_leaf_remove(t, ll);
1788 		ll = l;
1789 	}
1790 
1791 	if (ll && hlist_empty(&ll->list))
1792 		trie_leaf_remove(t, ll);
1793 
1794 	pr_debug("trie_flush found=%d\n", found);
1795 	return found;
1796 }
1797 
1798 static void fn_trie_select_default(struct fib_table *tb,
1799 				   const struct flowi *flp,
1800 				   struct fib_result *res)
1801 {
1802 	struct trie *t = (struct trie *) tb->tb_data;
1803 	int order, last_idx;
1804 	struct fib_info *fi = NULL;
1805 	struct fib_info *last_resort;
1806 	struct fib_alias *fa = NULL;
1807 	struct list_head *fa_head;
1808 	struct leaf *l;
1809 
1810 	last_idx = -1;
1811 	last_resort = NULL;
1812 	order = -1;
1813 
1814 	rcu_read_lock();
1815 
1816 	l = fib_find_node(t, 0);
1817 	if (!l)
1818 		goto out;
1819 
1820 	fa_head = get_fa_head(l, 0);
1821 	if (!fa_head)
1822 		goto out;
1823 
1824 	if (list_empty(fa_head))
1825 		goto out;
1826 
1827 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1828 		struct fib_info *next_fi = fa->fa_info;
1829 
1830 		if (fa->fa_scope != res->scope ||
1831 		    fa->fa_type != RTN_UNICAST)
1832 			continue;
1833 
1834 		if (next_fi->fib_priority > res->fi->fib_priority)
1835 			break;
1836 		if (!next_fi->fib_nh[0].nh_gw ||
1837 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1838 			continue;
1839 		fa->fa_state |= FA_S_ACCESSED;
1840 
1841 		if (fi == NULL) {
1842 			if (next_fi != res->fi)
1843 				break;
1844 		} else if (!fib_detect_death(fi, order, &last_resort,
1845 					     &last_idx, tb->tb_default)) {
1846 			fib_result_assign(res, fi);
1847 			tb->tb_default = order;
1848 			goto out;
1849 		}
1850 		fi = next_fi;
1851 		order++;
1852 	}
1853 	if (order <= 0 || fi == NULL) {
1854 		tb->tb_default = -1;
1855 		goto out;
1856 	}
1857 
1858 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1859 				tb->tb_default)) {
1860 		fib_result_assign(res, fi);
1861 		tb->tb_default = order;
1862 		goto out;
1863 	}
1864 	if (last_idx >= 0)
1865 		fib_result_assign(res, last_resort);
1866 	tb->tb_default = last_idx;
1867 out:
1868 	rcu_read_unlock();
1869 }
1870 
1871 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1872 			   struct fib_table *tb,
1873 			   struct sk_buff *skb, struct netlink_callback *cb)
1874 {
1875 	int i, s_i;
1876 	struct fib_alias *fa;
1877 	__be32 xkey = htonl(key);
1878 
1879 	s_i = cb->args[5];
1880 	i = 0;
1881 
1882 	/* rcu_read_lock is hold by caller */
1883 
1884 	list_for_each_entry_rcu(fa, fah, fa_list) {
1885 		if (i < s_i) {
1886 			i++;
1887 			continue;
1888 		}
1889 
1890 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1891 				  cb->nlh->nlmsg_seq,
1892 				  RTM_NEWROUTE,
1893 				  tb->tb_id,
1894 				  fa->fa_type,
1895 				  fa->fa_scope,
1896 				  xkey,
1897 				  plen,
1898 				  fa->fa_tos,
1899 				  fa->fa_info, NLM_F_MULTI) < 0) {
1900 			cb->args[5] = i;
1901 			return -1;
1902 		}
1903 		i++;
1904 	}
1905 	cb->args[5] = i;
1906 	return skb->len;
1907 }
1908 
1909 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1910 			struct sk_buff *skb, struct netlink_callback *cb)
1911 {
1912 	struct leaf_info *li;
1913 	struct hlist_node *node;
1914 	int i, s_i;
1915 
1916 	s_i = cb->args[4];
1917 	i = 0;
1918 
1919 	/* rcu_read_lock is hold by caller */
1920 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1921 		if (i < s_i) {
1922 			i++;
1923 			continue;
1924 		}
1925 
1926 		if (i > s_i)
1927 			cb->args[5] = 0;
1928 
1929 		if (list_empty(&li->falh))
1930 			continue;
1931 
1932 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1933 			cb->args[4] = i;
1934 			return -1;
1935 		}
1936 		i++;
1937 	}
1938 
1939 	cb->args[4] = i;
1940 	return skb->len;
1941 }
1942 
1943 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1944 			struct netlink_callback *cb)
1945 {
1946 	struct leaf *l;
1947 	struct trie *t = (struct trie *) tb->tb_data;
1948 	t_key key = cb->args[2];
1949 	int count = cb->args[3];
1950 
1951 	rcu_read_lock();
1952 	/* Dump starting at last key.
1953 	 * Note: 0.0.0.0/0 (ie default) is first key.
1954 	 */
1955 	if (count == 0)
1956 		l = trie_firstleaf(t);
1957 	else {
1958 		/* Normally, continue from last key, but if that is missing
1959 		 * fallback to using slow rescan
1960 		 */
1961 		l = fib_find_node(t, key);
1962 		if (!l)
1963 			l = trie_leafindex(t, count);
1964 	}
1965 
1966 	while (l) {
1967 		cb->args[2] = l->key;
1968 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1969 			cb->args[3] = count;
1970 			rcu_read_unlock();
1971 			return -1;
1972 		}
1973 
1974 		++count;
1975 		l = trie_nextleaf(l);
1976 		memset(&cb->args[4], 0,
1977 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1978 	}
1979 	cb->args[3] = count;
1980 	rcu_read_unlock();
1981 
1982 	return skb->len;
1983 }
1984 
1985 void __init fib_hash_init(void)
1986 {
1987 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1988 					  sizeof(struct fib_alias),
1989 					  0, SLAB_PANIC, NULL);
1990 
1991 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1992 					   max(sizeof(struct leaf),
1993 					       sizeof(struct leaf_info)),
1994 					   0, SLAB_PANIC, NULL);
1995 }
1996 
1997 
1998 /* Fix more generic FIB names for init later */
1999 struct fib_table *fib_hash_table(u32 id)
2000 {
2001 	struct fib_table *tb;
2002 	struct trie *t;
2003 
2004 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2005 		     GFP_KERNEL);
2006 	if (tb == NULL)
2007 		return NULL;
2008 
2009 	tb->tb_id = id;
2010 	tb->tb_default = -1;
2011 	tb->tb_lookup = fn_trie_lookup;
2012 	tb->tb_insert = fn_trie_insert;
2013 	tb->tb_delete = fn_trie_delete;
2014 	tb->tb_flush = fn_trie_flush;
2015 	tb->tb_select_default = fn_trie_select_default;
2016 	tb->tb_dump = fn_trie_dump;
2017 
2018 	t = (struct trie *) tb->tb_data;
2019 	memset(t, 0, sizeof(*t));
2020 
2021 	if (id == RT_TABLE_LOCAL)
2022 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2023 
2024 	return tb;
2025 }
2026 
2027 #ifdef CONFIG_PROC_FS
2028 /* Depth first Trie walk iterator */
2029 struct fib_trie_iter {
2030 	struct seq_net_private p;
2031 	struct trie *trie_local, *trie_main;
2032 	struct tnode *tnode;
2033 	struct trie *trie;
2034 	unsigned index;
2035 	unsigned depth;
2036 };
2037 
2038 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2039 {
2040 	struct tnode *tn = iter->tnode;
2041 	unsigned cindex = iter->index;
2042 	struct tnode *p;
2043 
2044 	/* A single entry routing table */
2045 	if (!tn)
2046 		return NULL;
2047 
2048 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2049 		 iter->tnode, iter->index, iter->depth);
2050 rescan:
2051 	while (cindex < (1<<tn->bits)) {
2052 		struct node *n = tnode_get_child_rcu(tn, cindex);
2053 
2054 		if (n) {
2055 			if (IS_LEAF(n)) {
2056 				iter->tnode = tn;
2057 				iter->index = cindex + 1;
2058 			} else {
2059 				/* push down one level */
2060 				iter->tnode = (struct tnode *) n;
2061 				iter->index = 0;
2062 				++iter->depth;
2063 			}
2064 			return n;
2065 		}
2066 
2067 		++cindex;
2068 	}
2069 
2070 	/* Current node exhausted, pop back up */
2071 	p = node_parent_rcu((struct node *)tn);
2072 	if (p) {
2073 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2074 		tn = p;
2075 		--iter->depth;
2076 		goto rescan;
2077 	}
2078 
2079 	/* got root? */
2080 	return NULL;
2081 }
2082 
2083 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2084 				       struct trie *t)
2085 {
2086 	struct node *n ;
2087 
2088 	if (!t)
2089 		return NULL;
2090 
2091 	n = rcu_dereference(t->trie);
2092 
2093 	if (!iter)
2094 		return NULL;
2095 
2096 	if (n) {
2097 		if (IS_TNODE(n)) {
2098 			iter->tnode = (struct tnode *) n;
2099 			iter->trie = t;
2100 			iter->index = 0;
2101 			iter->depth = 1;
2102 		} else {
2103 			iter->tnode = NULL;
2104 			iter->trie  = t;
2105 			iter->index = 0;
2106 			iter->depth = 0;
2107 		}
2108 		return n;
2109 	}
2110 	return NULL;
2111 }
2112 
2113 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2114 {
2115 	struct node *n;
2116 	struct fib_trie_iter iter;
2117 
2118 	memset(s, 0, sizeof(*s));
2119 
2120 	rcu_read_lock();
2121 	for (n = fib_trie_get_first(&iter, t); n;
2122 	     n = fib_trie_get_next(&iter)) {
2123 		if (IS_LEAF(n)) {
2124 			struct leaf *l = (struct leaf *)n;
2125 			struct leaf_info *li;
2126 			struct hlist_node *tmp;
2127 
2128 			s->leaves++;
2129 			s->totdepth += iter.depth;
2130 			if (iter.depth > s->maxdepth)
2131 				s->maxdepth = iter.depth;
2132 
2133 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2134 				++s->prefixes;
2135 		} else {
2136 			const struct tnode *tn = (const struct tnode *) n;
2137 			int i;
2138 
2139 			s->tnodes++;
2140 			if (tn->bits < MAX_STAT_DEPTH)
2141 				s->nodesizes[tn->bits]++;
2142 
2143 			for (i = 0; i < (1<<tn->bits); i++)
2144 				if (!tn->child[i])
2145 					s->nullpointers++;
2146 		}
2147 	}
2148 	rcu_read_unlock();
2149 }
2150 
2151 /*
2152  *	This outputs /proc/net/fib_triestats
2153  */
2154 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2155 {
2156 	unsigned i, max, pointers, bytes, avdepth;
2157 
2158 	if (stat->leaves)
2159 		avdepth = stat->totdepth*100 / stat->leaves;
2160 	else
2161 		avdepth = 0;
2162 
2163 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2164 		   avdepth / 100, avdepth % 100);
2165 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2166 
2167 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2168 	bytes = sizeof(struct leaf) * stat->leaves;
2169 
2170 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2171 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2172 
2173 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2174 	bytes += sizeof(struct tnode) * stat->tnodes;
2175 
2176 	max = MAX_STAT_DEPTH;
2177 	while (max > 0 && stat->nodesizes[max-1] == 0)
2178 		max--;
2179 
2180 	pointers = 0;
2181 	for (i = 1; i <= max; i++)
2182 		if (stat->nodesizes[i] != 0) {
2183 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2184 			pointers += (1<<i) * stat->nodesizes[i];
2185 		}
2186 	seq_putc(seq, '\n');
2187 	seq_printf(seq, "\tPointers: %u\n", pointers);
2188 
2189 	bytes += sizeof(struct node *) * pointers;
2190 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2191 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2192 }
2193 
2194 #ifdef CONFIG_IP_FIB_TRIE_STATS
2195 static void trie_show_usage(struct seq_file *seq,
2196 			    const struct trie_use_stats *stats)
2197 {
2198 	seq_printf(seq, "\nCounters:\n---------\n");
2199 	seq_printf(seq, "gets = %u\n", stats->gets);
2200 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2201 	seq_printf(seq, "semantic match passed = %u\n",
2202 		   stats->semantic_match_passed);
2203 	seq_printf(seq, "semantic match miss = %u\n",
2204 		   stats->semantic_match_miss);
2205 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2206 	seq_printf(seq, "skipped node resize = %u\n\n",
2207 		   stats->resize_node_skipped);
2208 }
2209 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2210 
2211 static void fib_trie_show(struct seq_file *seq, const char *name,
2212 			  struct trie *trie)
2213 {
2214 	struct trie_stat stat;
2215 
2216 	trie_collect_stats(trie, &stat);
2217 	seq_printf(seq, "%s:\n", name);
2218 	trie_show_stats(seq, &stat);
2219 #ifdef CONFIG_IP_FIB_TRIE_STATS
2220 	trie_show_usage(seq, &trie->stats);
2221 #endif
2222 }
2223 
2224 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2225 {
2226 	struct net *net = (struct net *)seq->private;
2227 	struct fib_table *tb;
2228 
2229 	seq_printf(seq,
2230 		   "Basic info: size of leaf:"
2231 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2232 		   sizeof(struct leaf), sizeof(struct tnode));
2233 
2234 	tb = fib_get_table(net, RT_TABLE_LOCAL);
2235 	if (tb)
2236 		fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
2237 
2238 	tb = fib_get_table(net, RT_TABLE_MAIN);
2239 	if (tb)
2240 		fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
2241 
2242 	return 0;
2243 }
2244 
2245 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2246 {
2247 	int err;
2248 	struct net *net;
2249 
2250 	net = get_proc_net(inode);
2251 	if (net == NULL)
2252 		return -ENXIO;
2253 	err = single_open(file, fib_triestat_seq_show, net);
2254 	if (err < 0) {
2255 		put_net(net);
2256 		return err;
2257 	}
2258 	return 0;
2259 }
2260 
2261 static int fib_triestat_seq_release(struct inode *ino, struct file *f)
2262 {
2263 	struct seq_file *seq = f->private_data;
2264 	put_net(seq->private);
2265 	return single_release(ino, f);
2266 }
2267 
2268 static const struct file_operations fib_triestat_fops = {
2269 	.owner	= THIS_MODULE,
2270 	.open	= fib_triestat_seq_open,
2271 	.read	= seq_read,
2272 	.llseek	= seq_lseek,
2273 	.release = fib_triestat_seq_release,
2274 };
2275 
2276 static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2277 				      loff_t pos)
2278 {
2279 	loff_t idx = 0;
2280 	struct node *n;
2281 
2282 	for (n = fib_trie_get_first(iter, iter->trie_local);
2283 	     n; ++idx, n = fib_trie_get_next(iter)) {
2284 		if (pos == idx)
2285 			return n;
2286 	}
2287 
2288 	for (n = fib_trie_get_first(iter, iter->trie_main);
2289 	     n; ++idx, n = fib_trie_get_next(iter)) {
2290 		if (pos == idx)
2291 			return n;
2292 	}
2293 	return NULL;
2294 }
2295 
2296 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2297 	__acquires(RCU)
2298 {
2299 	struct fib_trie_iter *iter = seq->private;
2300 	struct fib_table *tb;
2301 
2302 	if (!iter->trie_local) {
2303 		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2304 		if (tb)
2305 			iter->trie_local = (struct trie *) tb->tb_data;
2306 	}
2307 	if (!iter->trie_main) {
2308 		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2309 		if (tb)
2310 			iter->trie_main = (struct trie *) tb->tb_data;
2311 	}
2312 	rcu_read_lock();
2313 	if (*pos == 0)
2314 		return SEQ_START_TOKEN;
2315 	return fib_trie_get_idx(iter, *pos - 1);
2316 }
2317 
2318 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2319 {
2320 	struct fib_trie_iter *iter = seq->private;
2321 	void *l = v;
2322 
2323 	++*pos;
2324 	if (v == SEQ_START_TOKEN)
2325 		return fib_trie_get_idx(iter, 0);
2326 
2327 	v = fib_trie_get_next(iter);
2328 	BUG_ON(v == l);
2329 	if (v)
2330 		return v;
2331 
2332 	/* continue scan in next trie */
2333 	if (iter->trie == iter->trie_local)
2334 		return fib_trie_get_first(iter, iter->trie_main);
2335 
2336 	return NULL;
2337 }
2338 
2339 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2340 	__releases(RCU)
2341 {
2342 	rcu_read_unlock();
2343 }
2344 
2345 static void seq_indent(struct seq_file *seq, int n)
2346 {
2347 	while (n-- > 0) seq_puts(seq, "   ");
2348 }
2349 
2350 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2351 {
2352 	switch (s) {
2353 	case RT_SCOPE_UNIVERSE: return "universe";
2354 	case RT_SCOPE_SITE:	return "site";
2355 	case RT_SCOPE_LINK:	return "link";
2356 	case RT_SCOPE_HOST:	return "host";
2357 	case RT_SCOPE_NOWHERE:	return "nowhere";
2358 	default:
2359 		snprintf(buf, len, "scope=%d", s);
2360 		return buf;
2361 	}
2362 }
2363 
2364 static const char *rtn_type_names[__RTN_MAX] = {
2365 	[RTN_UNSPEC] = "UNSPEC",
2366 	[RTN_UNICAST] = "UNICAST",
2367 	[RTN_LOCAL] = "LOCAL",
2368 	[RTN_BROADCAST] = "BROADCAST",
2369 	[RTN_ANYCAST] = "ANYCAST",
2370 	[RTN_MULTICAST] = "MULTICAST",
2371 	[RTN_BLACKHOLE] = "BLACKHOLE",
2372 	[RTN_UNREACHABLE] = "UNREACHABLE",
2373 	[RTN_PROHIBIT] = "PROHIBIT",
2374 	[RTN_THROW] = "THROW",
2375 	[RTN_NAT] = "NAT",
2376 	[RTN_XRESOLVE] = "XRESOLVE",
2377 };
2378 
2379 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2380 {
2381 	if (t < __RTN_MAX && rtn_type_names[t])
2382 		return rtn_type_names[t];
2383 	snprintf(buf, len, "type %u", t);
2384 	return buf;
2385 }
2386 
2387 /* Pretty print the trie */
2388 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2389 {
2390 	const struct fib_trie_iter *iter = seq->private;
2391 	struct node *n = v;
2392 
2393 	if (v == SEQ_START_TOKEN)
2394 		return 0;
2395 
2396 	if (!node_parent_rcu(n)) {
2397 		if (iter->trie == iter->trie_local)
2398 			seq_puts(seq, "<local>:\n");
2399 		else
2400 			seq_puts(seq, "<main>:\n");
2401 	}
2402 
2403 	if (IS_TNODE(n)) {
2404 		struct tnode *tn = (struct tnode *) n;
2405 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2406 
2407 		seq_indent(seq, iter->depth-1);
2408 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
2409 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
2410 			   tn->empty_children);
2411 
2412 	} else {
2413 		struct leaf *l = (struct leaf *) n;
2414 		struct leaf_info *li;
2415 		struct hlist_node *node;
2416 		__be32 val = htonl(l->key);
2417 
2418 		seq_indent(seq, iter->depth);
2419 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
2420 
2421 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2422 			struct fib_alias *fa;
2423 
2424 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2425 				char buf1[32], buf2[32];
2426 
2427 				seq_indent(seq, iter->depth+1);
2428 				seq_printf(seq, "  /%d %s %s", li->plen,
2429 					   rtn_scope(buf1, sizeof(buf1),
2430 						     fa->fa_scope),
2431 					   rtn_type(buf2, sizeof(buf2),
2432 						    fa->fa_type));
2433 				if (fa->fa_tos)
2434 					seq_printf(seq, "tos =%d\n",
2435 						   fa->fa_tos);
2436 				seq_putc(seq, '\n');
2437 			}
2438 		}
2439 	}
2440 
2441 	return 0;
2442 }
2443 
2444 static const struct seq_operations fib_trie_seq_ops = {
2445 	.start  = fib_trie_seq_start,
2446 	.next   = fib_trie_seq_next,
2447 	.stop   = fib_trie_seq_stop,
2448 	.show   = fib_trie_seq_show,
2449 };
2450 
2451 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2452 {
2453 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2454 			    sizeof(struct fib_trie_iter));
2455 }
2456 
2457 static const struct file_operations fib_trie_fops = {
2458 	.owner  = THIS_MODULE,
2459 	.open   = fib_trie_seq_open,
2460 	.read   = seq_read,
2461 	.llseek = seq_lseek,
2462 	.release = seq_release_net,
2463 };
2464 
2465 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2466 {
2467 	static unsigned type2flags[RTN_MAX + 1] = {
2468 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2469 	};
2470 	unsigned flags = type2flags[type];
2471 
2472 	if (fi && fi->fib_nh->nh_gw)
2473 		flags |= RTF_GATEWAY;
2474 	if (mask == htonl(0xFFFFFFFF))
2475 		flags |= RTF_HOST;
2476 	flags |= RTF_UP;
2477 	return flags;
2478 }
2479 
2480 /*
2481  *	This outputs /proc/net/route.
2482  *	The format of the file is not supposed to be changed
2483  * 	and needs to be same as fib_hash output to avoid breaking
2484  *	legacy utilities
2485  */
2486 static int fib_route_seq_show(struct seq_file *seq, void *v)
2487 {
2488 	const struct fib_trie_iter *iter = seq->private;
2489 	struct leaf *l = v;
2490 	struct leaf_info *li;
2491 	struct hlist_node *node;
2492 
2493 	if (v == SEQ_START_TOKEN) {
2494 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2495 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2496 			   "\tWindow\tIRTT");
2497 		return 0;
2498 	}
2499 
2500 	if (iter->trie == iter->trie_local)
2501 		return 0;
2502 
2503 	if (IS_TNODE(l))
2504 		return 0;
2505 
2506 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2507 		struct fib_alias *fa;
2508 		__be32 mask, prefix;
2509 
2510 		mask = inet_make_mask(li->plen);
2511 		prefix = htonl(l->key);
2512 
2513 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2514 			const struct fib_info *fi = fa->fa_info;
2515 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2516 			char bf[128];
2517 
2518 			if (fa->fa_type == RTN_BROADCAST
2519 			    || fa->fa_type == RTN_MULTICAST)
2520 				continue;
2521 
2522 			if (fi)
2523 				snprintf(bf, sizeof(bf),
2524 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2525 					 fi->fib_dev ? fi->fib_dev->name : "*",
2526 					 prefix,
2527 					 fi->fib_nh->nh_gw, flags, 0, 0,
2528 					 fi->fib_priority,
2529 					 mask,
2530 					 (fi->fib_advmss ?
2531 					  fi->fib_advmss + 40 : 0),
2532 					 fi->fib_window,
2533 					 fi->fib_rtt >> 3);
2534 			else
2535 				snprintf(bf, sizeof(bf),
2536 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2537 					 prefix, 0, flags, 0, 0, 0,
2538 					 mask, 0, 0, 0);
2539 
2540 			seq_printf(seq, "%-127s\n", bf);
2541 		}
2542 	}
2543 
2544 	return 0;
2545 }
2546 
2547 static const struct seq_operations fib_route_seq_ops = {
2548 	.start  = fib_trie_seq_start,
2549 	.next   = fib_trie_seq_next,
2550 	.stop   = fib_trie_seq_stop,
2551 	.show   = fib_route_seq_show,
2552 };
2553 
2554 static int fib_route_seq_open(struct inode *inode, struct file *file)
2555 {
2556 	return seq_open_net(inode, file, &fib_route_seq_ops,
2557 			    sizeof(struct fib_trie_iter));
2558 }
2559 
2560 static const struct file_operations fib_route_fops = {
2561 	.owner  = THIS_MODULE,
2562 	.open   = fib_route_seq_open,
2563 	.read   = seq_read,
2564 	.llseek = seq_lseek,
2565 	.release = seq_release_net,
2566 };
2567 
2568 int __net_init fib_proc_init(struct net *net)
2569 {
2570 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2571 		goto out1;
2572 
2573 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2574 				  &fib_triestat_fops))
2575 		goto out2;
2576 
2577 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2578 		goto out3;
2579 
2580 	return 0;
2581 
2582 out3:
2583 	proc_net_remove(net, "fib_triestat");
2584 out2:
2585 	proc_net_remove(net, "fib_trie");
2586 out1:
2587 	return -ENOMEM;
2588 }
2589 
2590 void __net_exit fib_proc_exit(struct net *net)
2591 {
2592 	proc_net_remove(net, "fib_trie");
2593 	proc_net_remove(net, "fib_triestat");
2594 	proc_net_remove(net, "route");
2595 }
2596 
2597 #endif /* CONFIG_PROC_FS */
2598