xref: /openbmc/linux/net/ipv4/fib_trie.c (revision e8e0929d)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally descibed in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <asm/system.h>
55 #include <linux/bitops.h>
56 #include <linux/types.h>
57 #include <linux/kernel.h>
58 #include <linux/mm.h>
59 #include <linux/string.h>
60 #include <linux/socket.h>
61 #include <linux/sockios.h>
62 #include <linux/errno.h>
63 #include <linux/in.h>
64 #include <linux/inet.h>
65 #include <linux/inetdevice.h>
66 #include <linux/netdevice.h>
67 #include <linux/if_arp.h>
68 #include <linux/proc_fs.h>
69 #include <linux/rcupdate.h>
70 #include <linux/skbuff.h>
71 #include <linux/netlink.h>
72 #include <linux/init.h>
73 #include <linux/list.h>
74 #include <net/net_namespace.h>
75 #include <net/ip.h>
76 #include <net/protocol.h>
77 #include <net/route.h>
78 #include <net/tcp.h>
79 #include <net/sock.h>
80 #include <net/ip_fib.h>
81 #include "fib_lookup.h"
82 
83 #define MAX_STAT_DEPTH 32
84 
85 #define KEYLENGTH (8*sizeof(t_key))
86 
87 typedef unsigned int t_key;
88 
89 #define T_TNODE 0
90 #define T_LEAF  1
91 #define NODE_TYPE_MASK	0x1UL
92 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93 
94 #define IS_TNODE(n) (!(n->parent & T_LEAF))
95 #define IS_LEAF(n) (n->parent & T_LEAF)
96 
97 struct node {
98 	unsigned long parent;
99 	t_key key;
100 };
101 
102 struct leaf {
103 	unsigned long parent;
104 	t_key key;
105 	struct hlist_head list;
106 	struct rcu_head rcu;
107 };
108 
109 struct leaf_info {
110 	struct hlist_node hlist;
111 	struct rcu_head rcu;
112 	int plen;
113 	struct list_head falh;
114 };
115 
116 struct tnode {
117 	unsigned long parent;
118 	t_key key;
119 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
120 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
121 	unsigned int full_children;	/* KEYLENGTH bits needed */
122 	unsigned int empty_children;	/* KEYLENGTH bits needed */
123 	union {
124 		struct rcu_head rcu;
125 		struct work_struct work;
126 		struct tnode *tnode_free;
127 	};
128 	struct node *child[0];
129 };
130 
131 #ifdef CONFIG_IP_FIB_TRIE_STATS
132 struct trie_use_stats {
133 	unsigned int gets;
134 	unsigned int backtrack;
135 	unsigned int semantic_match_passed;
136 	unsigned int semantic_match_miss;
137 	unsigned int null_node_hit;
138 	unsigned int resize_node_skipped;
139 };
140 #endif
141 
142 struct trie_stat {
143 	unsigned int totdepth;
144 	unsigned int maxdepth;
145 	unsigned int tnodes;
146 	unsigned int leaves;
147 	unsigned int nullpointers;
148 	unsigned int prefixes;
149 	unsigned int nodesizes[MAX_STAT_DEPTH];
150 };
151 
152 struct trie {
153 	struct node *trie;
154 #ifdef CONFIG_IP_FIB_TRIE_STATS
155 	struct trie_use_stats stats;
156 #endif
157 };
158 
159 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 				  int wasfull);
162 static struct node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 static size_t tnode_free_size;
168 
169 /*
170  * synchronize_rcu after call_rcu for that many pages; it should be especially
171  * useful before resizing the root node with PREEMPT_NONE configs; the value was
172  * obtained experimentally, aiming to avoid visible slowdown.
173  */
174 static const int sync_pages = 128;
175 
176 static struct kmem_cache *fn_alias_kmem __read_mostly;
177 static struct kmem_cache *trie_leaf_kmem __read_mostly;
178 
179 static inline struct tnode *node_parent(struct node *node)
180 {
181 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
182 }
183 
184 static inline struct tnode *node_parent_rcu(struct node *node)
185 {
186 	struct tnode *ret = node_parent(node);
187 
188 	return rcu_dereference(ret);
189 }
190 
191 /* Same as rcu_assign_pointer
192  * but that macro() assumes that value is a pointer.
193  */
194 static inline void node_set_parent(struct node *node, struct tnode *ptr)
195 {
196 	smp_wmb();
197 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
198 }
199 
200 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
201 {
202 	BUG_ON(i >= 1U << tn->bits);
203 
204 	return tn->child[i];
205 }
206 
207 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
208 {
209 	struct node *ret = tnode_get_child(tn, i);
210 
211 	return rcu_dereference(ret);
212 }
213 
214 static inline int tnode_child_length(const struct tnode *tn)
215 {
216 	return 1 << tn->bits;
217 }
218 
219 static inline t_key mask_pfx(t_key k, unsigned short l)
220 {
221 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
222 }
223 
224 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225 {
226 	if (offset < KEYLENGTH)
227 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
228 	else
229 		return 0;
230 }
231 
232 static inline int tkey_equals(t_key a, t_key b)
233 {
234 	return a == b;
235 }
236 
237 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
238 {
239 	if (bits == 0 || offset >= KEYLENGTH)
240 		return 1;
241 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
242 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
243 }
244 
245 static inline int tkey_mismatch(t_key a, int offset, t_key b)
246 {
247 	t_key diff = a ^ b;
248 	int i = offset;
249 
250 	if (!diff)
251 		return 0;
252 	while ((diff << i) >> (KEYLENGTH-1) == 0)
253 		i++;
254 	return i;
255 }
256 
257 /*
258   To understand this stuff, an understanding of keys and all their bits is
259   necessary. Every node in the trie has a key associated with it, but not
260   all of the bits in that key are significant.
261 
262   Consider a node 'n' and its parent 'tp'.
263 
264   If n is a leaf, every bit in its key is significant. Its presence is
265   necessitated by path compression, since during a tree traversal (when
266   searching for a leaf - unless we are doing an insertion) we will completely
267   ignore all skipped bits we encounter. Thus we need to verify, at the end of
268   a potentially successful search, that we have indeed been walking the
269   correct key path.
270 
271   Note that we can never "miss" the correct key in the tree if present by
272   following the wrong path. Path compression ensures that segments of the key
273   that are the same for all keys with a given prefix are skipped, but the
274   skipped part *is* identical for each node in the subtrie below the skipped
275   bit! trie_insert() in this implementation takes care of that - note the
276   call to tkey_sub_equals() in trie_insert().
277 
278   if n is an internal node - a 'tnode' here, the various parts of its key
279   have many different meanings.
280 
281   Example:
282   _________________________________________________________________
283   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
284   -----------------------------------------------------------------
285     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
286 
287   _________________________________________________________________
288   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
289   -----------------------------------------------------------------
290    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
291 
292   tp->pos = 7
293   tp->bits = 3
294   n->pos = 15
295   n->bits = 4
296 
297   First, let's just ignore the bits that come before the parent tp, that is
298   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
299   not use them for anything.
300 
301   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
302   index into the parent's child array. That is, they will be used to find
303   'n' among tp's children.
304 
305   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
306   for the node n.
307 
308   All the bits we have seen so far are significant to the node n. The rest
309   of the bits are really not needed or indeed known in n->key.
310 
311   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
312   n's child array, and will of course be different for each child.
313 
314 
315   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
316   at this point.
317 
318 */
319 
320 static inline void check_tnode(const struct tnode *tn)
321 {
322 	WARN_ON(tn && tn->pos+tn->bits > 32);
323 }
324 
325 static const int halve_threshold = 25;
326 static const int inflate_threshold = 50;
327 static const int halve_threshold_root = 15;
328 static const int inflate_threshold_root = 30;
329 
330 static void __alias_free_mem(struct rcu_head *head)
331 {
332 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
333 	kmem_cache_free(fn_alias_kmem, fa);
334 }
335 
336 static inline void alias_free_mem_rcu(struct fib_alias *fa)
337 {
338 	call_rcu(&fa->rcu, __alias_free_mem);
339 }
340 
341 static void __leaf_free_rcu(struct rcu_head *head)
342 {
343 	struct leaf *l = container_of(head, struct leaf, rcu);
344 	kmem_cache_free(trie_leaf_kmem, l);
345 }
346 
347 static inline void free_leaf(struct leaf *l)
348 {
349 	call_rcu_bh(&l->rcu, __leaf_free_rcu);
350 }
351 
352 static void __leaf_info_free_rcu(struct rcu_head *head)
353 {
354 	kfree(container_of(head, struct leaf_info, rcu));
355 }
356 
357 static inline void free_leaf_info(struct leaf_info *leaf)
358 {
359 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
360 }
361 
362 static struct tnode *tnode_alloc(size_t size)
363 {
364 	if (size <= PAGE_SIZE)
365 		return kzalloc(size, GFP_KERNEL);
366 	else
367 		return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
368 }
369 
370 static void __tnode_vfree(struct work_struct *arg)
371 {
372 	struct tnode *tn = container_of(arg, struct tnode, work);
373 	vfree(tn);
374 }
375 
376 static void __tnode_free_rcu(struct rcu_head *head)
377 {
378 	struct tnode *tn = container_of(head, struct tnode, rcu);
379 	size_t size = sizeof(struct tnode) +
380 		      (sizeof(struct node *) << tn->bits);
381 
382 	if (size <= PAGE_SIZE)
383 		kfree(tn);
384 	else {
385 		INIT_WORK(&tn->work, __tnode_vfree);
386 		schedule_work(&tn->work);
387 	}
388 }
389 
390 static inline void tnode_free(struct tnode *tn)
391 {
392 	if (IS_LEAF(tn))
393 		free_leaf((struct leaf *) tn);
394 	else
395 		call_rcu(&tn->rcu, __tnode_free_rcu);
396 }
397 
398 static void tnode_free_safe(struct tnode *tn)
399 {
400 	BUG_ON(IS_LEAF(tn));
401 	tn->tnode_free = tnode_free_head;
402 	tnode_free_head = tn;
403 	tnode_free_size += sizeof(struct tnode) +
404 			   (sizeof(struct node *) << tn->bits);
405 }
406 
407 static void tnode_free_flush(void)
408 {
409 	struct tnode *tn;
410 
411 	while ((tn = tnode_free_head)) {
412 		tnode_free_head = tn->tnode_free;
413 		tn->tnode_free = NULL;
414 		tnode_free(tn);
415 	}
416 
417 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
418 		tnode_free_size = 0;
419 		synchronize_rcu();
420 	}
421 }
422 
423 static struct leaf *leaf_new(void)
424 {
425 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
426 	if (l) {
427 		l->parent = T_LEAF;
428 		INIT_HLIST_HEAD(&l->list);
429 	}
430 	return l;
431 }
432 
433 static struct leaf_info *leaf_info_new(int plen)
434 {
435 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
436 	if (li) {
437 		li->plen = plen;
438 		INIT_LIST_HEAD(&li->falh);
439 	}
440 	return li;
441 }
442 
443 static struct tnode *tnode_new(t_key key, int pos, int bits)
444 {
445 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
446 	struct tnode *tn = tnode_alloc(sz);
447 
448 	if (tn) {
449 		tn->parent = T_TNODE;
450 		tn->pos = pos;
451 		tn->bits = bits;
452 		tn->key = key;
453 		tn->full_children = 0;
454 		tn->empty_children = 1<<bits;
455 	}
456 
457 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
458 		 (unsigned long) (sizeof(struct node) << bits));
459 	return tn;
460 }
461 
462 /*
463  * Check whether a tnode 'n' is "full", i.e. it is an internal node
464  * and no bits are skipped. See discussion in dyntree paper p. 6
465  */
466 
467 static inline int tnode_full(const struct tnode *tn, const struct node *n)
468 {
469 	if (n == NULL || IS_LEAF(n))
470 		return 0;
471 
472 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
473 }
474 
475 static inline void put_child(struct trie *t, struct tnode *tn, int i,
476 			     struct node *n)
477 {
478 	tnode_put_child_reorg(tn, i, n, -1);
479 }
480 
481  /*
482   * Add a child at position i overwriting the old value.
483   * Update the value of full_children and empty_children.
484   */
485 
486 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
487 				  int wasfull)
488 {
489 	struct node *chi = tn->child[i];
490 	int isfull;
491 
492 	BUG_ON(i >= 1<<tn->bits);
493 
494 	/* update emptyChildren */
495 	if (n == NULL && chi != NULL)
496 		tn->empty_children++;
497 	else if (n != NULL && chi == NULL)
498 		tn->empty_children--;
499 
500 	/* update fullChildren */
501 	if (wasfull == -1)
502 		wasfull = tnode_full(tn, chi);
503 
504 	isfull = tnode_full(tn, n);
505 	if (wasfull && !isfull)
506 		tn->full_children--;
507 	else if (!wasfull && isfull)
508 		tn->full_children++;
509 
510 	if (n)
511 		node_set_parent(n, tn);
512 
513 	rcu_assign_pointer(tn->child[i], n);
514 }
515 
516 #define MAX_WORK 10
517 static struct node *resize(struct trie *t, struct tnode *tn)
518 {
519 	int i;
520 	struct tnode *old_tn;
521 	int inflate_threshold_use;
522 	int halve_threshold_use;
523 	int max_work;
524 
525 	if (!tn)
526 		return NULL;
527 
528 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
529 		 tn, inflate_threshold, halve_threshold);
530 
531 	/* No children */
532 	if (tn->empty_children == tnode_child_length(tn)) {
533 		tnode_free_safe(tn);
534 		return NULL;
535 	}
536 	/* One child */
537 	if (tn->empty_children == tnode_child_length(tn) - 1)
538 		goto one_child;
539 	/*
540 	 * Double as long as the resulting node has a number of
541 	 * nonempty nodes that are above the threshold.
542 	 */
543 
544 	/*
545 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
546 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
547 	 * Telecommunications, page 6:
548 	 * "A node is doubled if the ratio of non-empty children to all
549 	 * children in the *doubled* node is at least 'high'."
550 	 *
551 	 * 'high' in this instance is the variable 'inflate_threshold'. It
552 	 * is expressed as a percentage, so we multiply it with
553 	 * tnode_child_length() and instead of multiplying by 2 (since the
554 	 * child array will be doubled by inflate()) and multiplying
555 	 * the left-hand side by 100 (to handle the percentage thing) we
556 	 * multiply the left-hand side by 50.
557 	 *
558 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
559 	 * - tn->empty_children is of course the number of non-null children
560 	 * in the current node. tn->full_children is the number of "full"
561 	 * children, that is non-null tnodes with a skip value of 0.
562 	 * All of those will be doubled in the resulting inflated tnode, so
563 	 * we just count them one extra time here.
564 	 *
565 	 * A clearer way to write this would be:
566 	 *
567 	 * to_be_doubled = tn->full_children;
568 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
569 	 *     tn->full_children;
570 	 *
571 	 * new_child_length = tnode_child_length(tn) * 2;
572 	 *
573 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
574 	 *      new_child_length;
575 	 * if (new_fill_factor >= inflate_threshold)
576 	 *
577 	 * ...and so on, tho it would mess up the while () loop.
578 	 *
579 	 * anyway,
580 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
581 	 *      inflate_threshold
582 	 *
583 	 * avoid a division:
584 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
585 	 *      inflate_threshold * new_child_length
586 	 *
587 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
588 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
589 	 *    tn->full_children) >= inflate_threshold * new_child_length
590 	 *
591 	 * expand new_child_length:
592 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
593 	 *    tn->full_children) >=
594 	 *      inflate_threshold * tnode_child_length(tn) * 2
595 	 *
596 	 * shorten again:
597 	 * 50 * (tn->full_children + tnode_child_length(tn) -
598 	 *    tn->empty_children) >= inflate_threshold *
599 	 *    tnode_child_length(tn)
600 	 *
601 	 */
602 
603 	check_tnode(tn);
604 
605 	/* Keep root node larger  */
606 
607 	if (!node_parent((struct node*) tn)) {
608 		inflate_threshold_use = inflate_threshold_root;
609 		halve_threshold_use = halve_threshold_root;
610 	}
611 	else {
612 		inflate_threshold_use = inflate_threshold;
613 		halve_threshold_use = halve_threshold;
614 	}
615 
616 	max_work = MAX_WORK;
617 	while ((tn->full_children > 0 &&  max_work-- &&
618 		50 * (tn->full_children + tnode_child_length(tn)
619 		      - tn->empty_children)
620 		>= inflate_threshold_use * tnode_child_length(tn))) {
621 
622 		old_tn = tn;
623 		tn = inflate(t, tn);
624 
625 		if (IS_ERR(tn)) {
626 			tn = old_tn;
627 #ifdef CONFIG_IP_FIB_TRIE_STATS
628 			t->stats.resize_node_skipped++;
629 #endif
630 			break;
631 		}
632 	}
633 
634 	check_tnode(tn);
635 
636 	/* Return if at least one inflate is run */
637 	if( max_work != MAX_WORK)
638 		return (struct node *) tn;
639 
640 	/*
641 	 * Halve as long as the number of empty children in this
642 	 * node is above threshold.
643 	 */
644 
645 	max_work = MAX_WORK;
646 	while (tn->bits > 1 &&  max_work-- &&
647 	       100 * (tnode_child_length(tn) - tn->empty_children) <
648 	       halve_threshold_use * tnode_child_length(tn)) {
649 
650 		old_tn = tn;
651 		tn = halve(t, tn);
652 		if (IS_ERR(tn)) {
653 			tn = old_tn;
654 #ifdef CONFIG_IP_FIB_TRIE_STATS
655 			t->stats.resize_node_skipped++;
656 #endif
657 			break;
658 		}
659 	}
660 
661 
662 	/* Only one child remains */
663 	if (tn->empty_children == tnode_child_length(tn) - 1) {
664 one_child:
665 		for (i = 0; i < tnode_child_length(tn); i++) {
666 			struct node *n;
667 
668 			n = tn->child[i];
669 			if (!n)
670 				continue;
671 
672 			/* compress one level */
673 
674 			node_set_parent(n, NULL);
675 			tnode_free_safe(tn);
676 			return n;
677 		}
678 	}
679 	return (struct node *) tn;
680 }
681 
682 static struct tnode *inflate(struct trie *t, struct tnode *tn)
683 {
684 	struct tnode *oldtnode = tn;
685 	int olen = tnode_child_length(tn);
686 	int i;
687 
688 	pr_debug("In inflate\n");
689 
690 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
691 
692 	if (!tn)
693 		return ERR_PTR(-ENOMEM);
694 
695 	/*
696 	 * Preallocate and store tnodes before the actual work so we
697 	 * don't get into an inconsistent state if memory allocation
698 	 * fails. In case of failure we return the oldnode and  inflate
699 	 * of tnode is ignored.
700 	 */
701 
702 	for (i = 0; i < olen; i++) {
703 		struct tnode *inode;
704 
705 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
706 		if (inode &&
707 		    IS_TNODE(inode) &&
708 		    inode->pos == oldtnode->pos + oldtnode->bits &&
709 		    inode->bits > 1) {
710 			struct tnode *left, *right;
711 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
712 
713 			left = tnode_new(inode->key&(~m), inode->pos + 1,
714 					 inode->bits - 1);
715 			if (!left)
716 				goto nomem;
717 
718 			right = tnode_new(inode->key|m, inode->pos + 1,
719 					  inode->bits - 1);
720 
721 			if (!right) {
722 				tnode_free(left);
723 				goto nomem;
724 			}
725 
726 			put_child(t, tn, 2*i, (struct node *) left);
727 			put_child(t, tn, 2*i+1, (struct node *) right);
728 		}
729 	}
730 
731 	for (i = 0; i < olen; i++) {
732 		struct tnode *inode;
733 		struct node *node = tnode_get_child(oldtnode, i);
734 		struct tnode *left, *right;
735 		int size, j;
736 
737 		/* An empty child */
738 		if (node == NULL)
739 			continue;
740 
741 		/* A leaf or an internal node with skipped bits */
742 
743 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
744 		   tn->pos + tn->bits - 1) {
745 			if (tkey_extract_bits(node->key,
746 					      oldtnode->pos + oldtnode->bits,
747 					      1) == 0)
748 				put_child(t, tn, 2*i, node);
749 			else
750 				put_child(t, tn, 2*i+1, node);
751 			continue;
752 		}
753 
754 		/* An internal node with two children */
755 		inode = (struct tnode *) node;
756 
757 		if (inode->bits == 1) {
758 			put_child(t, tn, 2*i, inode->child[0]);
759 			put_child(t, tn, 2*i+1, inode->child[1]);
760 
761 			tnode_free_safe(inode);
762 			continue;
763 		}
764 
765 		/* An internal node with more than two children */
766 
767 		/* We will replace this node 'inode' with two new
768 		 * ones, 'left' and 'right', each with half of the
769 		 * original children. The two new nodes will have
770 		 * a position one bit further down the key and this
771 		 * means that the "significant" part of their keys
772 		 * (see the discussion near the top of this file)
773 		 * will differ by one bit, which will be "0" in
774 		 * left's key and "1" in right's key. Since we are
775 		 * moving the key position by one step, the bit that
776 		 * we are moving away from - the bit at position
777 		 * (inode->pos) - is the one that will differ between
778 		 * left and right. So... we synthesize that bit in the
779 		 * two  new keys.
780 		 * The mask 'm' below will be a single "one" bit at
781 		 * the position (inode->pos)
782 		 */
783 
784 		/* Use the old key, but set the new significant
785 		 *   bit to zero.
786 		 */
787 
788 		left = (struct tnode *) tnode_get_child(tn, 2*i);
789 		put_child(t, tn, 2*i, NULL);
790 
791 		BUG_ON(!left);
792 
793 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
794 		put_child(t, tn, 2*i+1, NULL);
795 
796 		BUG_ON(!right);
797 
798 		size = tnode_child_length(left);
799 		for (j = 0; j < size; j++) {
800 			put_child(t, left, j, inode->child[j]);
801 			put_child(t, right, j, inode->child[j + size]);
802 		}
803 		put_child(t, tn, 2*i, resize(t, left));
804 		put_child(t, tn, 2*i+1, resize(t, right));
805 
806 		tnode_free_safe(inode);
807 	}
808 	tnode_free_safe(oldtnode);
809 	return tn;
810 nomem:
811 	{
812 		int size = tnode_child_length(tn);
813 		int j;
814 
815 		for (j = 0; j < size; j++)
816 			if (tn->child[j])
817 				tnode_free((struct tnode *)tn->child[j]);
818 
819 		tnode_free(tn);
820 
821 		return ERR_PTR(-ENOMEM);
822 	}
823 }
824 
825 static struct tnode *halve(struct trie *t, struct tnode *tn)
826 {
827 	struct tnode *oldtnode = tn;
828 	struct node *left, *right;
829 	int i;
830 	int olen = tnode_child_length(tn);
831 
832 	pr_debug("In halve\n");
833 
834 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
835 
836 	if (!tn)
837 		return ERR_PTR(-ENOMEM);
838 
839 	/*
840 	 * Preallocate and store tnodes before the actual work so we
841 	 * don't get into an inconsistent state if memory allocation
842 	 * fails. In case of failure we return the oldnode and halve
843 	 * of tnode is ignored.
844 	 */
845 
846 	for (i = 0; i < olen; i += 2) {
847 		left = tnode_get_child(oldtnode, i);
848 		right = tnode_get_child(oldtnode, i+1);
849 
850 		/* Two nonempty children */
851 		if (left && right) {
852 			struct tnode *newn;
853 
854 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
855 
856 			if (!newn)
857 				goto nomem;
858 
859 			put_child(t, tn, i/2, (struct node *)newn);
860 		}
861 
862 	}
863 
864 	for (i = 0; i < olen; i += 2) {
865 		struct tnode *newBinNode;
866 
867 		left = tnode_get_child(oldtnode, i);
868 		right = tnode_get_child(oldtnode, i+1);
869 
870 		/* At least one of the children is empty */
871 		if (left == NULL) {
872 			if (right == NULL)    /* Both are empty */
873 				continue;
874 			put_child(t, tn, i/2, right);
875 			continue;
876 		}
877 
878 		if (right == NULL) {
879 			put_child(t, tn, i/2, left);
880 			continue;
881 		}
882 
883 		/* Two nonempty children */
884 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
885 		put_child(t, tn, i/2, NULL);
886 		put_child(t, newBinNode, 0, left);
887 		put_child(t, newBinNode, 1, right);
888 		put_child(t, tn, i/2, resize(t, newBinNode));
889 	}
890 	tnode_free_safe(oldtnode);
891 	return tn;
892 nomem:
893 	{
894 		int size = tnode_child_length(tn);
895 		int j;
896 
897 		for (j = 0; j < size; j++)
898 			if (tn->child[j])
899 				tnode_free((struct tnode *)tn->child[j]);
900 
901 		tnode_free(tn);
902 
903 		return ERR_PTR(-ENOMEM);
904 	}
905 }
906 
907 /* readside must use rcu_read_lock currently dump routines
908  via get_fa_head and dump */
909 
910 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
911 {
912 	struct hlist_head *head = &l->list;
913 	struct hlist_node *node;
914 	struct leaf_info *li;
915 
916 	hlist_for_each_entry_rcu(li, node, head, hlist)
917 		if (li->plen == plen)
918 			return li;
919 
920 	return NULL;
921 }
922 
923 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
924 {
925 	struct leaf_info *li = find_leaf_info(l, plen);
926 
927 	if (!li)
928 		return NULL;
929 
930 	return &li->falh;
931 }
932 
933 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
934 {
935 	struct leaf_info *li = NULL, *last = NULL;
936 	struct hlist_node *node;
937 
938 	if (hlist_empty(head)) {
939 		hlist_add_head_rcu(&new->hlist, head);
940 	} else {
941 		hlist_for_each_entry(li, node, head, hlist) {
942 			if (new->plen > li->plen)
943 				break;
944 
945 			last = li;
946 		}
947 		if (last)
948 			hlist_add_after_rcu(&last->hlist, &new->hlist);
949 		else
950 			hlist_add_before_rcu(&new->hlist, &li->hlist);
951 	}
952 }
953 
954 /* rcu_read_lock needs to be hold by caller from readside */
955 
956 static struct leaf *
957 fib_find_node(struct trie *t, u32 key)
958 {
959 	int pos;
960 	struct tnode *tn;
961 	struct node *n;
962 
963 	pos = 0;
964 	n = rcu_dereference(t->trie);
965 
966 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
967 		tn = (struct tnode *) n;
968 
969 		check_tnode(tn);
970 
971 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
972 			pos = tn->pos + tn->bits;
973 			n = tnode_get_child_rcu(tn,
974 						tkey_extract_bits(key,
975 								  tn->pos,
976 								  tn->bits));
977 		} else
978 			break;
979 	}
980 	/* Case we have found a leaf. Compare prefixes */
981 
982 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
983 		return (struct leaf *)n;
984 
985 	return NULL;
986 }
987 
988 static void trie_rebalance(struct trie *t, struct tnode *tn)
989 {
990 	int wasfull;
991 	t_key cindex, key;
992 	struct tnode *tp;
993 
994 	key = tn->key;
995 
996 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
997 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
998 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
999 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
1000 
1001 		tnode_put_child_reorg((struct tnode *)tp, cindex,
1002 				      (struct node *)tn, wasfull);
1003 
1004 		tp = node_parent((struct node *) tn);
1005 		if (!tp)
1006 			rcu_assign_pointer(t->trie, (struct node *)tn);
1007 
1008 		tnode_free_flush();
1009 		if (!tp)
1010 			break;
1011 		tn = tp;
1012 	}
1013 
1014 	/* Handle last (top) tnode */
1015 	if (IS_TNODE(tn))
1016 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
1017 
1018 	rcu_assign_pointer(t->trie, (struct node *)tn);
1019 	tnode_free_flush();
1020 
1021 	return;
1022 }
1023 
1024 /* only used from updater-side */
1025 
1026 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1027 {
1028 	int pos, newpos;
1029 	struct tnode *tp = NULL, *tn = NULL;
1030 	struct node *n;
1031 	struct leaf *l;
1032 	int missbit;
1033 	struct list_head *fa_head = NULL;
1034 	struct leaf_info *li;
1035 	t_key cindex;
1036 
1037 	pos = 0;
1038 	n = t->trie;
1039 
1040 	/* If we point to NULL, stop. Either the tree is empty and we should
1041 	 * just put a new leaf in if, or we have reached an empty child slot,
1042 	 * and we should just put our new leaf in that.
1043 	 * If we point to a T_TNODE, check if it matches our key. Note that
1044 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1045 	 * not be the parent's 'pos'+'bits'!
1046 	 *
1047 	 * If it does match the current key, get pos/bits from it, extract
1048 	 * the index from our key, push the T_TNODE and walk the tree.
1049 	 *
1050 	 * If it doesn't, we have to replace it with a new T_TNODE.
1051 	 *
1052 	 * If we point to a T_LEAF, it might or might not have the same key
1053 	 * as we do. If it does, just change the value, update the T_LEAF's
1054 	 * value, and return it.
1055 	 * If it doesn't, we need to replace it with a T_TNODE.
1056 	 */
1057 
1058 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1059 		tn = (struct tnode *) n;
1060 
1061 		check_tnode(tn);
1062 
1063 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1064 			tp = tn;
1065 			pos = tn->pos + tn->bits;
1066 			n = tnode_get_child(tn,
1067 					    tkey_extract_bits(key,
1068 							      tn->pos,
1069 							      tn->bits));
1070 
1071 			BUG_ON(n && node_parent(n) != tn);
1072 		} else
1073 			break;
1074 	}
1075 
1076 	/*
1077 	 * n  ----> NULL, LEAF or TNODE
1078 	 *
1079 	 * tp is n's (parent) ----> NULL or TNODE
1080 	 */
1081 
1082 	BUG_ON(tp && IS_LEAF(tp));
1083 
1084 	/* Case 1: n is a leaf. Compare prefixes */
1085 
1086 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1087 		l = (struct leaf *) n;
1088 		li = leaf_info_new(plen);
1089 
1090 		if (!li)
1091 			return NULL;
1092 
1093 		fa_head = &li->falh;
1094 		insert_leaf_info(&l->list, li);
1095 		goto done;
1096 	}
1097 	l = leaf_new();
1098 
1099 	if (!l)
1100 		return NULL;
1101 
1102 	l->key = key;
1103 	li = leaf_info_new(plen);
1104 
1105 	if (!li) {
1106 		free_leaf(l);
1107 		return NULL;
1108 	}
1109 
1110 	fa_head = &li->falh;
1111 	insert_leaf_info(&l->list, li);
1112 
1113 	if (t->trie && n == NULL) {
1114 		/* Case 2: n is NULL, and will just insert a new leaf */
1115 
1116 		node_set_parent((struct node *)l, tp);
1117 
1118 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1119 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1120 	} else {
1121 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1122 		/*
1123 		 *  Add a new tnode here
1124 		 *  first tnode need some special handling
1125 		 */
1126 
1127 		if (tp)
1128 			pos = tp->pos+tp->bits;
1129 		else
1130 			pos = 0;
1131 
1132 		if (n) {
1133 			newpos = tkey_mismatch(key, pos, n->key);
1134 			tn = tnode_new(n->key, newpos, 1);
1135 		} else {
1136 			newpos = 0;
1137 			tn = tnode_new(key, newpos, 1); /* First tnode */
1138 		}
1139 
1140 		if (!tn) {
1141 			free_leaf_info(li);
1142 			free_leaf(l);
1143 			return NULL;
1144 		}
1145 
1146 		node_set_parent((struct node *)tn, tp);
1147 
1148 		missbit = tkey_extract_bits(key, newpos, 1);
1149 		put_child(t, tn, missbit, (struct node *)l);
1150 		put_child(t, tn, 1-missbit, n);
1151 
1152 		if (tp) {
1153 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1154 			put_child(t, (struct tnode *)tp, cindex,
1155 				  (struct node *)tn);
1156 		} else {
1157 			rcu_assign_pointer(t->trie, (struct node *)tn);
1158 			tp = tn;
1159 		}
1160 	}
1161 
1162 	if (tp && tp->pos + tp->bits > 32)
1163 		pr_warning("fib_trie"
1164 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1165 			   tp, tp->pos, tp->bits, key, plen);
1166 
1167 	/* Rebalance the trie */
1168 
1169 	trie_rebalance(t, tp);
1170 done:
1171 	return fa_head;
1172 }
1173 
1174 /*
1175  * Caller must hold RTNL.
1176  */
1177 static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
1178 {
1179 	struct trie *t = (struct trie *) tb->tb_data;
1180 	struct fib_alias *fa, *new_fa;
1181 	struct list_head *fa_head = NULL;
1182 	struct fib_info *fi;
1183 	int plen = cfg->fc_dst_len;
1184 	u8 tos = cfg->fc_tos;
1185 	u32 key, mask;
1186 	int err;
1187 	struct leaf *l;
1188 
1189 	if (plen > 32)
1190 		return -EINVAL;
1191 
1192 	key = ntohl(cfg->fc_dst);
1193 
1194 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1195 
1196 	mask = ntohl(inet_make_mask(plen));
1197 
1198 	if (key & ~mask)
1199 		return -EINVAL;
1200 
1201 	key = key & mask;
1202 
1203 	fi = fib_create_info(cfg);
1204 	if (IS_ERR(fi)) {
1205 		err = PTR_ERR(fi);
1206 		goto err;
1207 	}
1208 
1209 	l = fib_find_node(t, key);
1210 	fa = NULL;
1211 
1212 	if (l) {
1213 		fa_head = get_fa_head(l, plen);
1214 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1215 	}
1216 
1217 	/* Now fa, if non-NULL, points to the first fib alias
1218 	 * with the same keys [prefix,tos,priority], if such key already
1219 	 * exists or to the node before which we will insert new one.
1220 	 *
1221 	 * If fa is NULL, we will need to allocate a new one and
1222 	 * insert to the head of f.
1223 	 *
1224 	 * If f is NULL, no fib node matched the destination key
1225 	 * and we need to allocate a new one of those as well.
1226 	 */
1227 
1228 	if (fa && fa->fa_tos == tos &&
1229 	    fa->fa_info->fib_priority == fi->fib_priority) {
1230 		struct fib_alias *fa_first, *fa_match;
1231 
1232 		err = -EEXIST;
1233 		if (cfg->fc_nlflags & NLM_F_EXCL)
1234 			goto out;
1235 
1236 		/* We have 2 goals:
1237 		 * 1. Find exact match for type, scope, fib_info to avoid
1238 		 * duplicate routes
1239 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1240 		 */
1241 		fa_match = NULL;
1242 		fa_first = fa;
1243 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1244 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1245 			if (fa->fa_tos != tos)
1246 				break;
1247 			if (fa->fa_info->fib_priority != fi->fib_priority)
1248 				break;
1249 			if (fa->fa_type == cfg->fc_type &&
1250 			    fa->fa_scope == cfg->fc_scope &&
1251 			    fa->fa_info == fi) {
1252 				fa_match = fa;
1253 				break;
1254 			}
1255 		}
1256 
1257 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1258 			struct fib_info *fi_drop;
1259 			u8 state;
1260 
1261 			fa = fa_first;
1262 			if (fa_match) {
1263 				if (fa == fa_match)
1264 					err = 0;
1265 				goto out;
1266 			}
1267 			err = -ENOBUFS;
1268 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1269 			if (new_fa == NULL)
1270 				goto out;
1271 
1272 			fi_drop = fa->fa_info;
1273 			new_fa->fa_tos = fa->fa_tos;
1274 			new_fa->fa_info = fi;
1275 			new_fa->fa_type = cfg->fc_type;
1276 			new_fa->fa_scope = cfg->fc_scope;
1277 			state = fa->fa_state;
1278 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1279 
1280 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1281 			alias_free_mem_rcu(fa);
1282 
1283 			fib_release_info(fi_drop);
1284 			if (state & FA_S_ACCESSED)
1285 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1286 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1287 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1288 
1289 			goto succeeded;
1290 		}
1291 		/* Error if we find a perfect match which
1292 		 * uses the same scope, type, and nexthop
1293 		 * information.
1294 		 */
1295 		if (fa_match)
1296 			goto out;
1297 
1298 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1299 			fa = fa_first;
1300 	}
1301 	err = -ENOENT;
1302 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1303 		goto out;
1304 
1305 	err = -ENOBUFS;
1306 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1307 	if (new_fa == NULL)
1308 		goto out;
1309 
1310 	new_fa->fa_info = fi;
1311 	new_fa->fa_tos = tos;
1312 	new_fa->fa_type = cfg->fc_type;
1313 	new_fa->fa_scope = cfg->fc_scope;
1314 	new_fa->fa_state = 0;
1315 	/*
1316 	 * Insert new entry to the list.
1317 	 */
1318 
1319 	if (!fa_head) {
1320 		fa_head = fib_insert_node(t, key, plen);
1321 		if (unlikely(!fa_head)) {
1322 			err = -ENOMEM;
1323 			goto out_free_new_fa;
1324 		}
1325 	}
1326 
1327 	list_add_tail_rcu(&new_fa->fa_list,
1328 			  (fa ? &fa->fa_list : fa_head));
1329 
1330 	rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1331 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1332 		  &cfg->fc_nlinfo, 0);
1333 succeeded:
1334 	return 0;
1335 
1336 out_free_new_fa:
1337 	kmem_cache_free(fn_alias_kmem, new_fa);
1338 out:
1339 	fib_release_info(fi);
1340 err:
1341 	return err;
1342 }
1343 
1344 /* should be called with rcu_read_lock */
1345 static int check_leaf(struct trie *t, struct leaf *l,
1346 		      t_key key,  const struct flowi *flp,
1347 		      struct fib_result *res)
1348 {
1349 	struct leaf_info *li;
1350 	struct hlist_head *hhead = &l->list;
1351 	struct hlist_node *node;
1352 
1353 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1354 		int err;
1355 		int plen = li->plen;
1356 		__be32 mask = inet_make_mask(plen);
1357 
1358 		if (l->key != (key & ntohl(mask)))
1359 			continue;
1360 
1361 		err = fib_semantic_match(&li->falh, flp, res, plen);
1362 
1363 #ifdef CONFIG_IP_FIB_TRIE_STATS
1364 		if (err <= 0)
1365 			t->stats.semantic_match_passed++;
1366 		else
1367 			t->stats.semantic_match_miss++;
1368 #endif
1369 		if (err <= 0)
1370 			return err;
1371 	}
1372 
1373 	return 1;
1374 }
1375 
1376 static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1377 			  struct fib_result *res)
1378 {
1379 	struct trie *t = (struct trie *) tb->tb_data;
1380 	int ret;
1381 	struct node *n;
1382 	struct tnode *pn;
1383 	int pos, bits;
1384 	t_key key = ntohl(flp->fl4_dst);
1385 	int chopped_off;
1386 	t_key cindex = 0;
1387 	int current_prefix_length = KEYLENGTH;
1388 	struct tnode *cn;
1389 	t_key node_prefix, key_prefix, pref_mismatch;
1390 	int mp;
1391 
1392 	rcu_read_lock();
1393 
1394 	n = rcu_dereference(t->trie);
1395 	if (!n)
1396 		goto failed;
1397 
1398 #ifdef CONFIG_IP_FIB_TRIE_STATS
1399 	t->stats.gets++;
1400 #endif
1401 
1402 	/* Just a leaf? */
1403 	if (IS_LEAF(n)) {
1404 		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1405 		goto found;
1406 	}
1407 
1408 	pn = (struct tnode *) n;
1409 	chopped_off = 0;
1410 
1411 	while (pn) {
1412 		pos = pn->pos;
1413 		bits = pn->bits;
1414 
1415 		if (!chopped_off)
1416 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1417 						   pos, bits);
1418 
1419 		n = tnode_get_child_rcu(pn, cindex);
1420 
1421 		if (n == NULL) {
1422 #ifdef CONFIG_IP_FIB_TRIE_STATS
1423 			t->stats.null_node_hit++;
1424 #endif
1425 			goto backtrace;
1426 		}
1427 
1428 		if (IS_LEAF(n)) {
1429 			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1430 			if (ret > 0)
1431 				goto backtrace;
1432 			goto found;
1433 		}
1434 
1435 		cn = (struct tnode *)n;
1436 
1437 		/*
1438 		 * It's a tnode, and we can do some extra checks here if we
1439 		 * like, to avoid descending into a dead-end branch.
1440 		 * This tnode is in the parent's child array at index
1441 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1442 		 * chopped off, so in reality the index may be just a
1443 		 * subprefix, padded with zero at the end.
1444 		 * We can also take a look at any skipped bits in this
1445 		 * tnode - everything up to p_pos is supposed to be ok,
1446 		 * and the non-chopped bits of the index (se previous
1447 		 * paragraph) are also guaranteed ok, but the rest is
1448 		 * considered unknown.
1449 		 *
1450 		 * The skipped bits are key[pos+bits..cn->pos].
1451 		 */
1452 
1453 		/* If current_prefix_length < pos+bits, we are already doing
1454 		 * actual prefix  matching, which means everything from
1455 		 * pos+(bits-chopped_off) onward must be zero along some
1456 		 * branch of this subtree - otherwise there is *no* valid
1457 		 * prefix present. Here we can only check the skipped
1458 		 * bits. Remember, since we have already indexed into the
1459 		 * parent's child array, we know that the bits we chopped of
1460 		 * *are* zero.
1461 		 */
1462 
1463 		/* NOTA BENE: Checking only skipped bits
1464 		   for the new node here */
1465 
1466 		if (current_prefix_length < pos+bits) {
1467 			if (tkey_extract_bits(cn->key, current_prefix_length,
1468 						cn->pos - current_prefix_length)
1469 			    || !(cn->child[0]))
1470 				goto backtrace;
1471 		}
1472 
1473 		/*
1474 		 * If chopped_off=0, the index is fully validated and we
1475 		 * only need to look at the skipped bits for this, the new,
1476 		 * tnode. What we actually want to do is to find out if
1477 		 * these skipped bits match our key perfectly, or if we will
1478 		 * have to count on finding a matching prefix further down,
1479 		 * because if we do, we would like to have some way of
1480 		 * verifying the existence of such a prefix at this point.
1481 		 */
1482 
1483 		/* The only thing we can do at this point is to verify that
1484 		 * any such matching prefix can indeed be a prefix to our
1485 		 * key, and if the bits in the node we are inspecting that
1486 		 * do not match our key are not ZERO, this cannot be true.
1487 		 * Thus, find out where there is a mismatch (before cn->pos)
1488 		 * and verify that all the mismatching bits are zero in the
1489 		 * new tnode's key.
1490 		 */
1491 
1492 		/*
1493 		 * Note: We aren't very concerned about the piece of
1494 		 * the key that precede pn->pos+pn->bits, since these
1495 		 * have already been checked. The bits after cn->pos
1496 		 * aren't checked since these are by definition
1497 		 * "unknown" at this point. Thus, what we want to see
1498 		 * is if we are about to enter the "prefix matching"
1499 		 * state, and in that case verify that the skipped
1500 		 * bits that will prevail throughout this subtree are
1501 		 * zero, as they have to be if we are to find a
1502 		 * matching prefix.
1503 		 */
1504 
1505 		node_prefix = mask_pfx(cn->key, cn->pos);
1506 		key_prefix = mask_pfx(key, cn->pos);
1507 		pref_mismatch = key_prefix^node_prefix;
1508 		mp = 0;
1509 
1510 		/*
1511 		 * In short: If skipped bits in this node do not match
1512 		 * the search key, enter the "prefix matching"
1513 		 * state.directly.
1514 		 */
1515 		if (pref_mismatch) {
1516 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1517 				mp++;
1518 				pref_mismatch = pref_mismatch << 1;
1519 			}
1520 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1521 
1522 			if (key_prefix != 0)
1523 				goto backtrace;
1524 
1525 			if (current_prefix_length >= cn->pos)
1526 				current_prefix_length = mp;
1527 		}
1528 
1529 		pn = (struct tnode *)n; /* Descend */
1530 		chopped_off = 0;
1531 		continue;
1532 
1533 backtrace:
1534 		chopped_off++;
1535 
1536 		/* As zero don't change the child key (cindex) */
1537 		while ((chopped_off <= pn->bits)
1538 		       && !(cindex & (1<<(chopped_off-1))))
1539 			chopped_off++;
1540 
1541 		/* Decrease current_... with bits chopped off */
1542 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1543 			current_prefix_length = pn->pos + pn->bits
1544 				- chopped_off;
1545 
1546 		/*
1547 		 * Either we do the actual chop off according or if we have
1548 		 * chopped off all bits in this tnode walk up to our parent.
1549 		 */
1550 
1551 		if (chopped_off <= pn->bits) {
1552 			cindex &= ~(1 << (chopped_off-1));
1553 		} else {
1554 			struct tnode *parent = node_parent_rcu((struct node *) pn);
1555 			if (!parent)
1556 				goto failed;
1557 
1558 			/* Get Child's index */
1559 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1560 			pn = parent;
1561 			chopped_off = 0;
1562 
1563 #ifdef CONFIG_IP_FIB_TRIE_STATS
1564 			t->stats.backtrack++;
1565 #endif
1566 			goto backtrace;
1567 		}
1568 	}
1569 failed:
1570 	ret = 1;
1571 found:
1572 	rcu_read_unlock();
1573 	return ret;
1574 }
1575 
1576 /*
1577  * Remove the leaf and return parent.
1578  */
1579 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1580 {
1581 	struct tnode *tp = node_parent((struct node *) l);
1582 
1583 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1584 
1585 	if (tp) {
1586 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1587 		put_child(t, (struct tnode *)tp, cindex, NULL);
1588 		trie_rebalance(t, tp);
1589 	} else
1590 		rcu_assign_pointer(t->trie, NULL);
1591 
1592 	free_leaf(l);
1593 }
1594 
1595 /*
1596  * Caller must hold RTNL.
1597  */
1598 static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
1599 {
1600 	struct trie *t = (struct trie *) tb->tb_data;
1601 	u32 key, mask;
1602 	int plen = cfg->fc_dst_len;
1603 	u8 tos = cfg->fc_tos;
1604 	struct fib_alias *fa, *fa_to_delete;
1605 	struct list_head *fa_head;
1606 	struct leaf *l;
1607 	struct leaf_info *li;
1608 
1609 	if (plen > 32)
1610 		return -EINVAL;
1611 
1612 	key = ntohl(cfg->fc_dst);
1613 	mask = ntohl(inet_make_mask(plen));
1614 
1615 	if (key & ~mask)
1616 		return -EINVAL;
1617 
1618 	key = key & mask;
1619 	l = fib_find_node(t, key);
1620 
1621 	if (!l)
1622 		return -ESRCH;
1623 
1624 	fa_head = get_fa_head(l, plen);
1625 	fa = fib_find_alias(fa_head, tos, 0);
1626 
1627 	if (!fa)
1628 		return -ESRCH;
1629 
1630 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1631 
1632 	fa_to_delete = NULL;
1633 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1634 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1635 		struct fib_info *fi = fa->fa_info;
1636 
1637 		if (fa->fa_tos != tos)
1638 			break;
1639 
1640 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1641 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1642 		     fa->fa_scope == cfg->fc_scope) &&
1643 		    (!cfg->fc_protocol ||
1644 		     fi->fib_protocol == cfg->fc_protocol) &&
1645 		    fib_nh_match(cfg, fi) == 0) {
1646 			fa_to_delete = fa;
1647 			break;
1648 		}
1649 	}
1650 
1651 	if (!fa_to_delete)
1652 		return -ESRCH;
1653 
1654 	fa = fa_to_delete;
1655 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1656 		  &cfg->fc_nlinfo, 0);
1657 
1658 	l = fib_find_node(t, key);
1659 	li = find_leaf_info(l, plen);
1660 
1661 	list_del_rcu(&fa->fa_list);
1662 
1663 	if (list_empty(fa_head)) {
1664 		hlist_del_rcu(&li->hlist);
1665 		free_leaf_info(li);
1666 	}
1667 
1668 	if (hlist_empty(&l->list))
1669 		trie_leaf_remove(t, l);
1670 
1671 	if (fa->fa_state & FA_S_ACCESSED)
1672 		rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1673 
1674 	fib_release_info(fa->fa_info);
1675 	alias_free_mem_rcu(fa);
1676 	return 0;
1677 }
1678 
1679 static int trie_flush_list(struct list_head *head)
1680 {
1681 	struct fib_alias *fa, *fa_node;
1682 	int found = 0;
1683 
1684 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1685 		struct fib_info *fi = fa->fa_info;
1686 
1687 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1688 			list_del_rcu(&fa->fa_list);
1689 			fib_release_info(fa->fa_info);
1690 			alias_free_mem_rcu(fa);
1691 			found++;
1692 		}
1693 	}
1694 	return found;
1695 }
1696 
1697 static int trie_flush_leaf(struct leaf *l)
1698 {
1699 	int found = 0;
1700 	struct hlist_head *lih = &l->list;
1701 	struct hlist_node *node, *tmp;
1702 	struct leaf_info *li = NULL;
1703 
1704 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1705 		found += trie_flush_list(&li->falh);
1706 
1707 		if (list_empty(&li->falh)) {
1708 			hlist_del_rcu(&li->hlist);
1709 			free_leaf_info(li);
1710 		}
1711 	}
1712 	return found;
1713 }
1714 
1715 /*
1716  * Scan for the next right leaf starting at node p->child[idx]
1717  * Since we have back pointer, no recursion necessary.
1718  */
1719 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1720 {
1721 	do {
1722 		t_key idx;
1723 
1724 		if (c)
1725 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1726 		else
1727 			idx = 0;
1728 
1729 		while (idx < 1u << p->bits) {
1730 			c = tnode_get_child_rcu(p, idx++);
1731 			if (!c)
1732 				continue;
1733 
1734 			if (IS_LEAF(c)) {
1735 				prefetch(p->child[idx]);
1736 				return (struct leaf *) c;
1737 			}
1738 
1739 			/* Rescan start scanning in new node */
1740 			p = (struct tnode *) c;
1741 			idx = 0;
1742 		}
1743 
1744 		/* Node empty, walk back up to parent */
1745 		c = (struct node *) p;
1746 	} while ( (p = node_parent_rcu(c)) != NULL);
1747 
1748 	return NULL; /* Root of trie */
1749 }
1750 
1751 static struct leaf *trie_firstleaf(struct trie *t)
1752 {
1753 	struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1754 
1755 	if (!n)
1756 		return NULL;
1757 
1758 	if (IS_LEAF(n))          /* trie is just a leaf */
1759 		return (struct leaf *) n;
1760 
1761 	return leaf_walk_rcu(n, NULL);
1762 }
1763 
1764 static struct leaf *trie_nextleaf(struct leaf *l)
1765 {
1766 	struct node *c = (struct node *) l;
1767 	struct tnode *p = node_parent_rcu(c);
1768 
1769 	if (!p)
1770 		return NULL;	/* trie with just one leaf */
1771 
1772 	return leaf_walk_rcu(p, c);
1773 }
1774 
1775 static struct leaf *trie_leafindex(struct trie *t, int index)
1776 {
1777 	struct leaf *l = trie_firstleaf(t);
1778 
1779 	while (l && index-- > 0)
1780 		l = trie_nextleaf(l);
1781 
1782 	return l;
1783 }
1784 
1785 
1786 /*
1787  * Caller must hold RTNL.
1788  */
1789 static int fn_trie_flush(struct fib_table *tb)
1790 {
1791 	struct trie *t = (struct trie *) tb->tb_data;
1792 	struct leaf *l, *ll = NULL;
1793 	int found = 0;
1794 
1795 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1796 		found += trie_flush_leaf(l);
1797 
1798 		if (ll && hlist_empty(&ll->list))
1799 			trie_leaf_remove(t, ll);
1800 		ll = l;
1801 	}
1802 
1803 	if (ll && hlist_empty(&ll->list))
1804 		trie_leaf_remove(t, ll);
1805 
1806 	pr_debug("trie_flush found=%d\n", found);
1807 	return found;
1808 }
1809 
1810 static void fn_trie_select_default(struct fib_table *tb,
1811 				   const struct flowi *flp,
1812 				   struct fib_result *res)
1813 {
1814 	struct trie *t = (struct trie *) tb->tb_data;
1815 	int order, last_idx;
1816 	struct fib_info *fi = NULL;
1817 	struct fib_info *last_resort;
1818 	struct fib_alias *fa = NULL;
1819 	struct list_head *fa_head;
1820 	struct leaf *l;
1821 
1822 	last_idx = -1;
1823 	last_resort = NULL;
1824 	order = -1;
1825 
1826 	rcu_read_lock();
1827 
1828 	l = fib_find_node(t, 0);
1829 	if (!l)
1830 		goto out;
1831 
1832 	fa_head = get_fa_head(l, 0);
1833 	if (!fa_head)
1834 		goto out;
1835 
1836 	if (list_empty(fa_head))
1837 		goto out;
1838 
1839 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
1840 		struct fib_info *next_fi = fa->fa_info;
1841 
1842 		if (fa->fa_scope != res->scope ||
1843 		    fa->fa_type != RTN_UNICAST)
1844 			continue;
1845 
1846 		if (next_fi->fib_priority > res->fi->fib_priority)
1847 			break;
1848 		if (!next_fi->fib_nh[0].nh_gw ||
1849 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1850 			continue;
1851 		fa->fa_state |= FA_S_ACCESSED;
1852 
1853 		if (fi == NULL) {
1854 			if (next_fi != res->fi)
1855 				break;
1856 		} else if (!fib_detect_death(fi, order, &last_resort,
1857 					     &last_idx, tb->tb_default)) {
1858 			fib_result_assign(res, fi);
1859 			tb->tb_default = order;
1860 			goto out;
1861 		}
1862 		fi = next_fi;
1863 		order++;
1864 	}
1865 	if (order <= 0 || fi == NULL) {
1866 		tb->tb_default = -1;
1867 		goto out;
1868 	}
1869 
1870 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1871 				tb->tb_default)) {
1872 		fib_result_assign(res, fi);
1873 		tb->tb_default = order;
1874 		goto out;
1875 	}
1876 	if (last_idx >= 0)
1877 		fib_result_assign(res, last_resort);
1878 	tb->tb_default = last_idx;
1879 out:
1880 	rcu_read_unlock();
1881 }
1882 
1883 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1884 			   struct fib_table *tb,
1885 			   struct sk_buff *skb, struct netlink_callback *cb)
1886 {
1887 	int i, s_i;
1888 	struct fib_alias *fa;
1889 	__be32 xkey = htonl(key);
1890 
1891 	s_i = cb->args[5];
1892 	i = 0;
1893 
1894 	/* rcu_read_lock is hold by caller */
1895 
1896 	list_for_each_entry_rcu(fa, fah, fa_list) {
1897 		if (i < s_i) {
1898 			i++;
1899 			continue;
1900 		}
1901 
1902 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1903 				  cb->nlh->nlmsg_seq,
1904 				  RTM_NEWROUTE,
1905 				  tb->tb_id,
1906 				  fa->fa_type,
1907 				  fa->fa_scope,
1908 				  xkey,
1909 				  plen,
1910 				  fa->fa_tos,
1911 				  fa->fa_info, NLM_F_MULTI) < 0) {
1912 			cb->args[5] = i;
1913 			return -1;
1914 		}
1915 		i++;
1916 	}
1917 	cb->args[5] = i;
1918 	return skb->len;
1919 }
1920 
1921 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1922 			struct sk_buff *skb, struct netlink_callback *cb)
1923 {
1924 	struct leaf_info *li;
1925 	struct hlist_node *node;
1926 	int i, s_i;
1927 
1928 	s_i = cb->args[4];
1929 	i = 0;
1930 
1931 	/* rcu_read_lock is hold by caller */
1932 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1933 		if (i < s_i) {
1934 			i++;
1935 			continue;
1936 		}
1937 
1938 		if (i > s_i)
1939 			cb->args[5] = 0;
1940 
1941 		if (list_empty(&li->falh))
1942 			continue;
1943 
1944 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1945 			cb->args[4] = i;
1946 			return -1;
1947 		}
1948 		i++;
1949 	}
1950 
1951 	cb->args[4] = i;
1952 	return skb->len;
1953 }
1954 
1955 static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1956 			struct netlink_callback *cb)
1957 {
1958 	struct leaf *l;
1959 	struct trie *t = (struct trie *) tb->tb_data;
1960 	t_key key = cb->args[2];
1961 	int count = cb->args[3];
1962 
1963 	rcu_read_lock();
1964 	/* Dump starting at last key.
1965 	 * Note: 0.0.0.0/0 (ie default) is first key.
1966 	 */
1967 	if (count == 0)
1968 		l = trie_firstleaf(t);
1969 	else {
1970 		/* Normally, continue from last key, but if that is missing
1971 		 * fallback to using slow rescan
1972 		 */
1973 		l = fib_find_node(t, key);
1974 		if (!l)
1975 			l = trie_leafindex(t, count);
1976 	}
1977 
1978 	while (l) {
1979 		cb->args[2] = l->key;
1980 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1981 			cb->args[3] = count;
1982 			rcu_read_unlock();
1983 			return -1;
1984 		}
1985 
1986 		++count;
1987 		l = trie_nextleaf(l);
1988 		memset(&cb->args[4], 0,
1989 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1990 	}
1991 	cb->args[3] = count;
1992 	rcu_read_unlock();
1993 
1994 	return skb->len;
1995 }
1996 
1997 void __init fib_hash_init(void)
1998 {
1999 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2000 					  sizeof(struct fib_alias),
2001 					  0, SLAB_PANIC, NULL);
2002 
2003 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2004 					   max(sizeof(struct leaf),
2005 					       sizeof(struct leaf_info)),
2006 					   0, SLAB_PANIC, NULL);
2007 }
2008 
2009 
2010 /* Fix more generic FIB names for init later */
2011 struct fib_table *fib_hash_table(u32 id)
2012 {
2013 	struct fib_table *tb;
2014 	struct trie *t;
2015 
2016 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2017 		     GFP_KERNEL);
2018 	if (tb == NULL)
2019 		return NULL;
2020 
2021 	tb->tb_id = id;
2022 	tb->tb_default = -1;
2023 	tb->tb_lookup = fn_trie_lookup;
2024 	tb->tb_insert = fn_trie_insert;
2025 	tb->tb_delete = fn_trie_delete;
2026 	tb->tb_flush = fn_trie_flush;
2027 	tb->tb_select_default = fn_trie_select_default;
2028 	tb->tb_dump = fn_trie_dump;
2029 
2030 	t = (struct trie *) tb->tb_data;
2031 	memset(t, 0, sizeof(*t));
2032 
2033 	if (id == RT_TABLE_LOCAL)
2034 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2035 
2036 	return tb;
2037 }
2038 
2039 #ifdef CONFIG_PROC_FS
2040 /* Depth first Trie walk iterator */
2041 struct fib_trie_iter {
2042 	struct seq_net_private p;
2043 	struct fib_table *tb;
2044 	struct tnode *tnode;
2045 	unsigned index;
2046 	unsigned depth;
2047 };
2048 
2049 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2050 {
2051 	struct tnode *tn = iter->tnode;
2052 	unsigned cindex = iter->index;
2053 	struct tnode *p;
2054 
2055 	/* A single entry routing table */
2056 	if (!tn)
2057 		return NULL;
2058 
2059 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2060 		 iter->tnode, iter->index, iter->depth);
2061 rescan:
2062 	while (cindex < (1<<tn->bits)) {
2063 		struct node *n = tnode_get_child_rcu(tn, cindex);
2064 
2065 		if (n) {
2066 			if (IS_LEAF(n)) {
2067 				iter->tnode = tn;
2068 				iter->index = cindex + 1;
2069 			} else {
2070 				/* push down one level */
2071 				iter->tnode = (struct tnode *) n;
2072 				iter->index = 0;
2073 				++iter->depth;
2074 			}
2075 			return n;
2076 		}
2077 
2078 		++cindex;
2079 	}
2080 
2081 	/* Current node exhausted, pop back up */
2082 	p = node_parent_rcu((struct node *)tn);
2083 	if (p) {
2084 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2085 		tn = p;
2086 		--iter->depth;
2087 		goto rescan;
2088 	}
2089 
2090 	/* got root? */
2091 	return NULL;
2092 }
2093 
2094 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2095 				       struct trie *t)
2096 {
2097 	struct node *n;
2098 
2099 	if (!t)
2100 		return NULL;
2101 
2102 	n = rcu_dereference(t->trie);
2103 	if (!n)
2104 		return NULL;
2105 
2106 	if (IS_TNODE(n)) {
2107 		iter->tnode = (struct tnode *) n;
2108 		iter->index = 0;
2109 		iter->depth = 1;
2110 	} else {
2111 		iter->tnode = NULL;
2112 		iter->index = 0;
2113 		iter->depth = 0;
2114 	}
2115 
2116 	return n;
2117 }
2118 
2119 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2120 {
2121 	struct node *n;
2122 	struct fib_trie_iter iter;
2123 
2124 	memset(s, 0, sizeof(*s));
2125 
2126 	rcu_read_lock();
2127 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2128 		if (IS_LEAF(n)) {
2129 			struct leaf *l = (struct leaf *)n;
2130 			struct leaf_info *li;
2131 			struct hlist_node *tmp;
2132 
2133 			s->leaves++;
2134 			s->totdepth += iter.depth;
2135 			if (iter.depth > s->maxdepth)
2136 				s->maxdepth = iter.depth;
2137 
2138 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2139 				++s->prefixes;
2140 		} else {
2141 			const struct tnode *tn = (const struct tnode *) n;
2142 			int i;
2143 
2144 			s->tnodes++;
2145 			if (tn->bits < MAX_STAT_DEPTH)
2146 				s->nodesizes[tn->bits]++;
2147 
2148 			for (i = 0; i < (1<<tn->bits); i++)
2149 				if (!tn->child[i])
2150 					s->nullpointers++;
2151 		}
2152 	}
2153 	rcu_read_unlock();
2154 }
2155 
2156 /*
2157  *	This outputs /proc/net/fib_triestats
2158  */
2159 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2160 {
2161 	unsigned i, max, pointers, bytes, avdepth;
2162 
2163 	if (stat->leaves)
2164 		avdepth = stat->totdepth*100 / stat->leaves;
2165 	else
2166 		avdepth = 0;
2167 
2168 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2169 		   avdepth / 100, avdepth % 100);
2170 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2171 
2172 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2173 	bytes = sizeof(struct leaf) * stat->leaves;
2174 
2175 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2176 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2177 
2178 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2179 	bytes += sizeof(struct tnode) * stat->tnodes;
2180 
2181 	max = MAX_STAT_DEPTH;
2182 	while (max > 0 && stat->nodesizes[max-1] == 0)
2183 		max--;
2184 
2185 	pointers = 0;
2186 	for (i = 1; i <= max; i++)
2187 		if (stat->nodesizes[i] != 0) {
2188 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2189 			pointers += (1<<i) * stat->nodesizes[i];
2190 		}
2191 	seq_putc(seq, '\n');
2192 	seq_printf(seq, "\tPointers: %u\n", pointers);
2193 
2194 	bytes += sizeof(struct node *) * pointers;
2195 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2196 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2197 }
2198 
2199 #ifdef CONFIG_IP_FIB_TRIE_STATS
2200 static void trie_show_usage(struct seq_file *seq,
2201 			    const struct trie_use_stats *stats)
2202 {
2203 	seq_printf(seq, "\nCounters:\n---------\n");
2204 	seq_printf(seq, "gets = %u\n", stats->gets);
2205 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2206 	seq_printf(seq, "semantic match passed = %u\n",
2207 		   stats->semantic_match_passed);
2208 	seq_printf(seq, "semantic match miss = %u\n",
2209 		   stats->semantic_match_miss);
2210 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2211 	seq_printf(seq, "skipped node resize = %u\n\n",
2212 		   stats->resize_node_skipped);
2213 }
2214 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2215 
2216 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2217 {
2218 	if (tb->tb_id == RT_TABLE_LOCAL)
2219 		seq_puts(seq, "Local:\n");
2220 	else if (tb->tb_id == RT_TABLE_MAIN)
2221 		seq_puts(seq, "Main:\n");
2222 	else
2223 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2224 }
2225 
2226 
2227 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2228 {
2229 	struct net *net = (struct net *)seq->private;
2230 	unsigned int h;
2231 
2232 	seq_printf(seq,
2233 		   "Basic info: size of leaf:"
2234 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2235 		   sizeof(struct leaf), sizeof(struct tnode));
2236 
2237 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2238 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2239 		struct hlist_node *node;
2240 		struct fib_table *tb;
2241 
2242 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2243 			struct trie *t = (struct trie *) tb->tb_data;
2244 			struct trie_stat stat;
2245 
2246 			if (!t)
2247 				continue;
2248 
2249 			fib_table_print(seq, tb);
2250 
2251 			trie_collect_stats(t, &stat);
2252 			trie_show_stats(seq, &stat);
2253 #ifdef CONFIG_IP_FIB_TRIE_STATS
2254 			trie_show_usage(seq, &t->stats);
2255 #endif
2256 		}
2257 	}
2258 
2259 	return 0;
2260 }
2261 
2262 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2263 {
2264 	return single_open_net(inode, file, fib_triestat_seq_show);
2265 }
2266 
2267 static const struct file_operations fib_triestat_fops = {
2268 	.owner	= THIS_MODULE,
2269 	.open	= fib_triestat_seq_open,
2270 	.read	= seq_read,
2271 	.llseek	= seq_lseek,
2272 	.release = single_release_net,
2273 };
2274 
2275 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2276 {
2277 	struct fib_trie_iter *iter = seq->private;
2278 	struct net *net = seq_file_net(seq);
2279 	loff_t idx = 0;
2280 	unsigned int h;
2281 
2282 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2283 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2284 		struct hlist_node *node;
2285 		struct fib_table *tb;
2286 
2287 		hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2288 			struct node *n;
2289 
2290 			for (n = fib_trie_get_first(iter,
2291 						    (struct trie *) tb->tb_data);
2292 			     n; n = fib_trie_get_next(iter))
2293 				if (pos == idx++) {
2294 					iter->tb = tb;
2295 					return n;
2296 				}
2297 		}
2298 	}
2299 
2300 	return NULL;
2301 }
2302 
2303 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2304 	__acquires(RCU)
2305 {
2306 	rcu_read_lock();
2307 	return fib_trie_get_idx(seq, *pos);
2308 }
2309 
2310 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2311 {
2312 	struct fib_trie_iter *iter = seq->private;
2313 	struct net *net = seq_file_net(seq);
2314 	struct fib_table *tb = iter->tb;
2315 	struct hlist_node *tb_node;
2316 	unsigned int h;
2317 	struct node *n;
2318 
2319 	++*pos;
2320 	/* next node in same table */
2321 	n = fib_trie_get_next(iter);
2322 	if (n)
2323 		return n;
2324 
2325 	/* walk rest of this hash chain */
2326 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2327 	while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2328 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2329 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2330 		if (n)
2331 			goto found;
2332 	}
2333 
2334 	/* new hash chain */
2335 	while (++h < FIB_TABLE_HASHSZ) {
2336 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2337 		hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2338 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2339 			if (n)
2340 				goto found;
2341 		}
2342 	}
2343 	return NULL;
2344 
2345 found:
2346 	iter->tb = tb;
2347 	return n;
2348 }
2349 
2350 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2351 	__releases(RCU)
2352 {
2353 	rcu_read_unlock();
2354 }
2355 
2356 static void seq_indent(struct seq_file *seq, int n)
2357 {
2358 	while (n-- > 0) seq_puts(seq, "   ");
2359 }
2360 
2361 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2362 {
2363 	switch (s) {
2364 	case RT_SCOPE_UNIVERSE: return "universe";
2365 	case RT_SCOPE_SITE:	return "site";
2366 	case RT_SCOPE_LINK:	return "link";
2367 	case RT_SCOPE_HOST:	return "host";
2368 	case RT_SCOPE_NOWHERE:	return "nowhere";
2369 	default:
2370 		snprintf(buf, len, "scope=%d", s);
2371 		return buf;
2372 	}
2373 }
2374 
2375 static const char *const rtn_type_names[__RTN_MAX] = {
2376 	[RTN_UNSPEC] = "UNSPEC",
2377 	[RTN_UNICAST] = "UNICAST",
2378 	[RTN_LOCAL] = "LOCAL",
2379 	[RTN_BROADCAST] = "BROADCAST",
2380 	[RTN_ANYCAST] = "ANYCAST",
2381 	[RTN_MULTICAST] = "MULTICAST",
2382 	[RTN_BLACKHOLE] = "BLACKHOLE",
2383 	[RTN_UNREACHABLE] = "UNREACHABLE",
2384 	[RTN_PROHIBIT] = "PROHIBIT",
2385 	[RTN_THROW] = "THROW",
2386 	[RTN_NAT] = "NAT",
2387 	[RTN_XRESOLVE] = "XRESOLVE",
2388 };
2389 
2390 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2391 {
2392 	if (t < __RTN_MAX && rtn_type_names[t])
2393 		return rtn_type_names[t];
2394 	snprintf(buf, len, "type %u", t);
2395 	return buf;
2396 }
2397 
2398 /* Pretty print the trie */
2399 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2400 {
2401 	const struct fib_trie_iter *iter = seq->private;
2402 	struct node *n = v;
2403 
2404 	if (!node_parent_rcu(n))
2405 		fib_table_print(seq, iter->tb);
2406 
2407 	if (IS_TNODE(n)) {
2408 		struct tnode *tn = (struct tnode *) n;
2409 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2410 
2411 		seq_indent(seq, iter->depth-1);
2412 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2413 			   &prf, tn->pos, tn->bits, tn->full_children,
2414 			   tn->empty_children);
2415 
2416 	} else {
2417 		struct leaf *l = (struct leaf *) n;
2418 		struct leaf_info *li;
2419 		struct hlist_node *node;
2420 		__be32 val = htonl(l->key);
2421 
2422 		seq_indent(seq, iter->depth);
2423 		seq_printf(seq, "  |-- %pI4\n", &val);
2424 
2425 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2426 			struct fib_alias *fa;
2427 
2428 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2429 				char buf1[32], buf2[32];
2430 
2431 				seq_indent(seq, iter->depth+1);
2432 				seq_printf(seq, "  /%d %s %s", li->plen,
2433 					   rtn_scope(buf1, sizeof(buf1),
2434 						     fa->fa_scope),
2435 					   rtn_type(buf2, sizeof(buf2),
2436 						    fa->fa_type));
2437 				if (fa->fa_tos)
2438 					seq_printf(seq, " tos=%d", fa->fa_tos);
2439 				seq_putc(seq, '\n');
2440 			}
2441 		}
2442 	}
2443 
2444 	return 0;
2445 }
2446 
2447 static const struct seq_operations fib_trie_seq_ops = {
2448 	.start  = fib_trie_seq_start,
2449 	.next   = fib_trie_seq_next,
2450 	.stop   = fib_trie_seq_stop,
2451 	.show   = fib_trie_seq_show,
2452 };
2453 
2454 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2455 {
2456 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2457 			    sizeof(struct fib_trie_iter));
2458 }
2459 
2460 static const struct file_operations fib_trie_fops = {
2461 	.owner  = THIS_MODULE,
2462 	.open   = fib_trie_seq_open,
2463 	.read   = seq_read,
2464 	.llseek = seq_lseek,
2465 	.release = seq_release_net,
2466 };
2467 
2468 struct fib_route_iter {
2469 	struct seq_net_private p;
2470 	struct trie *main_trie;
2471 	loff_t	pos;
2472 	t_key	key;
2473 };
2474 
2475 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2476 {
2477 	struct leaf *l = NULL;
2478 	struct trie *t = iter->main_trie;
2479 
2480 	/* use cache location of last found key */
2481 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2482 		pos -= iter->pos;
2483 	else {
2484 		iter->pos = 0;
2485 		l = trie_firstleaf(t);
2486 	}
2487 
2488 	while (l && pos-- > 0) {
2489 		iter->pos++;
2490 		l = trie_nextleaf(l);
2491 	}
2492 
2493 	if (l)
2494 		iter->key = pos;	/* remember it */
2495 	else
2496 		iter->pos = 0;		/* forget it */
2497 
2498 	return l;
2499 }
2500 
2501 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2502 	__acquires(RCU)
2503 {
2504 	struct fib_route_iter *iter = seq->private;
2505 	struct fib_table *tb;
2506 
2507 	rcu_read_lock();
2508 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2509 	if (!tb)
2510 		return NULL;
2511 
2512 	iter->main_trie = (struct trie *) tb->tb_data;
2513 	if (*pos == 0)
2514 		return SEQ_START_TOKEN;
2515 	else
2516 		return fib_route_get_idx(iter, *pos - 1);
2517 }
2518 
2519 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2520 {
2521 	struct fib_route_iter *iter = seq->private;
2522 	struct leaf *l = v;
2523 
2524 	++*pos;
2525 	if (v == SEQ_START_TOKEN) {
2526 		iter->pos = 0;
2527 		l = trie_firstleaf(iter->main_trie);
2528 	} else {
2529 		iter->pos++;
2530 		l = trie_nextleaf(l);
2531 	}
2532 
2533 	if (l)
2534 		iter->key = l->key;
2535 	else
2536 		iter->pos = 0;
2537 	return l;
2538 }
2539 
2540 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2541 	__releases(RCU)
2542 {
2543 	rcu_read_unlock();
2544 }
2545 
2546 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2547 {
2548 	static unsigned type2flags[RTN_MAX + 1] = {
2549 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2550 	};
2551 	unsigned flags = type2flags[type];
2552 
2553 	if (fi && fi->fib_nh->nh_gw)
2554 		flags |= RTF_GATEWAY;
2555 	if (mask == htonl(0xFFFFFFFF))
2556 		flags |= RTF_HOST;
2557 	flags |= RTF_UP;
2558 	return flags;
2559 }
2560 
2561 /*
2562  *	This outputs /proc/net/route.
2563  *	The format of the file is not supposed to be changed
2564  * 	and needs to be same as fib_hash output to avoid breaking
2565  *	legacy utilities
2566  */
2567 static int fib_route_seq_show(struct seq_file *seq, void *v)
2568 {
2569 	struct leaf *l = v;
2570 	struct leaf_info *li;
2571 	struct hlist_node *node;
2572 
2573 	if (v == SEQ_START_TOKEN) {
2574 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2575 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2576 			   "\tWindow\tIRTT");
2577 		return 0;
2578 	}
2579 
2580 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2581 		struct fib_alias *fa;
2582 		__be32 mask, prefix;
2583 
2584 		mask = inet_make_mask(li->plen);
2585 		prefix = htonl(l->key);
2586 
2587 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2588 			const struct fib_info *fi = fa->fa_info;
2589 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2590 			int len;
2591 
2592 			if (fa->fa_type == RTN_BROADCAST
2593 			    || fa->fa_type == RTN_MULTICAST)
2594 				continue;
2595 
2596 			if (fi)
2597 				seq_printf(seq,
2598 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2599 					 "%d\t%08X\t%d\t%u\t%u%n",
2600 					 fi->fib_dev ? fi->fib_dev->name : "*",
2601 					 prefix,
2602 					 fi->fib_nh->nh_gw, flags, 0, 0,
2603 					 fi->fib_priority,
2604 					 mask,
2605 					 (fi->fib_advmss ?
2606 					  fi->fib_advmss + 40 : 0),
2607 					 fi->fib_window,
2608 					 fi->fib_rtt >> 3, &len);
2609 			else
2610 				seq_printf(seq,
2611 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2612 					 "%d\t%08X\t%d\t%u\t%u%n",
2613 					 prefix, 0, flags, 0, 0, 0,
2614 					 mask, 0, 0, 0, &len);
2615 
2616 			seq_printf(seq, "%*s\n", 127 - len, "");
2617 		}
2618 	}
2619 
2620 	return 0;
2621 }
2622 
2623 static const struct seq_operations fib_route_seq_ops = {
2624 	.start  = fib_route_seq_start,
2625 	.next   = fib_route_seq_next,
2626 	.stop   = fib_route_seq_stop,
2627 	.show   = fib_route_seq_show,
2628 };
2629 
2630 static int fib_route_seq_open(struct inode *inode, struct file *file)
2631 {
2632 	return seq_open_net(inode, file, &fib_route_seq_ops,
2633 			    sizeof(struct fib_route_iter));
2634 }
2635 
2636 static const struct file_operations fib_route_fops = {
2637 	.owner  = THIS_MODULE,
2638 	.open   = fib_route_seq_open,
2639 	.read   = seq_read,
2640 	.llseek = seq_lseek,
2641 	.release = seq_release_net,
2642 };
2643 
2644 int __net_init fib_proc_init(struct net *net)
2645 {
2646 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2647 		goto out1;
2648 
2649 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2650 				  &fib_triestat_fops))
2651 		goto out2;
2652 
2653 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2654 		goto out3;
2655 
2656 	return 0;
2657 
2658 out3:
2659 	proc_net_remove(net, "fib_triestat");
2660 out2:
2661 	proc_net_remove(net, "fib_trie");
2662 out1:
2663 	return -ENOMEM;
2664 }
2665 
2666 void __net_exit fib_proc_exit(struct net *net)
2667 {
2668 	proc_net_remove(net, "fib_trie");
2669 	proc_net_remove(net, "fib_triestat");
2670 	proc_net_remove(net, "route");
2671 }
2672 
2673 #endif /* CONFIG_PROC_FS */
2674