xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 8b5a1f9c)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally described in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
57 #include <linux/mm.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
62 #include <linux/in.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/prefetch.h>
75 #include <linux/export.h>
76 #include <net/net_namespace.h>
77 #include <net/ip.h>
78 #include <net/protocol.h>
79 #include <net/route.h>
80 #include <net/tcp.h>
81 #include <net/sock.h>
82 #include <net/ip_fib.h>
83 #include "fib_lookup.h"
84 
85 #define MAX_STAT_DEPTH 32
86 
87 #define KEYLENGTH (8*sizeof(t_key))
88 
89 typedef unsigned int t_key;
90 
91 #define T_TNODE 0
92 #define T_LEAF  1
93 #define NODE_TYPE_MASK	0x1UL
94 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
95 
96 #define IS_TNODE(n) (!(n->parent & T_LEAF))
97 #define IS_LEAF(n) (n->parent & T_LEAF)
98 
99 struct rt_trie_node {
100 	unsigned long parent;
101 	t_key key;
102 };
103 
104 struct leaf {
105 	unsigned long parent;
106 	t_key key;
107 	struct hlist_head list;
108 	struct rcu_head rcu;
109 };
110 
111 struct leaf_info {
112 	struct hlist_node hlist;
113 	int plen;
114 	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
115 	struct list_head falh;
116 	struct rcu_head rcu;
117 };
118 
119 struct tnode {
120 	unsigned long parent;
121 	t_key key;
122 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
123 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
124 	unsigned int full_children;	/* KEYLENGTH bits needed */
125 	unsigned int empty_children;	/* KEYLENGTH bits needed */
126 	union {
127 		struct rcu_head rcu;
128 		struct tnode *tnode_free;
129 	};
130 	struct rt_trie_node __rcu *child[0];
131 };
132 
133 #ifdef CONFIG_IP_FIB_TRIE_STATS
134 struct trie_use_stats {
135 	unsigned int gets;
136 	unsigned int backtrack;
137 	unsigned int semantic_match_passed;
138 	unsigned int semantic_match_miss;
139 	unsigned int null_node_hit;
140 	unsigned int resize_node_skipped;
141 };
142 #endif
143 
144 struct trie_stat {
145 	unsigned int totdepth;
146 	unsigned int maxdepth;
147 	unsigned int tnodes;
148 	unsigned int leaves;
149 	unsigned int nullpointers;
150 	unsigned int prefixes;
151 	unsigned int nodesizes[MAX_STAT_DEPTH];
152 };
153 
154 struct trie {
155 	struct rt_trie_node __rcu *trie;
156 #ifdef CONFIG_IP_FIB_TRIE_STATS
157 	struct trie_use_stats stats;
158 #endif
159 };
160 
161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
162 				  int wasfull);
163 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
165 static struct tnode *halve(struct trie *t, struct tnode *tn);
166 /* tnodes to free after resize(); protected by RTNL */
167 static struct tnode *tnode_free_head;
168 static size_t tnode_free_size;
169 
170 /*
171  * synchronize_rcu after call_rcu for that many pages; it should be especially
172  * useful before resizing the root node with PREEMPT_NONE configs; the value was
173  * obtained experimentally, aiming to avoid visible slowdown.
174  */
175 static const int sync_pages = 128;
176 
177 static struct kmem_cache *fn_alias_kmem __read_mostly;
178 static struct kmem_cache *trie_leaf_kmem __read_mostly;
179 
180 /*
181  * caller must hold RTNL
182  */
183 static inline struct tnode *node_parent(const struct rt_trie_node *node)
184 {
185 	unsigned long parent;
186 
187 	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
188 
189 	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
190 }
191 
192 /*
193  * caller must hold RCU read lock or RTNL
194  */
195 static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
196 {
197 	unsigned long parent;
198 
199 	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
200 							   lockdep_rtnl_is_held());
201 
202 	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
203 }
204 
205 /* Same as rcu_assign_pointer
206  * but that macro() assumes that value is a pointer.
207  */
208 static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
209 {
210 	smp_wmb();
211 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
212 }
213 
214 /*
215  * caller must hold RTNL
216  */
217 static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
218 {
219 	BUG_ON(i >= 1U << tn->bits);
220 
221 	return rtnl_dereference(tn->child[i]);
222 }
223 
224 /*
225  * caller must hold RCU read lock or RTNL
226  */
227 static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
228 {
229 	BUG_ON(i >= 1U << tn->bits);
230 
231 	return rcu_dereference_rtnl(tn->child[i]);
232 }
233 
234 static inline int tnode_child_length(const struct tnode *tn)
235 {
236 	return 1 << tn->bits;
237 }
238 
239 static inline t_key mask_pfx(t_key k, unsigned int l)
240 {
241 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
242 }
243 
244 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
245 {
246 	if (offset < KEYLENGTH)
247 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
248 	else
249 		return 0;
250 }
251 
252 static inline int tkey_equals(t_key a, t_key b)
253 {
254 	return a == b;
255 }
256 
257 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
258 {
259 	if (bits == 0 || offset >= KEYLENGTH)
260 		return 1;
261 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
262 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
263 }
264 
265 static inline int tkey_mismatch(t_key a, int offset, t_key b)
266 {
267 	t_key diff = a ^ b;
268 	int i = offset;
269 
270 	if (!diff)
271 		return 0;
272 	while ((diff << i) >> (KEYLENGTH-1) == 0)
273 		i++;
274 	return i;
275 }
276 
277 /*
278   To understand this stuff, an understanding of keys and all their bits is
279   necessary. Every node in the trie has a key associated with it, but not
280   all of the bits in that key are significant.
281 
282   Consider a node 'n' and its parent 'tp'.
283 
284   If n is a leaf, every bit in its key is significant. Its presence is
285   necessitated by path compression, since during a tree traversal (when
286   searching for a leaf - unless we are doing an insertion) we will completely
287   ignore all skipped bits we encounter. Thus we need to verify, at the end of
288   a potentially successful search, that we have indeed been walking the
289   correct key path.
290 
291   Note that we can never "miss" the correct key in the tree if present by
292   following the wrong path. Path compression ensures that segments of the key
293   that are the same for all keys with a given prefix are skipped, but the
294   skipped part *is* identical for each node in the subtrie below the skipped
295   bit! trie_insert() in this implementation takes care of that - note the
296   call to tkey_sub_equals() in trie_insert().
297 
298   if n is an internal node - a 'tnode' here, the various parts of its key
299   have many different meanings.
300 
301   Example:
302   _________________________________________________________________
303   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
304   -----------------------------------------------------------------
305     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
306 
307   _________________________________________________________________
308   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
309   -----------------------------------------------------------------
310    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
311 
312   tp->pos = 7
313   tp->bits = 3
314   n->pos = 15
315   n->bits = 4
316 
317   First, let's just ignore the bits that come before the parent tp, that is
318   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
319   not use them for anything.
320 
321   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
322   index into the parent's child array. That is, they will be used to find
323   'n' among tp's children.
324 
325   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
326   for the node n.
327 
328   All the bits we have seen so far are significant to the node n. The rest
329   of the bits are really not needed or indeed known in n->key.
330 
331   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
332   n's child array, and will of course be different for each child.
333 
334 
335   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
336   at this point.
337 
338 */
339 
340 static inline void check_tnode(const struct tnode *tn)
341 {
342 	WARN_ON(tn && tn->pos+tn->bits > 32);
343 }
344 
345 static const int halve_threshold = 25;
346 static const int inflate_threshold = 50;
347 static const int halve_threshold_root = 15;
348 static const int inflate_threshold_root = 30;
349 
350 static void __alias_free_mem(struct rcu_head *head)
351 {
352 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
353 	kmem_cache_free(fn_alias_kmem, fa);
354 }
355 
356 static inline void alias_free_mem_rcu(struct fib_alias *fa)
357 {
358 	call_rcu(&fa->rcu, __alias_free_mem);
359 }
360 
361 static void __leaf_free_rcu(struct rcu_head *head)
362 {
363 	struct leaf *l = container_of(head, struct leaf, rcu);
364 	kmem_cache_free(trie_leaf_kmem, l);
365 }
366 
367 static inline void free_leaf(struct leaf *l)
368 {
369 	call_rcu(&l->rcu, __leaf_free_rcu);
370 }
371 
372 static inline void free_leaf_info(struct leaf_info *leaf)
373 {
374 	kfree_rcu(leaf, rcu);
375 }
376 
377 static struct tnode *tnode_alloc(size_t size)
378 {
379 	if (size <= PAGE_SIZE)
380 		return kzalloc(size, GFP_KERNEL);
381 	else
382 		return vzalloc(size);
383 }
384 
385 static void __tnode_free_rcu(struct rcu_head *head)
386 {
387 	struct tnode *tn = container_of(head, struct tnode, rcu);
388 	size_t size = sizeof(struct tnode) +
389 		      (sizeof(struct rt_trie_node *) << tn->bits);
390 
391 	if (size <= PAGE_SIZE)
392 		kfree(tn);
393 	else
394 		vfree(tn);
395 }
396 
397 static inline void tnode_free(struct tnode *tn)
398 {
399 	if (IS_LEAF(tn))
400 		free_leaf((struct leaf *) tn);
401 	else
402 		call_rcu(&tn->rcu, __tnode_free_rcu);
403 }
404 
405 static void tnode_free_safe(struct tnode *tn)
406 {
407 	BUG_ON(IS_LEAF(tn));
408 	tn->tnode_free = tnode_free_head;
409 	tnode_free_head = tn;
410 	tnode_free_size += sizeof(struct tnode) +
411 			   (sizeof(struct rt_trie_node *) << tn->bits);
412 }
413 
414 static void tnode_free_flush(void)
415 {
416 	struct tnode *tn;
417 
418 	while ((tn = tnode_free_head)) {
419 		tnode_free_head = tn->tnode_free;
420 		tn->tnode_free = NULL;
421 		tnode_free(tn);
422 	}
423 
424 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
425 		tnode_free_size = 0;
426 		synchronize_rcu();
427 	}
428 }
429 
430 static struct leaf *leaf_new(void)
431 {
432 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
433 	if (l) {
434 		l->parent = T_LEAF;
435 		INIT_HLIST_HEAD(&l->list);
436 	}
437 	return l;
438 }
439 
440 static struct leaf_info *leaf_info_new(int plen)
441 {
442 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
443 	if (li) {
444 		li->plen = plen;
445 		li->mask_plen = ntohl(inet_make_mask(plen));
446 		INIT_LIST_HEAD(&li->falh);
447 	}
448 	return li;
449 }
450 
451 static struct tnode *tnode_new(t_key key, int pos, int bits)
452 {
453 	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
454 	struct tnode *tn = tnode_alloc(sz);
455 
456 	if (tn) {
457 		tn->parent = T_TNODE;
458 		tn->pos = pos;
459 		tn->bits = bits;
460 		tn->key = key;
461 		tn->full_children = 0;
462 		tn->empty_children = 1<<bits;
463 	}
464 
465 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
466 		 sizeof(struct rt_trie_node *) << bits);
467 	return tn;
468 }
469 
470 /*
471  * Check whether a tnode 'n' is "full", i.e. it is an internal node
472  * and no bits are skipped. See discussion in dyntree paper p. 6
473  */
474 
475 static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
476 {
477 	if (n == NULL || IS_LEAF(n))
478 		return 0;
479 
480 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
481 }
482 
483 static inline void put_child(struct tnode *tn, int i,
484 			     struct rt_trie_node *n)
485 {
486 	tnode_put_child_reorg(tn, i, n, -1);
487 }
488 
489  /*
490   * Add a child at position i overwriting the old value.
491   * Update the value of full_children and empty_children.
492   */
493 
494 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
495 				  int wasfull)
496 {
497 	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
498 	int isfull;
499 
500 	BUG_ON(i >= 1<<tn->bits);
501 
502 	/* update emptyChildren */
503 	if (n == NULL && chi != NULL)
504 		tn->empty_children++;
505 	else if (n != NULL && chi == NULL)
506 		tn->empty_children--;
507 
508 	/* update fullChildren */
509 	if (wasfull == -1)
510 		wasfull = tnode_full(tn, chi);
511 
512 	isfull = tnode_full(tn, n);
513 	if (wasfull && !isfull)
514 		tn->full_children--;
515 	else if (!wasfull && isfull)
516 		tn->full_children++;
517 
518 	if (n)
519 		node_set_parent(n, tn);
520 
521 	rcu_assign_pointer(tn->child[i], n);
522 }
523 
524 #define MAX_WORK 10
525 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
526 {
527 	int i;
528 	struct tnode *old_tn;
529 	int inflate_threshold_use;
530 	int halve_threshold_use;
531 	int max_work;
532 
533 	if (!tn)
534 		return NULL;
535 
536 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
537 		 tn, inflate_threshold, halve_threshold);
538 
539 	/* No children */
540 	if (tn->empty_children == tnode_child_length(tn)) {
541 		tnode_free_safe(tn);
542 		return NULL;
543 	}
544 	/* One child */
545 	if (tn->empty_children == tnode_child_length(tn) - 1)
546 		goto one_child;
547 	/*
548 	 * Double as long as the resulting node has a number of
549 	 * nonempty nodes that are above the threshold.
550 	 */
551 
552 	/*
553 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
554 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
555 	 * Telecommunications, page 6:
556 	 * "A node is doubled if the ratio of non-empty children to all
557 	 * children in the *doubled* node is at least 'high'."
558 	 *
559 	 * 'high' in this instance is the variable 'inflate_threshold'. It
560 	 * is expressed as a percentage, so we multiply it with
561 	 * tnode_child_length() and instead of multiplying by 2 (since the
562 	 * child array will be doubled by inflate()) and multiplying
563 	 * the left-hand side by 100 (to handle the percentage thing) we
564 	 * multiply the left-hand side by 50.
565 	 *
566 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
567 	 * - tn->empty_children is of course the number of non-null children
568 	 * in the current node. tn->full_children is the number of "full"
569 	 * children, that is non-null tnodes with a skip value of 0.
570 	 * All of those will be doubled in the resulting inflated tnode, so
571 	 * we just count them one extra time here.
572 	 *
573 	 * A clearer way to write this would be:
574 	 *
575 	 * to_be_doubled = tn->full_children;
576 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
577 	 *     tn->full_children;
578 	 *
579 	 * new_child_length = tnode_child_length(tn) * 2;
580 	 *
581 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
582 	 *      new_child_length;
583 	 * if (new_fill_factor >= inflate_threshold)
584 	 *
585 	 * ...and so on, tho it would mess up the while () loop.
586 	 *
587 	 * anyway,
588 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
589 	 *      inflate_threshold
590 	 *
591 	 * avoid a division:
592 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
593 	 *      inflate_threshold * new_child_length
594 	 *
595 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
596 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
597 	 *    tn->full_children) >= inflate_threshold * new_child_length
598 	 *
599 	 * expand new_child_length:
600 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
601 	 *    tn->full_children) >=
602 	 *      inflate_threshold * tnode_child_length(tn) * 2
603 	 *
604 	 * shorten again:
605 	 * 50 * (tn->full_children + tnode_child_length(tn) -
606 	 *    tn->empty_children) >= inflate_threshold *
607 	 *    tnode_child_length(tn)
608 	 *
609 	 */
610 
611 	check_tnode(tn);
612 
613 	/* Keep root node larger  */
614 
615 	if (!node_parent((struct rt_trie_node *)tn)) {
616 		inflate_threshold_use = inflate_threshold_root;
617 		halve_threshold_use = halve_threshold_root;
618 	} else {
619 		inflate_threshold_use = inflate_threshold;
620 		halve_threshold_use = halve_threshold;
621 	}
622 
623 	max_work = MAX_WORK;
624 	while ((tn->full_children > 0 &&  max_work-- &&
625 		50 * (tn->full_children + tnode_child_length(tn)
626 		      - tn->empty_children)
627 		>= inflate_threshold_use * tnode_child_length(tn))) {
628 
629 		old_tn = tn;
630 		tn = inflate(t, tn);
631 
632 		if (IS_ERR(tn)) {
633 			tn = old_tn;
634 #ifdef CONFIG_IP_FIB_TRIE_STATS
635 			t->stats.resize_node_skipped++;
636 #endif
637 			break;
638 		}
639 	}
640 
641 	check_tnode(tn);
642 
643 	/* Return if at least one inflate is run */
644 	if (max_work != MAX_WORK)
645 		return (struct rt_trie_node *) tn;
646 
647 	/*
648 	 * Halve as long as the number of empty children in this
649 	 * node is above threshold.
650 	 */
651 
652 	max_work = MAX_WORK;
653 	while (tn->bits > 1 &&  max_work-- &&
654 	       100 * (tnode_child_length(tn) - tn->empty_children) <
655 	       halve_threshold_use * tnode_child_length(tn)) {
656 
657 		old_tn = tn;
658 		tn = halve(t, tn);
659 		if (IS_ERR(tn)) {
660 			tn = old_tn;
661 #ifdef CONFIG_IP_FIB_TRIE_STATS
662 			t->stats.resize_node_skipped++;
663 #endif
664 			break;
665 		}
666 	}
667 
668 
669 	/* Only one child remains */
670 	if (tn->empty_children == tnode_child_length(tn) - 1) {
671 one_child:
672 		for (i = 0; i < tnode_child_length(tn); i++) {
673 			struct rt_trie_node *n;
674 
675 			n = rtnl_dereference(tn->child[i]);
676 			if (!n)
677 				continue;
678 
679 			/* compress one level */
680 
681 			node_set_parent(n, NULL);
682 			tnode_free_safe(tn);
683 			return n;
684 		}
685 	}
686 	return (struct rt_trie_node *) tn;
687 }
688 
689 
690 static void tnode_clean_free(struct tnode *tn)
691 {
692 	int i;
693 	struct tnode *tofree;
694 
695 	for (i = 0; i < tnode_child_length(tn); i++) {
696 		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
697 		if (tofree)
698 			tnode_free(tofree);
699 	}
700 	tnode_free(tn);
701 }
702 
703 static struct tnode *inflate(struct trie *t, struct tnode *tn)
704 {
705 	struct tnode *oldtnode = tn;
706 	int olen = tnode_child_length(tn);
707 	int i;
708 
709 	pr_debug("In inflate\n");
710 
711 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
712 
713 	if (!tn)
714 		return ERR_PTR(-ENOMEM);
715 
716 	/*
717 	 * Preallocate and store tnodes before the actual work so we
718 	 * don't get into an inconsistent state if memory allocation
719 	 * fails. In case of failure we return the oldnode and  inflate
720 	 * of tnode is ignored.
721 	 */
722 
723 	for (i = 0; i < olen; i++) {
724 		struct tnode *inode;
725 
726 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
727 		if (inode &&
728 		    IS_TNODE(inode) &&
729 		    inode->pos == oldtnode->pos + oldtnode->bits &&
730 		    inode->bits > 1) {
731 			struct tnode *left, *right;
732 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
733 
734 			left = tnode_new(inode->key&(~m), inode->pos + 1,
735 					 inode->bits - 1);
736 			if (!left)
737 				goto nomem;
738 
739 			right = tnode_new(inode->key|m, inode->pos + 1,
740 					  inode->bits - 1);
741 
742 			if (!right) {
743 				tnode_free(left);
744 				goto nomem;
745 			}
746 
747 			put_child(tn, 2*i, (struct rt_trie_node *) left);
748 			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
749 		}
750 	}
751 
752 	for (i = 0; i < olen; i++) {
753 		struct tnode *inode;
754 		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
755 		struct tnode *left, *right;
756 		int size, j;
757 
758 		/* An empty child */
759 		if (node == NULL)
760 			continue;
761 
762 		/* A leaf or an internal node with skipped bits */
763 
764 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
765 		   tn->pos + tn->bits - 1) {
766 			if (tkey_extract_bits(node->key,
767 					      oldtnode->pos + oldtnode->bits,
768 					      1) == 0)
769 				put_child(tn, 2*i, node);
770 			else
771 				put_child(tn, 2*i+1, node);
772 			continue;
773 		}
774 
775 		/* An internal node with two children */
776 		inode = (struct tnode *) node;
777 
778 		if (inode->bits == 1) {
779 			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
780 			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
781 
782 			tnode_free_safe(inode);
783 			continue;
784 		}
785 
786 		/* An internal node with more than two children */
787 
788 		/* We will replace this node 'inode' with two new
789 		 * ones, 'left' and 'right', each with half of the
790 		 * original children. The two new nodes will have
791 		 * a position one bit further down the key and this
792 		 * means that the "significant" part of their keys
793 		 * (see the discussion near the top of this file)
794 		 * will differ by one bit, which will be "0" in
795 		 * left's key and "1" in right's key. Since we are
796 		 * moving the key position by one step, the bit that
797 		 * we are moving away from - the bit at position
798 		 * (inode->pos) - is the one that will differ between
799 		 * left and right. So... we synthesize that bit in the
800 		 * two  new keys.
801 		 * The mask 'm' below will be a single "one" bit at
802 		 * the position (inode->pos)
803 		 */
804 
805 		/* Use the old key, but set the new significant
806 		 *   bit to zero.
807 		 */
808 
809 		left = (struct tnode *) tnode_get_child(tn, 2*i);
810 		put_child(tn, 2*i, NULL);
811 
812 		BUG_ON(!left);
813 
814 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
815 		put_child(tn, 2*i+1, NULL);
816 
817 		BUG_ON(!right);
818 
819 		size = tnode_child_length(left);
820 		for (j = 0; j < size; j++) {
821 			put_child(left, j, rtnl_dereference(inode->child[j]));
822 			put_child(right, j, rtnl_dereference(inode->child[j + size]));
823 		}
824 		put_child(tn, 2*i, resize(t, left));
825 		put_child(tn, 2*i+1, resize(t, right));
826 
827 		tnode_free_safe(inode);
828 	}
829 	tnode_free_safe(oldtnode);
830 	return tn;
831 nomem:
832 	tnode_clean_free(tn);
833 	return ERR_PTR(-ENOMEM);
834 }
835 
836 static struct tnode *halve(struct trie *t, struct tnode *tn)
837 {
838 	struct tnode *oldtnode = tn;
839 	struct rt_trie_node *left, *right;
840 	int i;
841 	int olen = tnode_child_length(tn);
842 
843 	pr_debug("In halve\n");
844 
845 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
846 
847 	if (!tn)
848 		return ERR_PTR(-ENOMEM);
849 
850 	/*
851 	 * Preallocate and store tnodes before the actual work so we
852 	 * don't get into an inconsistent state if memory allocation
853 	 * fails. In case of failure we return the oldnode and halve
854 	 * of tnode is ignored.
855 	 */
856 
857 	for (i = 0; i < olen; i += 2) {
858 		left = tnode_get_child(oldtnode, i);
859 		right = tnode_get_child(oldtnode, i+1);
860 
861 		/* Two nonempty children */
862 		if (left && right) {
863 			struct tnode *newn;
864 
865 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
866 
867 			if (!newn)
868 				goto nomem;
869 
870 			put_child(tn, i/2, (struct rt_trie_node *)newn);
871 		}
872 
873 	}
874 
875 	for (i = 0; i < olen; i += 2) {
876 		struct tnode *newBinNode;
877 
878 		left = tnode_get_child(oldtnode, i);
879 		right = tnode_get_child(oldtnode, i+1);
880 
881 		/* At least one of the children is empty */
882 		if (left == NULL) {
883 			if (right == NULL)    /* Both are empty */
884 				continue;
885 			put_child(tn, i/2, right);
886 			continue;
887 		}
888 
889 		if (right == NULL) {
890 			put_child(tn, i/2, left);
891 			continue;
892 		}
893 
894 		/* Two nonempty children */
895 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
896 		put_child(tn, i/2, NULL);
897 		put_child(newBinNode, 0, left);
898 		put_child(newBinNode, 1, right);
899 		put_child(tn, i/2, resize(t, newBinNode));
900 	}
901 	tnode_free_safe(oldtnode);
902 	return tn;
903 nomem:
904 	tnode_clean_free(tn);
905 	return ERR_PTR(-ENOMEM);
906 }
907 
908 /* readside must use rcu_read_lock currently dump routines
909  via get_fa_head and dump */
910 
911 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
912 {
913 	struct hlist_head *head = &l->list;
914 	struct leaf_info *li;
915 
916 	hlist_for_each_entry_rcu(li, head, hlist)
917 		if (li->plen == plen)
918 			return li;
919 
920 	return NULL;
921 }
922 
923 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
924 {
925 	struct leaf_info *li = find_leaf_info(l, plen);
926 
927 	if (!li)
928 		return NULL;
929 
930 	return &li->falh;
931 }
932 
933 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
934 {
935 	struct leaf_info *li = NULL, *last = NULL;
936 
937 	if (hlist_empty(head)) {
938 		hlist_add_head_rcu(&new->hlist, head);
939 	} else {
940 		hlist_for_each_entry(li, head, hlist) {
941 			if (new->plen > li->plen)
942 				break;
943 
944 			last = li;
945 		}
946 		if (last)
947 			hlist_add_after_rcu(&last->hlist, &new->hlist);
948 		else
949 			hlist_add_before_rcu(&new->hlist, &li->hlist);
950 	}
951 }
952 
953 /* rcu_read_lock needs to be hold by caller from readside */
954 
955 static struct leaf *
956 fib_find_node(struct trie *t, u32 key)
957 {
958 	int pos;
959 	struct tnode *tn;
960 	struct rt_trie_node *n;
961 
962 	pos = 0;
963 	n = rcu_dereference_rtnl(t->trie);
964 
965 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
966 		tn = (struct tnode *) n;
967 
968 		check_tnode(tn);
969 
970 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
971 			pos = tn->pos + tn->bits;
972 			n = tnode_get_child_rcu(tn,
973 						tkey_extract_bits(key,
974 								  tn->pos,
975 								  tn->bits));
976 		} else
977 			break;
978 	}
979 	/* Case we have found a leaf. Compare prefixes */
980 
981 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
982 		return (struct leaf *)n;
983 
984 	return NULL;
985 }
986 
987 static void trie_rebalance(struct trie *t, struct tnode *tn)
988 {
989 	int wasfull;
990 	t_key cindex, key;
991 	struct tnode *tp;
992 
993 	key = tn->key;
994 
995 	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
996 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
997 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
998 		tn = (struct tnode *)resize(t, tn);
999 
1000 		tnode_put_child_reorg(tp, cindex,
1001 				      (struct rt_trie_node *)tn, wasfull);
1002 
1003 		tp = node_parent((struct rt_trie_node *) tn);
1004 		if (!tp)
1005 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1006 
1007 		tnode_free_flush();
1008 		if (!tp)
1009 			break;
1010 		tn = tp;
1011 	}
1012 
1013 	/* Handle last (top) tnode */
1014 	if (IS_TNODE(tn))
1015 		tn = (struct tnode *)resize(t, tn);
1016 
1017 	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1018 	tnode_free_flush();
1019 }
1020 
1021 /* only used from updater-side */
1022 
1023 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1024 {
1025 	int pos, newpos;
1026 	struct tnode *tp = NULL, *tn = NULL;
1027 	struct rt_trie_node *n;
1028 	struct leaf *l;
1029 	int missbit;
1030 	struct list_head *fa_head = NULL;
1031 	struct leaf_info *li;
1032 	t_key cindex;
1033 
1034 	pos = 0;
1035 	n = rtnl_dereference(t->trie);
1036 
1037 	/* If we point to NULL, stop. Either the tree is empty and we should
1038 	 * just put a new leaf in if, or we have reached an empty child slot,
1039 	 * and we should just put our new leaf in that.
1040 	 * If we point to a T_TNODE, check if it matches our key. Note that
1041 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1042 	 * not be the parent's 'pos'+'bits'!
1043 	 *
1044 	 * If it does match the current key, get pos/bits from it, extract
1045 	 * the index from our key, push the T_TNODE and walk the tree.
1046 	 *
1047 	 * If it doesn't, we have to replace it with a new T_TNODE.
1048 	 *
1049 	 * If we point to a T_LEAF, it might or might not have the same key
1050 	 * as we do. If it does, just change the value, update the T_LEAF's
1051 	 * value, and return it.
1052 	 * If it doesn't, we need to replace it with a T_TNODE.
1053 	 */
1054 
1055 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1056 		tn = (struct tnode *) n;
1057 
1058 		check_tnode(tn);
1059 
1060 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1061 			tp = tn;
1062 			pos = tn->pos + tn->bits;
1063 			n = tnode_get_child(tn,
1064 					    tkey_extract_bits(key,
1065 							      tn->pos,
1066 							      tn->bits));
1067 
1068 			BUG_ON(n && node_parent(n) != tn);
1069 		} else
1070 			break;
1071 	}
1072 
1073 	/*
1074 	 * n  ----> NULL, LEAF or TNODE
1075 	 *
1076 	 * tp is n's (parent) ----> NULL or TNODE
1077 	 */
1078 
1079 	BUG_ON(tp && IS_LEAF(tp));
1080 
1081 	/* Case 1: n is a leaf. Compare prefixes */
1082 
1083 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1084 		l = (struct leaf *) n;
1085 		li = leaf_info_new(plen);
1086 
1087 		if (!li)
1088 			return NULL;
1089 
1090 		fa_head = &li->falh;
1091 		insert_leaf_info(&l->list, li);
1092 		goto done;
1093 	}
1094 	l = leaf_new();
1095 
1096 	if (!l)
1097 		return NULL;
1098 
1099 	l->key = key;
1100 	li = leaf_info_new(plen);
1101 
1102 	if (!li) {
1103 		free_leaf(l);
1104 		return NULL;
1105 	}
1106 
1107 	fa_head = &li->falh;
1108 	insert_leaf_info(&l->list, li);
1109 
1110 	if (t->trie && n == NULL) {
1111 		/* Case 2: n is NULL, and will just insert a new leaf */
1112 
1113 		node_set_parent((struct rt_trie_node *)l, tp);
1114 
1115 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1116 		put_child(tp, cindex, (struct rt_trie_node *)l);
1117 	} else {
1118 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1119 		/*
1120 		 *  Add a new tnode here
1121 		 *  first tnode need some special handling
1122 		 */
1123 
1124 		if (tp)
1125 			pos = tp->pos+tp->bits;
1126 		else
1127 			pos = 0;
1128 
1129 		if (n) {
1130 			newpos = tkey_mismatch(key, pos, n->key);
1131 			tn = tnode_new(n->key, newpos, 1);
1132 		} else {
1133 			newpos = 0;
1134 			tn = tnode_new(key, newpos, 1); /* First tnode */
1135 		}
1136 
1137 		if (!tn) {
1138 			free_leaf_info(li);
1139 			free_leaf(l);
1140 			return NULL;
1141 		}
1142 
1143 		node_set_parent((struct rt_trie_node *)tn, tp);
1144 
1145 		missbit = tkey_extract_bits(key, newpos, 1);
1146 		put_child(tn, missbit, (struct rt_trie_node *)l);
1147 		put_child(tn, 1-missbit, n);
1148 
1149 		if (tp) {
1150 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1151 			put_child(tp, cindex, (struct rt_trie_node *)tn);
1152 		} else {
1153 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1154 			tp = tn;
1155 		}
1156 	}
1157 
1158 	if (tp && tp->pos + tp->bits > 32)
1159 		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1160 			tp, tp->pos, tp->bits, key, plen);
1161 
1162 	/* Rebalance the trie */
1163 
1164 	trie_rebalance(t, tp);
1165 done:
1166 	return fa_head;
1167 }
1168 
1169 /*
1170  * Caller must hold RTNL.
1171  */
1172 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1173 {
1174 	struct trie *t = (struct trie *) tb->tb_data;
1175 	struct fib_alias *fa, *new_fa;
1176 	struct list_head *fa_head = NULL;
1177 	struct fib_info *fi;
1178 	int plen = cfg->fc_dst_len;
1179 	u8 tos = cfg->fc_tos;
1180 	u32 key, mask;
1181 	int err;
1182 	struct leaf *l;
1183 
1184 	if (plen > 32)
1185 		return -EINVAL;
1186 
1187 	key = ntohl(cfg->fc_dst);
1188 
1189 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1190 
1191 	mask = ntohl(inet_make_mask(plen));
1192 
1193 	if (key & ~mask)
1194 		return -EINVAL;
1195 
1196 	key = key & mask;
1197 
1198 	fi = fib_create_info(cfg);
1199 	if (IS_ERR(fi)) {
1200 		err = PTR_ERR(fi);
1201 		goto err;
1202 	}
1203 
1204 	l = fib_find_node(t, key);
1205 	fa = NULL;
1206 
1207 	if (l) {
1208 		fa_head = get_fa_head(l, plen);
1209 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1210 	}
1211 
1212 	/* Now fa, if non-NULL, points to the first fib alias
1213 	 * with the same keys [prefix,tos,priority], if such key already
1214 	 * exists or to the node before which we will insert new one.
1215 	 *
1216 	 * If fa is NULL, we will need to allocate a new one and
1217 	 * insert to the head of f.
1218 	 *
1219 	 * If f is NULL, no fib node matched the destination key
1220 	 * and we need to allocate a new one of those as well.
1221 	 */
1222 
1223 	if (fa && fa->fa_tos == tos &&
1224 	    fa->fa_info->fib_priority == fi->fib_priority) {
1225 		struct fib_alias *fa_first, *fa_match;
1226 
1227 		err = -EEXIST;
1228 		if (cfg->fc_nlflags & NLM_F_EXCL)
1229 			goto out;
1230 
1231 		/* We have 2 goals:
1232 		 * 1. Find exact match for type, scope, fib_info to avoid
1233 		 * duplicate routes
1234 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1235 		 */
1236 		fa_match = NULL;
1237 		fa_first = fa;
1238 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1239 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1240 			if (fa->fa_tos != tos)
1241 				break;
1242 			if (fa->fa_info->fib_priority != fi->fib_priority)
1243 				break;
1244 			if (fa->fa_type == cfg->fc_type &&
1245 			    fa->fa_info == fi) {
1246 				fa_match = fa;
1247 				break;
1248 			}
1249 		}
1250 
1251 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1252 			struct fib_info *fi_drop;
1253 			u8 state;
1254 
1255 			fa = fa_first;
1256 			if (fa_match) {
1257 				if (fa == fa_match)
1258 					err = 0;
1259 				goto out;
1260 			}
1261 			err = -ENOBUFS;
1262 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1263 			if (new_fa == NULL)
1264 				goto out;
1265 
1266 			fi_drop = fa->fa_info;
1267 			new_fa->fa_tos = fa->fa_tos;
1268 			new_fa->fa_info = fi;
1269 			new_fa->fa_type = cfg->fc_type;
1270 			state = fa->fa_state;
1271 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1272 
1273 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1274 			alias_free_mem_rcu(fa);
1275 
1276 			fib_release_info(fi_drop);
1277 			if (state & FA_S_ACCESSED)
1278 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1279 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1280 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1281 
1282 			goto succeeded;
1283 		}
1284 		/* Error if we find a perfect match which
1285 		 * uses the same scope, type, and nexthop
1286 		 * information.
1287 		 */
1288 		if (fa_match)
1289 			goto out;
1290 
1291 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1292 			fa = fa_first;
1293 	}
1294 	err = -ENOENT;
1295 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1296 		goto out;
1297 
1298 	err = -ENOBUFS;
1299 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1300 	if (new_fa == NULL)
1301 		goto out;
1302 
1303 	new_fa->fa_info = fi;
1304 	new_fa->fa_tos = tos;
1305 	new_fa->fa_type = cfg->fc_type;
1306 	new_fa->fa_state = 0;
1307 	/*
1308 	 * Insert new entry to the list.
1309 	 */
1310 
1311 	if (!fa_head) {
1312 		fa_head = fib_insert_node(t, key, plen);
1313 		if (unlikely(!fa_head)) {
1314 			err = -ENOMEM;
1315 			goto out_free_new_fa;
1316 		}
1317 	}
1318 
1319 	if (!plen)
1320 		tb->tb_num_default++;
1321 
1322 	list_add_tail_rcu(&new_fa->fa_list,
1323 			  (fa ? &fa->fa_list : fa_head));
1324 
1325 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1326 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1327 		  &cfg->fc_nlinfo, 0);
1328 succeeded:
1329 	return 0;
1330 
1331 out_free_new_fa:
1332 	kmem_cache_free(fn_alias_kmem, new_fa);
1333 out:
1334 	fib_release_info(fi);
1335 err:
1336 	return err;
1337 }
1338 
1339 /* should be called with rcu_read_lock */
1340 static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1341 		      t_key key,  const struct flowi4 *flp,
1342 		      struct fib_result *res, int fib_flags)
1343 {
1344 	struct leaf_info *li;
1345 	struct hlist_head *hhead = &l->list;
1346 
1347 	hlist_for_each_entry_rcu(li, hhead, hlist) {
1348 		struct fib_alias *fa;
1349 
1350 		if (l->key != (key & li->mask_plen))
1351 			continue;
1352 
1353 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1354 			struct fib_info *fi = fa->fa_info;
1355 			int nhsel, err;
1356 
1357 			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1358 				continue;
1359 			if (fi->fib_dead)
1360 				continue;
1361 			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1362 				continue;
1363 			fib_alias_accessed(fa);
1364 			err = fib_props[fa->fa_type].error;
1365 			if (err) {
1366 #ifdef CONFIG_IP_FIB_TRIE_STATS
1367 				t->stats.semantic_match_passed++;
1368 #endif
1369 				return err;
1370 			}
1371 			if (fi->fib_flags & RTNH_F_DEAD)
1372 				continue;
1373 			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1374 				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1375 
1376 				if (nh->nh_flags & RTNH_F_DEAD)
1377 					continue;
1378 				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1379 					continue;
1380 
1381 #ifdef CONFIG_IP_FIB_TRIE_STATS
1382 				t->stats.semantic_match_passed++;
1383 #endif
1384 				res->prefixlen = li->plen;
1385 				res->nh_sel = nhsel;
1386 				res->type = fa->fa_type;
1387 				res->scope = fa->fa_info->fib_scope;
1388 				res->fi = fi;
1389 				res->table = tb;
1390 				res->fa_head = &li->falh;
1391 				if (!(fib_flags & FIB_LOOKUP_NOREF))
1392 					atomic_inc(&fi->fib_clntref);
1393 				return 0;
1394 			}
1395 		}
1396 
1397 #ifdef CONFIG_IP_FIB_TRIE_STATS
1398 		t->stats.semantic_match_miss++;
1399 #endif
1400 	}
1401 
1402 	return 1;
1403 }
1404 
1405 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1406 		     struct fib_result *res, int fib_flags)
1407 {
1408 	struct trie *t = (struct trie *) tb->tb_data;
1409 	int ret;
1410 	struct rt_trie_node *n;
1411 	struct tnode *pn;
1412 	unsigned int pos, bits;
1413 	t_key key = ntohl(flp->daddr);
1414 	unsigned int chopped_off;
1415 	t_key cindex = 0;
1416 	unsigned int current_prefix_length = KEYLENGTH;
1417 	struct tnode *cn;
1418 	t_key pref_mismatch;
1419 
1420 	rcu_read_lock();
1421 
1422 	n = rcu_dereference(t->trie);
1423 	if (!n)
1424 		goto failed;
1425 
1426 #ifdef CONFIG_IP_FIB_TRIE_STATS
1427 	t->stats.gets++;
1428 #endif
1429 
1430 	/* Just a leaf? */
1431 	if (IS_LEAF(n)) {
1432 		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1433 		goto found;
1434 	}
1435 
1436 	pn = (struct tnode *) n;
1437 	chopped_off = 0;
1438 
1439 	while (pn) {
1440 		pos = pn->pos;
1441 		bits = pn->bits;
1442 
1443 		if (!chopped_off)
1444 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1445 						   pos, bits);
1446 
1447 		n = tnode_get_child_rcu(pn, cindex);
1448 
1449 		if (n == NULL) {
1450 #ifdef CONFIG_IP_FIB_TRIE_STATS
1451 			t->stats.null_node_hit++;
1452 #endif
1453 			goto backtrace;
1454 		}
1455 
1456 		if (IS_LEAF(n)) {
1457 			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1458 			if (ret > 0)
1459 				goto backtrace;
1460 			goto found;
1461 		}
1462 
1463 		cn = (struct tnode *)n;
1464 
1465 		/*
1466 		 * It's a tnode, and we can do some extra checks here if we
1467 		 * like, to avoid descending into a dead-end branch.
1468 		 * This tnode is in the parent's child array at index
1469 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1470 		 * chopped off, so in reality the index may be just a
1471 		 * subprefix, padded with zero at the end.
1472 		 * We can also take a look at any skipped bits in this
1473 		 * tnode - everything up to p_pos is supposed to be ok,
1474 		 * and the non-chopped bits of the index (se previous
1475 		 * paragraph) are also guaranteed ok, but the rest is
1476 		 * considered unknown.
1477 		 *
1478 		 * The skipped bits are key[pos+bits..cn->pos].
1479 		 */
1480 
1481 		/* If current_prefix_length < pos+bits, we are already doing
1482 		 * actual prefix  matching, which means everything from
1483 		 * pos+(bits-chopped_off) onward must be zero along some
1484 		 * branch of this subtree - otherwise there is *no* valid
1485 		 * prefix present. Here we can only check the skipped
1486 		 * bits. Remember, since we have already indexed into the
1487 		 * parent's child array, we know that the bits we chopped of
1488 		 * *are* zero.
1489 		 */
1490 
1491 		/* NOTA BENE: Checking only skipped bits
1492 		   for the new node here */
1493 
1494 		if (current_prefix_length < pos+bits) {
1495 			if (tkey_extract_bits(cn->key, current_prefix_length,
1496 						cn->pos - current_prefix_length)
1497 			    || !(cn->child[0]))
1498 				goto backtrace;
1499 		}
1500 
1501 		/*
1502 		 * If chopped_off=0, the index is fully validated and we
1503 		 * only need to look at the skipped bits for this, the new,
1504 		 * tnode. What we actually want to do is to find out if
1505 		 * these skipped bits match our key perfectly, or if we will
1506 		 * have to count on finding a matching prefix further down,
1507 		 * because if we do, we would like to have some way of
1508 		 * verifying the existence of such a prefix at this point.
1509 		 */
1510 
1511 		/* The only thing we can do at this point is to verify that
1512 		 * any such matching prefix can indeed be a prefix to our
1513 		 * key, and if the bits in the node we are inspecting that
1514 		 * do not match our key are not ZERO, this cannot be true.
1515 		 * Thus, find out where there is a mismatch (before cn->pos)
1516 		 * and verify that all the mismatching bits are zero in the
1517 		 * new tnode's key.
1518 		 */
1519 
1520 		/*
1521 		 * Note: We aren't very concerned about the piece of
1522 		 * the key that precede pn->pos+pn->bits, since these
1523 		 * have already been checked. The bits after cn->pos
1524 		 * aren't checked since these are by definition
1525 		 * "unknown" at this point. Thus, what we want to see
1526 		 * is if we are about to enter the "prefix matching"
1527 		 * state, and in that case verify that the skipped
1528 		 * bits that will prevail throughout this subtree are
1529 		 * zero, as they have to be if we are to find a
1530 		 * matching prefix.
1531 		 */
1532 
1533 		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1534 
1535 		/*
1536 		 * In short: If skipped bits in this node do not match
1537 		 * the search key, enter the "prefix matching"
1538 		 * state.directly.
1539 		 */
1540 		if (pref_mismatch) {
1541 			/* fls(x) = __fls(x) + 1 */
1542 			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1543 
1544 			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1545 				goto backtrace;
1546 
1547 			if (current_prefix_length >= cn->pos)
1548 				current_prefix_length = mp;
1549 		}
1550 
1551 		pn = (struct tnode *)n; /* Descend */
1552 		chopped_off = 0;
1553 		continue;
1554 
1555 backtrace:
1556 		chopped_off++;
1557 
1558 		/* As zero don't change the child key (cindex) */
1559 		while ((chopped_off <= pn->bits)
1560 		       && !(cindex & (1<<(chopped_off-1))))
1561 			chopped_off++;
1562 
1563 		/* Decrease current_... with bits chopped off */
1564 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1565 			current_prefix_length = pn->pos + pn->bits
1566 				- chopped_off;
1567 
1568 		/*
1569 		 * Either we do the actual chop off according or if we have
1570 		 * chopped off all bits in this tnode walk up to our parent.
1571 		 */
1572 
1573 		if (chopped_off <= pn->bits) {
1574 			cindex &= ~(1 << (chopped_off-1));
1575 		} else {
1576 			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1577 			if (!parent)
1578 				goto failed;
1579 
1580 			/* Get Child's index */
1581 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1582 			pn = parent;
1583 			chopped_off = 0;
1584 
1585 #ifdef CONFIG_IP_FIB_TRIE_STATS
1586 			t->stats.backtrack++;
1587 #endif
1588 			goto backtrace;
1589 		}
1590 	}
1591 failed:
1592 	ret = 1;
1593 found:
1594 	rcu_read_unlock();
1595 	return ret;
1596 }
1597 EXPORT_SYMBOL_GPL(fib_table_lookup);
1598 
1599 /*
1600  * Remove the leaf and return parent.
1601  */
1602 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1603 {
1604 	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1605 
1606 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1607 
1608 	if (tp) {
1609 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1610 		put_child(tp, cindex, NULL);
1611 		trie_rebalance(t, tp);
1612 	} else
1613 		RCU_INIT_POINTER(t->trie, NULL);
1614 
1615 	free_leaf(l);
1616 }
1617 
1618 /*
1619  * Caller must hold RTNL.
1620  */
1621 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1622 {
1623 	struct trie *t = (struct trie *) tb->tb_data;
1624 	u32 key, mask;
1625 	int plen = cfg->fc_dst_len;
1626 	u8 tos = cfg->fc_tos;
1627 	struct fib_alias *fa, *fa_to_delete;
1628 	struct list_head *fa_head;
1629 	struct leaf *l;
1630 	struct leaf_info *li;
1631 
1632 	if (plen > 32)
1633 		return -EINVAL;
1634 
1635 	key = ntohl(cfg->fc_dst);
1636 	mask = ntohl(inet_make_mask(plen));
1637 
1638 	if (key & ~mask)
1639 		return -EINVAL;
1640 
1641 	key = key & mask;
1642 	l = fib_find_node(t, key);
1643 
1644 	if (!l)
1645 		return -ESRCH;
1646 
1647 	li = find_leaf_info(l, plen);
1648 
1649 	if (!li)
1650 		return -ESRCH;
1651 
1652 	fa_head = &li->falh;
1653 	fa = fib_find_alias(fa_head, tos, 0);
1654 
1655 	if (!fa)
1656 		return -ESRCH;
1657 
1658 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1659 
1660 	fa_to_delete = NULL;
1661 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1662 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1663 		struct fib_info *fi = fa->fa_info;
1664 
1665 		if (fa->fa_tos != tos)
1666 			break;
1667 
1668 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1669 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1670 		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1671 		    (!cfg->fc_prefsrc ||
1672 		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1673 		    (!cfg->fc_protocol ||
1674 		     fi->fib_protocol == cfg->fc_protocol) &&
1675 		    fib_nh_match(cfg, fi) == 0) {
1676 			fa_to_delete = fa;
1677 			break;
1678 		}
1679 	}
1680 
1681 	if (!fa_to_delete)
1682 		return -ESRCH;
1683 
1684 	fa = fa_to_delete;
1685 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1686 		  &cfg->fc_nlinfo, 0);
1687 
1688 	list_del_rcu(&fa->fa_list);
1689 
1690 	if (!plen)
1691 		tb->tb_num_default--;
1692 
1693 	if (list_empty(fa_head)) {
1694 		hlist_del_rcu(&li->hlist);
1695 		free_leaf_info(li);
1696 	}
1697 
1698 	if (hlist_empty(&l->list))
1699 		trie_leaf_remove(t, l);
1700 
1701 	if (fa->fa_state & FA_S_ACCESSED)
1702 		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1703 
1704 	fib_release_info(fa->fa_info);
1705 	alias_free_mem_rcu(fa);
1706 	return 0;
1707 }
1708 
1709 static int trie_flush_list(struct list_head *head)
1710 {
1711 	struct fib_alias *fa, *fa_node;
1712 	int found = 0;
1713 
1714 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1715 		struct fib_info *fi = fa->fa_info;
1716 
1717 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1718 			list_del_rcu(&fa->fa_list);
1719 			fib_release_info(fa->fa_info);
1720 			alias_free_mem_rcu(fa);
1721 			found++;
1722 		}
1723 	}
1724 	return found;
1725 }
1726 
1727 static int trie_flush_leaf(struct leaf *l)
1728 {
1729 	int found = 0;
1730 	struct hlist_head *lih = &l->list;
1731 	struct hlist_node *tmp;
1732 	struct leaf_info *li = NULL;
1733 
1734 	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1735 		found += trie_flush_list(&li->falh);
1736 
1737 		if (list_empty(&li->falh)) {
1738 			hlist_del_rcu(&li->hlist);
1739 			free_leaf_info(li);
1740 		}
1741 	}
1742 	return found;
1743 }
1744 
1745 /*
1746  * Scan for the next right leaf starting at node p->child[idx]
1747  * Since we have back pointer, no recursion necessary.
1748  */
1749 static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1750 {
1751 	do {
1752 		t_key idx;
1753 
1754 		if (c)
1755 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1756 		else
1757 			idx = 0;
1758 
1759 		while (idx < 1u << p->bits) {
1760 			c = tnode_get_child_rcu(p, idx++);
1761 			if (!c)
1762 				continue;
1763 
1764 			if (IS_LEAF(c)) {
1765 				prefetch(rcu_dereference_rtnl(p->child[idx]));
1766 				return (struct leaf *) c;
1767 			}
1768 
1769 			/* Rescan start scanning in new node */
1770 			p = (struct tnode *) c;
1771 			idx = 0;
1772 		}
1773 
1774 		/* Node empty, walk back up to parent */
1775 		c = (struct rt_trie_node *) p;
1776 	} while ((p = node_parent_rcu(c)) != NULL);
1777 
1778 	return NULL; /* Root of trie */
1779 }
1780 
1781 static struct leaf *trie_firstleaf(struct trie *t)
1782 {
1783 	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1784 
1785 	if (!n)
1786 		return NULL;
1787 
1788 	if (IS_LEAF(n))          /* trie is just a leaf */
1789 		return (struct leaf *) n;
1790 
1791 	return leaf_walk_rcu(n, NULL);
1792 }
1793 
1794 static struct leaf *trie_nextleaf(struct leaf *l)
1795 {
1796 	struct rt_trie_node *c = (struct rt_trie_node *) l;
1797 	struct tnode *p = node_parent_rcu(c);
1798 
1799 	if (!p)
1800 		return NULL;	/* trie with just one leaf */
1801 
1802 	return leaf_walk_rcu(p, c);
1803 }
1804 
1805 static struct leaf *trie_leafindex(struct trie *t, int index)
1806 {
1807 	struct leaf *l = trie_firstleaf(t);
1808 
1809 	while (l && index-- > 0)
1810 		l = trie_nextleaf(l);
1811 
1812 	return l;
1813 }
1814 
1815 
1816 /*
1817  * Caller must hold RTNL.
1818  */
1819 int fib_table_flush(struct fib_table *tb)
1820 {
1821 	struct trie *t = (struct trie *) tb->tb_data;
1822 	struct leaf *l, *ll = NULL;
1823 	int found = 0;
1824 
1825 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1826 		found += trie_flush_leaf(l);
1827 
1828 		if (ll && hlist_empty(&ll->list))
1829 			trie_leaf_remove(t, ll);
1830 		ll = l;
1831 	}
1832 
1833 	if (ll && hlist_empty(&ll->list))
1834 		trie_leaf_remove(t, ll);
1835 
1836 	pr_debug("trie_flush found=%d\n", found);
1837 	return found;
1838 }
1839 
1840 void fib_free_table(struct fib_table *tb)
1841 {
1842 	kfree(tb);
1843 }
1844 
1845 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1846 			   struct fib_table *tb,
1847 			   struct sk_buff *skb, struct netlink_callback *cb)
1848 {
1849 	int i, s_i;
1850 	struct fib_alias *fa;
1851 	__be32 xkey = htonl(key);
1852 
1853 	s_i = cb->args[5];
1854 	i = 0;
1855 
1856 	/* rcu_read_lock is hold by caller */
1857 
1858 	list_for_each_entry_rcu(fa, fah, fa_list) {
1859 		if (i < s_i) {
1860 			i++;
1861 			continue;
1862 		}
1863 
1864 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1865 				  cb->nlh->nlmsg_seq,
1866 				  RTM_NEWROUTE,
1867 				  tb->tb_id,
1868 				  fa->fa_type,
1869 				  xkey,
1870 				  plen,
1871 				  fa->fa_tos,
1872 				  fa->fa_info, NLM_F_MULTI) < 0) {
1873 			cb->args[5] = i;
1874 			return -1;
1875 		}
1876 		i++;
1877 	}
1878 	cb->args[5] = i;
1879 	return skb->len;
1880 }
1881 
1882 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1883 			struct sk_buff *skb, struct netlink_callback *cb)
1884 {
1885 	struct leaf_info *li;
1886 	int i, s_i;
1887 
1888 	s_i = cb->args[4];
1889 	i = 0;
1890 
1891 	/* rcu_read_lock is hold by caller */
1892 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1893 		if (i < s_i) {
1894 			i++;
1895 			continue;
1896 		}
1897 
1898 		if (i > s_i)
1899 			cb->args[5] = 0;
1900 
1901 		if (list_empty(&li->falh))
1902 			continue;
1903 
1904 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1905 			cb->args[4] = i;
1906 			return -1;
1907 		}
1908 		i++;
1909 	}
1910 
1911 	cb->args[4] = i;
1912 	return skb->len;
1913 }
1914 
1915 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1916 		   struct netlink_callback *cb)
1917 {
1918 	struct leaf *l;
1919 	struct trie *t = (struct trie *) tb->tb_data;
1920 	t_key key = cb->args[2];
1921 	int count = cb->args[3];
1922 
1923 	rcu_read_lock();
1924 	/* Dump starting at last key.
1925 	 * Note: 0.0.0.0/0 (ie default) is first key.
1926 	 */
1927 	if (count == 0)
1928 		l = trie_firstleaf(t);
1929 	else {
1930 		/* Normally, continue from last key, but if that is missing
1931 		 * fallback to using slow rescan
1932 		 */
1933 		l = fib_find_node(t, key);
1934 		if (!l)
1935 			l = trie_leafindex(t, count);
1936 	}
1937 
1938 	while (l) {
1939 		cb->args[2] = l->key;
1940 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1941 			cb->args[3] = count;
1942 			rcu_read_unlock();
1943 			return -1;
1944 		}
1945 
1946 		++count;
1947 		l = trie_nextleaf(l);
1948 		memset(&cb->args[4], 0,
1949 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1950 	}
1951 	cb->args[3] = count;
1952 	rcu_read_unlock();
1953 
1954 	return skb->len;
1955 }
1956 
1957 void __init fib_trie_init(void)
1958 {
1959 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1960 					  sizeof(struct fib_alias),
1961 					  0, SLAB_PANIC, NULL);
1962 
1963 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1964 					   max(sizeof(struct leaf),
1965 					       sizeof(struct leaf_info)),
1966 					   0, SLAB_PANIC, NULL);
1967 }
1968 
1969 
1970 struct fib_table *fib_trie_table(u32 id)
1971 {
1972 	struct fib_table *tb;
1973 	struct trie *t;
1974 
1975 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1976 		     GFP_KERNEL);
1977 	if (tb == NULL)
1978 		return NULL;
1979 
1980 	tb->tb_id = id;
1981 	tb->tb_default = -1;
1982 	tb->tb_num_default = 0;
1983 
1984 	t = (struct trie *) tb->tb_data;
1985 	memset(t, 0, sizeof(*t));
1986 
1987 	return tb;
1988 }
1989 
1990 #ifdef CONFIG_PROC_FS
1991 /* Depth first Trie walk iterator */
1992 struct fib_trie_iter {
1993 	struct seq_net_private p;
1994 	struct fib_table *tb;
1995 	struct tnode *tnode;
1996 	unsigned int index;
1997 	unsigned int depth;
1998 };
1999 
2000 static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
2001 {
2002 	struct tnode *tn = iter->tnode;
2003 	unsigned int cindex = iter->index;
2004 	struct tnode *p;
2005 
2006 	/* A single entry routing table */
2007 	if (!tn)
2008 		return NULL;
2009 
2010 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2011 		 iter->tnode, iter->index, iter->depth);
2012 rescan:
2013 	while (cindex < (1<<tn->bits)) {
2014 		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2015 
2016 		if (n) {
2017 			if (IS_LEAF(n)) {
2018 				iter->tnode = tn;
2019 				iter->index = cindex + 1;
2020 			} else {
2021 				/* push down one level */
2022 				iter->tnode = (struct tnode *) n;
2023 				iter->index = 0;
2024 				++iter->depth;
2025 			}
2026 			return n;
2027 		}
2028 
2029 		++cindex;
2030 	}
2031 
2032 	/* Current node exhausted, pop back up */
2033 	p = node_parent_rcu((struct rt_trie_node *)tn);
2034 	if (p) {
2035 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2036 		tn = p;
2037 		--iter->depth;
2038 		goto rescan;
2039 	}
2040 
2041 	/* got root? */
2042 	return NULL;
2043 }
2044 
2045 static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2046 				       struct trie *t)
2047 {
2048 	struct rt_trie_node *n;
2049 
2050 	if (!t)
2051 		return NULL;
2052 
2053 	n = rcu_dereference(t->trie);
2054 	if (!n)
2055 		return NULL;
2056 
2057 	if (IS_TNODE(n)) {
2058 		iter->tnode = (struct tnode *) n;
2059 		iter->index = 0;
2060 		iter->depth = 1;
2061 	} else {
2062 		iter->tnode = NULL;
2063 		iter->index = 0;
2064 		iter->depth = 0;
2065 	}
2066 
2067 	return n;
2068 }
2069 
2070 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2071 {
2072 	struct rt_trie_node *n;
2073 	struct fib_trie_iter iter;
2074 
2075 	memset(s, 0, sizeof(*s));
2076 
2077 	rcu_read_lock();
2078 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2079 		if (IS_LEAF(n)) {
2080 			struct leaf *l = (struct leaf *)n;
2081 			struct leaf_info *li;
2082 
2083 			s->leaves++;
2084 			s->totdepth += iter.depth;
2085 			if (iter.depth > s->maxdepth)
2086 				s->maxdepth = iter.depth;
2087 
2088 			hlist_for_each_entry_rcu(li, &l->list, hlist)
2089 				++s->prefixes;
2090 		} else {
2091 			const struct tnode *tn = (const struct tnode *) n;
2092 			int i;
2093 
2094 			s->tnodes++;
2095 			if (tn->bits < MAX_STAT_DEPTH)
2096 				s->nodesizes[tn->bits]++;
2097 
2098 			for (i = 0; i < (1<<tn->bits); i++)
2099 				if (!tn->child[i])
2100 					s->nullpointers++;
2101 		}
2102 	}
2103 	rcu_read_unlock();
2104 }
2105 
2106 /*
2107  *	This outputs /proc/net/fib_triestats
2108  */
2109 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2110 {
2111 	unsigned int i, max, pointers, bytes, avdepth;
2112 
2113 	if (stat->leaves)
2114 		avdepth = stat->totdepth*100 / stat->leaves;
2115 	else
2116 		avdepth = 0;
2117 
2118 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2119 		   avdepth / 100, avdepth % 100);
2120 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2121 
2122 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2123 	bytes = sizeof(struct leaf) * stat->leaves;
2124 
2125 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2126 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2127 
2128 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2129 	bytes += sizeof(struct tnode) * stat->tnodes;
2130 
2131 	max = MAX_STAT_DEPTH;
2132 	while (max > 0 && stat->nodesizes[max-1] == 0)
2133 		max--;
2134 
2135 	pointers = 0;
2136 	for (i = 1; i <= max; i++)
2137 		if (stat->nodesizes[i] != 0) {
2138 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2139 			pointers += (1<<i) * stat->nodesizes[i];
2140 		}
2141 	seq_putc(seq, '\n');
2142 	seq_printf(seq, "\tPointers: %u\n", pointers);
2143 
2144 	bytes += sizeof(struct rt_trie_node *) * pointers;
2145 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2146 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2147 }
2148 
2149 #ifdef CONFIG_IP_FIB_TRIE_STATS
2150 static void trie_show_usage(struct seq_file *seq,
2151 			    const struct trie_use_stats *stats)
2152 {
2153 	seq_printf(seq, "\nCounters:\n---------\n");
2154 	seq_printf(seq, "gets = %u\n", stats->gets);
2155 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2156 	seq_printf(seq, "semantic match passed = %u\n",
2157 		   stats->semantic_match_passed);
2158 	seq_printf(seq, "semantic match miss = %u\n",
2159 		   stats->semantic_match_miss);
2160 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2161 	seq_printf(seq, "skipped node resize = %u\n\n",
2162 		   stats->resize_node_skipped);
2163 }
2164 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2165 
2166 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2167 {
2168 	if (tb->tb_id == RT_TABLE_LOCAL)
2169 		seq_puts(seq, "Local:\n");
2170 	else if (tb->tb_id == RT_TABLE_MAIN)
2171 		seq_puts(seq, "Main:\n");
2172 	else
2173 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2174 }
2175 
2176 
2177 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2178 {
2179 	struct net *net = (struct net *)seq->private;
2180 	unsigned int h;
2181 
2182 	seq_printf(seq,
2183 		   "Basic info: size of leaf:"
2184 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2185 		   sizeof(struct leaf), sizeof(struct tnode));
2186 
2187 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2188 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2189 		struct fib_table *tb;
2190 
2191 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2192 			struct trie *t = (struct trie *) tb->tb_data;
2193 			struct trie_stat stat;
2194 
2195 			if (!t)
2196 				continue;
2197 
2198 			fib_table_print(seq, tb);
2199 
2200 			trie_collect_stats(t, &stat);
2201 			trie_show_stats(seq, &stat);
2202 #ifdef CONFIG_IP_FIB_TRIE_STATS
2203 			trie_show_usage(seq, &t->stats);
2204 #endif
2205 		}
2206 	}
2207 
2208 	return 0;
2209 }
2210 
2211 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2212 {
2213 	return single_open_net(inode, file, fib_triestat_seq_show);
2214 }
2215 
2216 static const struct file_operations fib_triestat_fops = {
2217 	.owner	= THIS_MODULE,
2218 	.open	= fib_triestat_seq_open,
2219 	.read	= seq_read,
2220 	.llseek	= seq_lseek,
2221 	.release = single_release_net,
2222 };
2223 
2224 static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2225 {
2226 	struct fib_trie_iter *iter = seq->private;
2227 	struct net *net = seq_file_net(seq);
2228 	loff_t idx = 0;
2229 	unsigned int h;
2230 
2231 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2232 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2233 		struct fib_table *tb;
2234 
2235 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2236 			struct rt_trie_node *n;
2237 
2238 			for (n = fib_trie_get_first(iter,
2239 						    (struct trie *) tb->tb_data);
2240 			     n; n = fib_trie_get_next(iter))
2241 				if (pos == idx++) {
2242 					iter->tb = tb;
2243 					return n;
2244 				}
2245 		}
2246 	}
2247 
2248 	return NULL;
2249 }
2250 
2251 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2252 	__acquires(RCU)
2253 {
2254 	rcu_read_lock();
2255 	return fib_trie_get_idx(seq, *pos);
2256 }
2257 
2258 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2259 {
2260 	struct fib_trie_iter *iter = seq->private;
2261 	struct net *net = seq_file_net(seq);
2262 	struct fib_table *tb = iter->tb;
2263 	struct hlist_node *tb_node;
2264 	unsigned int h;
2265 	struct rt_trie_node *n;
2266 
2267 	++*pos;
2268 	/* next node in same table */
2269 	n = fib_trie_get_next(iter);
2270 	if (n)
2271 		return n;
2272 
2273 	/* walk rest of this hash chain */
2274 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2275 	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2276 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2277 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2278 		if (n)
2279 			goto found;
2280 	}
2281 
2282 	/* new hash chain */
2283 	while (++h < FIB_TABLE_HASHSZ) {
2284 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2285 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2286 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2287 			if (n)
2288 				goto found;
2289 		}
2290 	}
2291 	return NULL;
2292 
2293 found:
2294 	iter->tb = tb;
2295 	return n;
2296 }
2297 
2298 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2299 	__releases(RCU)
2300 {
2301 	rcu_read_unlock();
2302 }
2303 
2304 static void seq_indent(struct seq_file *seq, int n)
2305 {
2306 	while (n-- > 0)
2307 		seq_puts(seq, "   ");
2308 }
2309 
2310 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2311 {
2312 	switch (s) {
2313 	case RT_SCOPE_UNIVERSE: return "universe";
2314 	case RT_SCOPE_SITE:	return "site";
2315 	case RT_SCOPE_LINK:	return "link";
2316 	case RT_SCOPE_HOST:	return "host";
2317 	case RT_SCOPE_NOWHERE:	return "nowhere";
2318 	default:
2319 		snprintf(buf, len, "scope=%d", s);
2320 		return buf;
2321 	}
2322 }
2323 
2324 static const char *const rtn_type_names[__RTN_MAX] = {
2325 	[RTN_UNSPEC] = "UNSPEC",
2326 	[RTN_UNICAST] = "UNICAST",
2327 	[RTN_LOCAL] = "LOCAL",
2328 	[RTN_BROADCAST] = "BROADCAST",
2329 	[RTN_ANYCAST] = "ANYCAST",
2330 	[RTN_MULTICAST] = "MULTICAST",
2331 	[RTN_BLACKHOLE] = "BLACKHOLE",
2332 	[RTN_UNREACHABLE] = "UNREACHABLE",
2333 	[RTN_PROHIBIT] = "PROHIBIT",
2334 	[RTN_THROW] = "THROW",
2335 	[RTN_NAT] = "NAT",
2336 	[RTN_XRESOLVE] = "XRESOLVE",
2337 };
2338 
2339 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2340 {
2341 	if (t < __RTN_MAX && rtn_type_names[t])
2342 		return rtn_type_names[t];
2343 	snprintf(buf, len, "type %u", t);
2344 	return buf;
2345 }
2346 
2347 /* Pretty print the trie */
2348 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2349 {
2350 	const struct fib_trie_iter *iter = seq->private;
2351 	struct rt_trie_node *n = v;
2352 
2353 	if (!node_parent_rcu(n))
2354 		fib_table_print(seq, iter->tb);
2355 
2356 	if (IS_TNODE(n)) {
2357 		struct tnode *tn = (struct tnode *) n;
2358 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2359 
2360 		seq_indent(seq, iter->depth-1);
2361 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2362 			   &prf, tn->pos, tn->bits, tn->full_children,
2363 			   tn->empty_children);
2364 
2365 	} else {
2366 		struct leaf *l = (struct leaf *) n;
2367 		struct leaf_info *li;
2368 		__be32 val = htonl(l->key);
2369 
2370 		seq_indent(seq, iter->depth);
2371 		seq_printf(seq, "  |-- %pI4\n", &val);
2372 
2373 		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2374 			struct fib_alias *fa;
2375 
2376 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2377 				char buf1[32], buf2[32];
2378 
2379 				seq_indent(seq, iter->depth+1);
2380 				seq_printf(seq, "  /%d %s %s", li->plen,
2381 					   rtn_scope(buf1, sizeof(buf1),
2382 						     fa->fa_info->fib_scope),
2383 					   rtn_type(buf2, sizeof(buf2),
2384 						    fa->fa_type));
2385 				if (fa->fa_tos)
2386 					seq_printf(seq, " tos=%d", fa->fa_tos);
2387 				seq_putc(seq, '\n');
2388 			}
2389 		}
2390 	}
2391 
2392 	return 0;
2393 }
2394 
2395 static const struct seq_operations fib_trie_seq_ops = {
2396 	.start  = fib_trie_seq_start,
2397 	.next   = fib_trie_seq_next,
2398 	.stop   = fib_trie_seq_stop,
2399 	.show   = fib_trie_seq_show,
2400 };
2401 
2402 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2403 {
2404 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2405 			    sizeof(struct fib_trie_iter));
2406 }
2407 
2408 static const struct file_operations fib_trie_fops = {
2409 	.owner  = THIS_MODULE,
2410 	.open   = fib_trie_seq_open,
2411 	.read   = seq_read,
2412 	.llseek = seq_lseek,
2413 	.release = seq_release_net,
2414 };
2415 
2416 struct fib_route_iter {
2417 	struct seq_net_private p;
2418 	struct trie *main_trie;
2419 	loff_t	pos;
2420 	t_key	key;
2421 };
2422 
2423 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2424 {
2425 	struct leaf *l = NULL;
2426 	struct trie *t = iter->main_trie;
2427 
2428 	/* use cache location of last found key */
2429 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2430 		pos -= iter->pos;
2431 	else {
2432 		iter->pos = 0;
2433 		l = trie_firstleaf(t);
2434 	}
2435 
2436 	while (l && pos-- > 0) {
2437 		iter->pos++;
2438 		l = trie_nextleaf(l);
2439 	}
2440 
2441 	if (l)
2442 		iter->key = pos;	/* remember it */
2443 	else
2444 		iter->pos = 0;		/* forget it */
2445 
2446 	return l;
2447 }
2448 
2449 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2450 	__acquires(RCU)
2451 {
2452 	struct fib_route_iter *iter = seq->private;
2453 	struct fib_table *tb;
2454 
2455 	rcu_read_lock();
2456 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2457 	if (!tb)
2458 		return NULL;
2459 
2460 	iter->main_trie = (struct trie *) tb->tb_data;
2461 	if (*pos == 0)
2462 		return SEQ_START_TOKEN;
2463 	else
2464 		return fib_route_get_idx(iter, *pos - 1);
2465 }
2466 
2467 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2468 {
2469 	struct fib_route_iter *iter = seq->private;
2470 	struct leaf *l = v;
2471 
2472 	++*pos;
2473 	if (v == SEQ_START_TOKEN) {
2474 		iter->pos = 0;
2475 		l = trie_firstleaf(iter->main_trie);
2476 	} else {
2477 		iter->pos++;
2478 		l = trie_nextleaf(l);
2479 	}
2480 
2481 	if (l)
2482 		iter->key = l->key;
2483 	else
2484 		iter->pos = 0;
2485 	return l;
2486 }
2487 
2488 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2489 	__releases(RCU)
2490 {
2491 	rcu_read_unlock();
2492 }
2493 
2494 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2495 {
2496 	unsigned int flags = 0;
2497 
2498 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2499 		flags = RTF_REJECT;
2500 	if (fi && fi->fib_nh->nh_gw)
2501 		flags |= RTF_GATEWAY;
2502 	if (mask == htonl(0xFFFFFFFF))
2503 		flags |= RTF_HOST;
2504 	flags |= RTF_UP;
2505 	return flags;
2506 }
2507 
2508 /*
2509  *	This outputs /proc/net/route.
2510  *	The format of the file is not supposed to be changed
2511  *	and needs to be same as fib_hash output to avoid breaking
2512  *	legacy utilities
2513  */
2514 static int fib_route_seq_show(struct seq_file *seq, void *v)
2515 {
2516 	struct leaf *l = v;
2517 	struct leaf_info *li;
2518 
2519 	if (v == SEQ_START_TOKEN) {
2520 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2521 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2522 			   "\tWindow\tIRTT");
2523 		return 0;
2524 	}
2525 
2526 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2527 		struct fib_alias *fa;
2528 		__be32 mask, prefix;
2529 
2530 		mask = inet_make_mask(li->plen);
2531 		prefix = htonl(l->key);
2532 
2533 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2534 			const struct fib_info *fi = fa->fa_info;
2535 			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2536 			int len;
2537 
2538 			if (fa->fa_type == RTN_BROADCAST
2539 			    || fa->fa_type == RTN_MULTICAST)
2540 				continue;
2541 
2542 			if (fi)
2543 				seq_printf(seq,
2544 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2545 					 "%d\t%08X\t%d\t%u\t%u%n",
2546 					 fi->fib_dev ? fi->fib_dev->name : "*",
2547 					 prefix,
2548 					 fi->fib_nh->nh_gw, flags, 0, 0,
2549 					 fi->fib_priority,
2550 					 mask,
2551 					 (fi->fib_advmss ?
2552 					  fi->fib_advmss + 40 : 0),
2553 					 fi->fib_window,
2554 					 fi->fib_rtt >> 3, &len);
2555 			else
2556 				seq_printf(seq,
2557 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2558 					 "%d\t%08X\t%d\t%u\t%u%n",
2559 					 prefix, 0, flags, 0, 0, 0,
2560 					 mask, 0, 0, 0, &len);
2561 
2562 			seq_printf(seq, "%*s\n", 127 - len, "");
2563 		}
2564 	}
2565 
2566 	return 0;
2567 }
2568 
2569 static const struct seq_operations fib_route_seq_ops = {
2570 	.start  = fib_route_seq_start,
2571 	.next   = fib_route_seq_next,
2572 	.stop   = fib_route_seq_stop,
2573 	.show   = fib_route_seq_show,
2574 };
2575 
2576 static int fib_route_seq_open(struct inode *inode, struct file *file)
2577 {
2578 	return seq_open_net(inode, file, &fib_route_seq_ops,
2579 			    sizeof(struct fib_route_iter));
2580 }
2581 
2582 static const struct file_operations fib_route_fops = {
2583 	.owner  = THIS_MODULE,
2584 	.open   = fib_route_seq_open,
2585 	.read   = seq_read,
2586 	.llseek = seq_lseek,
2587 	.release = seq_release_net,
2588 };
2589 
2590 int __net_init fib_proc_init(struct net *net)
2591 {
2592 	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2593 		goto out1;
2594 
2595 	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2596 			 &fib_triestat_fops))
2597 		goto out2;
2598 
2599 	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2600 		goto out3;
2601 
2602 	return 0;
2603 
2604 out3:
2605 	remove_proc_entry("fib_triestat", net->proc_net);
2606 out2:
2607 	remove_proc_entry("fib_trie", net->proc_net);
2608 out1:
2609 	return -ENOMEM;
2610 }
2611 
2612 void __net_exit fib_proc_exit(struct net *net)
2613 {
2614 	remove_proc_entry("fib_trie", net->proc_net);
2615 	remove_proc_entry("fib_triestat", net->proc_net);
2616 	remove_proc_entry("route", net->proc_net);
2617 }
2618 
2619 #endif /* CONFIG_PROC_FS */
2620