xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 5bd8e16d)
1 /*
2  *   This program is free software; you can redistribute it and/or
3  *   modify it under the terms of the GNU General Public License
4  *   as published by the Free Software Foundation; either version
5  *   2 of the License, or (at your option) any later version.
6  *
7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8  *     & Swedish University of Agricultural Sciences.
9  *
10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
11  *     Agricultural Sciences.
12  *
13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
14  *
15  * This work is based on the LPC-trie which is originally described in:
16  *
17  * An experimental study of compression methods for dynamic tries
18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19  * http://www.csc.kth.se/~snilsson/software/dyntrie2/
20  *
21  *
22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24  *
25  *
26  * Code from fib_hash has been reused which includes the following header:
27  *
28  *
29  * INET		An implementation of the TCP/IP protocol suite for the LINUX
30  *		operating system.  INET is implemented using the  BSD Socket
31  *		interface as the means of communication with the user level.
32  *
33  *		IPv4 FIB: lookup engine and maintenance routines.
34  *
35  *
36  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37  *
38  *		This program is free software; you can redistribute it and/or
39  *		modify it under the terms of the GNU General Public License
40  *		as published by the Free Software Foundation; either version
41  *		2 of the License, or (at your option) any later version.
42  *
43  * Substantial contributions to this work comes from:
44  *
45  *		David S. Miller, <davem@davemloft.net>
46  *		Stephen Hemminger <shemminger@osdl.org>
47  *		Paul E. McKenney <paulmck@us.ibm.com>
48  *		Patrick McHardy <kaber@trash.net>
49  */
50 
51 #define VERSION "0.409"
52 
53 #include <asm/uaccess.h>
54 #include <linux/bitops.h>
55 #include <linux/types.h>
56 #include <linux/kernel.h>
57 #include <linux/mm.h>
58 #include <linux/string.h>
59 #include <linux/socket.h>
60 #include <linux/sockios.h>
61 #include <linux/errno.h>
62 #include <linux/in.h>
63 #include <linux/inet.h>
64 #include <linux/inetdevice.h>
65 #include <linux/netdevice.h>
66 #include <linux/if_arp.h>
67 #include <linux/proc_fs.h>
68 #include <linux/rcupdate.h>
69 #include <linux/skbuff.h>
70 #include <linux/netlink.h>
71 #include <linux/init.h>
72 #include <linux/list.h>
73 #include <linux/slab.h>
74 #include <linux/export.h>
75 #include <net/net_namespace.h>
76 #include <net/ip.h>
77 #include <net/protocol.h>
78 #include <net/route.h>
79 #include <net/tcp.h>
80 #include <net/sock.h>
81 #include <net/ip_fib.h>
82 #include "fib_lookup.h"
83 
84 #define MAX_STAT_DEPTH 32
85 
86 #define KEYLENGTH (8*sizeof(t_key))
87 
88 typedef unsigned int t_key;
89 
90 #define T_TNODE 0
91 #define T_LEAF  1
92 #define NODE_TYPE_MASK	0x1UL
93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94 
95 #define IS_TNODE(n) (!(n->parent & T_LEAF))
96 #define IS_LEAF(n) (n->parent & T_LEAF)
97 
98 struct rt_trie_node {
99 	unsigned long parent;
100 	t_key key;
101 };
102 
103 struct leaf {
104 	unsigned long parent;
105 	t_key key;
106 	struct hlist_head list;
107 	struct rcu_head rcu;
108 };
109 
110 struct leaf_info {
111 	struct hlist_node hlist;
112 	int plen;
113 	u32 mask_plen; /* ntohl(inet_make_mask(plen)) */
114 	struct list_head falh;
115 	struct rcu_head rcu;
116 };
117 
118 struct tnode {
119 	unsigned long parent;
120 	t_key key;
121 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123 	unsigned int full_children;	/* KEYLENGTH bits needed */
124 	unsigned int empty_children;	/* KEYLENGTH bits needed */
125 	union {
126 		struct rcu_head rcu;
127 		struct tnode *tnode_free;
128 	};
129 	struct rt_trie_node __rcu *child[0];
130 };
131 
132 #ifdef CONFIG_IP_FIB_TRIE_STATS
133 struct trie_use_stats {
134 	unsigned int gets;
135 	unsigned int backtrack;
136 	unsigned int semantic_match_passed;
137 	unsigned int semantic_match_miss;
138 	unsigned int null_node_hit;
139 	unsigned int resize_node_skipped;
140 };
141 #endif
142 
143 struct trie_stat {
144 	unsigned int totdepth;
145 	unsigned int maxdepth;
146 	unsigned int tnodes;
147 	unsigned int leaves;
148 	unsigned int nullpointers;
149 	unsigned int prefixes;
150 	unsigned int nodesizes[MAX_STAT_DEPTH];
151 };
152 
153 struct trie {
154 	struct rt_trie_node __rcu *trie;
155 #ifdef CONFIG_IP_FIB_TRIE_STATS
156 	struct trie_use_stats stats;
157 #endif
158 };
159 
160 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
161 				  int wasfull);
162 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
163 static struct tnode *inflate(struct trie *t, struct tnode *tn);
164 static struct tnode *halve(struct trie *t, struct tnode *tn);
165 /* tnodes to free after resize(); protected by RTNL */
166 static struct tnode *tnode_free_head;
167 static size_t tnode_free_size;
168 
169 /*
170  * synchronize_rcu after call_rcu for that many pages; it should be especially
171  * useful before resizing the root node with PREEMPT_NONE configs; the value was
172  * obtained experimentally, aiming to avoid visible slowdown.
173  */
174 static const int sync_pages = 128;
175 
176 static struct kmem_cache *fn_alias_kmem __read_mostly;
177 static struct kmem_cache *trie_leaf_kmem __read_mostly;
178 
179 /*
180  * caller must hold RTNL
181  */
182 static inline struct tnode *node_parent(const struct rt_trie_node *node)
183 {
184 	unsigned long parent;
185 
186 	parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
187 
188 	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
189 }
190 
191 /*
192  * caller must hold RCU read lock or RTNL
193  */
194 static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
195 {
196 	unsigned long parent;
197 
198 	parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
199 							   lockdep_rtnl_is_held());
200 
201 	return (struct tnode *)(parent & ~NODE_TYPE_MASK);
202 }
203 
204 /* Same as rcu_assign_pointer
205  * but that macro() assumes that value is a pointer.
206  */
207 static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
208 {
209 	smp_wmb();
210 	node->parent = (unsigned long)ptr | NODE_TYPE(node);
211 }
212 
213 /*
214  * caller must hold RTNL
215  */
216 static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
217 {
218 	BUG_ON(i >= 1U << tn->bits);
219 
220 	return rtnl_dereference(tn->child[i]);
221 }
222 
223 /*
224  * caller must hold RCU read lock or RTNL
225  */
226 static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
227 {
228 	BUG_ON(i >= 1U << tn->bits);
229 
230 	return rcu_dereference_rtnl(tn->child[i]);
231 }
232 
233 static inline int tnode_child_length(const struct tnode *tn)
234 {
235 	return 1 << tn->bits;
236 }
237 
238 static inline t_key mask_pfx(t_key k, unsigned int l)
239 {
240 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
241 }
242 
243 static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
244 {
245 	if (offset < KEYLENGTH)
246 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
247 	else
248 		return 0;
249 }
250 
251 static inline int tkey_equals(t_key a, t_key b)
252 {
253 	return a == b;
254 }
255 
256 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
257 {
258 	if (bits == 0 || offset >= KEYLENGTH)
259 		return 1;
260 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
261 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
262 }
263 
264 static inline int tkey_mismatch(t_key a, int offset, t_key b)
265 {
266 	t_key diff = a ^ b;
267 	int i = offset;
268 
269 	if (!diff)
270 		return 0;
271 	while ((diff << i) >> (KEYLENGTH-1) == 0)
272 		i++;
273 	return i;
274 }
275 
276 /*
277   To understand this stuff, an understanding of keys and all their bits is
278   necessary. Every node in the trie has a key associated with it, but not
279   all of the bits in that key are significant.
280 
281   Consider a node 'n' and its parent 'tp'.
282 
283   If n is a leaf, every bit in its key is significant. Its presence is
284   necessitated by path compression, since during a tree traversal (when
285   searching for a leaf - unless we are doing an insertion) we will completely
286   ignore all skipped bits we encounter. Thus we need to verify, at the end of
287   a potentially successful search, that we have indeed been walking the
288   correct key path.
289 
290   Note that we can never "miss" the correct key in the tree if present by
291   following the wrong path. Path compression ensures that segments of the key
292   that are the same for all keys with a given prefix are skipped, but the
293   skipped part *is* identical for each node in the subtrie below the skipped
294   bit! trie_insert() in this implementation takes care of that - note the
295   call to tkey_sub_equals() in trie_insert().
296 
297   if n is an internal node - a 'tnode' here, the various parts of its key
298   have many different meanings.
299 
300   Example:
301   _________________________________________________________________
302   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
303   -----------------------------------------------------------------
304     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
305 
306   _________________________________________________________________
307   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
308   -----------------------------------------------------------------
309    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
310 
311   tp->pos = 7
312   tp->bits = 3
313   n->pos = 15
314   n->bits = 4
315 
316   First, let's just ignore the bits that come before the parent tp, that is
317   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
318   not use them for anything.
319 
320   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
321   index into the parent's child array. That is, they will be used to find
322   'n' among tp's children.
323 
324   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
325   for the node n.
326 
327   All the bits we have seen so far are significant to the node n. The rest
328   of the bits are really not needed or indeed known in n->key.
329 
330   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
331   n's child array, and will of course be different for each child.
332 
333 
334   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
335   at this point.
336 
337 */
338 
339 static inline void check_tnode(const struct tnode *tn)
340 {
341 	WARN_ON(tn && tn->pos+tn->bits > 32);
342 }
343 
344 static const int halve_threshold = 25;
345 static const int inflate_threshold = 50;
346 static const int halve_threshold_root = 15;
347 static const int inflate_threshold_root = 30;
348 
349 static void __alias_free_mem(struct rcu_head *head)
350 {
351 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
352 	kmem_cache_free(fn_alias_kmem, fa);
353 }
354 
355 static inline void alias_free_mem_rcu(struct fib_alias *fa)
356 {
357 	call_rcu(&fa->rcu, __alias_free_mem);
358 }
359 
360 static void __leaf_free_rcu(struct rcu_head *head)
361 {
362 	struct leaf *l = container_of(head, struct leaf, rcu);
363 	kmem_cache_free(trie_leaf_kmem, l);
364 }
365 
366 static inline void free_leaf(struct leaf *l)
367 {
368 	call_rcu(&l->rcu, __leaf_free_rcu);
369 }
370 
371 static inline void free_leaf_info(struct leaf_info *leaf)
372 {
373 	kfree_rcu(leaf, rcu);
374 }
375 
376 static struct tnode *tnode_alloc(size_t size)
377 {
378 	if (size <= PAGE_SIZE)
379 		return kzalloc(size, GFP_KERNEL);
380 	else
381 		return vzalloc(size);
382 }
383 
384 static void __tnode_free_rcu(struct rcu_head *head)
385 {
386 	struct tnode *tn = container_of(head, struct tnode, rcu);
387 	size_t size = sizeof(struct tnode) +
388 		      (sizeof(struct rt_trie_node *) << tn->bits);
389 
390 	if (size <= PAGE_SIZE)
391 		kfree(tn);
392 	else
393 		vfree(tn);
394 }
395 
396 static inline void tnode_free(struct tnode *tn)
397 {
398 	if (IS_LEAF(tn))
399 		free_leaf((struct leaf *) tn);
400 	else
401 		call_rcu(&tn->rcu, __tnode_free_rcu);
402 }
403 
404 static void tnode_free_safe(struct tnode *tn)
405 {
406 	BUG_ON(IS_LEAF(tn));
407 	tn->tnode_free = tnode_free_head;
408 	tnode_free_head = tn;
409 	tnode_free_size += sizeof(struct tnode) +
410 			   (sizeof(struct rt_trie_node *) << tn->bits);
411 }
412 
413 static void tnode_free_flush(void)
414 {
415 	struct tnode *tn;
416 
417 	while ((tn = tnode_free_head)) {
418 		tnode_free_head = tn->tnode_free;
419 		tn->tnode_free = NULL;
420 		tnode_free(tn);
421 	}
422 
423 	if (tnode_free_size >= PAGE_SIZE * sync_pages) {
424 		tnode_free_size = 0;
425 		synchronize_rcu();
426 	}
427 }
428 
429 static struct leaf *leaf_new(void)
430 {
431 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
432 	if (l) {
433 		l->parent = T_LEAF;
434 		INIT_HLIST_HEAD(&l->list);
435 	}
436 	return l;
437 }
438 
439 static struct leaf_info *leaf_info_new(int plen)
440 {
441 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
442 	if (li) {
443 		li->plen = plen;
444 		li->mask_plen = ntohl(inet_make_mask(plen));
445 		INIT_LIST_HEAD(&li->falh);
446 	}
447 	return li;
448 }
449 
450 static struct tnode *tnode_new(t_key key, int pos, int bits)
451 {
452 	size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
453 	struct tnode *tn = tnode_alloc(sz);
454 
455 	if (tn) {
456 		tn->parent = T_TNODE;
457 		tn->pos = pos;
458 		tn->bits = bits;
459 		tn->key = key;
460 		tn->full_children = 0;
461 		tn->empty_children = 1<<bits;
462 	}
463 
464 	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
465 		 sizeof(struct rt_trie_node *) << bits);
466 	return tn;
467 }
468 
469 /*
470  * Check whether a tnode 'n' is "full", i.e. it is an internal node
471  * and no bits are skipped. See discussion in dyntree paper p. 6
472  */
473 
474 static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
475 {
476 	if (n == NULL || IS_LEAF(n))
477 		return 0;
478 
479 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
480 }
481 
482 static inline void put_child(struct tnode *tn, int i,
483 			     struct rt_trie_node *n)
484 {
485 	tnode_put_child_reorg(tn, i, n, -1);
486 }
487 
488  /*
489   * Add a child at position i overwriting the old value.
490   * Update the value of full_children and empty_children.
491   */
492 
493 static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
494 				  int wasfull)
495 {
496 	struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
497 	int isfull;
498 
499 	BUG_ON(i >= 1<<tn->bits);
500 
501 	/* update emptyChildren */
502 	if (n == NULL && chi != NULL)
503 		tn->empty_children++;
504 	else if (n != NULL && chi == NULL)
505 		tn->empty_children--;
506 
507 	/* update fullChildren */
508 	if (wasfull == -1)
509 		wasfull = tnode_full(tn, chi);
510 
511 	isfull = tnode_full(tn, n);
512 	if (wasfull && !isfull)
513 		tn->full_children--;
514 	else if (!wasfull && isfull)
515 		tn->full_children++;
516 
517 	if (n)
518 		node_set_parent(n, tn);
519 
520 	rcu_assign_pointer(tn->child[i], n);
521 }
522 
523 #define MAX_WORK 10
524 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
525 {
526 	int i;
527 	struct tnode *old_tn;
528 	int inflate_threshold_use;
529 	int halve_threshold_use;
530 	int max_work;
531 
532 	if (!tn)
533 		return NULL;
534 
535 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
536 		 tn, inflate_threshold, halve_threshold);
537 
538 	/* No children */
539 	if (tn->empty_children == tnode_child_length(tn)) {
540 		tnode_free_safe(tn);
541 		return NULL;
542 	}
543 	/* One child */
544 	if (tn->empty_children == tnode_child_length(tn) - 1)
545 		goto one_child;
546 	/*
547 	 * Double as long as the resulting node has a number of
548 	 * nonempty nodes that are above the threshold.
549 	 */
550 
551 	/*
552 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
553 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
554 	 * Telecommunications, page 6:
555 	 * "A node is doubled if the ratio of non-empty children to all
556 	 * children in the *doubled* node is at least 'high'."
557 	 *
558 	 * 'high' in this instance is the variable 'inflate_threshold'. It
559 	 * is expressed as a percentage, so we multiply it with
560 	 * tnode_child_length() and instead of multiplying by 2 (since the
561 	 * child array will be doubled by inflate()) and multiplying
562 	 * the left-hand side by 100 (to handle the percentage thing) we
563 	 * multiply the left-hand side by 50.
564 	 *
565 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
566 	 * - tn->empty_children is of course the number of non-null children
567 	 * in the current node. tn->full_children is the number of "full"
568 	 * children, that is non-null tnodes with a skip value of 0.
569 	 * All of those will be doubled in the resulting inflated tnode, so
570 	 * we just count them one extra time here.
571 	 *
572 	 * A clearer way to write this would be:
573 	 *
574 	 * to_be_doubled = tn->full_children;
575 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
576 	 *     tn->full_children;
577 	 *
578 	 * new_child_length = tnode_child_length(tn) * 2;
579 	 *
580 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
581 	 *      new_child_length;
582 	 * if (new_fill_factor >= inflate_threshold)
583 	 *
584 	 * ...and so on, tho it would mess up the while () loop.
585 	 *
586 	 * anyway,
587 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
588 	 *      inflate_threshold
589 	 *
590 	 * avoid a division:
591 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
592 	 *      inflate_threshold * new_child_length
593 	 *
594 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
595 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
596 	 *    tn->full_children) >= inflate_threshold * new_child_length
597 	 *
598 	 * expand new_child_length:
599 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
600 	 *    tn->full_children) >=
601 	 *      inflate_threshold * tnode_child_length(tn) * 2
602 	 *
603 	 * shorten again:
604 	 * 50 * (tn->full_children + tnode_child_length(tn) -
605 	 *    tn->empty_children) >= inflate_threshold *
606 	 *    tnode_child_length(tn)
607 	 *
608 	 */
609 
610 	check_tnode(tn);
611 
612 	/* Keep root node larger  */
613 
614 	if (!node_parent((struct rt_trie_node *)tn)) {
615 		inflate_threshold_use = inflate_threshold_root;
616 		halve_threshold_use = halve_threshold_root;
617 	} else {
618 		inflate_threshold_use = inflate_threshold;
619 		halve_threshold_use = halve_threshold;
620 	}
621 
622 	max_work = MAX_WORK;
623 	while ((tn->full_children > 0 &&  max_work-- &&
624 		50 * (tn->full_children + tnode_child_length(tn)
625 		      - tn->empty_children)
626 		>= inflate_threshold_use * tnode_child_length(tn))) {
627 
628 		old_tn = tn;
629 		tn = inflate(t, tn);
630 
631 		if (IS_ERR(tn)) {
632 			tn = old_tn;
633 #ifdef CONFIG_IP_FIB_TRIE_STATS
634 			t->stats.resize_node_skipped++;
635 #endif
636 			break;
637 		}
638 	}
639 
640 	check_tnode(tn);
641 
642 	/* Return if at least one inflate is run */
643 	if (max_work != MAX_WORK)
644 		return (struct rt_trie_node *) tn;
645 
646 	/*
647 	 * Halve as long as the number of empty children in this
648 	 * node is above threshold.
649 	 */
650 
651 	max_work = MAX_WORK;
652 	while (tn->bits > 1 &&  max_work-- &&
653 	       100 * (tnode_child_length(tn) - tn->empty_children) <
654 	       halve_threshold_use * tnode_child_length(tn)) {
655 
656 		old_tn = tn;
657 		tn = halve(t, tn);
658 		if (IS_ERR(tn)) {
659 			tn = old_tn;
660 #ifdef CONFIG_IP_FIB_TRIE_STATS
661 			t->stats.resize_node_skipped++;
662 #endif
663 			break;
664 		}
665 	}
666 
667 
668 	/* Only one child remains */
669 	if (tn->empty_children == tnode_child_length(tn) - 1) {
670 one_child:
671 		for (i = 0; i < tnode_child_length(tn); i++) {
672 			struct rt_trie_node *n;
673 
674 			n = rtnl_dereference(tn->child[i]);
675 			if (!n)
676 				continue;
677 
678 			/* compress one level */
679 
680 			node_set_parent(n, NULL);
681 			tnode_free_safe(tn);
682 			return n;
683 		}
684 	}
685 	return (struct rt_trie_node *) tn;
686 }
687 
688 
689 static void tnode_clean_free(struct tnode *tn)
690 {
691 	int i;
692 	struct tnode *tofree;
693 
694 	for (i = 0; i < tnode_child_length(tn); i++) {
695 		tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
696 		if (tofree)
697 			tnode_free(tofree);
698 	}
699 	tnode_free(tn);
700 }
701 
702 static struct tnode *inflate(struct trie *t, struct tnode *tn)
703 {
704 	struct tnode *oldtnode = tn;
705 	int olen = tnode_child_length(tn);
706 	int i;
707 
708 	pr_debug("In inflate\n");
709 
710 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
711 
712 	if (!tn)
713 		return ERR_PTR(-ENOMEM);
714 
715 	/*
716 	 * Preallocate and store tnodes before the actual work so we
717 	 * don't get into an inconsistent state if memory allocation
718 	 * fails. In case of failure we return the oldnode and  inflate
719 	 * of tnode is ignored.
720 	 */
721 
722 	for (i = 0; i < olen; i++) {
723 		struct tnode *inode;
724 
725 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
726 		if (inode &&
727 		    IS_TNODE(inode) &&
728 		    inode->pos == oldtnode->pos + oldtnode->bits &&
729 		    inode->bits > 1) {
730 			struct tnode *left, *right;
731 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
732 
733 			left = tnode_new(inode->key&(~m), inode->pos + 1,
734 					 inode->bits - 1);
735 			if (!left)
736 				goto nomem;
737 
738 			right = tnode_new(inode->key|m, inode->pos + 1,
739 					  inode->bits - 1);
740 
741 			if (!right) {
742 				tnode_free(left);
743 				goto nomem;
744 			}
745 
746 			put_child(tn, 2*i, (struct rt_trie_node *) left);
747 			put_child(tn, 2*i+1, (struct rt_trie_node *) right);
748 		}
749 	}
750 
751 	for (i = 0; i < olen; i++) {
752 		struct tnode *inode;
753 		struct rt_trie_node *node = tnode_get_child(oldtnode, i);
754 		struct tnode *left, *right;
755 		int size, j;
756 
757 		/* An empty child */
758 		if (node == NULL)
759 			continue;
760 
761 		/* A leaf or an internal node with skipped bits */
762 
763 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
764 		   tn->pos + tn->bits - 1) {
765 			if (tkey_extract_bits(node->key,
766 					      oldtnode->pos + oldtnode->bits,
767 					      1) == 0)
768 				put_child(tn, 2*i, node);
769 			else
770 				put_child(tn, 2*i+1, node);
771 			continue;
772 		}
773 
774 		/* An internal node with two children */
775 		inode = (struct tnode *) node;
776 
777 		if (inode->bits == 1) {
778 			put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
779 			put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
780 
781 			tnode_free_safe(inode);
782 			continue;
783 		}
784 
785 		/* An internal node with more than two children */
786 
787 		/* We will replace this node 'inode' with two new
788 		 * ones, 'left' and 'right', each with half of the
789 		 * original children. The two new nodes will have
790 		 * a position one bit further down the key and this
791 		 * means that the "significant" part of their keys
792 		 * (see the discussion near the top of this file)
793 		 * will differ by one bit, which will be "0" in
794 		 * left's key and "1" in right's key. Since we are
795 		 * moving the key position by one step, the bit that
796 		 * we are moving away from - the bit at position
797 		 * (inode->pos) - is the one that will differ between
798 		 * left and right. So... we synthesize that bit in the
799 		 * two  new keys.
800 		 * The mask 'm' below will be a single "one" bit at
801 		 * the position (inode->pos)
802 		 */
803 
804 		/* Use the old key, but set the new significant
805 		 *   bit to zero.
806 		 */
807 
808 		left = (struct tnode *) tnode_get_child(tn, 2*i);
809 		put_child(tn, 2*i, NULL);
810 
811 		BUG_ON(!left);
812 
813 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
814 		put_child(tn, 2*i+1, NULL);
815 
816 		BUG_ON(!right);
817 
818 		size = tnode_child_length(left);
819 		for (j = 0; j < size; j++) {
820 			put_child(left, j, rtnl_dereference(inode->child[j]));
821 			put_child(right, j, rtnl_dereference(inode->child[j + size]));
822 		}
823 		put_child(tn, 2*i, resize(t, left));
824 		put_child(tn, 2*i+1, resize(t, right));
825 
826 		tnode_free_safe(inode);
827 	}
828 	tnode_free_safe(oldtnode);
829 	return tn;
830 nomem:
831 	tnode_clean_free(tn);
832 	return ERR_PTR(-ENOMEM);
833 }
834 
835 static struct tnode *halve(struct trie *t, struct tnode *tn)
836 {
837 	struct tnode *oldtnode = tn;
838 	struct rt_trie_node *left, *right;
839 	int i;
840 	int olen = tnode_child_length(tn);
841 
842 	pr_debug("In halve\n");
843 
844 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
845 
846 	if (!tn)
847 		return ERR_PTR(-ENOMEM);
848 
849 	/*
850 	 * Preallocate and store tnodes before the actual work so we
851 	 * don't get into an inconsistent state if memory allocation
852 	 * fails. In case of failure we return the oldnode and halve
853 	 * of tnode is ignored.
854 	 */
855 
856 	for (i = 0; i < olen; i += 2) {
857 		left = tnode_get_child(oldtnode, i);
858 		right = tnode_get_child(oldtnode, i+1);
859 
860 		/* Two nonempty children */
861 		if (left && right) {
862 			struct tnode *newn;
863 
864 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
865 
866 			if (!newn)
867 				goto nomem;
868 
869 			put_child(tn, i/2, (struct rt_trie_node *)newn);
870 		}
871 
872 	}
873 
874 	for (i = 0; i < olen; i += 2) {
875 		struct tnode *newBinNode;
876 
877 		left = tnode_get_child(oldtnode, i);
878 		right = tnode_get_child(oldtnode, i+1);
879 
880 		/* At least one of the children is empty */
881 		if (left == NULL) {
882 			if (right == NULL)    /* Both are empty */
883 				continue;
884 			put_child(tn, i/2, right);
885 			continue;
886 		}
887 
888 		if (right == NULL) {
889 			put_child(tn, i/2, left);
890 			continue;
891 		}
892 
893 		/* Two nonempty children */
894 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
895 		put_child(tn, i/2, NULL);
896 		put_child(newBinNode, 0, left);
897 		put_child(newBinNode, 1, right);
898 		put_child(tn, i/2, resize(t, newBinNode));
899 	}
900 	tnode_free_safe(oldtnode);
901 	return tn;
902 nomem:
903 	tnode_clean_free(tn);
904 	return ERR_PTR(-ENOMEM);
905 }
906 
907 /* readside must use rcu_read_lock currently dump routines
908  via get_fa_head and dump */
909 
910 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
911 {
912 	struct hlist_head *head = &l->list;
913 	struct leaf_info *li;
914 
915 	hlist_for_each_entry_rcu(li, head, hlist)
916 		if (li->plen == plen)
917 			return li;
918 
919 	return NULL;
920 }
921 
922 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
923 {
924 	struct leaf_info *li = find_leaf_info(l, plen);
925 
926 	if (!li)
927 		return NULL;
928 
929 	return &li->falh;
930 }
931 
932 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
933 {
934 	struct leaf_info *li = NULL, *last = NULL;
935 
936 	if (hlist_empty(head)) {
937 		hlist_add_head_rcu(&new->hlist, head);
938 	} else {
939 		hlist_for_each_entry(li, head, hlist) {
940 			if (new->plen > li->plen)
941 				break;
942 
943 			last = li;
944 		}
945 		if (last)
946 			hlist_add_after_rcu(&last->hlist, &new->hlist);
947 		else
948 			hlist_add_before_rcu(&new->hlist, &li->hlist);
949 	}
950 }
951 
952 /* rcu_read_lock needs to be hold by caller from readside */
953 
954 static struct leaf *
955 fib_find_node(struct trie *t, u32 key)
956 {
957 	int pos;
958 	struct tnode *tn;
959 	struct rt_trie_node *n;
960 
961 	pos = 0;
962 	n = rcu_dereference_rtnl(t->trie);
963 
964 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
965 		tn = (struct tnode *) n;
966 
967 		check_tnode(tn);
968 
969 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
970 			pos = tn->pos + tn->bits;
971 			n = tnode_get_child_rcu(tn,
972 						tkey_extract_bits(key,
973 								  tn->pos,
974 								  tn->bits));
975 		} else
976 			break;
977 	}
978 	/* Case we have found a leaf. Compare prefixes */
979 
980 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
981 		return (struct leaf *)n;
982 
983 	return NULL;
984 }
985 
986 static void trie_rebalance(struct trie *t, struct tnode *tn)
987 {
988 	int wasfull;
989 	t_key cindex, key;
990 	struct tnode *tp;
991 
992 	key = tn->key;
993 
994 	while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
995 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
996 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
997 		tn = (struct tnode *)resize(t, tn);
998 
999 		tnode_put_child_reorg(tp, cindex,
1000 				      (struct rt_trie_node *)tn, wasfull);
1001 
1002 		tp = node_parent((struct rt_trie_node *) tn);
1003 		if (!tp)
1004 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1005 
1006 		tnode_free_flush();
1007 		if (!tp)
1008 			break;
1009 		tn = tp;
1010 	}
1011 
1012 	/* Handle last (top) tnode */
1013 	if (IS_TNODE(tn))
1014 		tn = (struct tnode *)resize(t, tn);
1015 
1016 	rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1017 	tnode_free_flush();
1018 }
1019 
1020 /* only used from updater-side */
1021 
1022 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1023 {
1024 	int pos, newpos;
1025 	struct tnode *tp = NULL, *tn = NULL;
1026 	struct rt_trie_node *n;
1027 	struct leaf *l;
1028 	int missbit;
1029 	struct list_head *fa_head = NULL;
1030 	struct leaf_info *li;
1031 	t_key cindex;
1032 
1033 	pos = 0;
1034 	n = rtnl_dereference(t->trie);
1035 
1036 	/* If we point to NULL, stop. Either the tree is empty and we should
1037 	 * just put a new leaf in if, or we have reached an empty child slot,
1038 	 * and we should just put our new leaf in that.
1039 	 * If we point to a T_TNODE, check if it matches our key. Note that
1040 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
1041 	 * not be the parent's 'pos'+'bits'!
1042 	 *
1043 	 * If it does match the current key, get pos/bits from it, extract
1044 	 * the index from our key, push the T_TNODE and walk the tree.
1045 	 *
1046 	 * If it doesn't, we have to replace it with a new T_TNODE.
1047 	 *
1048 	 * If we point to a T_LEAF, it might or might not have the same key
1049 	 * as we do. If it does, just change the value, update the T_LEAF's
1050 	 * value, and return it.
1051 	 * If it doesn't, we need to replace it with a T_TNODE.
1052 	 */
1053 
1054 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1055 		tn = (struct tnode *) n;
1056 
1057 		check_tnode(tn);
1058 
1059 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1060 			tp = tn;
1061 			pos = tn->pos + tn->bits;
1062 			n = tnode_get_child(tn,
1063 					    tkey_extract_bits(key,
1064 							      tn->pos,
1065 							      tn->bits));
1066 
1067 			BUG_ON(n && node_parent(n) != tn);
1068 		} else
1069 			break;
1070 	}
1071 
1072 	/*
1073 	 * n  ----> NULL, LEAF or TNODE
1074 	 *
1075 	 * tp is n's (parent) ----> NULL or TNODE
1076 	 */
1077 
1078 	BUG_ON(tp && IS_LEAF(tp));
1079 
1080 	/* Case 1: n is a leaf. Compare prefixes */
1081 
1082 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1083 		l = (struct leaf *) n;
1084 		li = leaf_info_new(plen);
1085 
1086 		if (!li)
1087 			return NULL;
1088 
1089 		fa_head = &li->falh;
1090 		insert_leaf_info(&l->list, li);
1091 		goto done;
1092 	}
1093 	l = leaf_new();
1094 
1095 	if (!l)
1096 		return NULL;
1097 
1098 	l->key = key;
1099 	li = leaf_info_new(plen);
1100 
1101 	if (!li) {
1102 		free_leaf(l);
1103 		return NULL;
1104 	}
1105 
1106 	fa_head = &li->falh;
1107 	insert_leaf_info(&l->list, li);
1108 
1109 	if (t->trie && n == NULL) {
1110 		/* Case 2: n is NULL, and will just insert a new leaf */
1111 
1112 		node_set_parent((struct rt_trie_node *)l, tp);
1113 
1114 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1115 		put_child(tp, cindex, (struct rt_trie_node *)l);
1116 	} else {
1117 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1118 		/*
1119 		 *  Add a new tnode here
1120 		 *  first tnode need some special handling
1121 		 */
1122 
1123 		if (tp)
1124 			pos = tp->pos+tp->bits;
1125 		else
1126 			pos = 0;
1127 
1128 		if (n) {
1129 			newpos = tkey_mismatch(key, pos, n->key);
1130 			tn = tnode_new(n->key, newpos, 1);
1131 		} else {
1132 			newpos = 0;
1133 			tn = tnode_new(key, newpos, 1); /* First tnode */
1134 		}
1135 
1136 		if (!tn) {
1137 			free_leaf_info(li);
1138 			free_leaf(l);
1139 			return NULL;
1140 		}
1141 
1142 		node_set_parent((struct rt_trie_node *)tn, tp);
1143 
1144 		missbit = tkey_extract_bits(key, newpos, 1);
1145 		put_child(tn, missbit, (struct rt_trie_node *)l);
1146 		put_child(tn, 1-missbit, n);
1147 
1148 		if (tp) {
1149 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1150 			put_child(tp, cindex, (struct rt_trie_node *)tn);
1151 		} else {
1152 			rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1153 			tp = tn;
1154 		}
1155 	}
1156 
1157 	if (tp && tp->pos + tp->bits > 32)
1158 		pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1159 			tp, tp->pos, tp->bits, key, plen);
1160 
1161 	/* Rebalance the trie */
1162 
1163 	trie_rebalance(t, tp);
1164 done:
1165 	return fa_head;
1166 }
1167 
1168 /*
1169  * Caller must hold RTNL.
1170  */
1171 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1172 {
1173 	struct trie *t = (struct trie *) tb->tb_data;
1174 	struct fib_alias *fa, *new_fa;
1175 	struct list_head *fa_head = NULL;
1176 	struct fib_info *fi;
1177 	int plen = cfg->fc_dst_len;
1178 	u8 tos = cfg->fc_tos;
1179 	u32 key, mask;
1180 	int err;
1181 	struct leaf *l;
1182 
1183 	if (plen > 32)
1184 		return -EINVAL;
1185 
1186 	key = ntohl(cfg->fc_dst);
1187 
1188 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1189 
1190 	mask = ntohl(inet_make_mask(plen));
1191 
1192 	if (key & ~mask)
1193 		return -EINVAL;
1194 
1195 	key = key & mask;
1196 
1197 	fi = fib_create_info(cfg);
1198 	if (IS_ERR(fi)) {
1199 		err = PTR_ERR(fi);
1200 		goto err;
1201 	}
1202 
1203 	l = fib_find_node(t, key);
1204 	fa = NULL;
1205 
1206 	if (l) {
1207 		fa_head = get_fa_head(l, plen);
1208 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1209 	}
1210 
1211 	/* Now fa, if non-NULL, points to the first fib alias
1212 	 * with the same keys [prefix,tos,priority], if such key already
1213 	 * exists or to the node before which we will insert new one.
1214 	 *
1215 	 * If fa is NULL, we will need to allocate a new one and
1216 	 * insert to the head of f.
1217 	 *
1218 	 * If f is NULL, no fib node matched the destination key
1219 	 * and we need to allocate a new one of those as well.
1220 	 */
1221 
1222 	if (fa && fa->fa_tos == tos &&
1223 	    fa->fa_info->fib_priority == fi->fib_priority) {
1224 		struct fib_alias *fa_first, *fa_match;
1225 
1226 		err = -EEXIST;
1227 		if (cfg->fc_nlflags & NLM_F_EXCL)
1228 			goto out;
1229 
1230 		/* We have 2 goals:
1231 		 * 1. Find exact match for type, scope, fib_info to avoid
1232 		 * duplicate routes
1233 		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1234 		 */
1235 		fa_match = NULL;
1236 		fa_first = fa;
1237 		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1238 		list_for_each_entry_continue(fa, fa_head, fa_list) {
1239 			if (fa->fa_tos != tos)
1240 				break;
1241 			if (fa->fa_info->fib_priority != fi->fib_priority)
1242 				break;
1243 			if (fa->fa_type == cfg->fc_type &&
1244 			    fa->fa_info == fi) {
1245 				fa_match = fa;
1246 				break;
1247 			}
1248 		}
1249 
1250 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1251 			struct fib_info *fi_drop;
1252 			u8 state;
1253 
1254 			fa = fa_first;
1255 			if (fa_match) {
1256 				if (fa == fa_match)
1257 					err = 0;
1258 				goto out;
1259 			}
1260 			err = -ENOBUFS;
1261 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1262 			if (new_fa == NULL)
1263 				goto out;
1264 
1265 			fi_drop = fa->fa_info;
1266 			new_fa->fa_tos = fa->fa_tos;
1267 			new_fa->fa_info = fi;
1268 			new_fa->fa_type = cfg->fc_type;
1269 			state = fa->fa_state;
1270 			new_fa->fa_state = state & ~FA_S_ACCESSED;
1271 
1272 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1273 			alias_free_mem_rcu(fa);
1274 
1275 			fib_release_info(fi_drop);
1276 			if (state & FA_S_ACCESSED)
1277 				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1278 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1279 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1280 
1281 			goto succeeded;
1282 		}
1283 		/* Error if we find a perfect match which
1284 		 * uses the same scope, type, and nexthop
1285 		 * information.
1286 		 */
1287 		if (fa_match)
1288 			goto out;
1289 
1290 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
1291 			fa = fa_first;
1292 	}
1293 	err = -ENOENT;
1294 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1295 		goto out;
1296 
1297 	err = -ENOBUFS;
1298 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1299 	if (new_fa == NULL)
1300 		goto out;
1301 
1302 	new_fa->fa_info = fi;
1303 	new_fa->fa_tos = tos;
1304 	new_fa->fa_type = cfg->fc_type;
1305 	new_fa->fa_state = 0;
1306 	/*
1307 	 * Insert new entry to the list.
1308 	 */
1309 
1310 	if (!fa_head) {
1311 		fa_head = fib_insert_node(t, key, plen);
1312 		if (unlikely(!fa_head)) {
1313 			err = -ENOMEM;
1314 			goto out_free_new_fa;
1315 		}
1316 	}
1317 
1318 	if (!plen)
1319 		tb->tb_num_default++;
1320 
1321 	list_add_tail_rcu(&new_fa->fa_list,
1322 			  (fa ? &fa->fa_list : fa_head));
1323 
1324 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1325 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1326 		  &cfg->fc_nlinfo, 0);
1327 succeeded:
1328 	return 0;
1329 
1330 out_free_new_fa:
1331 	kmem_cache_free(fn_alias_kmem, new_fa);
1332 out:
1333 	fib_release_info(fi);
1334 err:
1335 	return err;
1336 }
1337 
1338 /* should be called with rcu_read_lock */
1339 static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
1340 		      t_key key,  const struct flowi4 *flp,
1341 		      struct fib_result *res, int fib_flags)
1342 {
1343 	struct leaf_info *li;
1344 	struct hlist_head *hhead = &l->list;
1345 
1346 	hlist_for_each_entry_rcu(li, hhead, hlist) {
1347 		struct fib_alias *fa;
1348 
1349 		if (l->key != (key & li->mask_plen))
1350 			continue;
1351 
1352 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1353 			struct fib_info *fi = fa->fa_info;
1354 			int nhsel, err;
1355 
1356 			if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1357 				continue;
1358 			if (fi->fib_dead)
1359 				continue;
1360 			if (fa->fa_info->fib_scope < flp->flowi4_scope)
1361 				continue;
1362 			fib_alias_accessed(fa);
1363 			err = fib_props[fa->fa_type].error;
1364 			if (err) {
1365 #ifdef CONFIG_IP_FIB_TRIE_STATS
1366 				t->stats.semantic_match_passed++;
1367 #endif
1368 				return err;
1369 			}
1370 			if (fi->fib_flags & RTNH_F_DEAD)
1371 				continue;
1372 			for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1373 				const struct fib_nh *nh = &fi->fib_nh[nhsel];
1374 
1375 				if (nh->nh_flags & RTNH_F_DEAD)
1376 					continue;
1377 				if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
1378 					continue;
1379 
1380 #ifdef CONFIG_IP_FIB_TRIE_STATS
1381 				t->stats.semantic_match_passed++;
1382 #endif
1383 				res->prefixlen = li->plen;
1384 				res->nh_sel = nhsel;
1385 				res->type = fa->fa_type;
1386 				res->scope = fa->fa_info->fib_scope;
1387 				res->fi = fi;
1388 				res->table = tb;
1389 				res->fa_head = &li->falh;
1390 				if (!(fib_flags & FIB_LOOKUP_NOREF))
1391 					atomic_inc(&fi->fib_clntref);
1392 				return 0;
1393 			}
1394 		}
1395 
1396 #ifdef CONFIG_IP_FIB_TRIE_STATS
1397 		t->stats.semantic_match_miss++;
1398 #endif
1399 	}
1400 
1401 	return 1;
1402 }
1403 
1404 int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1405 		     struct fib_result *res, int fib_flags)
1406 {
1407 	struct trie *t = (struct trie *) tb->tb_data;
1408 	int ret;
1409 	struct rt_trie_node *n;
1410 	struct tnode *pn;
1411 	unsigned int pos, bits;
1412 	t_key key = ntohl(flp->daddr);
1413 	unsigned int chopped_off;
1414 	t_key cindex = 0;
1415 	unsigned int current_prefix_length = KEYLENGTH;
1416 	struct tnode *cn;
1417 	t_key pref_mismatch;
1418 
1419 	rcu_read_lock();
1420 
1421 	n = rcu_dereference(t->trie);
1422 	if (!n)
1423 		goto failed;
1424 
1425 #ifdef CONFIG_IP_FIB_TRIE_STATS
1426 	t->stats.gets++;
1427 #endif
1428 
1429 	/* Just a leaf? */
1430 	if (IS_LEAF(n)) {
1431 		ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1432 		goto found;
1433 	}
1434 
1435 	pn = (struct tnode *) n;
1436 	chopped_off = 0;
1437 
1438 	while (pn) {
1439 		pos = pn->pos;
1440 		bits = pn->bits;
1441 
1442 		if (!chopped_off)
1443 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1444 						   pos, bits);
1445 
1446 		n = tnode_get_child_rcu(pn, cindex);
1447 
1448 		if (n == NULL) {
1449 #ifdef CONFIG_IP_FIB_TRIE_STATS
1450 			t->stats.null_node_hit++;
1451 #endif
1452 			goto backtrace;
1453 		}
1454 
1455 		if (IS_LEAF(n)) {
1456 			ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
1457 			if (ret > 0)
1458 				goto backtrace;
1459 			goto found;
1460 		}
1461 
1462 		cn = (struct tnode *)n;
1463 
1464 		/*
1465 		 * It's a tnode, and we can do some extra checks here if we
1466 		 * like, to avoid descending into a dead-end branch.
1467 		 * This tnode is in the parent's child array at index
1468 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
1469 		 * chopped off, so in reality the index may be just a
1470 		 * subprefix, padded with zero at the end.
1471 		 * We can also take a look at any skipped bits in this
1472 		 * tnode - everything up to p_pos is supposed to be ok,
1473 		 * and the non-chopped bits of the index (se previous
1474 		 * paragraph) are also guaranteed ok, but the rest is
1475 		 * considered unknown.
1476 		 *
1477 		 * The skipped bits are key[pos+bits..cn->pos].
1478 		 */
1479 
1480 		/* If current_prefix_length < pos+bits, we are already doing
1481 		 * actual prefix  matching, which means everything from
1482 		 * pos+(bits-chopped_off) onward must be zero along some
1483 		 * branch of this subtree - otherwise there is *no* valid
1484 		 * prefix present. Here we can only check the skipped
1485 		 * bits. Remember, since we have already indexed into the
1486 		 * parent's child array, we know that the bits we chopped of
1487 		 * *are* zero.
1488 		 */
1489 
1490 		/* NOTA BENE: Checking only skipped bits
1491 		   for the new node here */
1492 
1493 		if (current_prefix_length < pos+bits) {
1494 			if (tkey_extract_bits(cn->key, current_prefix_length,
1495 						cn->pos - current_prefix_length)
1496 			    || !(cn->child[0]))
1497 				goto backtrace;
1498 		}
1499 
1500 		/*
1501 		 * If chopped_off=0, the index is fully validated and we
1502 		 * only need to look at the skipped bits for this, the new,
1503 		 * tnode. What we actually want to do is to find out if
1504 		 * these skipped bits match our key perfectly, or if we will
1505 		 * have to count on finding a matching prefix further down,
1506 		 * because if we do, we would like to have some way of
1507 		 * verifying the existence of such a prefix at this point.
1508 		 */
1509 
1510 		/* The only thing we can do at this point is to verify that
1511 		 * any such matching prefix can indeed be a prefix to our
1512 		 * key, and if the bits in the node we are inspecting that
1513 		 * do not match our key are not ZERO, this cannot be true.
1514 		 * Thus, find out where there is a mismatch (before cn->pos)
1515 		 * and verify that all the mismatching bits are zero in the
1516 		 * new tnode's key.
1517 		 */
1518 
1519 		/*
1520 		 * Note: We aren't very concerned about the piece of
1521 		 * the key that precede pn->pos+pn->bits, since these
1522 		 * have already been checked. The bits after cn->pos
1523 		 * aren't checked since these are by definition
1524 		 * "unknown" at this point. Thus, what we want to see
1525 		 * is if we are about to enter the "prefix matching"
1526 		 * state, and in that case verify that the skipped
1527 		 * bits that will prevail throughout this subtree are
1528 		 * zero, as they have to be if we are to find a
1529 		 * matching prefix.
1530 		 */
1531 
1532 		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
1533 
1534 		/*
1535 		 * In short: If skipped bits in this node do not match
1536 		 * the search key, enter the "prefix matching"
1537 		 * state.directly.
1538 		 */
1539 		if (pref_mismatch) {
1540 			/* fls(x) = __fls(x) + 1 */
1541 			int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
1542 
1543 			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
1544 				goto backtrace;
1545 
1546 			if (current_prefix_length >= cn->pos)
1547 				current_prefix_length = mp;
1548 		}
1549 
1550 		pn = (struct tnode *)n; /* Descend */
1551 		chopped_off = 0;
1552 		continue;
1553 
1554 backtrace:
1555 		chopped_off++;
1556 
1557 		/* As zero don't change the child key (cindex) */
1558 		while ((chopped_off <= pn->bits)
1559 		       && !(cindex & (1<<(chopped_off-1))))
1560 			chopped_off++;
1561 
1562 		/* Decrease current_... with bits chopped off */
1563 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1564 			current_prefix_length = pn->pos + pn->bits
1565 				- chopped_off;
1566 
1567 		/*
1568 		 * Either we do the actual chop off according or if we have
1569 		 * chopped off all bits in this tnode walk up to our parent.
1570 		 */
1571 
1572 		if (chopped_off <= pn->bits) {
1573 			cindex &= ~(1 << (chopped_off-1));
1574 		} else {
1575 			struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
1576 			if (!parent)
1577 				goto failed;
1578 
1579 			/* Get Child's index */
1580 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1581 			pn = parent;
1582 			chopped_off = 0;
1583 
1584 #ifdef CONFIG_IP_FIB_TRIE_STATS
1585 			t->stats.backtrack++;
1586 #endif
1587 			goto backtrace;
1588 		}
1589 	}
1590 failed:
1591 	ret = 1;
1592 found:
1593 	rcu_read_unlock();
1594 	return ret;
1595 }
1596 EXPORT_SYMBOL_GPL(fib_table_lookup);
1597 
1598 /*
1599  * Remove the leaf and return parent.
1600  */
1601 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1602 {
1603 	struct tnode *tp = node_parent((struct rt_trie_node *) l);
1604 
1605 	pr_debug("entering trie_leaf_remove(%p)\n", l);
1606 
1607 	if (tp) {
1608 		t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1609 		put_child(tp, cindex, NULL);
1610 		trie_rebalance(t, tp);
1611 	} else
1612 		RCU_INIT_POINTER(t->trie, NULL);
1613 
1614 	free_leaf(l);
1615 }
1616 
1617 /*
1618  * Caller must hold RTNL.
1619  */
1620 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1621 {
1622 	struct trie *t = (struct trie *) tb->tb_data;
1623 	u32 key, mask;
1624 	int plen = cfg->fc_dst_len;
1625 	u8 tos = cfg->fc_tos;
1626 	struct fib_alias *fa, *fa_to_delete;
1627 	struct list_head *fa_head;
1628 	struct leaf *l;
1629 	struct leaf_info *li;
1630 
1631 	if (plen > 32)
1632 		return -EINVAL;
1633 
1634 	key = ntohl(cfg->fc_dst);
1635 	mask = ntohl(inet_make_mask(plen));
1636 
1637 	if (key & ~mask)
1638 		return -EINVAL;
1639 
1640 	key = key & mask;
1641 	l = fib_find_node(t, key);
1642 
1643 	if (!l)
1644 		return -ESRCH;
1645 
1646 	li = find_leaf_info(l, plen);
1647 
1648 	if (!li)
1649 		return -ESRCH;
1650 
1651 	fa_head = &li->falh;
1652 	fa = fib_find_alias(fa_head, tos, 0);
1653 
1654 	if (!fa)
1655 		return -ESRCH;
1656 
1657 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1658 
1659 	fa_to_delete = NULL;
1660 	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1661 	list_for_each_entry_continue(fa, fa_head, fa_list) {
1662 		struct fib_info *fi = fa->fa_info;
1663 
1664 		if (fa->fa_tos != tos)
1665 			break;
1666 
1667 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1668 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1669 		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1670 		    (!cfg->fc_prefsrc ||
1671 		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1672 		    (!cfg->fc_protocol ||
1673 		     fi->fib_protocol == cfg->fc_protocol) &&
1674 		    fib_nh_match(cfg, fi) == 0) {
1675 			fa_to_delete = fa;
1676 			break;
1677 		}
1678 	}
1679 
1680 	if (!fa_to_delete)
1681 		return -ESRCH;
1682 
1683 	fa = fa_to_delete;
1684 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1685 		  &cfg->fc_nlinfo, 0);
1686 
1687 	list_del_rcu(&fa->fa_list);
1688 
1689 	if (!plen)
1690 		tb->tb_num_default--;
1691 
1692 	if (list_empty(fa_head)) {
1693 		hlist_del_rcu(&li->hlist);
1694 		free_leaf_info(li);
1695 	}
1696 
1697 	if (hlist_empty(&l->list))
1698 		trie_leaf_remove(t, l);
1699 
1700 	if (fa->fa_state & FA_S_ACCESSED)
1701 		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1702 
1703 	fib_release_info(fa->fa_info);
1704 	alias_free_mem_rcu(fa);
1705 	return 0;
1706 }
1707 
1708 static int trie_flush_list(struct list_head *head)
1709 {
1710 	struct fib_alias *fa, *fa_node;
1711 	int found = 0;
1712 
1713 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1714 		struct fib_info *fi = fa->fa_info;
1715 
1716 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1717 			list_del_rcu(&fa->fa_list);
1718 			fib_release_info(fa->fa_info);
1719 			alias_free_mem_rcu(fa);
1720 			found++;
1721 		}
1722 	}
1723 	return found;
1724 }
1725 
1726 static int trie_flush_leaf(struct leaf *l)
1727 {
1728 	int found = 0;
1729 	struct hlist_head *lih = &l->list;
1730 	struct hlist_node *tmp;
1731 	struct leaf_info *li = NULL;
1732 
1733 	hlist_for_each_entry_safe(li, tmp, lih, hlist) {
1734 		found += trie_flush_list(&li->falh);
1735 
1736 		if (list_empty(&li->falh)) {
1737 			hlist_del_rcu(&li->hlist);
1738 			free_leaf_info(li);
1739 		}
1740 	}
1741 	return found;
1742 }
1743 
1744 /*
1745  * Scan for the next right leaf starting at node p->child[idx]
1746  * Since we have back pointer, no recursion necessary.
1747  */
1748 static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
1749 {
1750 	do {
1751 		t_key idx;
1752 
1753 		if (c)
1754 			idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1755 		else
1756 			idx = 0;
1757 
1758 		while (idx < 1u << p->bits) {
1759 			c = tnode_get_child_rcu(p, idx++);
1760 			if (!c)
1761 				continue;
1762 
1763 			if (IS_LEAF(c))
1764 				return (struct leaf *) c;
1765 
1766 			/* Rescan start scanning in new node */
1767 			p = (struct tnode *) c;
1768 			idx = 0;
1769 		}
1770 
1771 		/* Node empty, walk back up to parent */
1772 		c = (struct rt_trie_node *) p;
1773 	} while ((p = node_parent_rcu(c)) != NULL);
1774 
1775 	return NULL; /* Root of trie */
1776 }
1777 
1778 static struct leaf *trie_firstleaf(struct trie *t)
1779 {
1780 	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
1781 
1782 	if (!n)
1783 		return NULL;
1784 
1785 	if (IS_LEAF(n))          /* trie is just a leaf */
1786 		return (struct leaf *) n;
1787 
1788 	return leaf_walk_rcu(n, NULL);
1789 }
1790 
1791 static struct leaf *trie_nextleaf(struct leaf *l)
1792 {
1793 	struct rt_trie_node *c = (struct rt_trie_node *) l;
1794 	struct tnode *p = node_parent_rcu(c);
1795 
1796 	if (!p)
1797 		return NULL;	/* trie with just one leaf */
1798 
1799 	return leaf_walk_rcu(p, c);
1800 }
1801 
1802 static struct leaf *trie_leafindex(struct trie *t, int index)
1803 {
1804 	struct leaf *l = trie_firstleaf(t);
1805 
1806 	while (l && index-- > 0)
1807 		l = trie_nextleaf(l);
1808 
1809 	return l;
1810 }
1811 
1812 
1813 /*
1814  * Caller must hold RTNL.
1815  */
1816 int fib_table_flush(struct fib_table *tb)
1817 {
1818 	struct trie *t = (struct trie *) tb->tb_data;
1819 	struct leaf *l, *ll = NULL;
1820 	int found = 0;
1821 
1822 	for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1823 		found += trie_flush_leaf(l);
1824 
1825 		if (ll && hlist_empty(&ll->list))
1826 			trie_leaf_remove(t, ll);
1827 		ll = l;
1828 	}
1829 
1830 	if (ll && hlist_empty(&ll->list))
1831 		trie_leaf_remove(t, ll);
1832 
1833 	pr_debug("trie_flush found=%d\n", found);
1834 	return found;
1835 }
1836 
1837 void fib_free_table(struct fib_table *tb)
1838 {
1839 	kfree(tb);
1840 }
1841 
1842 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1843 			   struct fib_table *tb,
1844 			   struct sk_buff *skb, struct netlink_callback *cb)
1845 {
1846 	int i, s_i;
1847 	struct fib_alias *fa;
1848 	__be32 xkey = htonl(key);
1849 
1850 	s_i = cb->args[5];
1851 	i = 0;
1852 
1853 	/* rcu_read_lock is hold by caller */
1854 
1855 	list_for_each_entry_rcu(fa, fah, fa_list) {
1856 		if (i < s_i) {
1857 			i++;
1858 			continue;
1859 		}
1860 
1861 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
1862 				  cb->nlh->nlmsg_seq,
1863 				  RTM_NEWROUTE,
1864 				  tb->tb_id,
1865 				  fa->fa_type,
1866 				  xkey,
1867 				  plen,
1868 				  fa->fa_tos,
1869 				  fa->fa_info, NLM_F_MULTI) < 0) {
1870 			cb->args[5] = i;
1871 			return -1;
1872 		}
1873 		i++;
1874 	}
1875 	cb->args[5] = i;
1876 	return skb->len;
1877 }
1878 
1879 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1880 			struct sk_buff *skb, struct netlink_callback *cb)
1881 {
1882 	struct leaf_info *li;
1883 	int i, s_i;
1884 
1885 	s_i = cb->args[4];
1886 	i = 0;
1887 
1888 	/* rcu_read_lock is hold by caller */
1889 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
1890 		if (i < s_i) {
1891 			i++;
1892 			continue;
1893 		}
1894 
1895 		if (i > s_i)
1896 			cb->args[5] = 0;
1897 
1898 		if (list_empty(&li->falh))
1899 			continue;
1900 
1901 		if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1902 			cb->args[4] = i;
1903 			return -1;
1904 		}
1905 		i++;
1906 	}
1907 
1908 	cb->args[4] = i;
1909 	return skb->len;
1910 }
1911 
1912 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1913 		   struct netlink_callback *cb)
1914 {
1915 	struct leaf *l;
1916 	struct trie *t = (struct trie *) tb->tb_data;
1917 	t_key key = cb->args[2];
1918 	int count = cb->args[3];
1919 
1920 	rcu_read_lock();
1921 	/* Dump starting at last key.
1922 	 * Note: 0.0.0.0/0 (ie default) is first key.
1923 	 */
1924 	if (count == 0)
1925 		l = trie_firstleaf(t);
1926 	else {
1927 		/* Normally, continue from last key, but if that is missing
1928 		 * fallback to using slow rescan
1929 		 */
1930 		l = fib_find_node(t, key);
1931 		if (!l)
1932 			l = trie_leafindex(t, count);
1933 	}
1934 
1935 	while (l) {
1936 		cb->args[2] = l->key;
1937 		if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1938 			cb->args[3] = count;
1939 			rcu_read_unlock();
1940 			return -1;
1941 		}
1942 
1943 		++count;
1944 		l = trie_nextleaf(l);
1945 		memset(&cb->args[4], 0,
1946 		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
1947 	}
1948 	cb->args[3] = count;
1949 	rcu_read_unlock();
1950 
1951 	return skb->len;
1952 }
1953 
1954 void __init fib_trie_init(void)
1955 {
1956 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1957 					  sizeof(struct fib_alias),
1958 					  0, SLAB_PANIC, NULL);
1959 
1960 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1961 					   max(sizeof(struct leaf),
1962 					       sizeof(struct leaf_info)),
1963 					   0, SLAB_PANIC, NULL);
1964 }
1965 
1966 
1967 struct fib_table *fib_trie_table(u32 id)
1968 {
1969 	struct fib_table *tb;
1970 	struct trie *t;
1971 
1972 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1973 		     GFP_KERNEL);
1974 	if (tb == NULL)
1975 		return NULL;
1976 
1977 	tb->tb_id = id;
1978 	tb->tb_default = -1;
1979 	tb->tb_num_default = 0;
1980 
1981 	t = (struct trie *) tb->tb_data;
1982 	memset(t, 0, sizeof(*t));
1983 
1984 	return tb;
1985 }
1986 
1987 #ifdef CONFIG_PROC_FS
1988 /* Depth first Trie walk iterator */
1989 struct fib_trie_iter {
1990 	struct seq_net_private p;
1991 	struct fib_table *tb;
1992 	struct tnode *tnode;
1993 	unsigned int index;
1994 	unsigned int depth;
1995 };
1996 
1997 static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
1998 {
1999 	struct tnode *tn = iter->tnode;
2000 	unsigned int cindex = iter->index;
2001 	struct tnode *p;
2002 
2003 	/* A single entry routing table */
2004 	if (!tn)
2005 		return NULL;
2006 
2007 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2008 		 iter->tnode, iter->index, iter->depth);
2009 rescan:
2010 	while (cindex < (1<<tn->bits)) {
2011 		struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
2012 
2013 		if (n) {
2014 			if (IS_LEAF(n)) {
2015 				iter->tnode = tn;
2016 				iter->index = cindex + 1;
2017 			} else {
2018 				/* push down one level */
2019 				iter->tnode = (struct tnode *) n;
2020 				iter->index = 0;
2021 				++iter->depth;
2022 			}
2023 			return n;
2024 		}
2025 
2026 		++cindex;
2027 	}
2028 
2029 	/* Current node exhausted, pop back up */
2030 	p = node_parent_rcu((struct rt_trie_node *)tn);
2031 	if (p) {
2032 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2033 		tn = p;
2034 		--iter->depth;
2035 		goto rescan;
2036 	}
2037 
2038 	/* got root? */
2039 	return NULL;
2040 }
2041 
2042 static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
2043 				       struct trie *t)
2044 {
2045 	struct rt_trie_node *n;
2046 
2047 	if (!t)
2048 		return NULL;
2049 
2050 	n = rcu_dereference(t->trie);
2051 	if (!n)
2052 		return NULL;
2053 
2054 	if (IS_TNODE(n)) {
2055 		iter->tnode = (struct tnode *) n;
2056 		iter->index = 0;
2057 		iter->depth = 1;
2058 	} else {
2059 		iter->tnode = NULL;
2060 		iter->index = 0;
2061 		iter->depth = 0;
2062 	}
2063 
2064 	return n;
2065 }
2066 
2067 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2068 {
2069 	struct rt_trie_node *n;
2070 	struct fib_trie_iter iter;
2071 
2072 	memset(s, 0, sizeof(*s));
2073 
2074 	rcu_read_lock();
2075 	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2076 		if (IS_LEAF(n)) {
2077 			struct leaf *l = (struct leaf *)n;
2078 			struct leaf_info *li;
2079 
2080 			s->leaves++;
2081 			s->totdepth += iter.depth;
2082 			if (iter.depth > s->maxdepth)
2083 				s->maxdepth = iter.depth;
2084 
2085 			hlist_for_each_entry_rcu(li, &l->list, hlist)
2086 				++s->prefixes;
2087 		} else {
2088 			const struct tnode *tn = (const struct tnode *) n;
2089 			int i;
2090 
2091 			s->tnodes++;
2092 			if (tn->bits < MAX_STAT_DEPTH)
2093 				s->nodesizes[tn->bits]++;
2094 
2095 			for (i = 0; i < (1<<tn->bits); i++)
2096 				if (!tn->child[i])
2097 					s->nullpointers++;
2098 		}
2099 	}
2100 	rcu_read_unlock();
2101 }
2102 
2103 /*
2104  *	This outputs /proc/net/fib_triestats
2105  */
2106 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2107 {
2108 	unsigned int i, max, pointers, bytes, avdepth;
2109 
2110 	if (stat->leaves)
2111 		avdepth = stat->totdepth*100 / stat->leaves;
2112 	else
2113 		avdepth = 0;
2114 
2115 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2116 		   avdepth / 100, avdepth % 100);
2117 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2118 
2119 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2120 	bytes = sizeof(struct leaf) * stat->leaves;
2121 
2122 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2123 	bytes += sizeof(struct leaf_info) * stat->prefixes;
2124 
2125 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2126 	bytes += sizeof(struct tnode) * stat->tnodes;
2127 
2128 	max = MAX_STAT_DEPTH;
2129 	while (max > 0 && stat->nodesizes[max-1] == 0)
2130 		max--;
2131 
2132 	pointers = 0;
2133 	for (i = 1; i < max; i++)
2134 		if (stat->nodesizes[i] != 0) {
2135 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2136 			pointers += (1<<i) * stat->nodesizes[i];
2137 		}
2138 	seq_putc(seq, '\n');
2139 	seq_printf(seq, "\tPointers: %u\n", pointers);
2140 
2141 	bytes += sizeof(struct rt_trie_node *) * pointers;
2142 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2143 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2144 }
2145 
2146 #ifdef CONFIG_IP_FIB_TRIE_STATS
2147 static void trie_show_usage(struct seq_file *seq,
2148 			    const struct trie_use_stats *stats)
2149 {
2150 	seq_printf(seq, "\nCounters:\n---------\n");
2151 	seq_printf(seq, "gets = %u\n", stats->gets);
2152 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2153 	seq_printf(seq, "semantic match passed = %u\n",
2154 		   stats->semantic_match_passed);
2155 	seq_printf(seq, "semantic match miss = %u\n",
2156 		   stats->semantic_match_miss);
2157 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2158 	seq_printf(seq, "skipped node resize = %u\n\n",
2159 		   stats->resize_node_skipped);
2160 }
2161 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2162 
2163 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2164 {
2165 	if (tb->tb_id == RT_TABLE_LOCAL)
2166 		seq_puts(seq, "Local:\n");
2167 	else if (tb->tb_id == RT_TABLE_MAIN)
2168 		seq_puts(seq, "Main:\n");
2169 	else
2170 		seq_printf(seq, "Id %d:\n", tb->tb_id);
2171 }
2172 
2173 
2174 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2175 {
2176 	struct net *net = (struct net *)seq->private;
2177 	unsigned int h;
2178 
2179 	seq_printf(seq,
2180 		   "Basic info: size of leaf:"
2181 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
2182 		   sizeof(struct leaf), sizeof(struct tnode));
2183 
2184 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2185 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2186 		struct fib_table *tb;
2187 
2188 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2189 			struct trie *t = (struct trie *) tb->tb_data;
2190 			struct trie_stat stat;
2191 
2192 			if (!t)
2193 				continue;
2194 
2195 			fib_table_print(seq, tb);
2196 
2197 			trie_collect_stats(t, &stat);
2198 			trie_show_stats(seq, &stat);
2199 #ifdef CONFIG_IP_FIB_TRIE_STATS
2200 			trie_show_usage(seq, &t->stats);
2201 #endif
2202 		}
2203 	}
2204 
2205 	return 0;
2206 }
2207 
2208 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2209 {
2210 	return single_open_net(inode, file, fib_triestat_seq_show);
2211 }
2212 
2213 static const struct file_operations fib_triestat_fops = {
2214 	.owner	= THIS_MODULE,
2215 	.open	= fib_triestat_seq_open,
2216 	.read	= seq_read,
2217 	.llseek	= seq_lseek,
2218 	.release = single_release_net,
2219 };
2220 
2221 static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2222 {
2223 	struct fib_trie_iter *iter = seq->private;
2224 	struct net *net = seq_file_net(seq);
2225 	loff_t idx = 0;
2226 	unsigned int h;
2227 
2228 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2229 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2230 		struct fib_table *tb;
2231 
2232 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2233 			struct rt_trie_node *n;
2234 
2235 			for (n = fib_trie_get_first(iter,
2236 						    (struct trie *) tb->tb_data);
2237 			     n; n = fib_trie_get_next(iter))
2238 				if (pos == idx++) {
2239 					iter->tb = tb;
2240 					return n;
2241 				}
2242 		}
2243 	}
2244 
2245 	return NULL;
2246 }
2247 
2248 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2249 	__acquires(RCU)
2250 {
2251 	rcu_read_lock();
2252 	return fib_trie_get_idx(seq, *pos);
2253 }
2254 
2255 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2256 {
2257 	struct fib_trie_iter *iter = seq->private;
2258 	struct net *net = seq_file_net(seq);
2259 	struct fib_table *tb = iter->tb;
2260 	struct hlist_node *tb_node;
2261 	unsigned int h;
2262 	struct rt_trie_node *n;
2263 
2264 	++*pos;
2265 	/* next node in same table */
2266 	n = fib_trie_get_next(iter);
2267 	if (n)
2268 		return n;
2269 
2270 	/* walk rest of this hash chain */
2271 	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2272 	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2273 		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2274 		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2275 		if (n)
2276 			goto found;
2277 	}
2278 
2279 	/* new hash chain */
2280 	while (++h < FIB_TABLE_HASHSZ) {
2281 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2282 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2283 			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2284 			if (n)
2285 				goto found;
2286 		}
2287 	}
2288 	return NULL;
2289 
2290 found:
2291 	iter->tb = tb;
2292 	return n;
2293 }
2294 
2295 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2296 	__releases(RCU)
2297 {
2298 	rcu_read_unlock();
2299 }
2300 
2301 static void seq_indent(struct seq_file *seq, int n)
2302 {
2303 	while (n-- > 0)
2304 		seq_puts(seq, "   ");
2305 }
2306 
2307 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2308 {
2309 	switch (s) {
2310 	case RT_SCOPE_UNIVERSE: return "universe";
2311 	case RT_SCOPE_SITE:	return "site";
2312 	case RT_SCOPE_LINK:	return "link";
2313 	case RT_SCOPE_HOST:	return "host";
2314 	case RT_SCOPE_NOWHERE:	return "nowhere";
2315 	default:
2316 		snprintf(buf, len, "scope=%d", s);
2317 		return buf;
2318 	}
2319 }
2320 
2321 static const char *const rtn_type_names[__RTN_MAX] = {
2322 	[RTN_UNSPEC] = "UNSPEC",
2323 	[RTN_UNICAST] = "UNICAST",
2324 	[RTN_LOCAL] = "LOCAL",
2325 	[RTN_BROADCAST] = "BROADCAST",
2326 	[RTN_ANYCAST] = "ANYCAST",
2327 	[RTN_MULTICAST] = "MULTICAST",
2328 	[RTN_BLACKHOLE] = "BLACKHOLE",
2329 	[RTN_UNREACHABLE] = "UNREACHABLE",
2330 	[RTN_PROHIBIT] = "PROHIBIT",
2331 	[RTN_THROW] = "THROW",
2332 	[RTN_NAT] = "NAT",
2333 	[RTN_XRESOLVE] = "XRESOLVE",
2334 };
2335 
2336 static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2337 {
2338 	if (t < __RTN_MAX && rtn_type_names[t])
2339 		return rtn_type_names[t];
2340 	snprintf(buf, len, "type %u", t);
2341 	return buf;
2342 }
2343 
2344 /* Pretty print the trie */
2345 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2346 {
2347 	const struct fib_trie_iter *iter = seq->private;
2348 	struct rt_trie_node *n = v;
2349 
2350 	if (!node_parent_rcu(n))
2351 		fib_table_print(seq, iter->tb);
2352 
2353 	if (IS_TNODE(n)) {
2354 		struct tnode *tn = (struct tnode *) n;
2355 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2356 
2357 		seq_indent(seq, iter->depth-1);
2358 		seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2359 			   &prf, tn->pos, tn->bits, tn->full_children,
2360 			   tn->empty_children);
2361 
2362 	} else {
2363 		struct leaf *l = (struct leaf *) n;
2364 		struct leaf_info *li;
2365 		__be32 val = htonl(l->key);
2366 
2367 		seq_indent(seq, iter->depth);
2368 		seq_printf(seq, "  |-- %pI4\n", &val);
2369 
2370 		hlist_for_each_entry_rcu(li, &l->list, hlist) {
2371 			struct fib_alias *fa;
2372 
2373 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2374 				char buf1[32], buf2[32];
2375 
2376 				seq_indent(seq, iter->depth+1);
2377 				seq_printf(seq, "  /%d %s %s", li->plen,
2378 					   rtn_scope(buf1, sizeof(buf1),
2379 						     fa->fa_info->fib_scope),
2380 					   rtn_type(buf2, sizeof(buf2),
2381 						    fa->fa_type));
2382 				if (fa->fa_tos)
2383 					seq_printf(seq, " tos=%d", fa->fa_tos);
2384 				seq_putc(seq, '\n');
2385 			}
2386 		}
2387 	}
2388 
2389 	return 0;
2390 }
2391 
2392 static const struct seq_operations fib_trie_seq_ops = {
2393 	.start  = fib_trie_seq_start,
2394 	.next   = fib_trie_seq_next,
2395 	.stop   = fib_trie_seq_stop,
2396 	.show   = fib_trie_seq_show,
2397 };
2398 
2399 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2400 {
2401 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2402 			    sizeof(struct fib_trie_iter));
2403 }
2404 
2405 static const struct file_operations fib_trie_fops = {
2406 	.owner  = THIS_MODULE,
2407 	.open   = fib_trie_seq_open,
2408 	.read   = seq_read,
2409 	.llseek = seq_lseek,
2410 	.release = seq_release_net,
2411 };
2412 
2413 struct fib_route_iter {
2414 	struct seq_net_private p;
2415 	struct trie *main_trie;
2416 	loff_t	pos;
2417 	t_key	key;
2418 };
2419 
2420 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2421 {
2422 	struct leaf *l = NULL;
2423 	struct trie *t = iter->main_trie;
2424 
2425 	/* use cache location of last found key */
2426 	if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2427 		pos -= iter->pos;
2428 	else {
2429 		iter->pos = 0;
2430 		l = trie_firstleaf(t);
2431 	}
2432 
2433 	while (l && pos-- > 0) {
2434 		iter->pos++;
2435 		l = trie_nextleaf(l);
2436 	}
2437 
2438 	if (l)
2439 		iter->key = pos;	/* remember it */
2440 	else
2441 		iter->pos = 0;		/* forget it */
2442 
2443 	return l;
2444 }
2445 
2446 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2447 	__acquires(RCU)
2448 {
2449 	struct fib_route_iter *iter = seq->private;
2450 	struct fib_table *tb;
2451 
2452 	rcu_read_lock();
2453 	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2454 	if (!tb)
2455 		return NULL;
2456 
2457 	iter->main_trie = (struct trie *) tb->tb_data;
2458 	if (*pos == 0)
2459 		return SEQ_START_TOKEN;
2460 	else
2461 		return fib_route_get_idx(iter, *pos - 1);
2462 }
2463 
2464 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2465 {
2466 	struct fib_route_iter *iter = seq->private;
2467 	struct leaf *l = v;
2468 
2469 	++*pos;
2470 	if (v == SEQ_START_TOKEN) {
2471 		iter->pos = 0;
2472 		l = trie_firstleaf(iter->main_trie);
2473 	} else {
2474 		iter->pos++;
2475 		l = trie_nextleaf(l);
2476 	}
2477 
2478 	if (l)
2479 		iter->key = l->key;
2480 	else
2481 		iter->pos = 0;
2482 	return l;
2483 }
2484 
2485 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2486 	__releases(RCU)
2487 {
2488 	rcu_read_unlock();
2489 }
2490 
2491 static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2492 {
2493 	unsigned int flags = 0;
2494 
2495 	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2496 		flags = RTF_REJECT;
2497 	if (fi && fi->fib_nh->nh_gw)
2498 		flags |= RTF_GATEWAY;
2499 	if (mask == htonl(0xFFFFFFFF))
2500 		flags |= RTF_HOST;
2501 	flags |= RTF_UP;
2502 	return flags;
2503 }
2504 
2505 /*
2506  *	This outputs /proc/net/route.
2507  *	The format of the file is not supposed to be changed
2508  *	and needs to be same as fib_hash output to avoid breaking
2509  *	legacy utilities
2510  */
2511 static int fib_route_seq_show(struct seq_file *seq, void *v)
2512 {
2513 	struct leaf *l = v;
2514 	struct leaf_info *li;
2515 
2516 	if (v == SEQ_START_TOKEN) {
2517 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2518 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2519 			   "\tWindow\tIRTT");
2520 		return 0;
2521 	}
2522 
2523 	hlist_for_each_entry_rcu(li, &l->list, hlist) {
2524 		struct fib_alias *fa;
2525 		__be32 mask, prefix;
2526 
2527 		mask = inet_make_mask(li->plen);
2528 		prefix = htonl(l->key);
2529 
2530 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2531 			const struct fib_info *fi = fa->fa_info;
2532 			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2533 			int len;
2534 
2535 			if (fa->fa_type == RTN_BROADCAST
2536 			    || fa->fa_type == RTN_MULTICAST)
2537 				continue;
2538 
2539 			if (fi)
2540 				seq_printf(seq,
2541 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2542 					 "%d\t%08X\t%d\t%u\t%u%n",
2543 					 fi->fib_dev ? fi->fib_dev->name : "*",
2544 					 prefix,
2545 					 fi->fib_nh->nh_gw, flags, 0, 0,
2546 					 fi->fib_priority,
2547 					 mask,
2548 					 (fi->fib_advmss ?
2549 					  fi->fib_advmss + 40 : 0),
2550 					 fi->fib_window,
2551 					 fi->fib_rtt >> 3, &len);
2552 			else
2553 				seq_printf(seq,
2554 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2555 					 "%d\t%08X\t%d\t%u\t%u%n",
2556 					 prefix, 0, flags, 0, 0, 0,
2557 					 mask, 0, 0, 0, &len);
2558 
2559 			seq_printf(seq, "%*s\n", 127 - len, "");
2560 		}
2561 	}
2562 
2563 	return 0;
2564 }
2565 
2566 static const struct seq_operations fib_route_seq_ops = {
2567 	.start  = fib_route_seq_start,
2568 	.next   = fib_route_seq_next,
2569 	.stop   = fib_route_seq_stop,
2570 	.show   = fib_route_seq_show,
2571 };
2572 
2573 static int fib_route_seq_open(struct inode *inode, struct file *file)
2574 {
2575 	return seq_open_net(inode, file, &fib_route_seq_ops,
2576 			    sizeof(struct fib_route_iter));
2577 }
2578 
2579 static const struct file_operations fib_route_fops = {
2580 	.owner  = THIS_MODULE,
2581 	.open   = fib_route_seq_open,
2582 	.read   = seq_read,
2583 	.llseek = seq_lseek,
2584 	.release = seq_release_net,
2585 };
2586 
2587 int __net_init fib_proc_init(struct net *net)
2588 {
2589 	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
2590 		goto out1;
2591 
2592 	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
2593 			 &fib_triestat_fops))
2594 		goto out2;
2595 
2596 	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
2597 		goto out3;
2598 
2599 	return 0;
2600 
2601 out3:
2602 	remove_proc_entry("fib_triestat", net->proc_net);
2603 out2:
2604 	remove_proc_entry("fib_trie", net->proc_net);
2605 out1:
2606 	return -ENOMEM;
2607 }
2608 
2609 void __net_exit fib_proc_exit(struct net *net)
2610 {
2611 	remove_proc_entry("fib_trie", net->proc_net);
2612 	remove_proc_entry("fib_triestat", net->proc_net);
2613 	remove_proc_entry("route", net->proc_net);
2614 }
2615 
2616 #endif /* CONFIG_PROC_FS */
2617