xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 0b26ca68)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   *
4   *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
5   *     & Swedish University of Agricultural Sciences.
6   *
7   *   Jens Laas <jens.laas@data.slu.se> Swedish University of
8   *     Agricultural Sciences.
9   *
10   *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
11   *
12   * This work is based on the LPC-trie which is originally described in:
13   *
14   * An experimental study of compression methods for dynamic tries
15   * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
16   * https://www.csc.kth.se/~snilsson/software/dyntrie2/
17   *
18   * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
19   * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
20   *
21   * Code from fib_hash has been reused which includes the following header:
22   *
23   * INET		An implementation of the TCP/IP protocol suite for the LINUX
24   *		operating system.  INET is implemented using the  BSD Socket
25   *		interface as the means of communication with the user level.
26   *
27   *		IPv4 FIB: lookup engine and maintenance routines.
28   *
29   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
30   *
31   * Substantial contributions to this work comes from:
32   *
33   *		David S. Miller, <davem@davemloft.net>
34   *		Stephen Hemminger <shemminger@osdl.org>
35   *		Paul E. McKenney <paulmck@us.ibm.com>
36   *		Patrick McHardy <kaber@trash.net>
37   */
38  #include <linux/cache.h>
39  #include <linux/uaccess.h>
40  #include <linux/bitops.h>
41  #include <linux/types.h>
42  #include <linux/kernel.h>
43  #include <linux/mm.h>
44  #include <linux/string.h>
45  #include <linux/socket.h>
46  #include <linux/sockios.h>
47  #include <linux/errno.h>
48  #include <linux/in.h>
49  #include <linux/inet.h>
50  #include <linux/inetdevice.h>
51  #include <linux/netdevice.h>
52  #include <linux/if_arp.h>
53  #include <linux/proc_fs.h>
54  #include <linux/rcupdate.h>
55  #include <linux/skbuff.h>
56  #include <linux/netlink.h>
57  #include <linux/init.h>
58  #include <linux/list.h>
59  #include <linux/slab.h>
60  #include <linux/export.h>
61  #include <linux/vmalloc.h>
62  #include <linux/notifier.h>
63  #include <net/net_namespace.h>
64  #include <net/ip.h>
65  #include <net/protocol.h>
66  #include <net/route.h>
67  #include <net/tcp.h>
68  #include <net/sock.h>
69  #include <net/ip_fib.h>
70  #include <net/fib_notifier.h>
71  #include <trace/events/fib.h>
72  #include "fib_lookup.h"
73  
74  static int call_fib_entry_notifier(struct notifier_block *nb,
75  				   enum fib_event_type event_type, u32 dst,
76  				   int dst_len, struct fib_alias *fa,
77  				   struct netlink_ext_ack *extack)
78  {
79  	struct fib_entry_notifier_info info = {
80  		.info.extack = extack,
81  		.dst = dst,
82  		.dst_len = dst_len,
83  		.fi = fa->fa_info,
84  		.tos = fa->fa_tos,
85  		.type = fa->fa_type,
86  		.tb_id = fa->tb_id,
87  	};
88  	return call_fib4_notifier(nb, event_type, &info.info);
89  }
90  
91  static int call_fib_entry_notifiers(struct net *net,
92  				    enum fib_event_type event_type, u32 dst,
93  				    int dst_len, struct fib_alias *fa,
94  				    struct netlink_ext_ack *extack)
95  {
96  	struct fib_entry_notifier_info info = {
97  		.info.extack = extack,
98  		.dst = dst,
99  		.dst_len = dst_len,
100  		.fi = fa->fa_info,
101  		.tos = fa->fa_tos,
102  		.type = fa->fa_type,
103  		.tb_id = fa->tb_id,
104  	};
105  	return call_fib4_notifiers(net, event_type, &info.info);
106  }
107  
108  #define MAX_STAT_DEPTH 32
109  
110  #define KEYLENGTH	(8*sizeof(t_key))
111  #define KEY_MAX		((t_key)~0)
112  
113  typedef unsigned int t_key;
114  
115  #define IS_TRIE(n)	((n)->pos >= KEYLENGTH)
116  #define IS_TNODE(n)	((n)->bits)
117  #define IS_LEAF(n)	(!(n)->bits)
118  
119  struct key_vector {
120  	t_key key;
121  	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122  	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
123  	unsigned char slen;
124  	union {
125  		/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
126  		struct hlist_head leaf;
127  		/* This array is valid if (pos | bits) > 0 (TNODE) */
128  		struct key_vector __rcu *tnode[0];
129  	};
130  };
131  
132  struct tnode {
133  	struct rcu_head rcu;
134  	t_key empty_children;		/* KEYLENGTH bits needed */
135  	t_key full_children;		/* KEYLENGTH bits needed */
136  	struct key_vector __rcu *parent;
137  	struct key_vector kv[1];
138  #define tn_bits kv[0].bits
139  };
140  
141  #define TNODE_SIZE(n)	offsetof(struct tnode, kv[0].tnode[n])
142  #define LEAF_SIZE	TNODE_SIZE(1)
143  
144  #ifdef CONFIG_IP_FIB_TRIE_STATS
145  struct trie_use_stats {
146  	unsigned int gets;
147  	unsigned int backtrack;
148  	unsigned int semantic_match_passed;
149  	unsigned int semantic_match_miss;
150  	unsigned int null_node_hit;
151  	unsigned int resize_node_skipped;
152  };
153  #endif
154  
155  struct trie_stat {
156  	unsigned int totdepth;
157  	unsigned int maxdepth;
158  	unsigned int tnodes;
159  	unsigned int leaves;
160  	unsigned int nullpointers;
161  	unsigned int prefixes;
162  	unsigned int nodesizes[MAX_STAT_DEPTH];
163  };
164  
165  struct trie {
166  	struct key_vector kv[1];
167  #ifdef CONFIG_IP_FIB_TRIE_STATS
168  	struct trie_use_stats __percpu *stats;
169  #endif
170  };
171  
172  static struct key_vector *resize(struct trie *t, struct key_vector *tn);
173  static unsigned int tnode_free_size;
174  
175  /*
176   * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
177   * especially useful before resizing the root node with PREEMPT_NONE configs;
178   * the value was obtained experimentally, aiming to avoid visible slowdown.
179   */
180  unsigned int sysctl_fib_sync_mem = 512 * 1024;
181  unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
182  unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
183  
184  static struct kmem_cache *fn_alias_kmem __ro_after_init;
185  static struct kmem_cache *trie_leaf_kmem __ro_after_init;
186  
187  static inline struct tnode *tn_info(struct key_vector *kv)
188  {
189  	return container_of(kv, struct tnode, kv[0]);
190  }
191  
192  /* caller must hold RTNL */
193  #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent)
194  #define get_child(tn, i) rtnl_dereference((tn)->tnode[i])
195  
196  /* caller must hold RCU read lock or RTNL */
197  #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent)
198  #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i])
199  
200  /* wrapper for rcu_assign_pointer */
201  static inline void node_set_parent(struct key_vector *n, struct key_vector *tp)
202  {
203  	if (n)
204  		rcu_assign_pointer(tn_info(n)->parent, tp);
205  }
206  
207  #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p)
208  
209  /* This provides us with the number of children in this node, in the case of a
210   * leaf this will return 0 meaning none of the children are accessible.
211   */
212  static inline unsigned long child_length(const struct key_vector *tn)
213  {
214  	return (1ul << tn->bits) & ~(1ul);
215  }
216  
217  #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos)
218  
219  static inline unsigned long get_index(t_key key, struct key_vector *kv)
220  {
221  	unsigned long index = key ^ kv->key;
222  
223  	if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos))
224  		return 0;
225  
226  	return index >> kv->pos;
227  }
228  
229  /* To understand this stuff, an understanding of keys and all their bits is
230   * necessary. Every node in the trie has a key associated with it, but not
231   * all of the bits in that key are significant.
232   *
233   * Consider a node 'n' and its parent 'tp'.
234   *
235   * If n is a leaf, every bit in its key is significant. Its presence is
236   * necessitated by path compression, since during a tree traversal (when
237   * searching for a leaf - unless we are doing an insertion) we will completely
238   * ignore all skipped bits we encounter. Thus we need to verify, at the end of
239   * a potentially successful search, that we have indeed been walking the
240   * correct key path.
241   *
242   * Note that we can never "miss" the correct key in the tree if present by
243   * following the wrong path. Path compression ensures that segments of the key
244   * that are the same for all keys with a given prefix are skipped, but the
245   * skipped part *is* identical for each node in the subtrie below the skipped
246   * bit! trie_insert() in this implementation takes care of that.
247   *
248   * if n is an internal node - a 'tnode' here, the various parts of its key
249   * have many different meanings.
250   *
251   * Example:
252   * _________________________________________________________________
253   * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
254   * -----------------------------------------------------------------
255   *  31  30  29  28  27  26  25  24  23  22  21  20  19  18  17  16
256   *
257   * _________________________________________________________________
258   * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
259   * -----------------------------------------------------------------
260   *  15  14  13  12  11  10   9   8   7   6   5   4   3   2   1   0
261   *
262   * tp->pos = 22
263   * tp->bits = 3
264   * n->pos = 13
265   * n->bits = 4
266   *
267   * First, let's just ignore the bits that come before the parent tp, that is
268   * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
269   * point we do not use them for anything.
270   *
271   * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
272   * index into the parent's child array. That is, they will be used to find
273   * 'n' among tp's children.
274   *
275   * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits
276   * for the node n.
277   *
278   * All the bits we have seen so far are significant to the node n. The rest
279   * of the bits are really not needed or indeed known in n->key.
280   *
281   * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
282   * n's child array, and will of course be different for each child.
283   *
284   * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown
285   * at this point.
286   */
287  
288  static const int halve_threshold = 25;
289  static const int inflate_threshold = 50;
290  static const int halve_threshold_root = 15;
291  static const int inflate_threshold_root = 30;
292  
293  static void __alias_free_mem(struct rcu_head *head)
294  {
295  	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
296  	kmem_cache_free(fn_alias_kmem, fa);
297  }
298  
299  static inline void alias_free_mem_rcu(struct fib_alias *fa)
300  {
301  	call_rcu(&fa->rcu, __alias_free_mem);
302  }
303  
304  #define TNODE_VMALLOC_MAX \
305  	ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
306  
307  static void __node_free_rcu(struct rcu_head *head)
308  {
309  	struct tnode *n = container_of(head, struct tnode, rcu);
310  
311  	if (!n->tn_bits)
312  		kmem_cache_free(trie_leaf_kmem, n);
313  	else
314  		kvfree(n);
315  }
316  
317  #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
318  
319  static struct tnode *tnode_alloc(int bits)
320  {
321  	size_t size;
322  
323  	/* verify bits is within bounds */
324  	if (bits > TNODE_VMALLOC_MAX)
325  		return NULL;
326  
327  	/* determine size and verify it is non-zero and didn't overflow */
328  	size = TNODE_SIZE(1ul << bits);
329  
330  	if (size <= PAGE_SIZE)
331  		return kzalloc(size, GFP_KERNEL);
332  	else
333  		return vzalloc(size);
334  }
335  
336  static inline void empty_child_inc(struct key_vector *n)
337  {
338  	tn_info(n)->empty_children++;
339  
340  	if (!tn_info(n)->empty_children)
341  		tn_info(n)->full_children++;
342  }
343  
344  static inline void empty_child_dec(struct key_vector *n)
345  {
346  	if (!tn_info(n)->empty_children)
347  		tn_info(n)->full_children--;
348  
349  	tn_info(n)->empty_children--;
350  }
351  
352  static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
353  {
354  	struct key_vector *l;
355  	struct tnode *kv;
356  
357  	kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
358  	if (!kv)
359  		return NULL;
360  
361  	/* initialize key vector */
362  	l = kv->kv;
363  	l->key = key;
364  	l->pos = 0;
365  	l->bits = 0;
366  	l->slen = fa->fa_slen;
367  
368  	/* link leaf to fib alias */
369  	INIT_HLIST_HEAD(&l->leaf);
370  	hlist_add_head(&fa->fa_list, &l->leaf);
371  
372  	return l;
373  }
374  
375  static struct key_vector *tnode_new(t_key key, int pos, int bits)
376  {
377  	unsigned int shift = pos + bits;
378  	struct key_vector *tn;
379  	struct tnode *tnode;
380  
381  	/* verify bits and pos their msb bits clear and values are valid */
382  	BUG_ON(!bits || (shift > KEYLENGTH));
383  
384  	tnode = tnode_alloc(bits);
385  	if (!tnode)
386  		return NULL;
387  
388  	pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0),
389  		 sizeof(struct key_vector *) << bits);
390  
391  	if (bits == KEYLENGTH)
392  		tnode->full_children = 1;
393  	else
394  		tnode->empty_children = 1ul << bits;
395  
396  	tn = tnode->kv;
397  	tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
398  	tn->pos = pos;
399  	tn->bits = bits;
400  	tn->slen = pos;
401  
402  	return tn;
403  }
404  
405  /* Check whether a tnode 'n' is "full", i.e. it is an internal node
406   * and no bits are skipped. See discussion in dyntree paper p. 6
407   */
408  static inline int tnode_full(struct key_vector *tn, struct key_vector *n)
409  {
410  	return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
411  }
412  
413  /* Add a child at position i overwriting the old value.
414   * Update the value of full_children and empty_children.
415   */
416  static void put_child(struct key_vector *tn, unsigned long i,
417  		      struct key_vector *n)
418  {
419  	struct key_vector *chi = get_child(tn, i);
420  	int isfull, wasfull;
421  
422  	BUG_ON(i >= child_length(tn));
423  
424  	/* update emptyChildren, overflow into fullChildren */
425  	if (!n && chi)
426  		empty_child_inc(tn);
427  	if (n && !chi)
428  		empty_child_dec(tn);
429  
430  	/* update fullChildren */
431  	wasfull = tnode_full(tn, chi);
432  	isfull = tnode_full(tn, n);
433  
434  	if (wasfull && !isfull)
435  		tn_info(tn)->full_children--;
436  	else if (!wasfull && isfull)
437  		tn_info(tn)->full_children++;
438  
439  	if (n && (tn->slen < n->slen))
440  		tn->slen = n->slen;
441  
442  	rcu_assign_pointer(tn->tnode[i], n);
443  }
444  
445  static void update_children(struct key_vector *tn)
446  {
447  	unsigned long i;
448  
449  	/* update all of the child parent pointers */
450  	for (i = child_length(tn); i;) {
451  		struct key_vector *inode = get_child(tn, --i);
452  
453  		if (!inode)
454  			continue;
455  
456  		/* Either update the children of a tnode that
457  		 * already belongs to us or update the child
458  		 * to point to ourselves.
459  		 */
460  		if (node_parent(inode) == tn)
461  			update_children(inode);
462  		else
463  			node_set_parent(inode, tn);
464  	}
465  }
466  
467  static inline void put_child_root(struct key_vector *tp, t_key key,
468  				  struct key_vector *n)
469  {
470  	if (IS_TRIE(tp))
471  		rcu_assign_pointer(tp->tnode[0], n);
472  	else
473  		put_child(tp, get_index(key, tp), n);
474  }
475  
476  static inline void tnode_free_init(struct key_vector *tn)
477  {
478  	tn_info(tn)->rcu.next = NULL;
479  }
480  
481  static inline void tnode_free_append(struct key_vector *tn,
482  				     struct key_vector *n)
483  {
484  	tn_info(n)->rcu.next = tn_info(tn)->rcu.next;
485  	tn_info(tn)->rcu.next = &tn_info(n)->rcu;
486  }
487  
488  static void tnode_free(struct key_vector *tn)
489  {
490  	struct callback_head *head = &tn_info(tn)->rcu;
491  
492  	while (head) {
493  		head = head->next;
494  		tnode_free_size += TNODE_SIZE(1ul << tn->bits);
495  		node_free(tn);
496  
497  		tn = container_of(head, struct tnode, rcu)->kv;
498  	}
499  
500  	if (tnode_free_size >= sysctl_fib_sync_mem) {
501  		tnode_free_size = 0;
502  		synchronize_rcu();
503  	}
504  }
505  
506  static struct key_vector *replace(struct trie *t,
507  				  struct key_vector *oldtnode,
508  				  struct key_vector *tn)
509  {
510  	struct key_vector *tp = node_parent(oldtnode);
511  	unsigned long i;
512  
513  	/* setup the parent pointer out of and back into this node */
514  	NODE_INIT_PARENT(tn, tp);
515  	put_child_root(tp, tn->key, tn);
516  
517  	/* update all of the child parent pointers */
518  	update_children(tn);
519  
520  	/* all pointers should be clean so we are done */
521  	tnode_free(oldtnode);
522  
523  	/* resize children now that oldtnode is freed */
524  	for (i = child_length(tn); i;) {
525  		struct key_vector *inode = get_child(tn, --i);
526  
527  		/* resize child node */
528  		if (tnode_full(tn, inode))
529  			tn = resize(t, inode);
530  	}
531  
532  	return tp;
533  }
534  
535  static struct key_vector *inflate(struct trie *t,
536  				  struct key_vector *oldtnode)
537  {
538  	struct key_vector *tn;
539  	unsigned long i;
540  	t_key m;
541  
542  	pr_debug("In inflate\n");
543  
544  	tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
545  	if (!tn)
546  		goto notnode;
547  
548  	/* prepare oldtnode to be freed */
549  	tnode_free_init(oldtnode);
550  
551  	/* Assemble all of the pointers in our cluster, in this case that
552  	 * represents all of the pointers out of our allocated nodes that
553  	 * point to existing tnodes and the links between our allocated
554  	 * nodes.
555  	 */
556  	for (i = child_length(oldtnode), m = 1u << tn->pos; i;) {
557  		struct key_vector *inode = get_child(oldtnode, --i);
558  		struct key_vector *node0, *node1;
559  		unsigned long j, k;
560  
561  		/* An empty child */
562  		if (!inode)
563  			continue;
564  
565  		/* A leaf or an internal node with skipped bits */
566  		if (!tnode_full(oldtnode, inode)) {
567  			put_child(tn, get_index(inode->key, tn), inode);
568  			continue;
569  		}
570  
571  		/* drop the node in the old tnode free list */
572  		tnode_free_append(oldtnode, inode);
573  
574  		/* An internal node with two children */
575  		if (inode->bits == 1) {
576  			put_child(tn, 2 * i + 1, get_child(inode, 1));
577  			put_child(tn, 2 * i, get_child(inode, 0));
578  			continue;
579  		}
580  
581  		/* We will replace this node 'inode' with two new
582  		 * ones, 'node0' and 'node1', each with half of the
583  		 * original children. The two new nodes will have
584  		 * a position one bit further down the key and this
585  		 * means that the "significant" part of their keys
586  		 * (see the discussion near the top of this file)
587  		 * will differ by one bit, which will be "0" in
588  		 * node0's key and "1" in node1's key. Since we are
589  		 * moving the key position by one step, the bit that
590  		 * we are moving away from - the bit at position
591  		 * (tn->pos) - is the one that will differ between
592  		 * node0 and node1. So... we synthesize that bit in the
593  		 * two new keys.
594  		 */
595  		node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
596  		if (!node1)
597  			goto nomem;
598  		node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
599  
600  		tnode_free_append(tn, node1);
601  		if (!node0)
602  			goto nomem;
603  		tnode_free_append(tn, node0);
604  
605  		/* populate child pointers in new nodes */
606  		for (k = child_length(inode), j = k / 2; j;) {
607  			put_child(node1, --j, get_child(inode, --k));
608  			put_child(node0, j, get_child(inode, j));
609  			put_child(node1, --j, get_child(inode, --k));
610  			put_child(node0, j, get_child(inode, j));
611  		}
612  
613  		/* link new nodes to parent */
614  		NODE_INIT_PARENT(node1, tn);
615  		NODE_INIT_PARENT(node0, tn);
616  
617  		/* link parent to nodes */
618  		put_child(tn, 2 * i + 1, node1);
619  		put_child(tn, 2 * i, node0);
620  	}
621  
622  	/* setup the parent pointers into and out of this node */
623  	return replace(t, oldtnode, tn);
624  nomem:
625  	/* all pointers should be clean so we are done */
626  	tnode_free(tn);
627  notnode:
628  	return NULL;
629  }
630  
631  static struct key_vector *halve(struct trie *t,
632  				struct key_vector *oldtnode)
633  {
634  	struct key_vector *tn;
635  	unsigned long i;
636  
637  	pr_debug("In halve\n");
638  
639  	tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
640  	if (!tn)
641  		goto notnode;
642  
643  	/* prepare oldtnode to be freed */
644  	tnode_free_init(oldtnode);
645  
646  	/* Assemble all of the pointers in our cluster, in this case that
647  	 * represents all of the pointers out of our allocated nodes that
648  	 * point to existing tnodes and the links between our allocated
649  	 * nodes.
650  	 */
651  	for (i = child_length(oldtnode); i;) {
652  		struct key_vector *node1 = get_child(oldtnode, --i);
653  		struct key_vector *node0 = get_child(oldtnode, --i);
654  		struct key_vector *inode;
655  
656  		/* At least one of the children is empty */
657  		if (!node1 || !node0) {
658  			put_child(tn, i / 2, node1 ? : node0);
659  			continue;
660  		}
661  
662  		/* Two nonempty children */
663  		inode = tnode_new(node0->key, oldtnode->pos, 1);
664  		if (!inode)
665  			goto nomem;
666  		tnode_free_append(tn, inode);
667  
668  		/* initialize pointers out of node */
669  		put_child(inode, 1, node1);
670  		put_child(inode, 0, node0);
671  		NODE_INIT_PARENT(inode, tn);
672  
673  		/* link parent to node */
674  		put_child(tn, i / 2, inode);
675  	}
676  
677  	/* setup the parent pointers into and out of this node */
678  	return replace(t, oldtnode, tn);
679  nomem:
680  	/* all pointers should be clean so we are done */
681  	tnode_free(tn);
682  notnode:
683  	return NULL;
684  }
685  
686  static struct key_vector *collapse(struct trie *t,
687  				   struct key_vector *oldtnode)
688  {
689  	struct key_vector *n, *tp;
690  	unsigned long i;
691  
692  	/* scan the tnode looking for that one child that might still exist */
693  	for (n = NULL, i = child_length(oldtnode); !n && i;)
694  		n = get_child(oldtnode, --i);
695  
696  	/* compress one level */
697  	tp = node_parent(oldtnode);
698  	put_child_root(tp, oldtnode->key, n);
699  	node_set_parent(n, tp);
700  
701  	/* drop dead node */
702  	node_free(oldtnode);
703  
704  	return tp;
705  }
706  
707  static unsigned char update_suffix(struct key_vector *tn)
708  {
709  	unsigned char slen = tn->pos;
710  	unsigned long stride, i;
711  	unsigned char slen_max;
712  
713  	/* only vector 0 can have a suffix length greater than or equal to
714  	 * tn->pos + tn->bits, the second highest node will have a suffix
715  	 * length at most of tn->pos + tn->bits - 1
716  	 */
717  	slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen);
718  
719  	/* search though the list of children looking for nodes that might
720  	 * have a suffix greater than the one we currently have.  This is
721  	 * why we start with a stride of 2 since a stride of 1 would
722  	 * represent the nodes with suffix length equal to tn->pos
723  	 */
724  	for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) {
725  		struct key_vector *n = get_child(tn, i);
726  
727  		if (!n || (n->slen <= slen))
728  			continue;
729  
730  		/* update stride and slen based on new value */
731  		stride <<= (n->slen - slen);
732  		slen = n->slen;
733  		i &= ~(stride - 1);
734  
735  		/* stop searching if we have hit the maximum possible value */
736  		if (slen >= slen_max)
737  			break;
738  	}
739  
740  	tn->slen = slen;
741  
742  	return slen;
743  }
744  
745  /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
746   * the Helsinki University of Technology and Matti Tikkanen of Nokia
747   * Telecommunications, page 6:
748   * "A node is doubled if the ratio of non-empty children to all
749   * children in the *doubled* node is at least 'high'."
750   *
751   * 'high' in this instance is the variable 'inflate_threshold'. It
752   * is expressed as a percentage, so we multiply it with
753   * child_length() and instead of multiplying by 2 (since the
754   * child array will be doubled by inflate()) and multiplying
755   * the left-hand side by 100 (to handle the percentage thing) we
756   * multiply the left-hand side by 50.
757   *
758   * The left-hand side may look a bit weird: child_length(tn)
759   * - tn->empty_children is of course the number of non-null children
760   * in the current node. tn->full_children is the number of "full"
761   * children, that is non-null tnodes with a skip value of 0.
762   * All of those will be doubled in the resulting inflated tnode, so
763   * we just count them one extra time here.
764   *
765   * A clearer way to write this would be:
766   *
767   * to_be_doubled = tn->full_children;
768   * not_to_be_doubled = child_length(tn) - tn->empty_children -
769   *     tn->full_children;
770   *
771   * new_child_length = child_length(tn) * 2;
772   *
773   * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
774   *      new_child_length;
775   * if (new_fill_factor >= inflate_threshold)
776   *
777   * ...and so on, tho it would mess up the while () loop.
778   *
779   * anyway,
780   * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
781   *      inflate_threshold
782   *
783   * avoid a division:
784   * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
785   *      inflate_threshold * new_child_length
786   *
787   * expand not_to_be_doubled and to_be_doubled, and shorten:
788   * 100 * (child_length(tn) - tn->empty_children +
789   *    tn->full_children) >= inflate_threshold * new_child_length
790   *
791   * expand new_child_length:
792   * 100 * (child_length(tn) - tn->empty_children +
793   *    tn->full_children) >=
794   *      inflate_threshold * child_length(tn) * 2
795   *
796   * shorten again:
797   * 50 * (tn->full_children + child_length(tn) -
798   *    tn->empty_children) >= inflate_threshold *
799   *    child_length(tn)
800   *
801   */
802  static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn)
803  {
804  	unsigned long used = child_length(tn);
805  	unsigned long threshold = used;
806  
807  	/* Keep root node larger */
808  	threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold;
809  	used -= tn_info(tn)->empty_children;
810  	used += tn_info(tn)->full_children;
811  
812  	/* if bits == KEYLENGTH then pos = 0, and will fail below */
813  
814  	return (used > 1) && tn->pos && ((50 * used) >= threshold);
815  }
816  
817  static inline bool should_halve(struct key_vector *tp, struct key_vector *tn)
818  {
819  	unsigned long used = child_length(tn);
820  	unsigned long threshold = used;
821  
822  	/* Keep root node larger */
823  	threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold;
824  	used -= tn_info(tn)->empty_children;
825  
826  	/* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
827  
828  	return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
829  }
830  
831  static inline bool should_collapse(struct key_vector *tn)
832  {
833  	unsigned long used = child_length(tn);
834  
835  	used -= tn_info(tn)->empty_children;
836  
837  	/* account for bits == KEYLENGTH case */
838  	if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children)
839  		used -= KEY_MAX;
840  
841  	/* One child or none, time to drop us from the trie */
842  	return used < 2;
843  }
844  
845  #define MAX_WORK 10
846  static struct key_vector *resize(struct trie *t, struct key_vector *tn)
847  {
848  #ifdef CONFIG_IP_FIB_TRIE_STATS
849  	struct trie_use_stats __percpu *stats = t->stats;
850  #endif
851  	struct key_vector *tp = node_parent(tn);
852  	unsigned long cindex = get_index(tn->key, tp);
853  	int max_work = MAX_WORK;
854  
855  	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
856  		 tn, inflate_threshold, halve_threshold);
857  
858  	/* track the tnode via the pointer from the parent instead of
859  	 * doing it ourselves.  This way we can let RCU fully do its
860  	 * thing without us interfering
861  	 */
862  	BUG_ON(tn != get_child(tp, cindex));
863  
864  	/* Double as long as the resulting node has a number of
865  	 * nonempty nodes that are above the threshold.
866  	 */
867  	while (should_inflate(tp, tn) && max_work) {
868  		tp = inflate(t, tn);
869  		if (!tp) {
870  #ifdef CONFIG_IP_FIB_TRIE_STATS
871  			this_cpu_inc(stats->resize_node_skipped);
872  #endif
873  			break;
874  		}
875  
876  		max_work--;
877  		tn = get_child(tp, cindex);
878  	}
879  
880  	/* update parent in case inflate failed */
881  	tp = node_parent(tn);
882  
883  	/* Return if at least one inflate is run */
884  	if (max_work != MAX_WORK)
885  		return tp;
886  
887  	/* Halve as long as the number of empty children in this
888  	 * node is above threshold.
889  	 */
890  	while (should_halve(tp, tn) && max_work) {
891  		tp = halve(t, tn);
892  		if (!tp) {
893  #ifdef CONFIG_IP_FIB_TRIE_STATS
894  			this_cpu_inc(stats->resize_node_skipped);
895  #endif
896  			break;
897  		}
898  
899  		max_work--;
900  		tn = get_child(tp, cindex);
901  	}
902  
903  	/* Only one child remains */
904  	if (should_collapse(tn))
905  		return collapse(t, tn);
906  
907  	/* update parent in case halve failed */
908  	return node_parent(tn);
909  }
910  
911  static void node_pull_suffix(struct key_vector *tn, unsigned char slen)
912  {
913  	unsigned char node_slen = tn->slen;
914  
915  	while ((node_slen > tn->pos) && (node_slen > slen)) {
916  		slen = update_suffix(tn);
917  		if (node_slen == slen)
918  			break;
919  
920  		tn = node_parent(tn);
921  		node_slen = tn->slen;
922  	}
923  }
924  
925  static void node_push_suffix(struct key_vector *tn, unsigned char slen)
926  {
927  	while (tn->slen < slen) {
928  		tn->slen = slen;
929  		tn = node_parent(tn);
930  	}
931  }
932  
933  /* rcu_read_lock needs to be hold by caller from readside */
934  static struct key_vector *fib_find_node(struct trie *t,
935  					struct key_vector **tp, u32 key)
936  {
937  	struct key_vector *pn, *n = t->kv;
938  	unsigned long index = 0;
939  
940  	do {
941  		pn = n;
942  		n = get_child_rcu(n, index);
943  
944  		if (!n)
945  			break;
946  
947  		index = get_cindex(key, n);
948  
949  		/* This bit of code is a bit tricky but it combines multiple
950  		 * checks into a single check.  The prefix consists of the
951  		 * prefix plus zeros for the bits in the cindex. The index
952  		 * is the difference between the key and this value.  From
953  		 * this we can actually derive several pieces of data.
954  		 *   if (index >= (1ul << bits))
955  		 *     we have a mismatch in skip bits and failed
956  		 *   else
957  		 *     we know the value is cindex
958  		 *
959  		 * This check is safe even if bits == KEYLENGTH due to the
960  		 * fact that we can only allocate a node with 32 bits if a
961  		 * long is greater than 32 bits.
962  		 */
963  		if (index >= (1ul << n->bits)) {
964  			n = NULL;
965  			break;
966  		}
967  
968  		/* keep searching until we find a perfect match leaf or NULL */
969  	} while (IS_TNODE(n));
970  
971  	*tp = pn;
972  
973  	return n;
974  }
975  
976  /* Return the first fib alias matching TOS with
977   * priority less than or equal to PRIO.
978   * If 'find_first' is set, return the first matching
979   * fib alias, regardless of TOS and priority.
980   */
981  static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
982  					u8 tos, u32 prio, u32 tb_id,
983  					bool find_first)
984  {
985  	struct fib_alias *fa;
986  
987  	if (!fah)
988  		return NULL;
989  
990  	hlist_for_each_entry(fa, fah, fa_list) {
991  		if (fa->fa_slen < slen)
992  			continue;
993  		if (fa->fa_slen != slen)
994  			break;
995  		if (fa->tb_id > tb_id)
996  			continue;
997  		if (fa->tb_id != tb_id)
998  			break;
999  		if (find_first)
1000  			return fa;
1001  		if (fa->fa_tos > tos)
1002  			continue;
1003  		if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
1004  			return fa;
1005  	}
1006  
1007  	return NULL;
1008  }
1009  
1010  static struct fib_alias *
1011  fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
1012  {
1013  	u8 slen = KEYLENGTH - fri->dst_len;
1014  	struct key_vector *l, *tp;
1015  	struct fib_table *tb;
1016  	struct fib_alias *fa;
1017  	struct trie *t;
1018  
1019  	tb = fib_get_table(net, fri->tb_id);
1020  	if (!tb)
1021  		return NULL;
1022  
1023  	t = (struct trie *)tb->tb_data;
1024  	l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
1025  	if (!l)
1026  		return NULL;
1027  
1028  	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
1029  		if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
1030  		    fa->fa_tos == fri->tos && fa->fa_info == fri->fi &&
1031  		    fa->fa_type == fri->type)
1032  			return fa;
1033  	}
1034  
1035  	return NULL;
1036  }
1037  
1038  void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
1039  {
1040  	struct fib_alias *fa_match;
1041  
1042  	rcu_read_lock();
1043  
1044  	fa_match = fib_find_matching_alias(net, fri);
1045  	if (!fa_match)
1046  		goto out;
1047  
1048  	fa_match->offload = fri->offload;
1049  	fa_match->trap = fri->trap;
1050  
1051  out:
1052  	rcu_read_unlock();
1053  }
1054  EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
1055  
1056  static void trie_rebalance(struct trie *t, struct key_vector *tn)
1057  {
1058  	while (!IS_TRIE(tn))
1059  		tn = resize(t, tn);
1060  }
1061  
1062  static int fib_insert_node(struct trie *t, struct key_vector *tp,
1063  			   struct fib_alias *new, t_key key)
1064  {
1065  	struct key_vector *n, *l;
1066  
1067  	l = leaf_new(key, new);
1068  	if (!l)
1069  		goto noleaf;
1070  
1071  	/* retrieve child from parent node */
1072  	n = get_child(tp, get_index(key, tp));
1073  
1074  	/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
1075  	 *
1076  	 *  Add a new tnode here
1077  	 *  first tnode need some special handling
1078  	 *  leaves us in position for handling as case 3
1079  	 */
1080  	if (n) {
1081  		struct key_vector *tn;
1082  
1083  		tn = tnode_new(key, __fls(key ^ n->key), 1);
1084  		if (!tn)
1085  			goto notnode;
1086  
1087  		/* initialize routes out of node */
1088  		NODE_INIT_PARENT(tn, tp);
1089  		put_child(tn, get_index(key, tn) ^ 1, n);
1090  
1091  		/* start adding routes into the node */
1092  		put_child_root(tp, key, tn);
1093  		node_set_parent(n, tn);
1094  
1095  		/* parent now has a NULL spot where the leaf can go */
1096  		tp = tn;
1097  	}
1098  
1099  	/* Case 3: n is NULL, and will just insert a new leaf */
1100  	node_push_suffix(tp, new->fa_slen);
1101  	NODE_INIT_PARENT(l, tp);
1102  	put_child_root(tp, key, l);
1103  	trie_rebalance(t, tp);
1104  
1105  	return 0;
1106  notnode:
1107  	node_free(l);
1108  noleaf:
1109  	return -ENOMEM;
1110  }
1111  
1112  static int fib_insert_alias(struct trie *t, struct key_vector *tp,
1113  			    struct key_vector *l, struct fib_alias *new,
1114  			    struct fib_alias *fa, t_key key)
1115  {
1116  	if (!l)
1117  		return fib_insert_node(t, tp, new, key);
1118  
1119  	if (fa) {
1120  		hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
1121  	} else {
1122  		struct fib_alias *last;
1123  
1124  		hlist_for_each_entry(last, &l->leaf, fa_list) {
1125  			if (new->fa_slen < last->fa_slen)
1126  				break;
1127  			if ((new->fa_slen == last->fa_slen) &&
1128  			    (new->tb_id > last->tb_id))
1129  				break;
1130  			fa = last;
1131  		}
1132  
1133  		if (fa)
1134  			hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
1135  		else
1136  			hlist_add_head_rcu(&new->fa_list, &l->leaf);
1137  	}
1138  
1139  	/* if we added to the tail node then we need to update slen */
1140  	if (l->slen < new->fa_slen) {
1141  		l->slen = new->fa_slen;
1142  		node_push_suffix(tp, new->fa_slen);
1143  	}
1144  
1145  	return 0;
1146  }
1147  
1148  static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
1149  {
1150  	if (plen > KEYLENGTH) {
1151  		NL_SET_ERR_MSG(extack, "Invalid prefix length");
1152  		return false;
1153  	}
1154  
1155  	if ((plen < KEYLENGTH) && (key << plen)) {
1156  		NL_SET_ERR_MSG(extack,
1157  			       "Invalid prefix for given prefix length");
1158  		return false;
1159  	}
1160  
1161  	return true;
1162  }
1163  
1164  static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1165  			     struct key_vector *l, struct fib_alias *old);
1166  
1167  /* Caller must hold RTNL. */
1168  int fib_table_insert(struct net *net, struct fib_table *tb,
1169  		     struct fib_config *cfg, struct netlink_ext_ack *extack)
1170  {
1171  	struct trie *t = (struct trie *)tb->tb_data;
1172  	struct fib_alias *fa, *new_fa;
1173  	struct key_vector *l, *tp;
1174  	u16 nlflags = NLM_F_EXCL;
1175  	struct fib_info *fi;
1176  	u8 plen = cfg->fc_dst_len;
1177  	u8 slen = KEYLENGTH - plen;
1178  	u8 tos = cfg->fc_tos;
1179  	u32 key;
1180  	int err;
1181  
1182  	key = ntohl(cfg->fc_dst);
1183  
1184  	if (!fib_valid_key_len(key, plen, extack))
1185  		return -EINVAL;
1186  
1187  	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1188  
1189  	fi = fib_create_info(cfg, extack);
1190  	if (IS_ERR(fi)) {
1191  		err = PTR_ERR(fi);
1192  		goto err;
1193  	}
1194  
1195  	l = fib_find_node(t, &tp, key);
1196  	fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
1197  				tb->tb_id, false) : NULL;
1198  
1199  	/* Now fa, if non-NULL, points to the first fib alias
1200  	 * with the same keys [prefix,tos,priority], if such key already
1201  	 * exists or to the node before which we will insert new one.
1202  	 *
1203  	 * If fa is NULL, we will need to allocate a new one and
1204  	 * insert to the tail of the section matching the suffix length
1205  	 * of the new alias.
1206  	 */
1207  
1208  	if (fa && fa->fa_tos == tos &&
1209  	    fa->fa_info->fib_priority == fi->fib_priority) {
1210  		struct fib_alias *fa_first, *fa_match;
1211  
1212  		err = -EEXIST;
1213  		if (cfg->fc_nlflags & NLM_F_EXCL)
1214  			goto out;
1215  
1216  		nlflags &= ~NLM_F_EXCL;
1217  
1218  		/* We have 2 goals:
1219  		 * 1. Find exact match for type, scope, fib_info to avoid
1220  		 * duplicate routes
1221  		 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1222  		 */
1223  		fa_match = NULL;
1224  		fa_first = fa;
1225  		hlist_for_each_entry_from(fa, fa_list) {
1226  			if ((fa->fa_slen != slen) ||
1227  			    (fa->tb_id != tb->tb_id) ||
1228  			    (fa->fa_tos != tos))
1229  				break;
1230  			if (fa->fa_info->fib_priority != fi->fib_priority)
1231  				break;
1232  			if (fa->fa_type == cfg->fc_type &&
1233  			    fa->fa_info == fi) {
1234  				fa_match = fa;
1235  				break;
1236  			}
1237  		}
1238  
1239  		if (cfg->fc_nlflags & NLM_F_REPLACE) {
1240  			struct fib_info *fi_drop;
1241  			u8 state;
1242  
1243  			nlflags |= NLM_F_REPLACE;
1244  			fa = fa_first;
1245  			if (fa_match) {
1246  				if (fa == fa_match)
1247  					err = 0;
1248  				goto out;
1249  			}
1250  			err = -ENOBUFS;
1251  			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1252  			if (!new_fa)
1253  				goto out;
1254  
1255  			fi_drop = fa->fa_info;
1256  			new_fa->fa_tos = fa->fa_tos;
1257  			new_fa->fa_info = fi;
1258  			new_fa->fa_type = cfg->fc_type;
1259  			state = fa->fa_state;
1260  			new_fa->fa_state = state & ~FA_S_ACCESSED;
1261  			new_fa->fa_slen = fa->fa_slen;
1262  			new_fa->tb_id = tb->tb_id;
1263  			new_fa->fa_default = -1;
1264  			new_fa->offload = 0;
1265  			new_fa->trap = 0;
1266  
1267  			hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1268  
1269  			if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
1270  					   tb->tb_id, true) == new_fa) {
1271  				enum fib_event_type fib_event;
1272  
1273  				fib_event = FIB_EVENT_ENTRY_REPLACE;
1274  				err = call_fib_entry_notifiers(net, fib_event,
1275  							       key, plen,
1276  							       new_fa, extack);
1277  				if (err) {
1278  					hlist_replace_rcu(&new_fa->fa_list,
1279  							  &fa->fa_list);
1280  					goto out_free_new_fa;
1281  				}
1282  			}
1283  
1284  			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1285  				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
1286  
1287  			alias_free_mem_rcu(fa);
1288  
1289  			fib_release_info(fi_drop);
1290  			if (state & FA_S_ACCESSED)
1291  				rt_cache_flush(cfg->fc_nlinfo.nl_net);
1292  
1293  			goto succeeded;
1294  		}
1295  		/* Error if we find a perfect match which
1296  		 * uses the same scope, type, and nexthop
1297  		 * information.
1298  		 */
1299  		if (fa_match)
1300  			goto out;
1301  
1302  		if (cfg->fc_nlflags & NLM_F_APPEND)
1303  			nlflags |= NLM_F_APPEND;
1304  		else
1305  			fa = fa_first;
1306  	}
1307  	err = -ENOENT;
1308  	if (!(cfg->fc_nlflags & NLM_F_CREATE))
1309  		goto out;
1310  
1311  	nlflags |= NLM_F_CREATE;
1312  	err = -ENOBUFS;
1313  	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1314  	if (!new_fa)
1315  		goto out;
1316  
1317  	new_fa->fa_info = fi;
1318  	new_fa->fa_tos = tos;
1319  	new_fa->fa_type = cfg->fc_type;
1320  	new_fa->fa_state = 0;
1321  	new_fa->fa_slen = slen;
1322  	new_fa->tb_id = tb->tb_id;
1323  	new_fa->fa_default = -1;
1324  	new_fa->offload = 0;
1325  	new_fa->trap = 0;
1326  
1327  	/* Insert new entry to the list. */
1328  	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
1329  	if (err)
1330  		goto out_free_new_fa;
1331  
1332  	/* The alias was already inserted, so the node must exist. */
1333  	l = l ? l : fib_find_node(t, &tp, key);
1334  	if (WARN_ON_ONCE(!l))
1335  		goto out_free_new_fa;
1336  
1337  	if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
1338  	    new_fa) {
1339  		enum fib_event_type fib_event;
1340  
1341  		fib_event = FIB_EVENT_ENTRY_REPLACE;
1342  		err = call_fib_entry_notifiers(net, fib_event, key, plen,
1343  					       new_fa, extack);
1344  		if (err)
1345  			goto out_remove_new_fa;
1346  	}
1347  
1348  	if (!plen)
1349  		tb->tb_num_default++;
1350  
1351  	rt_cache_flush(cfg->fc_nlinfo.nl_net);
1352  	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
1353  		  &cfg->fc_nlinfo, nlflags);
1354  succeeded:
1355  	return 0;
1356  
1357  out_remove_new_fa:
1358  	fib_remove_alias(t, tp, l, new_fa);
1359  out_free_new_fa:
1360  	kmem_cache_free(fn_alias_kmem, new_fa);
1361  out:
1362  	fib_release_info(fi);
1363  err:
1364  	return err;
1365  }
1366  
1367  static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
1368  {
1369  	t_key prefix = n->key;
1370  
1371  	return (key ^ prefix) & (prefix | -prefix);
1372  }
1373  
1374  bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
1375  			 const struct flowi4 *flp)
1376  {
1377  	if (nhc->nhc_flags & RTNH_F_DEAD)
1378  		return false;
1379  
1380  	if (ip_ignore_linkdown(nhc->nhc_dev) &&
1381  	    nhc->nhc_flags & RTNH_F_LINKDOWN &&
1382  	    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
1383  		return false;
1384  
1385  	if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
1386  		if (flp->flowi4_oif &&
1387  		    flp->flowi4_oif != nhc->nhc_oif)
1388  			return false;
1389  	}
1390  
1391  	return true;
1392  }
1393  
1394  /* should be called with rcu_read_lock */
1395  int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
1396  		     struct fib_result *res, int fib_flags)
1397  {
1398  	struct trie *t = (struct trie *) tb->tb_data;
1399  #ifdef CONFIG_IP_FIB_TRIE_STATS
1400  	struct trie_use_stats __percpu *stats = t->stats;
1401  #endif
1402  	const t_key key = ntohl(flp->daddr);
1403  	struct key_vector *n, *pn;
1404  	struct fib_alias *fa;
1405  	unsigned long index;
1406  	t_key cindex;
1407  
1408  	pn = t->kv;
1409  	cindex = 0;
1410  
1411  	n = get_child_rcu(pn, cindex);
1412  	if (!n) {
1413  		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
1414  		return -EAGAIN;
1415  	}
1416  
1417  #ifdef CONFIG_IP_FIB_TRIE_STATS
1418  	this_cpu_inc(stats->gets);
1419  #endif
1420  
1421  	/* Step 1: Travel to the longest prefix match in the trie */
1422  	for (;;) {
1423  		index = get_cindex(key, n);
1424  
1425  		/* This bit of code is a bit tricky but it combines multiple
1426  		 * checks into a single check.  The prefix consists of the
1427  		 * prefix plus zeros for the "bits" in the prefix. The index
1428  		 * is the difference between the key and this value.  From
1429  		 * this we can actually derive several pieces of data.
1430  		 *   if (index >= (1ul << bits))
1431  		 *     we have a mismatch in skip bits and failed
1432  		 *   else
1433  		 *     we know the value is cindex
1434  		 *
1435  		 * This check is safe even if bits == KEYLENGTH due to the
1436  		 * fact that we can only allocate a node with 32 bits if a
1437  		 * long is greater than 32 bits.
1438  		 */
1439  		if (index >= (1ul << n->bits))
1440  			break;
1441  
1442  		/* we have found a leaf. Prefixes have already been compared */
1443  		if (IS_LEAF(n))
1444  			goto found;
1445  
1446  		/* only record pn and cindex if we are going to be chopping
1447  		 * bits later.  Otherwise we are just wasting cycles.
1448  		 */
1449  		if (n->slen > n->pos) {
1450  			pn = n;
1451  			cindex = index;
1452  		}
1453  
1454  		n = get_child_rcu(n, index);
1455  		if (unlikely(!n))
1456  			goto backtrace;
1457  	}
1458  
1459  	/* Step 2: Sort out leaves and begin backtracing for longest prefix */
1460  	for (;;) {
1461  		/* record the pointer where our next node pointer is stored */
1462  		struct key_vector __rcu **cptr = n->tnode;
1463  
1464  		/* This test verifies that none of the bits that differ
1465  		 * between the key and the prefix exist in the region of
1466  		 * the lsb and higher in the prefix.
1467  		 */
1468  		if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
1469  			goto backtrace;
1470  
1471  		/* exit out and process leaf */
1472  		if (unlikely(IS_LEAF(n)))
1473  			break;
1474  
1475  		/* Don't bother recording parent info.  Since we are in
1476  		 * prefix match mode we will have to come back to wherever
1477  		 * we started this traversal anyway
1478  		 */
1479  
1480  		while ((n = rcu_dereference(*cptr)) == NULL) {
1481  backtrace:
1482  #ifdef CONFIG_IP_FIB_TRIE_STATS
1483  			if (!n)
1484  				this_cpu_inc(stats->null_node_hit);
1485  #endif
1486  			/* If we are at cindex 0 there are no more bits for
1487  			 * us to strip at this level so we must ascend back
1488  			 * up one level to see if there are any more bits to
1489  			 * be stripped there.
1490  			 */
1491  			while (!cindex) {
1492  				t_key pkey = pn->key;
1493  
1494  				/* If we don't have a parent then there is
1495  				 * nothing for us to do as we do not have any
1496  				 * further nodes to parse.
1497  				 */
1498  				if (IS_TRIE(pn)) {
1499  					trace_fib_table_lookup(tb->tb_id, flp,
1500  							       NULL, -EAGAIN);
1501  					return -EAGAIN;
1502  				}
1503  #ifdef CONFIG_IP_FIB_TRIE_STATS
1504  				this_cpu_inc(stats->backtrack);
1505  #endif
1506  				/* Get Child's index */
1507  				pn = node_parent_rcu(pn);
1508  				cindex = get_index(pkey, pn);
1509  			}
1510  
1511  			/* strip the least significant bit from the cindex */
1512  			cindex &= cindex - 1;
1513  
1514  			/* grab pointer for next child node */
1515  			cptr = &pn->tnode[cindex];
1516  		}
1517  	}
1518  
1519  found:
1520  	/* this line carries forward the xor from earlier in the function */
1521  	index = key ^ n->key;
1522  
1523  	/* Step 3: Process the leaf, if that fails fall back to backtracing */
1524  	hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
1525  		struct fib_info *fi = fa->fa_info;
1526  		struct fib_nh_common *nhc;
1527  		int nhsel, err;
1528  
1529  		if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
1530  			if (index >= (1ul << fa->fa_slen))
1531  				continue;
1532  		}
1533  		if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1534  			continue;
1535  		if (fi->fib_dead)
1536  			continue;
1537  		if (fa->fa_info->fib_scope < flp->flowi4_scope)
1538  			continue;
1539  		fib_alias_accessed(fa);
1540  		err = fib_props[fa->fa_type].error;
1541  		if (unlikely(err < 0)) {
1542  out_reject:
1543  #ifdef CONFIG_IP_FIB_TRIE_STATS
1544  			this_cpu_inc(stats->semantic_match_passed);
1545  #endif
1546  			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
1547  			return err;
1548  		}
1549  		if (fi->fib_flags & RTNH_F_DEAD)
1550  			continue;
1551  
1552  		if (unlikely(fi->nh)) {
1553  			if (nexthop_is_blackhole(fi->nh)) {
1554  				err = fib_props[RTN_BLACKHOLE].error;
1555  				goto out_reject;
1556  			}
1557  
1558  			nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
1559  						     &nhsel);
1560  			if (nhc)
1561  				goto set_result;
1562  			goto miss;
1563  		}
1564  
1565  		for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
1566  			nhc = fib_info_nhc(fi, nhsel);
1567  
1568  			if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
1569  				continue;
1570  set_result:
1571  			if (!(fib_flags & FIB_LOOKUP_NOREF))
1572  				refcount_inc(&fi->fib_clntref);
1573  
1574  			res->prefix = htonl(n->key);
1575  			res->prefixlen = KEYLENGTH - fa->fa_slen;
1576  			res->nh_sel = nhsel;
1577  			res->nhc = nhc;
1578  			res->type = fa->fa_type;
1579  			res->scope = fi->fib_scope;
1580  			res->fi = fi;
1581  			res->table = tb;
1582  			res->fa_head = &n->leaf;
1583  #ifdef CONFIG_IP_FIB_TRIE_STATS
1584  			this_cpu_inc(stats->semantic_match_passed);
1585  #endif
1586  			trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
1587  
1588  			return err;
1589  		}
1590  	}
1591  miss:
1592  #ifdef CONFIG_IP_FIB_TRIE_STATS
1593  	this_cpu_inc(stats->semantic_match_miss);
1594  #endif
1595  	goto backtrace;
1596  }
1597  EXPORT_SYMBOL_GPL(fib_table_lookup);
1598  
1599  static void fib_remove_alias(struct trie *t, struct key_vector *tp,
1600  			     struct key_vector *l, struct fib_alias *old)
1601  {
1602  	/* record the location of the previous list_info entry */
1603  	struct hlist_node **pprev = old->fa_list.pprev;
1604  	struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
1605  
1606  	/* remove the fib_alias from the list */
1607  	hlist_del_rcu(&old->fa_list);
1608  
1609  	/* if we emptied the list this leaf will be freed and we can sort
1610  	 * out parent suffix lengths as a part of trie_rebalance
1611  	 */
1612  	if (hlist_empty(&l->leaf)) {
1613  		if (tp->slen == l->slen)
1614  			node_pull_suffix(tp, tp->pos);
1615  		put_child_root(tp, l->key, NULL);
1616  		node_free(l);
1617  		trie_rebalance(t, tp);
1618  		return;
1619  	}
1620  
1621  	/* only access fa if it is pointing at the last valid hlist_node */
1622  	if (*pprev)
1623  		return;
1624  
1625  	/* update the trie with the latest suffix length */
1626  	l->slen = fa->fa_slen;
1627  	node_pull_suffix(tp, fa->fa_slen);
1628  }
1629  
1630  static void fib_notify_alias_delete(struct net *net, u32 key,
1631  				    struct hlist_head *fah,
1632  				    struct fib_alias *fa_to_delete,
1633  				    struct netlink_ext_ack *extack)
1634  {
1635  	struct fib_alias *fa_next, *fa_to_notify;
1636  	u32 tb_id = fa_to_delete->tb_id;
1637  	u8 slen = fa_to_delete->fa_slen;
1638  	enum fib_event_type fib_event;
1639  
1640  	/* Do not notify if we do not care about the route. */
1641  	if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
1642  		return;
1643  
1644  	/* Determine if the route should be replaced by the next route in the
1645  	 * list.
1646  	 */
1647  	fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
1648  				   struct fib_alias, fa_list);
1649  	if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
1650  		fib_event = FIB_EVENT_ENTRY_REPLACE;
1651  		fa_to_notify = fa_next;
1652  	} else {
1653  		fib_event = FIB_EVENT_ENTRY_DEL;
1654  		fa_to_notify = fa_to_delete;
1655  	}
1656  	call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
1657  				 fa_to_notify, extack);
1658  }
1659  
1660  /* Caller must hold RTNL. */
1661  int fib_table_delete(struct net *net, struct fib_table *tb,
1662  		     struct fib_config *cfg, struct netlink_ext_ack *extack)
1663  {
1664  	struct trie *t = (struct trie *) tb->tb_data;
1665  	struct fib_alias *fa, *fa_to_delete;
1666  	struct key_vector *l, *tp;
1667  	u8 plen = cfg->fc_dst_len;
1668  	u8 slen = KEYLENGTH - plen;
1669  	u8 tos = cfg->fc_tos;
1670  	u32 key;
1671  
1672  	key = ntohl(cfg->fc_dst);
1673  
1674  	if (!fib_valid_key_len(key, plen, extack))
1675  		return -EINVAL;
1676  
1677  	l = fib_find_node(t, &tp, key);
1678  	if (!l)
1679  		return -ESRCH;
1680  
1681  	fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false);
1682  	if (!fa)
1683  		return -ESRCH;
1684  
1685  	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1686  
1687  	fa_to_delete = NULL;
1688  	hlist_for_each_entry_from(fa, fa_list) {
1689  		struct fib_info *fi = fa->fa_info;
1690  
1691  		if ((fa->fa_slen != slen) ||
1692  		    (fa->tb_id != tb->tb_id) ||
1693  		    (fa->fa_tos != tos))
1694  			break;
1695  
1696  		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1697  		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1698  		     fa->fa_info->fib_scope == cfg->fc_scope) &&
1699  		    (!cfg->fc_prefsrc ||
1700  		     fi->fib_prefsrc == cfg->fc_prefsrc) &&
1701  		    (!cfg->fc_protocol ||
1702  		     fi->fib_protocol == cfg->fc_protocol) &&
1703  		    fib_nh_match(net, cfg, fi, extack) == 0 &&
1704  		    fib_metrics_match(cfg, fi)) {
1705  			fa_to_delete = fa;
1706  			break;
1707  		}
1708  	}
1709  
1710  	if (!fa_to_delete)
1711  		return -ESRCH;
1712  
1713  	fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
1714  	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
1715  		  &cfg->fc_nlinfo, 0);
1716  
1717  	if (!plen)
1718  		tb->tb_num_default--;
1719  
1720  	fib_remove_alias(t, tp, l, fa_to_delete);
1721  
1722  	if (fa_to_delete->fa_state & FA_S_ACCESSED)
1723  		rt_cache_flush(cfg->fc_nlinfo.nl_net);
1724  
1725  	fib_release_info(fa_to_delete->fa_info);
1726  	alias_free_mem_rcu(fa_to_delete);
1727  	return 0;
1728  }
1729  
1730  /* Scan for the next leaf starting at the provided key value */
1731  static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)
1732  {
1733  	struct key_vector *pn, *n = *tn;
1734  	unsigned long cindex;
1735  
1736  	/* this loop is meant to try and find the key in the trie */
1737  	do {
1738  		/* record parent and next child index */
1739  		pn = n;
1740  		cindex = (key > pn->key) ? get_index(key, pn) : 0;
1741  
1742  		if (cindex >> pn->bits)
1743  			break;
1744  
1745  		/* descend into the next child */
1746  		n = get_child_rcu(pn, cindex++);
1747  		if (!n)
1748  			break;
1749  
1750  		/* guarantee forward progress on the keys */
1751  		if (IS_LEAF(n) && (n->key >= key))
1752  			goto found;
1753  	} while (IS_TNODE(n));
1754  
1755  	/* this loop will search for the next leaf with a greater key */
1756  	while (!IS_TRIE(pn)) {
1757  		/* if we exhausted the parent node we will need to climb */
1758  		if (cindex >= (1ul << pn->bits)) {
1759  			t_key pkey = pn->key;
1760  
1761  			pn = node_parent_rcu(pn);
1762  			cindex = get_index(pkey, pn) + 1;
1763  			continue;
1764  		}
1765  
1766  		/* grab the next available node */
1767  		n = get_child_rcu(pn, cindex++);
1768  		if (!n)
1769  			continue;
1770  
1771  		/* no need to compare keys since we bumped the index */
1772  		if (IS_LEAF(n))
1773  			goto found;
1774  
1775  		/* Rescan start scanning in new node */
1776  		pn = n;
1777  		cindex = 0;
1778  	}
1779  
1780  	*tn = pn;
1781  	return NULL; /* Root of trie */
1782  found:
1783  	/* if we are at the limit for keys just return NULL for the tnode */
1784  	*tn = pn;
1785  	return n;
1786  }
1787  
1788  static void fib_trie_free(struct fib_table *tb)
1789  {
1790  	struct trie *t = (struct trie *)tb->tb_data;
1791  	struct key_vector *pn = t->kv;
1792  	unsigned long cindex = 1;
1793  	struct hlist_node *tmp;
1794  	struct fib_alias *fa;
1795  
1796  	/* walk trie in reverse order and free everything */
1797  	for (;;) {
1798  		struct key_vector *n;
1799  
1800  		if (!(cindex--)) {
1801  			t_key pkey = pn->key;
1802  
1803  			if (IS_TRIE(pn))
1804  				break;
1805  
1806  			n = pn;
1807  			pn = node_parent(pn);
1808  
1809  			/* drop emptied tnode */
1810  			put_child_root(pn, n->key, NULL);
1811  			node_free(n);
1812  
1813  			cindex = get_index(pkey, pn);
1814  
1815  			continue;
1816  		}
1817  
1818  		/* grab the next available node */
1819  		n = get_child(pn, cindex);
1820  		if (!n)
1821  			continue;
1822  
1823  		if (IS_TNODE(n)) {
1824  			/* record pn and cindex for leaf walking */
1825  			pn = n;
1826  			cindex = 1ul << n->bits;
1827  
1828  			continue;
1829  		}
1830  
1831  		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1832  			hlist_del_rcu(&fa->fa_list);
1833  			alias_free_mem_rcu(fa);
1834  		}
1835  
1836  		put_child_root(pn, n->key, NULL);
1837  		node_free(n);
1838  	}
1839  
1840  #ifdef CONFIG_IP_FIB_TRIE_STATS
1841  	free_percpu(t->stats);
1842  #endif
1843  	kfree(tb);
1844  }
1845  
1846  struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
1847  {
1848  	struct trie *ot = (struct trie *)oldtb->tb_data;
1849  	struct key_vector *l, *tp = ot->kv;
1850  	struct fib_table *local_tb;
1851  	struct fib_alias *fa;
1852  	struct trie *lt;
1853  	t_key key = 0;
1854  
1855  	if (oldtb->tb_data == oldtb->__data)
1856  		return oldtb;
1857  
1858  	local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL);
1859  	if (!local_tb)
1860  		return NULL;
1861  
1862  	lt = (struct trie *)local_tb->tb_data;
1863  
1864  	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
1865  		struct key_vector *local_l = NULL, *local_tp;
1866  
1867  		hlist_for_each_entry(fa, &l->leaf, fa_list) {
1868  			struct fib_alias *new_fa;
1869  
1870  			if (local_tb->tb_id != fa->tb_id)
1871  				continue;
1872  
1873  			/* clone fa for new local table */
1874  			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1875  			if (!new_fa)
1876  				goto out;
1877  
1878  			memcpy(new_fa, fa, sizeof(*fa));
1879  
1880  			/* insert clone into table */
1881  			if (!local_l)
1882  				local_l = fib_find_node(lt, &local_tp, l->key);
1883  
1884  			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
1885  					     NULL, l->key)) {
1886  				kmem_cache_free(fn_alias_kmem, new_fa);
1887  				goto out;
1888  			}
1889  		}
1890  
1891  		/* stop loop if key wrapped back to 0 */
1892  		key = l->key + 1;
1893  		if (key < l->key)
1894  			break;
1895  	}
1896  
1897  	return local_tb;
1898  out:
1899  	fib_trie_free(local_tb);
1900  
1901  	return NULL;
1902  }
1903  
1904  /* Caller must hold RTNL */
1905  void fib_table_flush_external(struct fib_table *tb)
1906  {
1907  	struct trie *t = (struct trie *)tb->tb_data;
1908  	struct key_vector *pn = t->kv;
1909  	unsigned long cindex = 1;
1910  	struct hlist_node *tmp;
1911  	struct fib_alias *fa;
1912  
1913  	/* walk trie in reverse order */
1914  	for (;;) {
1915  		unsigned char slen = 0;
1916  		struct key_vector *n;
1917  
1918  		if (!(cindex--)) {
1919  			t_key pkey = pn->key;
1920  
1921  			/* cannot resize the trie vector */
1922  			if (IS_TRIE(pn))
1923  				break;
1924  
1925  			/* update the suffix to address pulled leaves */
1926  			if (pn->slen > pn->pos)
1927  				update_suffix(pn);
1928  
1929  			/* resize completed node */
1930  			pn = resize(t, pn);
1931  			cindex = get_index(pkey, pn);
1932  
1933  			continue;
1934  		}
1935  
1936  		/* grab the next available node */
1937  		n = get_child(pn, cindex);
1938  		if (!n)
1939  			continue;
1940  
1941  		if (IS_TNODE(n)) {
1942  			/* record pn and cindex for leaf walking */
1943  			pn = n;
1944  			cindex = 1ul << n->bits;
1945  
1946  			continue;
1947  		}
1948  
1949  		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
1950  			/* if alias was cloned to local then we just
1951  			 * need to remove the local copy from main
1952  			 */
1953  			if (tb->tb_id != fa->tb_id) {
1954  				hlist_del_rcu(&fa->fa_list);
1955  				alias_free_mem_rcu(fa);
1956  				continue;
1957  			}
1958  
1959  			/* record local slen */
1960  			slen = fa->fa_slen;
1961  		}
1962  
1963  		/* update leaf slen */
1964  		n->slen = slen;
1965  
1966  		if (hlist_empty(&n->leaf)) {
1967  			put_child_root(pn, n->key, NULL);
1968  			node_free(n);
1969  		}
1970  	}
1971  }
1972  
1973  /* Caller must hold RTNL. */
1974  int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
1975  {
1976  	struct trie *t = (struct trie *)tb->tb_data;
1977  	struct key_vector *pn = t->kv;
1978  	unsigned long cindex = 1;
1979  	struct hlist_node *tmp;
1980  	struct fib_alias *fa;
1981  	int found = 0;
1982  
1983  	/* walk trie in reverse order */
1984  	for (;;) {
1985  		unsigned char slen = 0;
1986  		struct key_vector *n;
1987  
1988  		if (!(cindex--)) {
1989  			t_key pkey = pn->key;
1990  
1991  			/* cannot resize the trie vector */
1992  			if (IS_TRIE(pn))
1993  				break;
1994  
1995  			/* update the suffix to address pulled leaves */
1996  			if (pn->slen > pn->pos)
1997  				update_suffix(pn);
1998  
1999  			/* resize completed node */
2000  			pn = resize(t, pn);
2001  			cindex = get_index(pkey, pn);
2002  
2003  			continue;
2004  		}
2005  
2006  		/* grab the next available node */
2007  		n = get_child(pn, cindex);
2008  		if (!n)
2009  			continue;
2010  
2011  		if (IS_TNODE(n)) {
2012  			/* record pn and cindex for leaf walking */
2013  			pn = n;
2014  			cindex = 1ul << n->bits;
2015  
2016  			continue;
2017  		}
2018  
2019  		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
2020  			struct fib_info *fi = fa->fa_info;
2021  
2022  			if (!fi || tb->tb_id != fa->tb_id ||
2023  			    (!(fi->fib_flags & RTNH_F_DEAD) &&
2024  			     !fib_props[fa->fa_type].error)) {
2025  				slen = fa->fa_slen;
2026  				continue;
2027  			}
2028  
2029  			/* Do not flush error routes if network namespace is
2030  			 * not being dismantled
2031  			 */
2032  			if (!flush_all && fib_props[fa->fa_type].error) {
2033  				slen = fa->fa_slen;
2034  				continue;
2035  			}
2036  
2037  			fib_notify_alias_delete(net, n->key, &n->leaf, fa,
2038  						NULL);
2039  			hlist_del_rcu(&fa->fa_list);
2040  			fib_release_info(fa->fa_info);
2041  			alias_free_mem_rcu(fa);
2042  			found++;
2043  		}
2044  
2045  		/* update leaf slen */
2046  		n->slen = slen;
2047  
2048  		if (hlist_empty(&n->leaf)) {
2049  			put_child_root(pn, n->key, NULL);
2050  			node_free(n);
2051  		}
2052  	}
2053  
2054  	pr_debug("trie_flush found=%d\n", found);
2055  	return found;
2056  }
2057  
2058  /* derived from fib_trie_free */
2059  static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
2060  				     struct nl_info *info)
2061  {
2062  	struct trie *t = (struct trie *)tb->tb_data;
2063  	struct key_vector *pn = t->kv;
2064  	unsigned long cindex = 1;
2065  	struct fib_alias *fa;
2066  
2067  	for (;;) {
2068  		struct key_vector *n;
2069  
2070  		if (!(cindex--)) {
2071  			t_key pkey = pn->key;
2072  
2073  			if (IS_TRIE(pn))
2074  				break;
2075  
2076  			pn = node_parent(pn);
2077  			cindex = get_index(pkey, pn);
2078  			continue;
2079  		}
2080  
2081  		/* grab the next available node */
2082  		n = get_child(pn, cindex);
2083  		if (!n)
2084  			continue;
2085  
2086  		if (IS_TNODE(n)) {
2087  			/* record pn and cindex for leaf walking */
2088  			pn = n;
2089  			cindex = 1ul << n->bits;
2090  
2091  			continue;
2092  		}
2093  
2094  		hlist_for_each_entry(fa, &n->leaf, fa_list) {
2095  			struct fib_info *fi = fa->fa_info;
2096  
2097  			if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
2098  				continue;
2099  
2100  			rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
2101  				  KEYLENGTH - fa->fa_slen, tb->tb_id,
2102  				  info, NLM_F_REPLACE);
2103  		}
2104  	}
2105  }
2106  
2107  void fib_info_notify_update(struct net *net, struct nl_info *info)
2108  {
2109  	unsigned int h;
2110  
2111  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2112  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2113  		struct fib_table *tb;
2114  
2115  		hlist_for_each_entry_rcu(tb, head, tb_hlist,
2116  					 lockdep_rtnl_is_held())
2117  			__fib_info_notify_update(net, tb, info);
2118  	}
2119  }
2120  
2121  static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
2122  			   struct notifier_block *nb,
2123  			   struct netlink_ext_ack *extack)
2124  {
2125  	struct fib_alias *fa;
2126  	int last_slen = -1;
2127  	int err;
2128  
2129  	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2130  		struct fib_info *fi = fa->fa_info;
2131  
2132  		if (!fi)
2133  			continue;
2134  
2135  		/* local and main table can share the same trie,
2136  		 * so don't notify twice for the same entry.
2137  		 */
2138  		if (tb->tb_id != fa->tb_id)
2139  			continue;
2140  
2141  		if (fa->fa_slen == last_slen)
2142  			continue;
2143  
2144  		last_slen = fa->fa_slen;
2145  		err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
2146  					      l->key, KEYLENGTH - fa->fa_slen,
2147  					      fa, extack);
2148  		if (err)
2149  			return err;
2150  	}
2151  	return 0;
2152  }
2153  
2154  static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
2155  			    struct netlink_ext_ack *extack)
2156  {
2157  	struct trie *t = (struct trie *)tb->tb_data;
2158  	struct key_vector *l, *tp = t->kv;
2159  	t_key key = 0;
2160  	int err;
2161  
2162  	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2163  		err = fib_leaf_notify(l, tb, nb, extack);
2164  		if (err)
2165  			return err;
2166  
2167  		key = l->key + 1;
2168  		/* stop in case of wrap around */
2169  		if (key < l->key)
2170  			break;
2171  	}
2172  	return 0;
2173  }
2174  
2175  int fib_notify(struct net *net, struct notifier_block *nb,
2176  	       struct netlink_ext_ack *extack)
2177  {
2178  	unsigned int h;
2179  	int err;
2180  
2181  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2182  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2183  		struct fib_table *tb;
2184  
2185  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2186  			err = fib_table_notify(tb, nb, extack);
2187  			if (err)
2188  				return err;
2189  		}
2190  	}
2191  	return 0;
2192  }
2193  
2194  static void __trie_free_rcu(struct rcu_head *head)
2195  {
2196  	struct fib_table *tb = container_of(head, struct fib_table, rcu);
2197  #ifdef CONFIG_IP_FIB_TRIE_STATS
2198  	struct trie *t = (struct trie *)tb->tb_data;
2199  
2200  	if (tb->tb_data == tb->__data)
2201  		free_percpu(t->stats);
2202  #endif /* CONFIG_IP_FIB_TRIE_STATS */
2203  	kfree(tb);
2204  }
2205  
2206  void fib_free_table(struct fib_table *tb)
2207  {
2208  	call_rcu(&tb->rcu, __trie_free_rcu);
2209  }
2210  
2211  static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
2212  			     struct sk_buff *skb, struct netlink_callback *cb,
2213  			     struct fib_dump_filter *filter)
2214  {
2215  	unsigned int flags = NLM_F_MULTI;
2216  	__be32 xkey = htonl(l->key);
2217  	int i, s_i, i_fa, s_fa, err;
2218  	struct fib_alias *fa;
2219  
2220  	if (filter->filter_set ||
2221  	    !filter->dump_exceptions || !filter->dump_routes)
2222  		flags |= NLM_F_DUMP_FILTERED;
2223  
2224  	s_i = cb->args[4];
2225  	s_fa = cb->args[5];
2226  	i = 0;
2227  
2228  	/* rcu_read_lock is hold by caller */
2229  	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2230  		struct fib_info *fi = fa->fa_info;
2231  
2232  		if (i < s_i)
2233  			goto next;
2234  
2235  		i_fa = 0;
2236  
2237  		if (tb->tb_id != fa->tb_id)
2238  			goto next;
2239  
2240  		if (filter->filter_set) {
2241  			if (filter->rt_type && fa->fa_type != filter->rt_type)
2242  				goto next;
2243  
2244  			if ((filter->protocol &&
2245  			     fi->fib_protocol != filter->protocol))
2246  				goto next;
2247  
2248  			if (filter->dev &&
2249  			    !fib_info_nh_uses_dev(fi, filter->dev))
2250  				goto next;
2251  		}
2252  
2253  		if (filter->dump_routes) {
2254  			if (!s_fa) {
2255  				struct fib_rt_info fri;
2256  
2257  				fri.fi = fi;
2258  				fri.tb_id = tb->tb_id;
2259  				fri.dst = xkey;
2260  				fri.dst_len = KEYLENGTH - fa->fa_slen;
2261  				fri.tos = fa->fa_tos;
2262  				fri.type = fa->fa_type;
2263  				fri.offload = fa->offload;
2264  				fri.trap = fa->trap;
2265  				err = fib_dump_info(skb,
2266  						    NETLINK_CB(cb->skb).portid,
2267  						    cb->nlh->nlmsg_seq,
2268  						    RTM_NEWROUTE, &fri, flags);
2269  				if (err < 0)
2270  					goto stop;
2271  			}
2272  
2273  			i_fa++;
2274  		}
2275  
2276  		if (filter->dump_exceptions) {
2277  			err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
2278  						 &i_fa, s_fa, flags);
2279  			if (err < 0)
2280  				goto stop;
2281  		}
2282  
2283  next:
2284  		i++;
2285  	}
2286  
2287  	cb->args[4] = i;
2288  	return skb->len;
2289  
2290  stop:
2291  	cb->args[4] = i;
2292  	cb->args[5] = i_fa;
2293  	return err;
2294  }
2295  
2296  /* rcu_read_lock needs to be hold by caller from readside */
2297  int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
2298  		   struct netlink_callback *cb, struct fib_dump_filter *filter)
2299  {
2300  	struct trie *t = (struct trie *)tb->tb_data;
2301  	struct key_vector *l, *tp = t->kv;
2302  	/* Dump starting at last key.
2303  	 * Note: 0.0.0.0/0 (ie default) is first key.
2304  	 */
2305  	int count = cb->args[2];
2306  	t_key key = cb->args[3];
2307  
2308  	/* First time here, count and key are both always 0. Count > 0
2309  	 * and key == 0 means the dump has wrapped around and we are done.
2310  	 */
2311  	if (count && !key)
2312  		return skb->len;
2313  
2314  	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
2315  		int err;
2316  
2317  		err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
2318  		if (err < 0) {
2319  			cb->args[3] = key;
2320  			cb->args[2] = count;
2321  			return err;
2322  		}
2323  
2324  		++count;
2325  		key = l->key + 1;
2326  
2327  		memset(&cb->args[4], 0,
2328  		       sizeof(cb->args) - 4*sizeof(cb->args[0]));
2329  
2330  		/* stop loop if key wrapped back to 0 */
2331  		if (key < l->key)
2332  			break;
2333  	}
2334  
2335  	cb->args[3] = key;
2336  	cb->args[2] = count;
2337  
2338  	return skb->len;
2339  }
2340  
2341  void __init fib_trie_init(void)
2342  {
2343  	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2344  					  sizeof(struct fib_alias),
2345  					  0, SLAB_PANIC, NULL);
2346  
2347  	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2348  					   LEAF_SIZE,
2349  					   0, SLAB_PANIC, NULL);
2350  }
2351  
2352  struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
2353  {
2354  	struct fib_table *tb;
2355  	struct trie *t;
2356  	size_t sz = sizeof(*tb);
2357  
2358  	if (!alias)
2359  		sz += sizeof(struct trie);
2360  
2361  	tb = kzalloc(sz, GFP_KERNEL);
2362  	if (!tb)
2363  		return NULL;
2364  
2365  	tb->tb_id = id;
2366  	tb->tb_num_default = 0;
2367  	tb->tb_data = (alias ? alias->__data : tb->__data);
2368  
2369  	if (alias)
2370  		return tb;
2371  
2372  	t = (struct trie *) tb->tb_data;
2373  	t->kv[0].pos = KEYLENGTH;
2374  	t->kv[0].slen = KEYLENGTH;
2375  #ifdef CONFIG_IP_FIB_TRIE_STATS
2376  	t->stats = alloc_percpu(struct trie_use_stats);
2377  	if (!t->stats) {
2378  		kfree(tb);
2379  		tb = NULL;
2380  	}
2381  #endif
2382  
2383  	return tb;
2384  }
2385  
2386  #ifdef CONFIG_PROC_FS
2387  /* Depth first Trie walk iterator */
2388  struct fib_trie_iter {
2389  	struct seq_net_private p;
2390  	struct fib_table *tb;
2391  	struct key_vector *tnode;
2392  	unsigned int index;
2393  	unsigned int depth;
2394  };
2395  
2396  static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter)
2397  {
2398  	unsigned long cindex = iter->index;
2399  	struct key_vector *pn = iter->tnode;
2400  	t_key pkey;
2401  
2402  	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2403  		 iter->tnode, iter->index, iter->depth);
2404  
2405  	while (!IS_TRIE(pn)) {
2406  		while (cindex < child_length(pn)) {
2407  			struct key_vector *n = get_child_rcu(pn, cindex++);
2408  
2409  			if (!n)
2410  				continue;
2411  
2412  			if (IS_LEAF(n)) {
2413  				iter->tnode = pn;
2414  				iter->index = cindex;
2415  			} else {
2416  				/* push down one level */
2417  				iter->tnode = n;
2418  				iter->index = 0;
2419  				++iter->depth;
2420  			}
2421  
2422  			return n;
2423  		}
2424  
2425  		/* Current node exhausted, pop back up */
2426  		pkey = pn->key;
2427  		pn = node_parent_rcu(pn);
2428  		cindex = get_index(pkey, pn) + 1;
2429  		--iter->depth;
2430  	}
2431  
2432  	/* record root node so further searches know we are done */
2433  	iter->tnode = pn;
2434  	iter->index = 0;
2435  
2436  	return NULL;
2437  }
2438  
2439  static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter,
2440  					     struct trie *t)
2441  {
2442  	struct key_vector *n, *pn;
2443  
2444  	if (!t)
2445  		return NULL;
2446  
2447  	pn = t->kv;
2448  	n = rcu_dereference(pn->tnode[0]);
2449  	if (!n)
2450  		return NULL;
2451  
2452  	if (IS_TNODE(n)) {
2453  		iter->tnode = n;
2454  		iter->index = 0;
2455  		iter->depth = 1;
2456  	} else {
2457  		iter->tnode = pn;
2458  		iter->index = 0;
2459  		iter->depth = 0;
2460  	}
2461  
2462  	return n;
2463  }
2464  
2465  static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2466  {
2467  	struct key_vector *n;
2468  	struct fib_trie_iter iter;
2469  
2470  	memset(s, 0, sizeof(*s));
2471  
2472  	rcu_read_lock();
2473  	for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2474  		if (IS_LEAF(n)) {
2475  			struct fib_alias *fa;
2476  
2477  			s->leaves++;
2478  			s->totdepth += iter.depth;
2479  			if (iter.depth > s->maxdepth)
2480  				s->maxdepth = iter.depth;
2481  
2482  			hlist_for_each_entry_rcu(fa, &n->leaf, fa_list)
2483  				++s->prefixes;
2484  		} else {
2485  			s->tnodes++;
2486  			if (n->bits < MAX_STAT_DEPTH)
2487  				s->nodesizes[n->bits]++;
2488  			s->nullpointers += tn_info(n)->empty_children;
2489  		}
2490  	}
2491  	rcu_read_unlock();
2492  }
2493  
2494  /*
2495   *	This outputs /proc/net/fib_triestats
2496   */
2497  static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2498  {
2499  	unsigned int i, max, pointers, bytes, avdepth;
2500  
2501  	if (stat->leaves)
2502  		avdepth = stat->totdepth*100 / stat->leaves;
2503  	else
2504  		avdepth = 0;
2505  
2506  	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2507  		   avdepth / 100, avdepth % 100);
2508  	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2509  
2510  	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2511  	bytes = LEAF_SIZE * stat->leaves;
2512  
2513  	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2514  	bytes += sizeof(struct fib_alias) * stat->prefixes;
2515  
2516  	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2517  	bytes += TNODE_SIZE(0) * stat->tnodes;
2518  
2519  	max = MAX_STAT_DEPTH;
2520  	while (max > 0 && stat->nodesizes[max-1] == 0)
2521  		max--;
2522  
2523  	pointers = 0;
2524  	for (i = 1; i < max; i++)
2525  		if (stat->nodesizes[i] != 0) {
2526  			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2527  			pointers += (1<<i) * stat->nodesizes[i];
2528  		}
2529  	seq_putc(seq, '\n');
2530  	seq_printf(seq, "\tPointers: %u\n", pointers);
2531  
2532  	bytes += sizeof(struct key_vector *) * pointers;
2533  	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2534  	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2535  }
2536  
2537  #ifdef CONFIG_IP_FIB_TRIE_STATS
2538  static void trie_show_usage(struct seq_file *seq,
2539  			    const struct trie_use_stats __percpu *stats)
2540  {
2541  	struct trie_use_stats s = { 0 };
2542  	int cpu;
2543  
2544  	/* loop through all of the CPUs and gather up the stats */
2545  	for_each_possible_cpu(cpu) {
2546  		const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
2547  
2548  		s.gets += pcpu->gets;
2549  		s.backtrack += pcpu->backtrack;
2550  		s.semantic_match_passed += pcpu->semantic_match_passed;
2551  		s.semantic_match_miss += pcpu->semantic_match_miss;
2552  		s.null_node_hit += pcpu->null_node_hit;
2553  		s.resize_node_skipped += pcpu->resize_node_skipped;
2554  	}
2555  
2556  	seq_printf(seq, "\nCounters:\n---------\n");
2557  	seq_printf(seq, "gets = %u\n", s.gets);
2558  	seq_printf(seq, "backtracks = %u\n", s.backtrack);
2559  	seq_printf(seq, "semantic match passed = %u\n",
2560  		   s.semantic_match_passed);
2561  	seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
2562  	seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
2563  	seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
2564  }
2565  #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2566  
2567  static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2568  {
2569  	if (tb->tb_id == RT_TABLE_LOCAL)
2570  		seq_puts(seq, "Local:\n");
2571  	else if (tb->tb_id == RT_TABLE_MAIN)
2572  		seq_puts(seq, "Main:\n");
2573  	else
2574  		seq_printf(seq, "Id %d:\n", tb->tb_id);
2575  }
2576  
2577  
2578  static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2579  {
2580  	struct net *net = (struct net *)seq->private;
2581  	unsigned int h;
2582  
2583  	seq_printf(seq,
2584  		   "Basic info: size of leaf:"
2585  		   " %zd bytes, size of tnode: %zd bytes.\n",
2586  		   LEAF_SIZE, TNODE_SIZE(0));
2587  
2588  	rcu_read_lock();
2589  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2590  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2591  		struct fib_table *tb;
2592  
2593  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2594  			struct trie *t = (struct trie *) tb->tb_data;
2595  			struct trie_stat stat;
2596  
2597  			if (!t)
2598  				continue;
2599  
2600  			fib_table_print(seq, tb);
2601  
2602  			trie_collect_stats(t, &stat);
2603  			trie_show_stats(seq, &stat);
2604  #ifdef CONFIG_IP_FIB_TRIE_STATS
2605  			trie_show_usage(seq, t->stats);
2606  #endif
2607  		}
2608  		cond_resched_rcu();
2609  	}
2610  	rcu_read_unlock();
2611  
2612  	return 0;
2613  }
2614  
2615  static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2616  {
2617  	struct fib_trie_iter *iter = seq->private;
2618  	struct net *net = seq_file_net(seq);
2619  	loff_t idx = 0;
2620  	unsigned int h;
2621  
2622  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2623  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2624  		struct fib_table *tb;
2625  
2626  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2627  			struct key_vector *n;
2628  
2629  			for (n = fib_trie_get_first(iter,
2630  						    (struct trie *) tb->tb_data);
2631  			     n; n = fib_trie_get_next(iter))
2632  				if (pos == idx++) {
2633  					iter->tb = tb;
2634  					return n;
2635  				}
2636  		}
2637  	}
2638  
2639  	return NULL;
2640  }
2641  
2642  static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2643  	__acquires(RCU)
2644  {
2645  	rcu_read_lock();
2646  	return fib_trie_get_idx(seq, *pos);
2647  }
2648  
2649  static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2650  {
2651  	struct fib_trie_iter *iter = seq->private;
2652  	struct net *net = seq_file_net(seq);
2653  	struct fib_table *tb = iter->tb;
2654  	struct hlist_node *tb_node;
2655  	unsigned int h;
2656  	struct key_vector *n;
2657  
2658  	++*pos;
2659  	/* next node in same table */
2660  	n = fib_trie_get_next(iter);
2661  	if (n)
2662  		return n;
2663  
2664  	/* walk rest of this hash chain */
2665  	h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2666  	while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) {
2667  		tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2668  		n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2669  		if (n)
2670  			goto found;
2671  	}
2672  
2673  	/* new hash chain */
2674  	while (++h < FIB_TABLE_HASHSZ) {
2675  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2676  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
2677  			n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2678  			if (n)
2679  				goto found;
2680  		}
2681  	}
2682  	return NULL;
2683  
2684  found:
2685  	iter->tb = tb;
2686  	return n;
2687  }
2688  
2689  static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2690  	__releases(RCU)
2691  {
2692  	rcu_read_unlock();
2693  }
2694  
2695  static void seq_indent(struct seq_file *seq, int n)
2696  {
2697  	while (n-- > 0)
2698  		seq_puts(seq, "   ");
2699  }
2700  
2701  static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2702  {
2703  	switch (s) {
2704  	case RT_SCOPE_UNIVERSE: return "universe";
2705  	case RT_SCOPE_SITE:	return "site";
2706  	case RT_SCOPE_LINK:	return "link";
2707  	case RT_SCOPE_HOST:	return "host";
2708  	case RT_SCOPE_NOWHERE:	return "nowhere";
2709  	default:
2710  		snprintf(buf, len, "scope=%d", s);
2711  		return buf;
2712  	}
2713  }
2714  
2715  static const char *const rtn_type_names[__RTN_MAX] = {
2716  	[RTN_UNSPEC] = "UNSPEC",
2717  	[RTN_UNICAST] = "UNICAST",
2718  	[RTN_LOCAL] = "LOCAL",
2719  	[RTN_BROADCAST] = "BROADCAST",
2720  	[RTN_ANYCAST] = "ANYCAST",
2721  	[RTN_MULTICAST] = "MULTICAST",
2722  	[RTN_BLACKHOLE] = "BLACKHOLE",
2723  	[RTN_UNREACHABLE] = "UNREACHABLE",
2724  	[RTN_PROHIBIT] = "PROHIBIT",
2725  	[RTN_THROW] = "THROW",
2726  	[RTN_NAT] = "NAT",
2727  	[RTN_XRESOLVE] = "XRESOLVE",
2728  };
2729  
2730  static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
2731  {
2732  	if (t < __RTN_MAX && rtn_type_names[t])
2733  		return rtn_type_names[t];
2734  	snprintf(buf, len, "type %u", t);
2735  	return buf;
2736  }
2737  
2738  /* Pretty print the trie */
2739  static int fib_trie_seq_show(struct seq_file *seq, void *v)
2740  {
2741  	const struct fib_trie_iter *iter = seq->private;
2742  	struct key_vector *n = v;
2743  
2744  	if (IS_TRIE(node_parent_rcu(n)))
2745  		fib_table_print(seq, iter->tb);
2746  
2747  	if (IS_TNODE(n)) {
2748  		__be32 prf = htonl(n->key);
2749  
2750  		seq_indent(seq, iter->depth-1);
2751  		seq_printf(seq, "  +-- %pI4/%zu %u %u %u\n",
2752  			   &prf, KEYLENGTH - n->pos - n->bits, n->bits,
2753  			   tn_info(n)->full_children,
2754  			   tn_info(n)->empty_children);
2755  	} else {
2756  		__be32 val = htonl(n->key);
2757  		struct fib_alias *fa;
2758  
2759  		seq_indent(seq, iter->depth);
2760  		seq_printf(seq, "  |-- %pI4\n", &val);
2761  
2762  		hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
2763  			char buf1[32], buf2[32];
2764  
2765  			seq_indent(seq, iter->depth + 1);
2766  			seq_printf(seq, "  /%zu %s %s",
2767  				   KEYLENGTH - fa->fa_slen,
2768  				   rtn_scope(buf1, sizeof(buf1),
2769  					     fa->fa_info->fib_scope),
2770  				   rtn_type(buf2, sizeof(buf2),
2771  					    fa->fa_type));
2772  			if (fa->fa_tos)
2773  				seq_printf(seq, " tos=%d", fa->fa_tos);
2774  			seq_putc(seq, '\n');
2775  		}
2776  	}
2777  
2778  	return 0;
2779  }
2780  
2781  static const struct seq_operations fib_trie_seq_ops = {
2782  	.start  = fib_trie_seq_start,
2783  	.next   = fib_trie_seq_next,
2784  	.stop   = fib_trie_seq_stop,
2785  	.show   = fib_trie_seq_show,
2786  };
2787  
2788  struct fib_route_iter {
2789  	struct seq_net_private p;
2790  	struct fib_table *main_tb;
2791  	struct key_vector *tnode;
2792  	loff_t	pos;
2793  	t_key	key;
2794  };
2795  
2796  static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter,
2797  					    loff_t pos)
2798  {
2799  	struct key_vector *l, **tp = &iter->tnode;
2800  	t_key key;
2801  
2802  	/* use cached location of previously found key */
2803  	if (iter->pos > 0 && pos >= iter->pos) {
2804  		key = iter->key;
2805  	} else {
2806  		iter->pos = 1;
2807  		key = 0;
2808  	}
2809  
2810  	pos -= iter->pos;
2811  
2812  	while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) {
2813  		key = l->key + 1;
2814  		iter->pos++;
2815  		l = NULL;
2816  
2817  		/* handle unlikely case of a key wrap */
2818  		if (!key)
2819  			break;
2820  	}
2821  
2822  	if (l)
2823  		iter->key = l->key;	/* remember it */
2824  	else
2825  		iter->pos = 0;		/* forget it */
2826  
2827  	return l;
2828  }
2829  
2830  static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2831  	__acquires(RCU)
2832  {
2833  	struct fib_route_iter *iter = seq->private;
2834  	struct fib_table *tb;
2835  	struct trie *t;
2836  
2837  	rcu_read_lock();
2838  
2839  	tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2840  	if (!tb)
2841  		return NULL;
2842  
2843  	iter->main_tb = tb;
2844  	t = (struct trie *)tb->tb_data;
2845  	iter->tnode = t->kv;
2846  
2847  	if (*pos != 0)
2848  		return fib_route_get_idx(iter, *pos);
2849  
2850  	iter->pos = 0;
2851  	iter->key = KEY_MAX;
2852  
2853  	return SEQ_START_TOKEN;
2854  }
2855  
2856  static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2857  {
2858  	struct fib_route_iter *iter = seq->private;
2859  	struct key_vector *l = NULL;
2860  	t_key key = iter->key + 1;
2861  
2862  	++*pos;
2863  
2864  	/* only allow key of 0 for start of sequence */
2865  	if ((v == SEQ_START_TOKEN) || key)
2866  		l = leaf_walk_rcu(&iter->tnode, key);
2867  
2868  	if (l) {
2869  		iter->key = l->key;
2870  		iter->pos++;
2871  	} else {
2872  		iter->pos = 0;
2873  	}
2874  
2875  	return l;
2876  }
2877  
2878  static void fib_route_seq_stop(struct seq_file *seq, void *v)
2879  	__releases(RCU)
2880  {
2881  	rcu_read_unlock();
2882  }
2883  
2884  static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
2885  {
2886  	unsigned int flags = 0;
2887  
2888  	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
2889  		flags = RTF_REJECT;
2890  	if (fi) {
2891  		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2892  
2893  		if (nhc->nhc_gw.ipv4)
2894  			flags |= RTF_GATEWAY;
2895  	}
2896  	if (mask == htonl(0xFFFFFFFF))
2897  		flags |= RTF_HOST;
2898  	flags |= RTF_UP;
2899  	return flags;
2900  }
2901  
2902  /*
2903   *	This outputs /proc/net/route.
2904   *	The format of the file is not supposed to be changed
2905   *	and needs to be same as fib_hash output to avoid breaking
2906   *	legacy utilities
2907   */
2908  static int fib_route_seq_show(struct seq_file *seq, void *v)
2909  {
2910  	struct fib_route_iter *iter = seq->private;
2911  	struct fib_table *tb = iter->main_tb;
2912  	struct fib_alias *fa;
2913  	struct key_vector *l = v;
2914  	__be32 prefix;
2915  
2916  	if (v == SEQ_START_TOKEN) {
2917  		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2918  			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2919  			   "\tWindow\tIRTT");
2920  		return 0;
2921  	}
2922  
2923  	prefix = htonl(l->key);
2924  
2925  	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
2926  		struct fib_info *fi = fa->fa_info;
2927  		__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
2928  		unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
2929  
2930  		if ((fa->fa_type == RTN_BROADCAST) ||
2931  		    (fa->fa_type == RTN_MULTICAST))
2932  			continue;
2933  
2934  		if (fa->tb_id != tb->tb_id)
2935  			continue;
2936  
2937  		seq_setwidth(seq, 127);
2938  
2939  		if (fi) {
2940  			struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
2941  			__be32 gw = 0;
2942  
2943  			if (nhc->nhc_gw_family == AF_INET)
2944  				gw = nhc->nhc_gw.ipv4;
2945  
2946  			seq_printf(seq,
2947  				   "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2948  				   "%d\t%08X\t%d\t%u\t%u",
2949  				   nhc->nhc_dev ? nhc->nhc_dev->name : "*",
2950  				   prefix, gw, flags, 0, 0,
2951  				   fi->fib_priority,
2952  				   mask,
2953  				   (fi->fib_advmss ?
2954  				    fi->fib_advmss + 40 : 0),
2955  				   fi->fib_window,
2956  				   fi->fib_rtt >> 3);
2957  		} else {
2958  			seq_printf(seq,
2959  				   "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2960  				   "%d\t%08X\t%d\t%u\t%u",
2961  				   prefix, 0, flags, 0, 0, 0,
2962  				   mask, 0, 0, 0);
2963  		}
2964  		seq_pad(seq, '\n');
2965  	}
2966  
2967  	return 0;
2968  }
2969  
2970  static const struct seq_operations fib_route_seq_ops = {
2971  	.start  = fib_route_seq_start,
2972  	.next   = fib_route_seq_next,
2973  	.stop   = fib_route_seq_stop,
2974  	.show   = fib_route_seq_show,
2975  };
2976  
2977  int __net_init fib_proc_init(struct net *net)
2978  {
2979  	if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops,
2980  			sizeof(struct fib_trie_iter)))
2981  		goto out1;
2982  
2983  	if (!proc_create_net_single("fib_triestat", 0444, net->proc_net,
2984  			fib_triestat_seq_show, NULL))
2985  		goto out2;
2986  
2987  	if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops,
2988  			sizeof(struct fib_route_iter)))
2989  		goto out3;
2990  
2991  	return 0;
2992  
2993  out3:
2994  	remove_proc_entry("fib_triestat", net->proc_net);
2995  out2:
2996  	remove_proc_entry("fib_trie", net->proc_net);
2997  out1:
2998  	return -ENOMEM;
2999  }
3000  
3001  void __net_exit fib_proc_exit(struct net *net)
3002  {
3003  	remove_proc_entry("fib_trie", net->proc_net);
3004  	remove_proc_entry("fib_triestat", net->proc_net);
3005  	remove_proc_entry("route", net->proc_net);
3006  }
3007  
3008  #endif /* CONFIG_PROC_FS */
3009