xref: /openbmc/linux/net/ipv4/fib_trie.c (revision 64347f78)
119baf839SRobert Olsson /*
219baf839SRobert Olsson  *   This program is free software; you can redistribute it and/or
319baf839SRobert Olsson  *   modify it under the terms of the GNU General Public License
419baf839SRobert Olsson  *   as published by the Free Software Foundation; either version
519baf839SRobert Olsson  *   2 of the License, or (at your option) any later version.
619baf839SRobert Olsson  *
719baf839SRobert Olsson  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
819baf839SRobert Olsson  *     & Swedish University of Agricultural Sciences.
919baf839SRobert Olsson  *
1019baf839SRobert Olsson  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
1119baf839SRobert Olsson  *     Agricultural Sciences.
1219baf839SRobert Olsson  *
1319baf839SRobert Olsson  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
1419baf839SRobert Olsson  *
1519baf839SRobert Olsson  * This work is based on the LPC-trie which is originally descibed in:
1619baf839SRobert Olsson  *
1719baf839SRobert Olsson  * An experimental study of compression methods for dynamic tries
1819baf839SRobert Olsson  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
1919baf839SRobert Olsson  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
2019baf839SRobert Olsson  *
2119baf839SRobert Olsson  *
2219baf839SRobert Olsson  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
2319baf839SRobert Olsson  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
2419baf839SRobert Olsson  *
2519baf839SRobert Olsson  * Version:	$Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
2619baf839SRobert Olsson  *
2719baf839SRobert Olsson  *
2819baf839SRobert Olsson  * Code from fib_hash has been reused which includes the following header:
2919baf839SRobert Olsson  *
3019baf839SRobert Olsson  *
3119baf839SRobert Olsson  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3219baf839SRobert Olsson  *		operating system.  INET is implemented using the  BSD Socket
3319baf839SRobert Olsson  *		interface as the means of communication with the user level.
3419baf839SRobert Olsson  *
3519baf839SRobert Olsson  *		IPv4 FIB: lookup engine and maintenance routines.
3619baf839SRobert Olsson  *
3719baf839SRobert Olsson  *
3819baf839SRobert Olsson  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
3919baf839SRobert Olsson  *
4019baf839SRobert Olsson  *		This program is free software; you can redistribute it and/or
4119baf839SRobert Olsson  *		modify it under the terms of the GNU General Public License
4219baf839SRobert Olsson  *		as published by the Free Software Foundation; either version
4319baf839SRobert Olsson  *		2 of the License, or (at your option) any later version.
44fd966255SRobert Olsson  *
45fd966255SRobert Olsson  * Substantial contributions to this work comes from:
46fd966255SRobert Olsson  *
47fd966255SRobert Olsson  *		David S. Miller, <davem@davemloft.net>
48fd966255SRobert Olsson  *		Stephen Hemminger <shemminger@osdl.org>
49fd966255SRobert Olsson  *		Paul E. McKenney <paulmck@us.ibm.com>
50fd966255SRobert Olsson  *		Patrick McHardy <kaber@trash.net>
5119baf839SRobert Olsson  */
5219baf839SRobert Olsson 
5305eee48cSRobert Olsson #define VERSION "0.408"
5419baf839SRobert Olsson 
5519baf839SRobert Olsson #include <asm/uaccess.h>
5619baf839SRobert Olsson #include <asm/system.h>
571977f032SJiri Slaby #include <linux/bitops.h>
5819baf839SRobert Olsson #include <linux/types.h>
5919baf839SRobert Olsson #include <linux/kernel.h>
6019baf839SRobert Olsson #include <linux/mm.h>
6119baf839SRobert Olsson #include <linux/string.h>
6219baf839SRobert Olsson #include <linux/socket.h>
6319baf839SRobert Olsson #include <linux/sockios.h>
6419baf839SRobert Olsson #include <linux/errno.h>
6519baf839SRobert Olsson #include <linux/in.h>
6619baf839SRobert Olsson #include <linux/inet.h>
67cd8787abSStephen Hemminger #include <linux/inetdevice.h>
6819baf839SRobert Olsson #include <linux/netdevice.h>
6919baf839SRobert Olsson #include <linux/if_arp.h>
7019baf839SRobert Olsson #include <linux/proc_fs.h>
712373ce1cSRobert Olsson #include <linux/rcupdate.h>
7219baf839SRobert Olsson #include <linux/skbuff.h>
7319baf839SRobert Olsson #include <linux/netlink.h>
7419baf839SRobert Olsson #include <linux/init.h>
7519baf839SRobert Olsson #include <linux/list.h>
76457c4cbcSEric W. Biederman #include <net/net_namespace.h>
7719baf839SRobert Olsson #include <net/ip.h>
7819baf839SRobert Olsson #include <net/protocol.h>
7919baf839SRobert Olsson #include <net/route.h>
8019baf839SRobert Olsson #include <net/tcp.h>
8119baf839SRobert Olsson #include <net/sock.h>
8219baf839SRobert Olsson #include <net/ip_fib.h>
8319baf839SRobert Olsson #include "fib_lookup.h"
8419baf839SRobert Olsson 
8506ef921dSRobert Olsson #define MAX_STAT_DEPTH 32
8619baf839SRobert Olsson 
8719baf839SRobert Olsson #define KEYLENGTH (8*sizeof(t_key))
8819baf839SRobert Olsson 
8919baf839SRobert Olsson typedef unsigned int t_key;
9019baf839SRobert Olsson 
9119baf839SRobert Olsson #define T_TNODE 0
9219baf839SRobert Olsson #define T_LEAF  1
9319baf839SRobert Olsson #define NODE_TYPE_MASK	0x1UL
942373ce1cSRobert Olsson #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
952373ce1cSRobert Olsson 
9691b9a277SOlof Johansson #define IS_TNODE(n) (!(n->parent & T_LEAF))
9791b9a277SOlof Johansson #define IS_LEAF(n) (n->parent & T_LEAF)
9819baf839SRobert Olsson 
9919baf839SRobert Olsson struct node {
10091b9a277SOlof Johansson 	unsigned long parent;
1018d965444SEric Dumazet 	t_key key;
10219baf839SRobert Olsson };
10319baf839SRobert Olsson 
10419baf839SRobert Olsson struct leaf {
10591b9a277SOlof Johansson 	unsigned long parent;
1068d965444SEric Dumazet 	t_key key;
10719baf839SRobert Olsson 	struct hlist_head list;
1082373ce1cSRobert Olsson 	struct rcu_head rcu;
10919baf839SRobert Olsson };
11019baf839SRobert Olsson 
11119baf839SRobert Olsson struct leaf_info {
11219baf839SRobert Olsson 	struct hlist_node hlist;
1132373ce1cSRobert Olsson 	struct rcu_head rcu;
11419baf839SRobert Olsson 	int plen;
11519baf839SRobert Olsson 	struct list_head falh;
11619baf839SRobert Olsson };
11719baf839SRobert Olsson 
11819baf839SRobert Olsson struct tnode {
11991b9a277SOlof Johansson 	unsigned long parent;
1208d965444SEric Dumazet 	t_key key;
121112d8cfcSEric Dumazet 	unsigned char pos;		/* 2log(KEYLENGTH) bits needed */
122112d8cfcSEric Dumazet 	unsigned char bits;		/* 2log(KEYLENGTH) bits needed */
1238d965444SEric Dumazet 	unsigned int full_children;	/* KEYLENGTH bits needed */
1248d965444SEric Dumazet 	unsigned int empty_children;	/* KEYLENGTH bits needed */
1252373ce1cSRobert Olsson 	struct rcu_head rcu;
12619baf839SRobert Olsson 	struct node *child[0];
12719baf839SRobert Olsson };
12819baf839SRobert Olsson 
12919baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
13019baf839SRobert Olsson struct trie_use_stats {
13119baf839SRobert Olsson 	unsigned int gets;
13219baf839SRobert Olsson 	unsigned int backtrack;
13319baf839SRobert Olsson 	unsigned int semantic_match_passed;
13419baf839SRobert Olsson 	unsigned int semantic_match_miss;
13519baf839SRobert Olsson 	unsigned int null_node_hit;
1362f36895aSRobert Olsson 	unsigned int resize_node_skipped;
13719baf839SRobert Olsson };
13819baf839SRobert Olsson #endif
13919baf839SRobert Olsson 
14019baf839SRobert Olsson struct trie_stat {
14119baf839SRobert Olsson 	unsigned int totdepth;
14219baf839SRobert Olsson 	unsigned int maxdepth;
14319baf839SRobert Olsson 	unsigned int tnodes;
14419baf839SRobert Olsson 	unsigned int leaves;
14519baf839SRobert Olsson 	unsigned int nullpointers;
14693672292SStephen Hemminger 	unsigned int prefixes;
14706ef921dSRobert Olsson 	unsigned int nodesizes[MAX_STAT_DEPTH];
14819baf839SRobert Olsson };
14919baf839SRobert Olsson 
15019baf839SRobert Olsson struct trie {
15119baf839SRobert Olsson 	struct node *trie;
15219baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
15319baf839SRobert Olsson 	struct trie_use_stats stats;
15419baf839SRobert Olsson #endif
15519baf839SRobert Olsson };
15619baf839SRobert Olsson 
15719baf839SRobert Olsson static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
158a07f5f50SStephen Hemminger static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
159a07f5f50SStephen Hemminger 				  int wasfull);
16019baf839SRobert Olsson static struct node *resize(struct trie *t, struct tnode *tn);
1612f80b3c8SRobert Olsson static struct tnode *inflate(struct trie *t, struct tnode *tn);
1622f80b3c8SRobert Olsson static struct tnode *halve(struct trie *t, struct tnode *tn);
16319baf839SRobert Olsson static void tnode_free(struct tnode *tn);
16419baf839SRobert Olsson 
165e18b890bSChristoph Lameter static struct kmem_cache *fn_alias_kmem __read_mostly;
166bc3c8c1eSStephen Hemminger static struct kmem_cache *trie_leaf_kmem __read_mostly;
16719baf839SRobert Olsson 
16806801916SStephen Hemminger static inline struct tnode *node_parent(struct node *node)
16906801916SStephen Hemminger {
170b59cfbf7SEric Dumazet 	return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
171b59cfbf7SEric Dumazet }
17206801916SStephen Hemminger 
173b59cfbf7SEric Dumazet static inline struct tnode *node_parent_rcu(struct node *node)
174b59cfbf7SEric Dumazet {
175b59cfbf7SEric Dumazet 	struct tnode *ret = node_parent(node);
176b59cfbf7SEric Dumazet 
17706801916SStephen Hemminger 	return rcu_dereference(ret);
17806801916SStephen Hemminger }
17906801916SStephen Hemminger 
18006801916SStephen Hemminger static inline void node_set_parent(struct node *node, struct tnode *ptr)
18106801916SStephen Hemminger {
18206801916SStephen Hemminger 	rcu_assign_pointer(node->parent,
18306801916SStephen Hemminger 			   (unsigned long)ptr | NODE_TYPE(node));
18406801916SStephen Hemminger }
1852373ce1cSRobert Olsson 
186b59cfbf7SEric Dumazet static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
18719baf839SRobert Olsson {
188b59cfbf7SEric Dumazet 	BUG_ON(i >= 1U << tn->bits);
18919baf839SRobert Olsson 
190b59cfbf7SEric Dumazet 	return tn->child[i];
191b59cfbf7SEric Dumazet }
192b59cfbf7SEric Dumazet 
193b59cfbf7SEric Dumazet static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
194b59cfbf7SEric Dumazet {
195b59cfbf7SEric Dumazet 	struct node *ret = tnode_get_child(tn, i);
196b59cfbf7SEric Dumazet 
197b59cfbf7SEric Dumazet 	return rcu_dereference(ret);
19819baf839SRobert Olsson }
19919baf839SRobert Olsson 
200bb435b8dSStephen Hemmigner static inline int tnode_child_length(const struct tnode *tn)
20119baf839SRobert Olsson {
20219baf839SRobert Olsson 	return 1 << tn->bits;
20319baf839SRobert Olsson }
20419baf839SRobert Olsson 
205ab66b4a7SStephen Hemminger static inline t_key mask_pfx(t_key k, unsigned short l)
206ab66b4a7SStephen Hemminger {
207ab66b4a7SStephen Hemminger 	return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
208ab66b4a7SStephen Hemminger }
209ab66b4a7SStephen Hemminger 
21019baf839SRobert Olsson static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
21119baf839SRobert Olsson {
21219baf839SRobert Olsson 	if (offset < KEYLENGTH)
21319baf839SRobert Olsson 		return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
21419baf839SRobert Olsson 	else
21519baf839SRobert Olsson 		return 0;
21619baf839SRobert Olsson }
21719baf839SRobert Olsson 
21819baf839SRobert Olsson static inline int tkey_equals(t_key a, t_key b)
21919baf839SRobert Olsson {
22019baf839SRobert Olsson 	return a == b;
22119baf839SRobert Olsson }
22219baf839SRobert Olsson 
22319baf839SRobert Olsson static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
22419baf839SRobert Olsson {
22519baf839SRobert Olsson 	if (bits == 0 || offset >= KEYLENGTH)
22619baf839SRobert Olsson 		return 1;
22719baf839SRobert Olsson 	bits = bits > KEYLENGTH ? KEYLENGTH : bits;
22819baf839SRobert Olsson 	return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
22919baf839SRobert Olsson }
23019baf839SRobert Olsson 
23119baf839SRobert Olsson static inline int tkey_mismatch(t_key a, int offset, t_key b)
23219baf839SRobert Olsson {
23319baf839SRobert Olsson 	t_key diff = a ^ b;
23419baf839SRobert Olsson 	int i = offset;
23519baf839SRobert Olsson 
23619baf839SRobert Olsson 	if (!diff)
23719baf839SRobert Olsson 		return 0;
23819baf839SRobert Olsson 	while ((diff << i) >> (KEYLENGTH-1) == 0)
23919baf839SRobert Olsson 		i++;
24019baf839SRobert Olsson 	return i;
24119baf839SRobert Olsson }
24219baf839SRobert Olsson 
24319baf839SRobert Olsson /*
24419baf839SRobert Olsson   To understand this stuff, an understanding of keys and all their bits is
24519baf839SRobert Olsson   necessary. Every node in the trie has a key associated with it, but not
24619baf839SRobert Olsson   all of the bits in that key are significant.
24719baf839SRobert Olsson 
24819baf839SRobert Olsson   Consider a node 'n' and its parent 'tp'.
24919baf839SRobert Olsson 
25019baf839SRobert Olsson   If n is a leaf, every bit in its key is significant. Its presence is
251772cb712SRobert Olsson   necessitated by path compression, since during a tree traversal (when
25219baf839SRobert Olsson   searching for a leaf - unless we are doing an insertion) we will completely
25319baf839SRobert Olsson   ignore all skipped bits we encounter. Thus we need to verify, at the end of
25419baf839SRobert Olsson   a potentially successful search, that we have indeed been walking the
25519baf839SRobert Olsson   correct key path.
25619baf839SRobert Olsson 
25719baf839SRobert Olsson   Note that we can never "miss" the correct key in the tree if present by
25819baf839SRobert Olsson   following the wrong path. Path compression ensures that segments of the key
25919baf839SRobert Olsson   that are the same for all keys with a given prefix are skipped, but the
26019baf839SRobert Olsson   skipped part *is* identical for each node in the subtrie below the skipped
26119baf839SRobert Olsson   bit! trie_insert() in this implementation takes care of that - note the
26219baf839SRobert Olsson   call to tkey_sub_equals() in trie_insert().
26319baf839SRobert Olsson 
26419baf839SRobert Olsson   if n is an internal node - a 'tnode' here, the various parts of its key
26519baf839SRobert Olsson   have many different meanings.
26619baf839SRobert Olsson 
26719baf839SRobert Olsson   Example:
26819baf839SRobert Olsson   _________________________________________________________________
26919baf839SRobert Olsson   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
27019baf839SRobert Olsson   -----------------------------------------------------------------
27119baf839SRobert Olsson     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
27219baf839SRobert Olsson 
27319baf839SRobert Olsson   _________________________________________________________________
27419baf839SRobert Olsson   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
27519baf839SRobert Olsson   -----------------------------------------------------------------
27619baf839SRobert Olsson    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
27719baf839SRobert Olsson 
27819baf839SRobert Olsson   tp->pos = 7
27919baf839SRobert Olsson   tp->bits = 3
28019baf839SRobert Olsson   n->pos = 15
28119baf839SRobert Olsson   n->bits = 4
28219baf839SRobert Olsson 
28319baf839SRobert Olsson   First, let's just ignore the bits that come before the parent tp, that is
28419baf839SRobert Olsson   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
28519baf839SRobert Olsson   not use them for anything.
28619baf839SRobert Olsson 
28719baf839SRobert Olsson   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
28819baf839SRobert Olsson   index into the parent's child array. That is, they will be used to find
28919baf839SRobert Olsson   'n' among tp's children.
29019baf839SRobert Olsson 
29119baf839SRobert Olsson   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
29219baf839SRobert Olsson   for the node n.
29319baf839SRobert Olsson 
29419baf839SRobert Olsson   All the bits we have seen so far are significant to the node n. The rest
29519baf839SRobert Olsson   of the bits are really not needed or indeed known in n->key.
29619baf839SRobert Olsson 
29719baf839SRobert Olsson   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
29819baf839SRobert Olsson   n's child array, and will of course be different for each child.
29919baf839SRobert Olsson 
300c877efb2SStephen Hemminger 
30119baf839SRobert Olsson   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
30219baf839SRobert Olsson   at this point.
30319baf839SRobert Olsson 
30419baf839SRobert Olsson */
30519baf839SRobert Olsson 
3060c7770c7SStephen Hemminger static inline void check_tnode(const struct tnode *tn)
30719baf839SRobert Olsson {
3080c7770c7SStephen Hemminger 	WARN_ON(tn && tn->pos+tn->bits > 32);
30919baf839SRobert Olsson }
31019baf839SRobert Olsson 
311f5026fabSDenis V. Lunev static const int halve_threshold = 25;
312f5026fabSDenis V. Lunev static const int inflate_threshold = 50;
313f5026fabSDenis V. Lunev static const int halve_threshold_root = 8;
314f5026fabSDenis V. Lunev static const int inflate_threshold_root = 15;
31519baf839SRobert Olsson 
3162373ce1cSRobert Olsson 
3172373ce1cSRobert Olsson static void __alias_free_mem(struct rcu_head *head)
3182373ce1cSRobert Olsson {
3192373ce1cSRobert Olsson 	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
3202373ce1cSRobert Olsson 	kmem_cache_free(fn_alias_kmem, fa);
3212373ce1cSRobert Olsson }
3222373ce1cSRobert Olsson 
3232373ce1cSRobert Olsson static inline void alias_free_mem_rcu(struct fib_alias *fa)
3242373ce1cSRobert Olsson {
3252373ce1cSRobert Olsson 	call_rcu(&fa->rcu, __alias_free_mem);
3262373ce1cSRobert Olsson }
3272373ce1cSRobert Olsson 
3282373ce1cSRobert Olsson static void __leaf_free_rcu(struct rcu_head *head)
3292373ce1cSRobert Olsson {
330bc3c8c1eSStephen Hemminger 	struct leaf *l = container_of(head, struct leaf, rcu);
331bc3c8c1eSStephen Hemminger 	kmem_cache_free(trie_leaf_kmem, l);
3322373ce1cSRobert Olsson }
3332373ce1cSRobert Olsson 
3342373ce1cSRobert Olsson static void __leaf_info_free_rcu(struct rcu_head *head)
3352373ce1cSRobert Olsson {
3362373ce1cSRobert Olsson 	kfree(container_of(head, struct leaf_info, rcu));
3372373ce1cSRobert Olsson }
3382373ce1cSRobert Olsson 
3392373ce1cSRobert Olsson static inline void free_leaf_info(struct leaf_info *leaf)
3402373ce1cSRobert Olsson {
3412373ce1cSRobert Olsson 	call_rcu(&leaf->rcu, __leaf_info_free_rcu);
3422373ce1cSRobert Olsson }
3432373ce1cSRobert Olsson 
3448d965444SEric Dumazet static struct tnode *tnode_alloc(size_t size)
3452373ce1cSRobert Olsson {
3462373ce1cSRobert Olsson 	struct page *pages;
3472373ce1cSRobert Olsson 
3482373ce1cSRobert Olsson 	if (size <= PAGE_SIZE)
3498d965444SEric Dumazet 		return kzalloc(size, GFP_KERNEL);
3502373ce1cSRobert Olsson 
3512373ce1cSRobert Olsson 	pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
3522373ce1cSRobert Olsson 	if (!pages)
3532373ce1cSRobert Olsson 		return NULL;
3542373ce1cSRobert Olsson 
3552373ce1cSRobert Olsson 	return page_address(pages);
3562373ce1cSRobert Olsson }
3572373ce1cSRobert Olsson 
3582373ce1cSRobert Olsson static void __tnode_free_rcu(struct rcu_head *head)
3592373ce1cSRobert Olsson {
3602373ce1cSRobert Olsson 	struct tnode *tn = container_of(head, struct tnode, rcu);
3618d965444SEric Dumazet 	size_t size = sizeof(struct tnode) +
3628d965444SEric Dumazet 		      (sizeof(struct node *) << tn->bits);
3632373ce1cSRobert Olsson 
3642373ce1cSRobert Olsson 	if (size <= PAGE_SIZE)
3652373ce1cSRobert Olsson 		kfree(tn);
3662373ce1cSRobert Olsson 	else
3672373ce1cSRobert Olsson 		free_pages((unsigned long)tn, get_order(size));
3682373ce1cSRobert Olsson }
3692373ce1cSRobert Olsson 
3702373ce1cSRobert Olsson static inline void tnode_free(struct tnode *tn)
3712373ce1cSRobert Olsson {
372550e29bcSRobert Olsson 	if (IS_LEAF(tn)) {
373550e29bcSRobert Olsson 		struct leaf *l = (struct leaf *) tn;
374550e29bcSRobert Olsson 		call_rcu_bh(&l->rcu, __leaf_free_rcu);
375132adf54SStephen Hemminger 	} else
3762373ce1cSRobert Olsson 		call_rcu(&tn->rcu, __tnode_free_rcu);
3772373ce1cSRobert Olsson }
3782373ce1cSRobert Olsson 
37919baf839SRobert Olsson static struct leaf *leaf_new(void)
38019baf839SRobert Olsson {
381bc3c8c1eSStephen Hemminger 	struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
38219baf839SRobert Olsson 	if (l) {
3832373ce1cSRobert Olsson 		l->parent = T_LEAF;
38419baf839SRobert Olsson 		INIT_HLIST_HEAD(&l->list);
38519baf839SRobert Olsson 	}
38619baf839SRobert Olsson 	return l;
38719baf839SRobert Olsson }
38819baf839SRobert Olsson 
38919baf839SRobert Olsson static struct leaf_info *leaf_info_new(int plen)
39019baf839SRobert Olsson {
39119baf839SRobert Olsson 	struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
3922373ce1cSRobert Olsson 	if (li) {
39319baf839SRobert Olsson 		li->plen = plen;
39419baf839SRobert Olsson 		INIT_LIST_HEAD(&li->falh);
3952373ce1cSRobert Olsson 	}
39619baf839SRobert Olsson 	return li;
39719baf839SRobert Olsson }
39819baf839SRobert Olsson 
39919baf839SRobert Olsson static struct tnode *tnode_new(t_key key, int pos, int bits)
40019baf839SRobert Olsson {
4018d965444SEric Dumazet 	size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
402f0e36f8cSPatrick McHardy 	struct tnode *tn = tnode_alloc(sz);
40319baf839SRobert Olsson 
40419baf839SRobert Olsson 	if (tn) {
4052373ce1cSRobert Olsson 		tn->parent = T_TNODE;
40619baf839SRobert Olsson 		tn->pos = pos;
40719baf839SRobert Olsson 		tn->bits = bits;
40819baf839SRobert Olsson 		tn->key = key;
40919baf839SRobert Olsson 		tn->full_children = 0;
41019baf839SRobert Olsson 		tn->empty_children = 1<<bits;
41119baf839SRobert Olsson 	}
412c877efb2SStephen Hemminger 
4138d965444SEric Dumazet 	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
4148d965444SEric Dumazet 		 (unsigned long) (sizeof(struct node) << bits));
41519baf839SRobert Olsson 	return tn;
41619baf839SRobert Olsson }
41719baf839SRobert Olsson 
41819baf839SRobert Olsson /*
41919baf839SRobert Olsson  * Check whether a tnode 'n' is "full", i.e. it is an internal node
42019baf839SRobert Olsson  * and no bits are skipped. See discussion in dyntree paper p. 6
42119baf839SRobert Olsson  */
42219baf839SRobert Olsson 
423bb435b8dSStephen Hemmigner static inline int tnode_full(const struct tnode *tn, const struct node *n)
42419baf839SRobert Olsson {
42519baf839SRobert Olsson 	if (n == NULL || IS_LEAF(n))
42619baf839SRobert Olsson 		return 0;
42719baf839SRobert Olsson 
42819baf839SRobert Olsson 	return ((struct tnode *) n)->pos == tn->pos + tn->bits;
42919baf839SRobert Olsson }
43019baf839SRobert Olsson 
431a07f5f50SStephen Hemminger static inline void put_child(struct trie *t, struct tnode *tn, int i,
432a07f5f50SStephen Hemminger 			     struct node *n)
43319baf839SRobert Olsson {
43419baf839SRobert Olsson 	tnode_put_child_reorg(tn, i, n, -1);
43519baf839SRobert Olsson }
43619baf839SRobert Olsson 
43719baf839SRobert Olsson  /*
43819baf839SRobert Olsson   * Add a child at position i overwriting the old value.
43919baf839SRobert Olsson   * Update the value of full_children and empty_children.
44019baf839SRobert Olsson   */
44119baf839SRobert Olsson 
442a07f5f50SStephen Hemminger static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
443a07f5f50SStephen Hemminger 				  int wasfull)
44419baf839SRobert Olsson {
4452373ce1cSRobert Olsson 	struct node *chi = tn->child[i];
44619baf839SRobert Olsson 	int isfull;
44719baf839SRobert Olsson 
4480c7770c7SStephen Hemminger 	BUG_ON(i >= 1<<tn->bits);
4490c7770c7SStephen Hemminger 
45019baf839SRobert Olsson 
45119baf839SRobert Olsson 	/* update emptyChildren */
45219baf839SRobert Olsson 	if (n == NULL && chi != NULL)
45319baf839SRobert Olsson 		tn->empty_children++;
45419baf839SRobert Olsson 	else if (n != NULL && chi == NULL)
45519baf839SRobert Olsson 		tn->empty_children--;
45619baf839SRobert Olsson 
45719baf839SRobert Olsson 	/* update fullChildren */
45819baf839SRobert Olsson 	if (wasfull == -1)
45919baf839SRobert Olsson 		wasfull = tnode_full(tn, chi);
46019baf839SRobert Olsson 
46119baf839SRobert Olsson 	isfull = tnode_full(tn, n);
46219baf839SRobert Olsson 	if (wasfull && !isfull)
46319baf839SRobert Olsson 		tn->full_children--;
46419baf839SRobert Olsson 	else if (!wasfull && isfull)
46519baf839SRobert Olsson 		tn->full_children++;
46691b9a277SOlof Johansson 
46719baf839SRobert Olsson 	if (n)
46806801916SStephen Hemminger 		node_set_parent(n, tn);
46919baf839SRobert Olsson 
4702373ce1cSRobert Olsson 	rcu_assign_pointer(tn->child[i], n);
47119baf839SRobert Olsson }
47219baf839SRobert Olsson 
47319baf839SRobert Olsson static struct node *resize(struct trie *t, struct tnode *tn)
47419baf839SRobert Olsson {
47519baf839SRobert Olsson 	int i;
4762f36895aSRobert Olsson 	int err = 0;
4772f80b3c8SRobert Olsson 	struct tnode *old_tn;
478e6308be8SRobert Olsson 	int inflate_threshold_use;
479e6308be8SRobert Olsson 	int halve_threshold_use;
48005eee48cSRobert Olsson 	int max_resize;
48119baf839SRobert Olsson 
48219baf839SRobert Olsson 	if (!tn)
48319baf839SRobert Olsson 		return NULL;
48419baf839SRobert Olsson 
4850c7770c7SStephen Hemminger 	pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
48619baf839SRobert Olsson 		 tn, inflate_threshold, halve_threshold);
48719baf839SRobert Olsson 
48819baf839SRobert Olsson 	/* No children */
48919baf839SRobert Olsson 	if (tn->empty_children == tnode_child_length(tn)) {
49019baf839SRobert Olsson 		tnode_free(tn);
49119baf839SRobert Olsson 		return NULL;
49219baf839SRobert Olsson 	}
49319baf839SRobert Olsson 	/* One child */
49419baf839SRobert Olsson 	if (tn->empty_children == tnode_child_length(tn) - 1)
49519baf839SRobert Olsson 		for (i = 0; i < tnode_child_length(tn); i++) {
49691b9a277SOlof Johansson 			struct node *n;
49719baf839SRobert Olsson 
49891b9a277SOlof Johansson 			n = tn->child[i];
4992373ce1cSRobert Olsson 			if (!n)
50091b9a277SOlof Johansson 				continue;
50119baf839SRobert Olsson 
50219baf839SRobert Olsson 			/* compress one level */
50306801916SStephen Hemminger 			node_set_parent(n, NULL);
50419baf839SRobert Olsson 			tnode_free(tn);
50519baf839SRobert Olsson 			return n;
50619baf839SRobert Olsson 		}
50719baf839SRobert Olsson 	/*
50819baf839SRobert Olsson 	 * Double as long as the resulting node has a number of
50919baf839SRobert Olsson 	 * nonempty nodes that are above the threshold.
51019baf839SRobert Olsson 	 */
51119baf839SRobert Olsson 
51219baf839SRobert Olsson 	/*
51319baf839SRobert Olsson 	 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
51419baf839SRobert Olsson 	 * the Helsinki University of Technology and Matti Tikkanen of Nokia
51519baf839SRobert Olsson 	 * Telecommunications, page 6:
51619baf839SRobert Olsson 	 * "A node is doubled if the ratio of non-empty children to all
51719baf839SRobert Olsson 	 * children in the *doubled* node is at least 'high'."
51819baf839SRobert Olsson 	 *
51919baf839SRobert Olsson 	 * 'high' in this instance is the variable 'inflate_threshold'. It
52019baf839SRobert Olsson 	 * is expressed as a percentage, so we multiply it with
52119baf839SRobert Olsson 	 * tnode_child_length() and instead of multiplying by 2 (since the
52219baf839SRobert Olsson 	 * child array will be doubled by inflate()) and multiplying
52319baf839SRobert Olsson 	 * the left-hand side by 100 (to handle the percentage thing) we
52419baf839SRobert Olsson 	 * multiply the left-hand side by 50.
52519baf839SRobert Olsson 	 *
52619baf839SRobert Olsson 	 * The left-hand side may look a bit weird: tnode_child_length(tn)
52719baf839SRobert Olsson 	 * - tn->empty_children is of course the number of non-null children
52819baf839SRobert Olsson 	 * in the current node. tn->full_children is the number of "full"
52919baf839SRobert Olsson 	 * children, that is non-null tnodes with a skip value of 0.
53019baf839SRobert Olsson 	 * All of those will be doubled in the resulting inflated tnode, so
53119baf839SRobert Olsson 	 * we just count them one extra time here.
53219baf839SRobert Olsson 	 *
53319baf839SRobert Olsson 	 * A clearer way to write this would be:
53419baf839SRobert Olsson 	 *
53519baf839SRobert Olsson 	 * to_be_doubled = tn->full_children;
53619baf839SRobert Olsson 	 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
53719baf839SRobert Olsson 	 *     tn->full_children;
53819baf839SRobert Olsson 	 *
53919baf839SRobert Olsson 	 * new_child_length = tnode_child_length(tn) * 2;
54019baf839SRobert Olsson 	 *
54119baf839SRobert Olsson 	 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
54219baf839SRobert Olsson 	 *      new_child_length;
54319baf839SRobert Olsson 	 * if (new_fill_factor >= inflate_threshold)
54419baf839SRobert Olsson 	 *
54519baf839SRobert Olsson 	 * ...and so on, tho it would mess up the while () loop.
54619baf839SRobert Olsson 	 *
54719baf839SRobert Olsson 	 * anyway,
54819baf839SRobert Olsson 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
54919baf839SRobert Olsson 	 *      inflate_threshold
55019baf839SRobert Olsson 	 *
55119baf839SRobert Olsson 	 * avoid a division:
55219baf839SRobert Olsson 	 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
55319baf839SRobert Olsson 	 *      inflate_threshold * new_child_length
55419baf839SRobert Olsson 	 *
55519baf839SRobert Olsson 	 * expand not_to_be_doubled and to_be_doubled, and shorten:
55619baf839SRobert Olsson 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
55719baf839SRobert Olsson 	 *    tn->full_children) >= inflate_threshold * new_child_length
55819baf839SRobert Olsson 	 *
55919baf839SRobert Olsson 	 * expand new_child_length:
56019baf839SRobert Olsson 	 * 100 * (tnode_child_length(tn) - tn->empty_children +
56119baf839SRobert Olsson 	 *    tn->full_children) >=
56219baf839SRobert Olsson 	 *      inflate_threshold * tnode_child_length(tn) * 2
56319baf839SRobert Olsson 	 *
56419baf839SRobert Olsson 	 * shorten again:
56519baf839SRobert Olsson 	 * 50 * (tn->full_children + tnode_child_length(tn) -
56619baf839SRobert Olsson 	 *    tn->empty_children) >= inflate_threshold *
56719baf839SRobert Olsson 	 *    tnode_child_length(tn)
56819baf839SRobert Olsson 	 *
56919baf839SRobert Olsson 	 */
57019baf839SRobert Olsson 
57119baf839SRobert Olsson 	check_tnode(tn);
57219baf839SRobert Olsson 
573e6308be8SRobert Olsson 	/* Keep root node larger  */
574e6308be8SRobert Olsson 
575e6308be8SRobert Olsson 	if (!tn->parent)
576e6308be8SRobert Olsson 		inflate_threshold_use = inflate_threshold_root;
577e6308be8SRobert Olsson 	else
578e6308be8SRobert Olsson 		inflate_threshold_use = inflate_threshold;
579e6308be8SRobert Olsson 
5802f36895aSRobert Olsson 	err = 0;
58105eee48cSRobert Olsson 	max_resize = 10;
58205eee48cSRobert Olsson 	while ((tn->full_children > 0 &&  max_resize-- &&
583a07f5f50SStephen Hemminger 		50 * (tn->full_children + tnode_child_length(tn)
584a07f5f50SStephen Hemminger 		      - tn->empty_children)
585a07f5f50SStephen Hemminger 		>= inflate_threshold_use * tnode_child_length(tn))) {
58619baf839SRobert Olsson 
5872f80b3c8SRobert Olsson 		old_tn = tn;
5882f80b3c8SRobert Olsson 		tn = inflate(t, tn);
589a07f5f50SStephen Hemminger 
5902f80b3c8SRobert Olsson 		if (IS_ERR(tn)) {
5912f80b3c8SRobert Olsson 			tn = old_tn;
5922f36895aSRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
5932f36895aSRobert Olsson 			t->stats.resize_node_skipped++;
5942f36895aSRobert Olsson #endif
5952f36895aSRobert Olsson 			break;
5962f36895aSRobert Olsson 		}
59719baf839SRobert Olsson 	}
59819baf839SRobert Olsson 
59905eee48cSRobert Olsson 	if (max_resize < 0) {
60005eee48cSRobert Olsson 		if (!tn->parent)
601a07f5f50SStephen Hemminger 			pr_warning("Fix inflate_threshold_root."
602a07f5f50SStephen Hemminger 				   " Now=%d size=%d bits\n",
60305eee48cSRobert Olsson 				   inflate_threshold_root, tn->bits);
60405eee48cSRobert Olsson 		else
605a07f5f50SStephen Hemminger 			pr_warning("Fix inflate_threshold."
606a07f5f50SStephen Hemminger 				   " Now=%d size=%d bits\n",
60705eee48cSRobert Olsson 				   inflate_threshold, tn->bits);
60805eee48cSRobert Olsson 	}
60905eee48cSRobert Olsson 
61019baf839SRobert Olsson 	check_tnode(tn);
61119baf839SRobert Olsson 
61219baf839SRobert Olsson 	/*
61319baf839SRobert Olsson 	 * Halve as long as the number of empty children in this
61419baf839SRobert Olsson 	 * node is above threshold.
61519baf839SRobert Olsson 	 */
6162f36895aSRobert Olsson 
617e6308be8SRobert Olsson 
618e6308be8SRobert Olsson 	/* Keep root node larger  */
619e6308be8SRobert Olsson 
620e6308be8SRobert Olsson 	if (!tn->parent)
621e6308be8SRobert Olsson 		halve_threshold_use = halve_threshold_root;
622e6308be8SRobert Olsson 	else
623e6308be8SRobert Olsson 		halve_threshold_use = halve_threshold;
624e6308be8SRobert Olsson 
6252f36895aSRobert Olsson 	err = 0;
62605eee48cSRobert Olsson 	max_resize = 10;
62705eee48cSRobert Olsson 	while (tn->bits > 1 &&  max_resize-- &&
62819baf839SRobert Olsson 	       100 * (tnode_child_length(tn) - tn->empty_children) <
629e6308be8SRobert Olsson 	       halve_threshold_use * tnode_child_length(tn)) {
63019baf839SRobert Olsson 
6312f80b3c8SRobert Olsson 		old_tn = tn;
6322f80b3c8SRobert Olsson 		tn = halve(t, tn);
6332f80b3c8SRobert Olsson 		if (IS_ERR(tn)) {
6342f80b3c8SRobert Olsson 			tn = old_tn;
6352f36895aSRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
6362f36895aSRobert Olsson 			t->stats.resize_node_skipped++;
6372f36895aSRobert Olsson #endif
6382f36895aSRobert Olsson 			break;
6392f36895aSRobert Olsson 		}
6402f36895aSRobert Olsson 	}
6412f36895aSRobert Olsson 
64205eee48cSRobert Olsson 	if (max_resize < 0) {
64305eee48cSRobert Olsson 		if (!tn->parent)
644a07f5f50SStephen Hemminger 			pr_warning("Fix halve_threshold_root."
645a07f5f50SStephen Hemminger 				   " Now=%d size=%d bits\n",
64605eee48cSRobert Olsson 				   halve_threshold_root, tn->bits);
64705eee48cSRobert Olsson 		else
648a07f5f50SStephen Hemminger 			pr_warning("Fix halve_threshold."
649a07f5f50SStephen Hemminger 				   " Now=%d size=%d bits\n",
65005eee48cSRobert Olsson 				   halve_threshold, tn->bits);
65105eee48cSRobert Olsson 	}
65219baf839SRobert Olsson 
65319baf839SRobert Olsson 	/* Only one child remains */
65419baf839SRobert Olsson 	if (tn->empty_children == tnode_child_length(tn) - 1)
65519baf839SRobert Olsson 		for (i = 0; i < tnode_child_length(tn); i++) {
65691b9a277SOlof Johansson 			struct node *n;
65719baf839SRobert Olsson 
65891b9a277SOlof Johansson 			n = tn->child[i];
6592373ce1cSRobert Olsson 			if (!n)
66091b9a277SOlof Johansson 				continue;
66191b9a277SOlof Johansson 
66291b9a277SOlof Johansson 			/* compress one level */
66391b9a277SOlof Johansson 
66406801916SStephen Hemminger 			node_set_parent(n, NULL);
66519baf839SRobert Olsson 			tnode_free(tn);
66619baf839SRobert Olsson 			return n;
66719baf839SRobert Olsson 		}
66819baf839SRobert Olsson 
66919baf839SRobert Olsson 	return (struct node *) tn;
67019baf839SRobert Olsson }
67119baf839SRobert Olsson 
6722f80b3c8SRobert Olsson static struct tnode *inflate(struct trie *t, struct tnode *tn)
67319baf839SRobert Olsson {
67419baf839SRobert Olsson 	struct tnode *oldtnode = tn;
67519baf839SRobert Olsson 	int olen = tnode_child_length(tn);
67619baf839SRobert Olsson 	int i;
67719baf839SRobert Olsson 
6780c7770c7SStephen Hemminger 	pr_debug("In inflate\n");
67919baf839SRobert Olsson 
68019baf839SRobert Olsson 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
68119baf839SRobert Olsson 
6822f80b3c8SRobert Olsson 	if (!tn)
6832f80b3c8SRobert Olsson 		return ERR_PTR(-ENOMEM);
6842f36895aSRobert Olsson 
6852f36895aSRobert Olsson 	/*
6862f36895aSRobert Olsson 	 * Preallocate and store tnodes before the actual work so we
6872f36895aSRobert Olsson 	 * don't get into an inconsistent state if memory allocation
6882f36895aSRobert Olsson 	 * fails. In case of failure we return the oldnode and  inflate
6892f36895aSRobert Olsson 	 * of tnode is ignored.
6902f36895aSRobert Olsson 	 */
6912f36895aSRobert Olsson 
6922f36895aSRobert Olsson 	for (i = 0; i < olen; i++) {
693a07f5f50SStephen Hemminger 		struct tnode *inode;
6942f36895aSRobert Olsson 
695a07f5f50SStephen Hemminger 		inode = (struct tnode *) tnode_get_child(oldtnode, i);
6962f36895aSRobert Olsson 		if (inode &&
6972f36895aSRobert Olsson 		    IS_TNODE(inode) &&
6982f36895aSRobert Olsson 		    inode->pos == oldtnode->pos + oldtnode->bits &&
6992f36895aSRobert Olsson 		    inode->bits > 1) {
7002f36895aSRobert Olsson 			struct tnode *left, *right;
701ab66b4a7SStephen Hemminger 			t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
7022f36895aSRobert Olsson 
7032f36895aSRobert Olsson 			left = tnode_new(inode->key&(~m), inode->pos + 1,
7042f36895aSRobert Olsson 					 inode->bits - 1);
7052f80b3c8SRobert Olsson 			if (!left)
7062f80b3c8SRobert Olsson 				goto nomem;
7072f36895aSRobert Olsson 
7082f36895aSRobert Olsson 			right = tnode_new(inode->key|m, inode->pos + 1,
7092f36895aSRobert Olsson 					  inode->bits - 1);
7102f36895aSRobert Olsson 
7112f36895aSRobert Olsson 			if (!right) {
7122f80b3c8SRobert Olsson 				tnode_free(left);
7132f80b3c8SRobert Olsson 				goto nomem;
7142f36895aSRobert Olsson 			}
7152f36895aSRobert Olsson 
7162f36895aSRobert Olsson 			put_child(t, tn, 2*i, (struct node *) left);
7172f36895aSRobert Olsson 			put_child(t, tn, 2*i+1, (struct node *) right);
7182f36895aSRobert Olsson 		}
7192f36895aSRobert Olsson 	}
7202f36895aSRobert Olsson 
72119baf839SRobert Olsson 	for (i = 0; i < olen; i++) {
722c95aaf9aSStephen Hemminger 		struct tnode *inode;
72319baf839SRobert Olsson 		struct node *node = tnode_get_child(oldtnode, i);
72491b9a277SOlof Johansson 		struct tnode *left, *right;
72591b9a277SOlof Johansson 		int size, j;
72619baf839SRobert Olsson 
72719baf839SRobert Olsson 		/* An empty child */
72819baf839SRobert Olsson 		if (node == NULL)
72919baf839SRobert Olsson 			continue;
73019baf839SRobert Olsson 
73119baf839SRobert Olsson 		/* A leaf or an internal node with skipped bits */
73219baf839SRobert Olsson 
73319baf839SRobert Olsson 		if (IS_LEAF(node) || ((struct tnode *) node)->pos >
73419baf839SRobert Olsson 		   tn->pos + tn->bits - 1) {
735a07f5f50SStephen Hemminger 			if (tkey_extract_bits(node->key,
736a07f5f50SStephen Hemminger 					      oldtnode->pos + oldtnode->bits,
73719baf839SRobert Olsson 					      1) == 0)
73819baf839SRobert Olsson 				put_child(t, tn, 2*i, node);
73919baf839SRobert Olsson 			else
74019baf839SRobert Olsson 				put_child(t, tn, 2*i+1, node);
74119baf839SRobert Olsson 			continue;
74219baf839SRobert Olsson 		}
74319baf839SRobert Olsson 
74419baf839SRobert Olsson 		/* An internal node with two children */
74519baf839SRobert Olsson 		inode = (struct tnode *) node;
74619baf839SRobert Olsson 
74719baf839SRobert Olsson 		if (inode->bits == 1) {
74819baf839SRobert Olsson 			put_child(t, tn, 2*i, inode->child[0]);
74919baf839SRobert Olsson 			put_child(t, tn, 2*i+1, inode->child[1]);
75019baf839SRobert Olsson 
75119baf839SRobert Olsson 			tnode_free(inode);
75291b9a277SOlof Johansson 			continue;
75319baf839SRobert Olsson 		}
75419baf839SRobert Olsson 
75519baf839SRobert Olsson 		/* An internal node with more than two children */
75619baf839SRobert Olsson 
75719baf839SRobert Olsson 		/* We will replace this node 'inode' with two new
75819baf839SRobert Olsson 		 * ones, 'left' and 'right', each with half of the
75919baf839SRobert Olsson 		 * original children. The two new nodes will have
76019baf839SRobert Olsson 		 * a position one bit further down the key and this
76119baf839SRobert Olsson 		 * means that the "significant" part of their keys
76219baf839SRobert Olsson 		 * (see the discussion near the top of this file)
76319baf839SRobert Olsson 		 * will differ by one bit, which will be "0" in
76419baf839SRobert Olsson 		 * left's key and "1" in right's key. Since we are
76519baf839SRobert Olsson 		 * moving the key position by one step, the bit that
76619baf839SRobert Olsson 		 * we are moving away from - the bit at position
76719baf839SRobert Olsson 		 * (inode->pos) - is the one that will differ between
76819baf839SRobert Olsson 		 * left and right. So... we synthesize that bit in the
76919baf839SRobert Olsson 		 * two  new keys.
77019baf839SRobert Olsson 		 * The mask 'm' below will be a single "one" bit at
77119baf839SRobert Olsson 		 * the position (inode->pos)
77219baf839SRobert Olsson 		 */
77319baf839SRobert Olsson 
77419baf839SRobert Olsson 		/* Use the old key, but set the new significant
77519baf839SRobert Olsson 		 *   bit to zero.
77619baf839SRobert Olsson 		 */
7772f36895aSRobert Olsson 
7782f36895aSRobert Olsson 		left = (struct tnode *) tnode_get_child(tn, 2*i);
7792f36895aSRobert Olsson 		put_child(t, tn, 2*i, NULL);
78019baf839SRobert Olsson 
78191b9a277SOlof Johansson 		BUG_ON(!left);
78219baf839SRobert Olsson 
7832f36895aSRobert Olsson 		right = (struct tnode *) tnode_get_child(tn, 2*i+1);
7842f36895aSRobert Olsson 		put_child(t, tn, 2*i+1, NULL);
78519baf839SRobert Olsson 
78691b9a277SOlof Johansson 		BUG_ON(!right);
78719baf839SRobert Olsson 
78819baf839SRobert Olsson 		size = tnode_child_length(left);
78919baf839SRobert Olsson 		for (j = 0; j < size; j++) {
79019baf839SRobert Olsson 			put_child(t, left, j, inode->child[j]);
79119baf839SRobert Olsson 			put_child(t, right, j, inode->child[j + size]);
79219baf839SRobert Olsson 		}
79319baf839SRobert Olsson 		put_child(t, tn, 2*i, resize(t, left));
79419baf839SRobert Olsson 		put_child(t, tn, 2*i+1, resize(t, right));
79519baf839SRobert Olsson 
79619baf839SRobert Olsson 		tnode_free(inode);
79719baf839SRobert Olsson 	}
79819baf839SRobert Olsson 	tnode_free(oldtnode);
79919baf839SRobert Olsson 	return tn;
8002f80b3c8SRobert Olsson nomem:
8012f80b3c8SRobert Olsson 	{
8022f80b3c8SRobert Olsson 		int size = tnode_child_length(tn);
8032f80b3c8SRobert Olsson 		int j;
8042f80b3c8SRobert Olsson 
8052f80b3c8SRobert Olsson 		for (j = 0; j < size; j++)
8062f80b3c8SRobert Olsson 			if (tn->child[j])
8072f80b3c8SRobert Olsson 				tnode_free((struct tnode *)tn->child[j]);
8082f80b3c8SRobert Olsson 
8092f80b3c8SRobert Olsson 		tnode_free(tn);
8102f80b3c8SRobert Olsson 
8112f80b3c8SRobert Olsson 		return ERR_PTR(-ENOMEM);
8122f80b3c8SRobert Olsson 	}
81319baf839SRobert Olsson }
81419baf839SRobert Olsson 
8152f80b3c8SRobert Olsson static struct tnode *halve(struct trie *t, struct tnode *tn)
81619baf839SRobert Olsson {
81719baf839SRobert Olsson 	struct tnode *oldtnode = tn;
81819baf839SRobert Olsson 	struct node *left, *right;
81919baf839SRobert Olsson 	int i;
82019baf839SRobert Olsson 	int olen = tnode_child_length(tn);
82119baf839SRobert Olsson 
8220c7770c7SStephen Hemminger 	pr_debug("In halve\n");
82319baf839SRobert Olsson 
82419baf839SRobert Olsson 	tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
82519baf839SRobert Olsson 
8262f80b3c8SRobert Olsson 	if (!tn)
8272f80b3c8SRobert Olsson 		return ERR_PTR(-ENOMEM);
8282f36895aSRobert Olsson 
8292f36895aSRobert Olsson 	/*
8302f36895aSRobert Olsson 	 * Preallocate and store tnodes before the actual work so we
8312f36895aSRobert Olsson 	 * don't get into an inconsistent state if memory allocation
8322f36895aSRobert Olsson 	 * fails. In case of failure we return the oldnode and halve
8332f36895aSRobert Olsson 	 * of tnode is ignored.
8342f36895aSRobert Olsson 	 */
8352f36895aSRobert Olsson 
8362f36895aSRobert Olsson 	for (i = 0; i < olen; i += 2) {
8372f36895aSRobert Olsson 		left = tnode_get_child(oldtnode, i);
8382f36895aSRobert Olsson 		right = tnode_get_child(oldtnode, i+1);
8392f36895aSRobert Olsson 
8402f36895aSRobert Olsson 		/* Two nonempty children */
8412f36895aSRobert Olsson 		if (left && right) {
8422f80b3c8SRobert Olsson 			struct tnode *newn;
8432f36895aSRobert Olsson 
8442f80b3c8SRobert Olsson 			newn = tnode_new(left->key, tn->pos + tn->bits, 1);
8452f80b3c8SRobert Olsson 
8462f80b3c8SRobert Olsson 			if (!newn)
8472f80b3c8SRobert Olsson 				goto nomem;
8482f80b3c8SRobert Olsson 
8492f80b3c8SRobert Olsson 			put_child(t, tn, i/2, (struct node *)newn);
8502f36895aSRobert Olsson 		}
8512f36895aSRobert Olsson 
8522f36895aSRobert Olsson 	}
85319baf839SRobert Olsson 
85419baf839SRobert Olsson 	for (i = 0; i < olen; i += 2) {
85591b9a277SOlof Johansson 		struct tnode *newBinNode;
85691b9a277SOlof Johansson 
85719baf839SRobert Olsson 		left = tnode_get_child(oldtnode, i);
85819baf839SRobert Olsson 		right = tnode_get_child(oldtnode, i+1);
85919baf839SRobert Olsson 
86019baf839SRobert Olsson 		/* At least one of the children is empty */
86119baf839SRobert Olsson 		if (left == NULL) {
86219baf839SRobert Olsson 			if (right == NULL)    /* Both are empty */
86319baf839SRobert Olsson 				continue;
86419baf839SRobert Olsson 			put_child(t, tn, i/2, right);
86591b9a277SOlof Johansson 			continue;
86691b9a277SOlof Johansson 		}
86791b9a277SOlof Johansson 
86891b9a277SOlof Johansson 		if (right == NULL) {
86919baf839SRobert Olsson 			put_child(t, tn, i/2, left);
87091b9a277SOlof Johansson 			continue;
87191b9a277SOlof Johansson 		}
87219baf839SRobert Olsson 
87319baf839SRobert Olsson 		/* Two nonempty children */
87491b9a277SOlof Johansson 		newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
8752f36895aSRobert Olsson 		put_child(t, tn, i/2, NULL);
87619baf839SRobert Olsson 		put_child(t, newBinNode, 0, left);
87719baf839SRobert Olsson 		put_child(t, newBinNode, 1, right);
87819baf839SRobert Olsson 		put_child(t, tn, i/2, resize(t, newBinNode));
87919baf839SRobert Olsson 	}
88019baf839SRobert Olsson 	tnode_free(oldtnode);
88119baf839SRobert Olsson 	return tn;
8822f80b3c8SRobert Olsson nomem:
8832f80b3c8SRobert Olsson 	{
8842f80b3c8SRobert Olsson 		int size = tnode_child_length(tn);
8852f80b3c8SRobert Olsson 		int j;
8862f80b3c8SRobert Olsson 
8872f80b3c8SRobert Olsson 		for (j = 0; j < size; j++)
8882f80b3c8SRobert Olsson 			if (tn->child[j])
8892f80b3c8SRobert Olsson 				tnode_free((struct tnode *)tn->child[j]);
8902f80b3c8SRobert Olsson 
8912f80b3c8SRobert Olsson 		tnode_free(tn);
8922f80b3c8SRobert Olsson 
8932f80b3c8SRobert Olsson 		return ERR_PTR(-ENOMEM);
8942f80b3c8SRobert Olsson 	}
89519baf839SRobert Olsson }
89619baf839SRobert Olsson 
897772cb712SRobert Olsson /* readside must use rcu_read_lock currently dump routines
8982373ce1cSRobert Olsson  via get_fa_head and dump */
8992373ce1cSRobert Olsson 
900772cb712SRobert Olsson static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
90119baf839SRobert Olsson {
902772cb712SRobert Olsson 	struct hlist_head *head = &l->list;
90319baf839SRobert Olsson 	struct hlist_node *node;
90419baf839SRobert Olsson 	struct leaf_info *li;
90519baf839SRobert Olsson 
9062373ce1cSRobert Olsson 	hlist_for_each_entry_rcu(li, node, head, hlist)
90719baf839SRobert Olsson 		if (li->plen == plen)
90819baf839SRobert Olsson 			return li;
90991b9a277SOlof Johansson 
91019baf839SRobert Olsson 	return NULL;
91119baf839SRobert Olsson }
91219baf839SRobert Olsson 
91319baf839SRobert Olsson static inline struct list_head *get_fa_head(struct leaf *l, int plen)
91419baf839SRobert Olsson {
915772cb712SRobert Olsson 	struct leaf_info *li = find_leaf_info(l, plen);
91619baf839SRobert Olsson 
91791b9a277SOlof Johansson 	if (!li)
91891b9a277SOlof Johansson 		return NULL;
91919baf839SRobert Olsson 
92091b9a277SOlof Johansson 	return &li->falh;
92119baf839SRobert Olsson }
92219baf839SRobert Olsson 
92319baf839SRobert Olsson static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
92419baf839SRobert Olsson {
92519baf839SRobert Olsson 	struct leaf_info *li = NULL, *last = NULL;
92691b9a277SOlof Johansson 	struct hlist_node *node;
92719baf839SRobert Olsson 
92891b9a277SOlof Johansson 	if (hlist_empty(head)) {
9292373ce1cSRobert Olsson 		hlist_add_head_rcu(&new->hlist, head);
93091b9a277SOlof Johansson 	} else {
93191b9a277SOlof Johansson 		hlist_for_each_entry(li, node, head, hlist) {
93219baf839SRobert Olsson 			if (new->plen > li->plen)
93319baf839SRobert Olsson 				break;
93419baf839SRobert Olsson 
93519baf839SRobert Olsson 			last = li;
93619baf839SRobert Olsson 		}
93719baf839SRobert Olsson 		if (last)
9382373ce1cSRobert Olsson 			hlist_add_after_rcu(&last->hlist, &new->hlist);
93919baf839SRobert Olsson 		else
9402373ce1cSRobert Olsson 			hlist_add_before_rcu(&new->hlist, &li->hlist);
94119baf839SRobert Olsson 	}
94219baf839SRobert Olsson }
94319baf839SRobert Olsson 
9442373ce1cSRobert Olsson /* rcu_read_lock needs to be hold by caller from readside */
9452373ce1cSRobert Olsson 
94619baf839SRobert Olsson static struct leaf *
94719baf839SRobert Olsson fib_find_node(struct trie *t, u32 key)
94819baf839SRobert Olsson {
94919baf839SRobert Olsson 	int pos;
95019baf839SRobert Olsson 	struct tnode *tn;
95119baf839SRobert Olsson 	struct node *n;
95219baf839SRobert Olsson 
95319baf839SRobert Olsson 	pos = 0;
9542373ce1cSRobert Olsson 	n = rcu_dereference(t->trie);
95519baf839SRobert Olsson 
95619baf839SRobert Olsson 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
95719baf839SRobert Olsson 		tn = (struct tnode *) n;
95819baf839SRobert Olsson 
95919baf839SRobert Olsson 		check_tnode(tn);
96019baf839SRobert Olsson 
96119baf839SRobert Olsson 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
96219baf839SRobert Olsson 			pos = tn->pos + tn->bits;
963a07f5f50SStephen Hemminger 			n = tnode_get_child_rcu(tn,
964a07f5f50SStephen Hemminger 						tkey_extract_bits(key,
965a07f5f50SStephen Hemminger 								  tn->pos,
966a07f5f50SStephen Hemminger 								  tn->bits));
96791b9a277SOlof Johansson 		} else
96819baf839SRobert Olsson 			break;
96919baf839SRobert Olsson 	}
97019baf839SRobert Olsson 	/* Case we have found a leaf. Compare prefixes */
97119baf839SRobert Olsson 
97291b9a277SOlof Johansson 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
97391b9a277SOlof Johansson 		return (struct leaf *)n;
97491b9a277SOlof Johansson 
97519baf839SRobert Olsson 	return NULL;
97619baf839SRobert Olsson }
97719baf839SRobert Olsson 
97819baf839SRobert Olsson static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
97919baf839SRobert Olsson {
98019baf839SRobert Olsson 	int wasfull;
98106801916SStephen Hemminger 	t_key cindex, key = tn->key;
98206801916SStephen Hemminger 	struct tnode *tp;
98319baf839SRobert Olsson 
98406801916SStephen Hemminger 	while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
98519baf839SRobert Olsson 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
98619baf839SRobert Olsson 		wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
98719baf839SRobert Olsson 		tn = (struct tnode *) resize(t, (struct tnode *)tn);
988a07f5f50SStephen Hemminger 
989a07f5f50SStephen Hemminger 		tnode_put_child_reorg((struct tnode *)tp, cindex,
990a07f5f50SStephen Hemminger 				      (struct node *)tn, wasfull);
99119baf839SRobert Olsson 
99206801916SStephen Hemminger 		tp = node_parent((struct node *) tn);
99306801916SStephen Hemminger 		if (!tp)
99419baf839SRobert Olsson 			break;
99506801916SStephen Hemminger 		tn = tp;
99619baf839SRobert Olsson 	}
99706801916SStephen Hemminger 
99819baf839SRobert Olsson 	/* Handle last (top) tnode */
99919baf839SRobert Olsson 	if (IS_TNODE(tn))
100019baf839SRobert Olsson 		tn = (struct tnode *)resize(t, (struct tnode *)tn);
100119baf839SRobert Olsson 
100219baf839SRobert Olsson 	return (struct node *)tn;
100319baf839SRobert Olsson }
100419baf839SRobert Olsson 
10052373ce1cSRobert Olsson /* only used from updater-side */
10062373ce1cSRobert Olsson 
1007fea86ad8SStephen Hemminger static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
100819baf839SRobert Olsson {
100919baf839SRobert Olsson 	int pos, newpos;
101019baf839SRobert Olsson 	struct tnode *tp = NULL, *tn = NULL;
101119baf839SRobert Olsson 	struct node *n;
101219baf839SRobert Olsson 	struct leaf *l;
101319baf839SRobert Olsson 	int missbit;
101419baf839SRobert Olsson 	struct list_head *fa_head = NULL;
101519baf839SRobert Olsson 	struct leaf_info *li;
101619baf839SRobert Olsson 	t_key cindex;
101719baf839SRobert Olsson 
101819baf839SRobert Olsson 	pos = 0;
101919baf839SRobert Olsson 	n = t->trie;
102019baf839SRobert Olsson 
102119baf839SRobert Olsson 	/* If we point to NULL, stop. Either the tree is empty and we should
102219baf839SRobert Olsson 	 * just put a new leaf in if, or we have reached an empty child slot,
102319baf839SRobert Olsson 	 * and we should just put our new leaf in that.
102419baf839SRobert Olsson 	 * If we point to a T_TNODE, check if it matches our key. Note that
102519baf839SRobert Olsson 	 * a T_TNODE might be skipping any number of bits - its 'pos' need
102619baf839SRobert Olsson 	 * not be the parent's 'pos'+'bits'!
102719baf839SRobert Olsson 	 *
102819baf839SRobert Olsson 	 * If it does match the current key, get pos/bits from it, extract
102919baf839SRobert Olsson 	 * the index from our key, push the T_TNODE and walk the tree.
103019baf839SRobert Olsson 	 *
103119baf839SRobert Olsson 	 * If it doesn't, we have to replace it with a new T_TNODE.
103219baf839SRobert Olsson 	 *
103319baf839SRobert Olsson 	 * If we point to a T_LEAF, it might or might not have the same key
103419baf839SRobert Olsson 	 * as we do. If it does, just change the value, update the T_LEAF's
103519baf839SRobert Olsson 	 * value, and return it.
103619baf839SRobert Olsson 	 * If it doesn't, we need to replace it with a T_TNODE.
103719baf839SRobert Olsson 	 */
103819baf839SRobert Olsson 
103919baf839SRobert Olsson 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
104019baf839SRobert Olsson 		tn = (struct tnode *) n;
104119baf839SRobert Olsson 
104219baf839SRobert Olsson 		check_tnode(tn);
104319baf839SRobert Olsson 
104419baf839SRobert Olsson 		if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
104519baf839SRobert Olsson 			tp = tn;
104619baf839SRobert Olsson 			pos = tn->pos + tn->bits;
1047a07f5f50SStephen Hemminger 			n = tnode_get_child(tn,
1048a07f5f50SStephen Hemminger 					    tkey_extract_bits(key,
1049a07f5f50SStephen Hemminger 							      tn->pos,
1050a07f5f50SStephen Hemminger 							      tn->bits));
105119baf839SRobert Olsson 
105206801916SStephen Hemminger 			BUG_ON(n && node_parent(n) != tn);
105391b9a277SOlof Johansson 		} else
105419baf839SRobert Olsson 			break;
105519baf839SRobert Olsson 	}
105619baf839SRobert Olsson 
105719baf839SRobert Olsson 	/*
105819baf839SRobert Olsson 	 * n  ----> NULL, LEAF or TNODE
105919baf839SRobert Olsson 	 *
106019baf839SRobert Olsson 	 * tp is n's (parent) ----> NULL or TNODE
106119baf839SRobert Olsson 	 */
106219baf839SRobert Olsson 
106391b9a277SOlof Johansson 	BUG_ON(tp && IS_LEAF(tp));
106419baf839SRobert Olsson 
106519baf839SRobert Olsson 	/* Case 1: n is a leaf. Compare prefixes */
106619baf839SRobert Olsson 
106719baf839SRobert Olsson 	if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1068c95aaf9aSStephen Hemminger 		l = (struct leaf *) n;
106919baf839SRobert Olsson 		li = leaf_info_new(plen);
107019baf839SRobert Olsson 
1071fea86ad8SStephen Hemminger 		if (!li)
1072fea86ad8SStephen Hemminger 			return NULL;
107319baf839SRobert Olsson 
107419baf839SRobert Olsson 		fa_head = &li->falh;
107519baf839SRobert Olsson 		insert_leaf_info(&l->list, li);
107619baf839SRobert Olsson 		goto done;
107719baf839SRobert Olsson 	}
107819baf839SRobert Olsson 	l = leaf_new();
107919baf839SRobert Olsson 
1080fea86ad8SStephen Hemminger 	if (!l)
1081fea86ad8SStephen Hemminger 		return NULL;
108219baf839SRobert Olsson 
108319baf839SRobert Olsson 	l->key = key;
108419baf839SRobert Olsson 	li = leaf_info_new(plen);
108519baf839SRobert Olsson 
1086f835e471SRobert Olsson 	if (!li) {
1087f835e471SRobert Olsson 		tnode_free((struct tnode *) l);
1088fea86ad8SStephen Hemminger 		return NULL;
1089f835e471SRobert Olsson 	}
109019baf839SRobert Olsson 
109119baf839SRobert Olsson 	fa_head = &li->falh;
109219baf839SRobert Olsson 	insert_leaf_info(&l->list, li);
109319baf839SRobert Olsson 
109419baf839SRobert Olsson 	if (t->trie && n == NULL) {
109591b9a277SOlof Johansson 		/* Case 2: n is NULL, and will just insert a new leaf */
109619baf839SRobert Olsson 
109706801916SStephen Hemminger 		node_set_parent((struct node *)l, tp);
109819baf839SRobert Olsson 
109919baf839SRobert Olsson 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
110019baf839SRobert Olsson 		put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
110191b9a277SOlof Johansson 	} else {
110219baf839SRobert Olsson 		/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
110319baf839SRobert Olsson 		/*
110419baf839SRobert Olsson 		 *  Add a new tnode here
110519baf839SRobert Olsson 		 *  first tnode need some special handling
110619baf839SRobert Olsson 		 */
110719baf839SRobert Olsson 
110819baf839SRobert Olsson 		if (tp)
110919baf839SRobert Olsson 			pos = tp->pos+tp->bits;
111019baf839SRobert Olsson 		else
111119baf839SRobert Olsson 			pos = 0;
111291b9a277SOlof Johansson 
111319baf839SRobert Olsson 		if (n) {
111419baf839SRobert Olsson 			newpos = tkey_mismatch(key, pos, n->key);
111519baf839SRobert Olsson 			tn = tnode_new(n->key, newpos, 1);
111691b9a277SOlof Johansson 		} else {
111719baf839SRobert Olsson 			newpos = 0;
111819baf839SRobert Olsson 			tn = tnode_new(key, newpos, 1); /* First tnode */
111919baf839SRobert Olsson 		}
1120f835e471SRobert Olsson 
1121f835e471SRobert Olsson 		if (!tn) {
1122f835e471SRobert Olsson 			free_leaf_info(li);
1123f835e471SRobert Olsson 			tnode_free((struct tnode *) l);
1124fea86ad8SStephen Hemminger 			return NULL;
1125f835e471SRobert Olsson 		}
112619baf839SRobert Olsson 
112706801916SStephen Hemminger 		node_set_parent((struct node *)tn, tp);
112819baf839SRobert Olsson 
112919baf839SRobert Olsson 		missbit = tkey_extract_bits(key, newpos, 1);
113019baf839SRobert Olsson 		put_child(t, tn, missbit, (struct node *)l);
113119baf839SRobert Olsson 		put_child(t, tn, 1-missbit, n);
113219baf839SRobert Olsson 
113319baf839SRobert Olsson 		if (tp) {
113419baf839SRobert Olsson 			cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1135a07f5f50SStephen Hemminger 			put_child(t, (struct tnode *)tp, cindex,
1136a07f5f50SStephen Hemminger 				  (struct node *)tn);
113791b9a277SOlof Johansson 		} else {
1138a07f5f50SStephen Hemminger 			rcu_assign_pointer(t->trie, (struct node *)tn);
113919baf839SRobert Olsson 			tp = tn;
114019baf839SRobert Olsson 		}
114119baf839SRobert Olsson 	}
114291b9a277SOlof Johansson 
114391b9a277SOlof Johansson 	if (tp && tp->pos + tp->bits > 32)
1144a07f5f50SStephen Hemminger 		pr_warning("fib_trie"
1145a07f5f50SStephen Hemminger 			   " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
114619baf839SRobert Olsson 			   tp, tp->pos, tp->bits, key, plen);
114791b9a277SOlof Johansson 
114819baf839SRobert Olsson 	/* Rebalance the trie */
11492373ce1cSRobert Olsson 
11502373ce1cSRobert Olsson 	rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
1151f835e471SRobert Olsson done:
115219baf839SRobert Olsson 	return fa_head;
115319baf839SRobert Olsson }
115419baf839SRobert Olsson 
1155d562f1f8SRobert Olsson /*
1156d562f1f8SRobert Olsson  * Caller must hold RTNL.
1157d562f1f8SRobert Olsson  */
11584e902c57SThomas Graf static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
115919baf839SRobert Olsson {
116019baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
116119baf839SRobert Olsson 	struct fib_alias *fa, *new_fa;
116219baf839SRobert Olsson 	struct list_head *fa_head = NULL;
116319baf839SRobert Olsson 	struct fib_info *fi;
11644e902c57SThomas Graf 	int plen = cfg->fc_dst_len;
11654e902c57SThomas Graf 	u8 tos = cfg->fc_tos;
116619baf839SRobert Olsson 	u32 key, mask;
116719baf839SRobert Olsson 	int err;
116819baf839SRobert Olsson 	struct leaf *l;
116919baf839SRobert Olsson 
117019baf839SRobert Olsson 	if (plen > 32)
117119baf839SRobert Olsson 		return -EINVAL;
117219baf839SRobert Olsson 
11734e902c57SThomas Graf 	key = ntohl(cfg->fc_dst);
117419baf839SRobert Olsson 
11752dfe55b4SPatrick McHardy 	pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
117619baf839SRobert Olsson 
117719baf839SRobert Olsson 	mask = ntohl(inet_make_mask(plen));
117819baf839SRobert Olsson 
117919baf839SRobert Olsson 	if (key & ~mask)
118019baf839SRobert Olsson 		return -EINVAL;
118119baf839SRobert Olsson 
118219baf839SRobert Olsson 	key = key & mask;
118319baf839SRobert Olsson 
11844e902c57SThomas Graf 	fi = fib_create_info(cfg);
11854e902c57SThomas Graf 	if (IS_ERR(fi)) {
11864e902c57SThomas Graf 		err = PTR_ERR(fi);
118719baf839SRobert Olsson 		goto err;
11884e902c57SThomas Graf 	}
118919baf839SRobert Olsson 
119019baf839SRobert Olsson 	l = fib_find_node(t, key);
119119baf839SRobert Olsson 	fa = NULL;
119219baf839SRobert Olsson 
119319baf839SRobert Olsson 	if (l) {
119419baf839SRobert Olsson 		fa_head = get_fa_head(l, plen);
119519baf839SRobert Olsson 		fa = fib_find_alias(fa_head, tos, fi->fib_priority);
119619baf839SRobert Olsson 	}
119719baf839SRobert Olsson 
119819baf839SRobert Olsson 	/* Now fa, if non-NULL, points to the first fib alias
119919baf839SRobert Olsson 	 * with the same keys [prefix,tos,priority], if such key already
120019baf839SRobert Olsson 	 * exists or to the node before which we will insert new one.
120119baf839SRobert Olsson 	 *
120219baf839SRobert Olsson 	 * If fa is NULL, we will need to allocate a new one and
120319baf839SRobert Olsson 	 * insert to the head of f.
120419baf839SRobert Olsson 	 *
120519baf839SRobert Olsson 	 * If f is NULL, no fib node matched the destination key
120619baf839SRobert Olsson 	 * and we need to allocate a new one of those as well.
120719baf839SRobert Olsson 	 */
120819baf839SRobert Olsson 
120991b9a277SOlof Johansson 	if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
121019baf839SRobert Olsson 		struct fib_alias *fa_orig;
121119baf839SRobert Olsson 
121219baf839SRobert Olsson 		err = -EEXIST;
12134e902c57SThomas Graf 		if (cfg->fc_nlflags & NLM_F_EXCL)
121419baf839SRobert Olsson 			goto out;
121519baf839SRobert Olsson 
12164e902c57SThomas Graf 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
121719baf839SRobert Olsson 			struct fib_info *fi_drop;
121819baf839SRobert Olsson 			u8 state;
121919baf839SRobert Olsson 
12206725033fSJoonwoo Park 			if (fi->fib_treeref > 1)
12216725033fSJoonwoo Park 				goto out;
12226725033fSJoonwoo Park 
12232373ce1cSRobert Olsson 			err = -ENOBUFS;
1224e94b1766SChristoph Lameter 			new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
12252373ce1cSRobert Olsson 			if (new_fa == NULL)
12262373ce1cSRobert Olsson 				goto out;
122719baf839SRobert Olsson 
122819baf839SRobert Olsson 			fi_drop = fa->fa_info;
12292373ce1cSRobert Olsson 			new_fa->fa_tos = fa->fa_tos;
12302373ce1cSRobert Olsson 			new_fa->fa_info = fi;
12314e902c57SThomas Graf 			new_fa->fa_type = cfg->fc_type;
12324e902c57SThomas Graf 			new_fa->fa_scope = cfg->fc_scope;
123319baf839SRobert Olsson 			state = fa->fa_state;
12342373ce1cSRobert Olsson 			new_fa->fa_state &= ~FA_S_ACCESSED;
123519baf839SRobert Olsson 
12362373ce1cSRobert Olsson 			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
12372373ce1cSRobert Olsson 			alias_free_mem_rcu(fa);
123819baf839SRobert Olsson 
123919baf839SRobert Olsson 			fib_release_info(fi_drop);
124019baf839SRobert Olsson 			if (state & FA_S_ACCESSED)
124119baf839SRobert Olsson 				rt_cache_flush(-1);
1242b8f55831SMilan Kocian 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1243b8f55831SMilan Kocian 				tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
124419baf839SRobert Olsson 
124519baf839SRobert Olsson 			goto succeeded;
124619baf839SRobert Olsson 		}
124719baf839SRobert Olsson 		/* Error if we find a perfect match which
124819baf839SRobert Olsson 		 * uses the same scope, type, and nexthop
124919baf839SRobert Olsson 		 * information.
125019baf839SRobert Olsson 		 */
125119baf839SRobert Olsson 		fa_orig = fa;
125219baf839SRobert Olsson 		list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
125319baf839SRobert Olsson 			if (fa->fa_tos != tos)
125419baf839SRobert Olsson 				break;
125519baf839SRobert Olsson 			if (fa->fa_info->fib_priority != fi->fib_priority)
125619baf839SRobert Olsson 				break;
12574e902c57SThomas Graf 			if (fa->fa_type == cfg->fc_type &&
12584e902c57SThomas Graf 			    fa->fa_scope == cfg->fc_scope &&
1259a07f5f50SStephen Hemminger 			    fa->fa_info == fi)
126019baf839SRobert Olsson 				goto out;
126119baf839SRobert Olsson 		}
1262a07f5f50SStephen Hemminger 
12634e902c57SThomas Graf 		if (!(cfg->fc_nlflags & NLM_F_APPEND))
126419baf839SRobert Olsson 			fa = fa_orig;
126519baf839SRobert Olsson 	}
126619baf839SRobert Olsson 	err = -ENOENT;
12674e902c57SThomas Graf 	if (!(cfg->fc_nlflags & NLM_F_CREATE))
126819baf839SRobert Olsson 		goto out;
126919baf839SRobert Olsson 
127019baf839SRobert Olsson 	err = -ENOBUFS;
1271e94b1766SChristoph Lameter 	new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
127219baf839SRobert Olsson 	if (new_fa == NULL)
127319baf839SRobert Olsson 		goto out;
127419baf839SRobert Olsson 
127519baf839SRobert Olsson 	new_fa->fa_info = fi;
127619baf839SRobert Olsson 	new_fa->fa_tos = tos;
12774e902c57SThomas Graf 	new_fa->fa_type = cfg->fc_type;
12784e902c57SThomas Graf 	new_fa->fa_scope = cfg->fc_scope;
127919baf839SRobert Olsson 	new_fa->fa_state = 0;
128019baf839SRobert Olsson 	/*
128119baf839SRobert Olsson 	 * Insert new entry to the list.
128219baf839SRobert Olsson 	 */
128319baf839SRobert Olsson 
1284f835e471SRobert Olsson 	if (!fa_head) {
1285fea86ad8SStephen Hemminger 		fa_head = fib_insert_node(t, key, plen);
1286fea86ad8SStephen Hemminger 		if (unlikely(!fa_head)) {
1287fea86ad8SStephen Hemminger 			err = -ENOMEM;
1288f835e471SRobert Olsson 			goto out_free_new_fa;
1289f835e471SRobert Olsson 		}
1290fea86ad8SStephen Hemminger 	}
129119baf839SRobert Olsson 
12922373ce1cSRobert Olsson 	list_add_tail_rcu(&new_fa->fa_list,
12932373ce1cSRobert Olsson 			  (fa ? &fa->fa_list : fa_head));
129419baf839SRobert Olsson 
129519baf839SRobert Olsson 	rt_cache_flush(-1);
12964e902c57SThomas Graf 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1297b8f55831SMilan Kocian 		  &cfg->fc_nlinfo, 0);
129819baf839SRobert Olsson succeeded:
129919baf839SRobert Olsson 	return 0;
1300f835e471SRobert Olsson 
1301f835e471SRobert Olsson out_free_new_fa:
1302f835e471SRobert Olsson 	kmem_cache_free(fn_alias_kmem, new_fa);
130319baf839SRobert Olsson out:
130419baf839SRobert Olsson 	fib_release_info(fi);
130591b9a277SOlof Johansson err:
130619baf839SRobert Olsson 	return err;
130719baf839SRobert Olsson }
130819baf839SRobert Olsson 
13092373ce1cSRobert Olsson 
1310772cb712SRobert Olsson /* should be called with rcu_read_lock */
1311a07f5f50SStephen Hemminger static int check_leaf(struct trie *t, struct leaf *l,
1312a07f5f50SStephen Hemminger 		      t_key key,  const struct flowi *flp,
131306c74270SPatrick McHardy 		      struct fib_result *res)
131419baf839SRobert Olsson {
131519baf839SRobert Olsson 	struct leaf_info *li;
131619baf839SRobert Olsson 	struct hlist_head *hhead = &l->list;
131719baf839SRobert Olsson 	struct hlist_node *node;
131819baf839SRobert Olsson 
13192373ce1cSRobert Olsson 	hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1320a07f5f50SStephen Hemminger 		int err;
1321a07f5f50SStephen Hemminger 		int plen = li->plen;
1322a07f5f50SStephen Hemminger 		__be32 mask = inet_make_mask(plen);
1323a07f5f50SStephen Hemminger 
1324888454c5SAl Viro 		if (l->key != (key & ntohl(mask)))
132519baf839SRobert Olsson 			continue;
132619baf839SRobert Olsson 
1327a07f5f50SStephen Hemminger 		err = fib_semantic_match(&li->falh, flp, res,
1328a07f5f50SStephen Hemminger 					 htonl(l->key), mask, plen);
1329a07f5f50SStephen Hemminger 
133019baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
1331a07f5f50SStephen Hemminger 		if (err <= 0)
133219baf839SRobert Olsson 			t->stats.semantic_match_passed++;
1333a07f5f50SStephen Hemminger 		else
133419baf839SRobert Olsson 			t->stats.semantic_match_miss++;
133519baf839SRobert Olsson #endif
1336a07f5f50SStephen Hemminger 		if (err <= 0)
1337a07f5f50SStephen Hemminger 			return plen;
133819baf839SRobert Olsson 	}
133919baf839SRobert Olsson 
1340a07f5f50SStephen Hemminger 	return -1;
1341a07f5f50SStephen Hemminger }
1342a07f5f50SStephen Hemminger 
1343a07f5f50SStephen Hemminger static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1344a07f5f50SStephen Hemminger 			  struct fib_result *res)
134519baf839SRobert Olsson {
134619baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
134719baf839SRobert Olsson 	int plen, ret = 0;
134819baf839SRobert Olsson 	struct node *n;
134919baf839SRobert Olsson 	struct tnode *pn;
135019baf839SRobert Olsson 	int pos, bits;
135119baf839SRobert Olsson 	t_key key = ntohl(flp->fl4_dst);
135219baf839SRobert Olsson 	int chopped_off;
135319baf839SRobert Olsson 	t_key cindex = 0;
135419baf839SRobert Olsson 	int current_prefix_length = KEYLENGTH;
135591b9a277SOlof Johansson 	struct tnode *cn;
135691b9a277SOlof Johansson 	t_key node_prefix, key_prefix, pref_mismatch;
135791b9a277SOlof Johansson 	int mp;
135891b9a277SOlof Johansson 
13592373ce1cSRobert Olsson 	rcu_read_lock();
136019baf839SRobert Olsson 
13612373ce1cSRobert Olsson 	n = rcu_dereference(t->trie);
136219baf839SRobert Olsson 	if (!n)
136319baf839SRobert Olsson 		goto failed;
136419baf839SRobert Olsson 
136519baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
136619baf839SRobert Olsson 	t->stats.gets++;
136719baf839SRobert Olsson #endif
136819baf839SRobert Olsson 
136919baf839SRobert Olsson 	/* Just a leaf? */
137019baf839SRobert Olsson 	if (IS_LEAF(n)) {
1371a07f5f50SStephen Hemminger 		plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1372a07f5f50SStephen Hemminger 		if (plen < 0)
137319baf839SRobert Olsson 			goto failed;
1374a07f5f50SStephen Hemminger 		ret = 0;
1375a07f5f50SStephen Hemminger 		goto found;
137619baf839SRobert Olsson 	}
1377a07f5f50SStephen Hemminger 
137819baf839SRobert Olsson 	pn = (struct tnode *) n;
137919baf839SRobert Olsson 	chopped_off = 0;
138019baf839SRobert Olsson 
138119baf839SRobert Olsson 	while (pn) {
138219baf839SRobert Olsson 		pos = pn->pos;
138319baf839SRobert Olsson 		bits = pn->bits;
138419baf839SRobert Olsson 
138519baf839SRobert Olsson 		if (!chopped_off)
1386ab66b4a7SStephen Hemminger 			cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1387ab66b4a7SStephen Hemminger 						   pos, bits);
138819baf839SRobert Olsson 
138919baf839SRobert Olsson 		n = tnode_get_child(pn, cindex);
139019baf839SRobert Olsson 
139119baf839SRobert Olsson 		if (n == NULL) {
139219baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
139319baf839SRobert Olsson 			t->stats.null_node_hit++;
139419baf839SRobert Olsson #endif
139519baf839SRobert Olsson 			goto backtrace;
139619baf839SRobert Olsson 		}
139719baf839SRobert Olsson 
139891b9a277SOlof Johansson 		if (IS_LEAF(n)) {
1399a07f5f50SStephen Hemminger 			plen = check_leaf(t, (struct leaf *)n, key, flp, res);
1400a07f5f50SStephen Hemminger 			if (plen < 0)
140191b9a277SOlof Johansson 				goto backtrace;
1402a07f5f50SStephen Hemminger 
1403a07f5f50SStephen Hemminger 			ret = 0;
1404a07f5f50SStephen Hemminger 			goto found;
140591b9a277SOlof Johansson 		}
140691b9a277SOlof Johansson 
140791b9a277SOlof Johansson 		cn = (struct tnode *)n;
140819baf839SRobert Olsson 
140919baf839SRobert Olsson 		/*
141019baf839SRobert Olsson 		 * It's a tnode, and we can do some extra checks here if we
141119baf839SRobert Olsson 		 * like, to avoid descending into a dead-end branch.
141219baf839SRobert Olsson 		 * This tnode is in the parent's child array at index
141319baf839SRobert Olsson 		 * key[p_pos..p_pos+p_bits] but potentially with some bits
141419baf839SRobert Olsson 		 * chopped off, so in reality the index may be just a
141519baf839SRobert Olsson 		 * subprefix, padded with zero at the end.
141619baf839SRobert Olsson 		 * We can also take a look at any skipped bits in this
141719baf839SRobert Olsson 		 * tnode - everything up to p_pos is supposed to be ok,
141819baf839SRobert Olsson 		 * and the non-chopped bits of the index (se previous
141919baf839SRobert Olsson 		 * paragraph) are also guaranteed ok, but the rest is
142019baf839SRobert Olsson 		 * considered unknown.
142119baf839SRobert Olsson 		 *
142219baf839SRobert Olsson 		 * The skipped bits are key[pos+bits..cn->pos].
142319baf839SRobert Olsson 		 */
142419baf839SRobert Olsson 
142519baf839SRobert Olsson 		/* If current_prefix_length < pos+bits, we are already doing
142619baf839SRobert Olsson 		 * actual prefix  matching, which means everything from
142719baf839SRobert Olsson 		 * pos+(bits-chopped_off) onward must be zero along some
142819baf839SRobert Olsson 		 * branch of this subtree - otherwise there is *no* valid
142919baf839SRobert Olsson 		 * prefix present. Here we can only check the skipped
143019baf839SRobert Olsson 		 * bits. Remember, since we have already indexed into the
143119baf839SRobert Olsson 		 * parent's child array, we know that the bits we chopped of
143219baf839SRobert Olsson 		 * *are* zero.
143319baf839SRobert Olsson 		 */
143419baf839SRobert Olsson 
1435a07f5f50SStephen Hemminger 		/* NOTA BENE: Checking only skipped bits
1436a07f5f50SStephen Hemminger 		   for the new node here */
143719baf839SRobert Olsson 
143819baf839SRobert Olsson 		if (current_prefix_length < pos+bits) {
143919baf839SRobert Olsson 			if (tkey_extract_bits(cn->key, current_prefix_length,
1440a07f5f50SStephen Hemminger 						cn->pos - current_prefix_length)
1441a07f5f50SStephen Hemminger 			    || !(cn->child[0]))
144219baf839SRobert Olsson 				goto backtrace;
144319baf839SRobert Olsson 		}
144419baf839SRobert Olsson 
144519baf839SRobert Olsson 		/*
144619baf839SRobert Olsson 		 * If chopped_off=0, the index is fully validated and we
144719baf839SRobert Olsson 		 * only need to look at the skipped bits for this, the new,
144819baf839SRobert Olsson 		 * tnode. What we actually want to do is to find out if
144919baf839SRobert Olsson 		 * these skipped bits match our key perfectly, or if we will
145019baf839SRobert Olsson 		 * have to count on finding a matching prefix further down,
145119baf839SRobert Olsson 		 * because if we do, we would like to have some way of
145219baf839SRobert Olsson 		 * verifying the existence of such a prefix at this point.
145319baf839SRobert Olsson 		 */
145419baf839SRobert Olsson 
145519baf839SRobert Olsson 		/* The only thing we can do at this point is to verify that
145619baf839SRobert Olsson 		 * any such matching prefix can indeed be a prefix to our
145719baf839SRobert Olsson 		 * key, and if the bits in the node we are inspecting that
145819baf839SRobert Olsson 		 * do not match our key are not ZERO, this cannot be true.
145919baf839SRobert Olsson 		 * Thus, find out where there is a mismatch (before cn->pos)
146019baf839SRobert Olsson 		 * and verify that all the mismatching bits are zero in the
146119baf839SRobert Olsson 		 * new tnode's key.
146219baf839SRobert Olsson 		 */
146319baf839SRobert Olsson 
1464a07f5f50SStephen Hemminger 		/*
1465a07f5f50SStephen Hemminger 		 * Note: We aren't very concerned about the piece of
1466a07f5f50SStephen Hemminger 		 * the key that precede pn->pos+pn->bits, since these
1467a07f5f50SStephen Hemminger 		 * have already been checked. The bits after cn->pos
1468a07f5f50SStephen Hemminger 		 * aren't checked since these are by definition
1469a07f5f50SStephen Hemminger 		 * "unknown" at this point. Thus, what we want to see
1470a07f5f50SStephen Hemminger 		 * is if we are about to enter the "prefix matching"
1471a07f5f50SStephen Hemminger 		 * state, and in that case verify that the skipped
1472a07f5f50SStephen Hemminger 		 * bits that will prevail throughout this subtree are
1473a07f5f50SStephen Hemminger 		 * zero, as they have to be if we are to find a
1474a07f5f50SStephen Hemminger 		 * matching prefix.
147519baf839SRobert Olsson 		 */
147619baf839SRobert Olsson 
1477ab66b4a7SStephen Hemminger 		node_prefix = mask_pfx(cn->key, cn->pos);
1478ab66b4a7SStephen Hemminger 		key_prefix = mask_pfx(key, cn->pos);
147919baf839SRobert Olsson 		pref_mismatch = key_prefix^node_prefix;
148019baf839SRobert Olsson 		mp = 0;
148119baf839SRobert Olsson 
1482a07f5f50SStephen Hemminger 		/*
1483a07f5f50SStephen Hemminger 		 * In short: If skipped bits in this node do not match
1484a07f5f50SStephen Hemminger 		 * the search key, enter the "prefix matching"
1485a07f5f50SStephen Hemminger 		 * state.directly.
148619baf839SRobert Olsson 		 */
148719baf839SRobert Olsson 		if (pref_mismatch) {
148819baf839SRobert Olsson 			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
148919baf839SRobert Olsson 				mp++;
149019baf839SRobert Olsson 				pref_mismatch = pref_mismatch << 1;
149119baf839SRobert Olsson 			}
149219baf839SRobert Olsson 			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
149319baf839SRobert Olsson 
149419baf839SRobert Olsson 			if (key_prefix != 0)
149519baf839SRobert Olsson 				goto backtrace;
149619baf839SRobert Olsson 
149719baf839SRobert Olsson 			if (current_prefix_length >= cn->pos)
149819baf839SRobert Olsson 				current_prefix_length = mp;
149919baf839SRobert Olsson 		}
1500a07f5f50SStephen Hemminger 
150119baf839SRobert Olsson 		pn = (struct tnode *)n; /* Descend */
150219baf839SRobert Olsson 		chopped_off = 0;
150319baf839SRobert Olsson 		continue;
150491b9a277SOlof Johansson 
150519baf839SRobert Olsson backtrace:
150619baf839SRobert Olsson 		chopped_off++;
150719baf839SRobert Olsson 
150819baf839SRobert Olsson 		/* As zero don't change the child key (cindex) */
1509a07f5f50SStephen Hemminger 		while ((chopped_off <= pn->bits)
1510a07f5f50SStephen Hemminger 		       && !(cindex & (1<<(chopped_off-1))))
151119baf839SRobert Olsson 			chopped_off++;
151219baf839SRobert Olsson 
151319baf839SRobert Olsson 		/* Decrease current_... with bits chopped off */
151419baf839SRobert Olsson 		if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1515a07f5f50SStephen Hemminger 			current_prefix_length = pn->pos + pn->bits
1516a07f5f50SStephen Hemminger 				- chopped_off;
151719baf839SRobert Olsson 
151819baf839SRobert Olsson 		/*
151919baf839SRobert Olsson 		 * Either we do the actual chop off according or if we have
152019baf839SRobert Olsson 		 * chopped off all bits in this tnode walk up to our parent.
152119baf839SRobert Olsson 		 */
152219baf839SRobert Olsson 
152391b9a277SOlof Johansson 		if (chopped_off <= pn->bits) {
152419baf839SRobert Olsson 			cindex &= ~(1 << (chopped_off-1));
152591b9a277SOlof Johansson 		} else {
152606801916SStephen Hemminger 			struct tnode *parent = node_parent((struct node *) pn);
152706801916SStephen Hemminger 			if (!parent)
152819baf839SRobert Olsson 				goto failed;
152919baf839SRobert Olsson 
153019baf839SRobert Olsson 			/* Get Child's index */
153106801916SStephen Hemminger 			cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
153206801916SStephen Hemminger 			pn = parent;
153319baf839SRobert Olsson 			chopped_off = 0;
153419baf839SRobert Olsson 
153519baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
153619baf839SRobert Olsson 			t->stats.backtrack++;
153719baf839SRobert Olsson #endif
153819baf839SRobert Olsson 			goto backtrace;
153919baf839SRobert Olsson 		}
154019baf839SRobert Olsson 	}
154119baf839SRobert Olsson failed:
154219baf839SRobert Olsson 	ret = 1;
154319baf839SRobert Olsson found:
15442373ce1cSRobert Olsson 	rcu_read_unlock();
154519baf839SRobert Olsson 	return ret;
154619baf839SRobert Olsson }
154719baf839SRobert Olsson 
15482373ce1cSRobert Olsson /* only called from updater side */
154919baf839SRobert Olsson static int trie_leaf_remove(struct trie *t, t_key key)
155019baf839SRobert Olsson {
155119baf839SRobert Olsson 	t_key cindex;
155219baf839SRobert Olsson 	struct tnode *tp = NULL;
155319baf839SRobert Olsson 	struct node *n = t->trie;
155419baf839SRobert Olsson 	struct leaf *l;
155519baf839SRobert Olsson 
15560c7770c7SStephen Hemminger 	pr_debug("entering trie_leaf_remove(%p)\n", n);
155719baf839SRobert Olsson 
155819baf839SRobert Olsson 	/* Note that in the case skipped bits, those bits are *not* checked!
155919baf839SRobert Olsson 	 * When we finish this, we will have NULL or a T_LEAF, and the
156019baf839SRobert Olsson 	 * T_LEAF may or may not match our key.
156119baf839SRobert Olsson 	 */
156219baf839SRobert Olsson 
156319baf839SRobert Olsson 	while (n != NULL && IS_TNODE(n)) {
156419baf839SRobert Olsson 		struct tnode *tn = (struct tnode *) n;
156519baf839SRobert Olsson 		check_tnode(tn);
1566a07f5f50SStephen Hemminger 		n = tnode_get_child(tn, tkey_extract_bits(key,
1567a07f5f50SStephen Hemminger 							  tn->pos, tn->bits));
156819baf839SRobert Olsson 
156906801916SStephen Hemminger 		BUG_ON(n && node_parent(n) != tn);
157019baf839SRobert Olsson 	}
157119baf839SRobert Olsson 	l = (struct leaf *) n;
157219baf839SRobert Olsson 
157319baf839SRobert Olsson 	if (!n || !tkey_equals(l->key, key))
157419baf839SRobert Olsson 		return 0;
157519baf839SRobert Olsson 
157619baf839SRobert Olsson 	/*
157719baf839SRobert Olsson 	 * Key found.
157819baf839SRobert Olsson 	 * Remove the leaf and rebalance the tree
157919baf839SRobert Olsson 	 */
158006801916SStephen Hemminger 	tp = node_parent(n);
158119baf839SRobert Olsson 	tnode_free((struct tnode *) n);
158219baf839SRobert Olsson 
158319baf839SRobert Olsson 	if (tp) {
158419baf839SRobert Olsson 		cindex = tkey_extract_bits(key, tp->pos, tp->bits);
158519baf839SRobert Olsson 		put_child(t, (struct tnode *)tp, cindex, NULL);
15862373ce1cSRobert Olsson 		rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
158791b9a277SOlof Johansson 	} else
15882373ce1cSRobert Olsson 		rcu_assign_pointer(t->trie, NULL);
158919baf839SRobert Olsson 
159019baf839SRobert Olsson 	return 1;
159119baf839SRobert Olsson }
159219baf839SRobert Olsson 
1593d562f1f8SRobert Olsson /*
1594d562f1f8SRobert Olsson  * Caller must hold RTNL.
1595d562f1f8SRobert Olsson  */
15964e902c57SThomas Graf static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
159719baf839SRobert Olsson {
159819baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
159919baf839SRobert Olsson 	u32 key, mask;
16004e902c57SThomas Graf 	int plen = cfg->fc_dst_len;
16014e902c57SThomas Graf 	u8 tos = cfg->fc_tos;
160219baf839SRobert Olsson 	struct fib_alias *fa, *fa_to_delete;
160319baf839SRobert Olsson 	struct list_head *fa_head;
160419baf839SRobert Olsson 	struct leaf *l;
160591b9a277SOlof Johansson 	struct leaf_info *li;
160691b9a277SOlof Johansson 
160719baf839SRobert Olsson 	if (plen > 32)
160819baf839SRobert Olsson 		return -EINVAL;
160919baf839SRobert Olsson 
16104e902c57SThomas Graf 	key = ntohl(cfg->fc_dst);
161119baf839SRobert Olsson 	mask = ntohl(inet_make_mask(plen));
161219baf839SRobert Olsson 
161319baf839SRobert Olsson 	if (key & ~mask)
161419baf839SRobert Olsson 		return -EINVAL;
161519baf839SRobert Olsson 
161619baf839SRobert Olsson 	key = key & mask;
161719baf839SRobert Olsson 	l = fib_find_node(t, key);
161819baf839SRobert Olsson 
161919baf839SRobert Olsson 	if (!l)
162019baf839SRobert Olsson 		return -ESRCH;
162119baf839SRobert Olsson 
162219baf839SRobert Olsson 	fa_head = get_fa_head(l, plen);
162319baf839SRobert Olsson 	fa = fib_find_alias(fa_head, tos, 0);
162419baf839SRobert Olsson 
162519baf839SRobert Olsson 	if (!fa)
162619baf839SRobert Olsson 		return -ESRCH;
162719baf839SRobert Olsson 
16280c7770c7SStephen Hemminger 	pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
162919baf839SRobert Olsson 
163019baf839SRobert Olsson 	fa_to_delete = NULL;
163119baf839SRobert Olsson 	fa_head = fa->fa_list.prev;
16322373ce1cSRobert Olsson 
163319baf839SRobert Olsson 	list_for_each_entry(fa, fa_head, fa_list) {
163419baf839SRobert Olsson 		struct fib_info *fi = fa->fa_info;
163519baf839SRobert Olsson 
163619baf839SRobert Olsson 		if (fa->fa_tos != tos)
163719baf839SRobert Olsson 			break;
163819baf839SRobert Olsson 
16394e902c57SThomas Graf 		if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
16404e902c57SThomas Graf 		    (cfg->fc_scope == RT_SCOPE_NOWHERE ||
16414e902c57SThomas Graf 		     fa->fa_scope == cfg->fc_scope) &&
16424e902c57SThomas Graf 		    (!cfg->fc_protocol ||
16434e902c57SThomas Graf 		     fi->fib_protocol == cfg->fc_protocol) &&
16444e902c57SThomas Graf 		    fib_nh_match(cfg, fi) == 0) {
164519baf839SRobert Olsson 			fa_to_delete = fa;
164619baf839SRobert Olsson 			break;
164719baf839SRobert Olsson 		}
164819baf839SRobert Olsson 	}
164919baf839SRobert Olsson 
165091b9a277SOlof Johansson 	if (!fa_to_delete)
165191b9a277SOlof Johansson 		return -ESRCH;
165219baf839SRobert Olsson 
165319baf839SRobert Olsson 	fa = fa_to_delete;
16544e902c57SThomas Graf 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1655b8f55831SMilan Kocian 		  &cfg->fc_nlinfo, 0);
165619baf839SRobert Olsson 
165719baf839SRobert Olsson 	l = fib_find_node(t, key);
1658772cb712SRobert Olsson 	li = find_leaf_info(l, plen);
165919baf839SRobert Olsson 
16602373ce1cSRobert Olsson 	list_del_rcu(&fa->fa_list);
166119baf839SRobert Olsson 
166219baf839SRobert Olsson 	if (list_empty(fa_head)) {
16632373ce1cSRobert Olsson 		hlist_del_rcu(&li->hlist);
166419baf839SRobert Olsson 		free_leaf_info(li);
16652373ce1cSRobert Olsson 	}
166619baf839SRobert Olsson 
166719baf839SRobert Olsson 	if (hlist_empty(&l->list))
166819baf839SRobert Olsson 		trie_leaf_remove(t, key);
166919baf839SRobert Olsson 
167019baf839SRobert Olsson 	if (fa->fa_state & FA_S_ACCESSED)
167119baf839SRobert Olsson 		rt_cache_flush(-1);
167219baf839SRobert Olsson 
16732373ce1cSRobert Olsson 	fib_release_info(fa->fa_info);
16742373ce1cSRobert Olsson 	alias_free_mem_rcu(fa);
167519baf839SRobert Olsson 	return 0;
167619baf839SRobert Olsson }
167719baf839SRobert Olsson 
167819baf839SRobert Olsson static int trie_flush_list(struct trie *t, struct list_head *head)
167919baf839SRobert Olsson {
168019baf839SRobert Olsson 	struct fib_alias *fa, *fa_node;
168119baf839SRobert Olsson 	int found = 0;
168219baf839SRobert Olsson 
168319baf839SRobert Olsson 	list_for_each_entry_safe(fa, fa_node, head, fa_list) {
168419baf839SRobert Olsson 		struct fib_info *fi = fa->fa_info;
168519baf839SRobert Olsson 
168619baf839SRobert Olsson 		if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
16872373ce1cSRobert Olsson 			list_del_rcu(&fa->fa_list);
16882373ce1cSRobert Olsson 			fib_release_info(fa->fa_info);
16892373ce1cSRobert Olsson 			alias_free_mem_rcu(fa);
169019baf839SRobert Olsson 			found++;
169119baf839SRobert Olsson 		}
169219baf839SRobert Olsson 	}
169319baf839SRobert Olsson 	return found;
169419baf839SRobert Olsson }
169519baf839SRobert Olsson 
169619baf839SRobert Olsson static int trie_flush_leaf(struct trie *t, struct leaf *l)
169719baf839SRobert Olsson {
169819baf839SRobert Olsson 	int found = 0;
169919baf839SRobert Olsson 	struct hlist_head *lih = &l->list;
170019baf839SRobert Olsson 	struct hlist_node *node, *tmp;
170119baf839SRobert Olsson 	struct leaf_info *li = NULL;
170219baf839SRobert Olsson 
170319baf839SRobert Olsson 	hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
170419baf839SRobert Olsson 		found += trie_flush_list(t, &li->falh);
170519baf839SRobert Olsson 
170619baf839SRobert Olsson 		if (list_empty(&li->falh)) {
17072373ce1cSRobert Olsson 			hlist_del_rcu(&li->hlist);
170819baf839SRobert Olsson 			free_leaf_info(li);
170919baf839SRobert Olsson 		}
171019baf839SRobert Olsson 	}
171119baf839SRobert Olsson 	return found;
171219baf839SRobert Olsson }
171319baf839SRobert Olsson 
17142373ce1cSRobert Olsson /* rcu_read_lock needs to be hold by caller from readside */
17152373ce1cSRobert Olsson 
171619baf839SRobert Olsson static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
171719baf839SRobert Olsson {
171819baf839SRobert Olsson 	struct node *c = (struct node *) thisleaf;
171919baf839SRobert Olsson 	struct tnode *p;
172019baf839SRobert Olsson 	int idx;
17212373ce1cSRobert Olsson 	struct node *trie = rcu_dereference(t->trie);
172219baf839SRobert Olsson 
172319baf839SRobert Olsson 	if (c == NULL) {
17242373ce1cSRobert Olsson 		if (trie == NULL)
172519baf839SRobert Olsson 			return NULL;
172619baf839SRobert Olsson 
17272373ce1cSRobert Olsson 		if (IS_LEAF(trie))          /* trie w. just a leaf */
17282373ce1cSRobert Olsson 			return (struct leaf *) trie;
172919baf839SRobert Olsson 
17302373ce1cSRobert Olsson 		p = (struct tnode *)trie;  /* Start */
173191b9a277SOlof Johansson 	} else
1732b59cfbf7SEric Dumazet 		p = node_parent_rcu(c);
1733c877efb2SStephen Hemminger 
173419baf839SRobert Olsson 	while (p) {
173519baf839SRobert Olsson 		int pos, last;
173619baf839SRobert Olsson 
173719baf839SRobert Olsson 		/*  Find the next child of the parent */
173819baf839SRobert Olsson 		if (c)
173919baf839SRobert Olsson 			pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
174019baf839SRobert Olsson 		else
174119baf839SRobert Olsson 			pos = 0;
174219baf839SRobert Olsson 
174319baf839SRobert Olsson 		last = 1 << p->bits;
174419baf839SRobert Olsson 		for (idx = pos; idx < last ; idx++) {
17452373ce1cSRobert Olsson 			c = rcu_dereference(p->child[idx]);
17462373ce1cSRobert Olsson 
17472373ce1cSRobert Olsson 			if (!c)
174891b9a277SOlof Johansson 				continue;
174919baf839SRobert Olsson 
175019baf839SRobert Olsson 			/* Decend if tnode */
17512373ce1cSRobert Olsson 			while (IS_TNODE(c)) {
17522373ce1cSRobert Olsson 				p = (struct tnode *) c;
175319baf839SRobert Olsson 				idx = 0;
175419baf839SRobert Olsson 
175519baf839SRobert Olsson 				/* Rightmost non-NULL branch */
175619baf839SRobert Olsson 				if (p && IS_TNODE(p))
17572373ce1cSRobert Olsson 					while (!(c = rcu_dereference(p->child[idx]))
17582373ce1cSRobert Olsson 					       && idx < (1<<p->bits)) idx++;
175919baf839SRobert Olsson 
176019baf839SRobert Olsson 				/* Done with this tnode? */
17612373ce1cSRobert Olsson 				if (idx >= (1 << p->bits) || !c)
176219baf839SRobert Olsson 					goto up;
176319baf839SRobert Olsson 			}
17642373ce1cSRobert Olsson 			return (struct leaf *) c;
176519baf839SRobert Olsson 		}
176619baf839SRobert Olsson up:
176719baf839SRobert Olsson 		/* No more children go up one step  */
176819baf839SRobert Olsson 		c = (struct node *) p;
1769b59cfbf7SEric Dumazet 		p = node_parent_rcu(c);
177019baf839SRobert Olsson 	}
177119baf839SRobert Olsson 	return NULL; /* Ready. Root of trie */
177219baf839SRobert Olsson }
177319baf839SRobert Olsson 
1774d562f1f8SRobert Olsson /*
1775d562f1f8SRobert Olsson  * Caller must hold RTNL.
1776d562f1f8SRobert Olsson  */
177719baf839SRobert Olsson static int fn_trie_flush(struct fib_table *tb)
177819baf839SRobert Olsson {
177919baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
178019baf839SRobert Olsson 	struct leaf *ll = NULL, *l = NULL;
178119baf839SRobert Olsson 	int found = 0, h;
178219baf839SRobert Olsson 
178319baf839SRobert Olsson 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
178419baf839SRobert Olsson 		found += trie_flush_leaf(t, l);
178519baf839SRobert Olsson 
178619baf839SRobert Olsson 		if (ll && hlist_empty(&ll->list))
178719baf839SRobert Olsson 			trie_leaf_remove(t, ll->key);
178819baf839SRobert Olsson 		ll = l;
178919baf839SRobert Olsson 	}
179019baf839SRobert Olsson 
179119baf839SRobert Olsson 	if (ll && hlist_empty(&ll->list))
179219baf839SRobert Olsson 		trie_leaf_remove(t, ll->key);
179319baf839SRobert Olsson 
17940c7770c7SStephen Hemminger 	pr_debug("trie_flush found=%d\n", found);
179519baf839SRobert Olsson 	return found;
179619baf839SRobert Olsson }
179719baf839SRobert Olsson 
1798a07f5f50SStephen Hemminger static void fn_trie_select_default(struct fib_table *tb,
1799a07f5f50SStephen Hemminger 				   const struct flowi *flp,
1800a07f5f50SStephen Hemminger 				   struct fib_result *res)
180119baf839SRobert Olsson {
180219baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
180319baf839SRobert Olsson 	int order, last_idx;
180419baf839SRobert Olsson 	struct fib_info *fi = NULL;
180519baf839SRobert Olsson 	struct fib_info *last_resort;
180619baf839SRobert Olsson 	struct fib_alias *fa = NULL;
180719baf839SRobert Olsson 	struct list_head *fa_head;
180819baf839SRobert Olsson 	struct leaf *l;
180919baf839SRobert Olsson 
181019baf839SRobert Olsson 	last_idx = -1;
181119baf839SRobert Olsson 	last_resort = NULL;
181219baf839SRobert Olsson 	order = -1;
181319baf839SRobert Olsson 
18142373ce1cSRobert Olsson 	rcu_read_lock();
181519baf839SRobert Olsson 
181619baf839SRobert Olsson 	l = fib_find_node(t, 0);
181719baf839SRobert Olsson 	if (!l)
181819baf839SRobert Olsson 		goto out;
181919baf839SRobert Olsson 
182019baf839SRobert Olsson 	fa_head = get_fa_head(l, 0);
182119baf839SRobert Olsson 	if (!fa_head)
182219baf839SRobert Olsson 		goto out;
182319baf839SRobert Olsson 
182419baf839SRobert Olsson 	if (list_empty(fa_head))
182519baf839SRobert Olsson 		goto out;
182619baf839SRobert Olsson 
18272373ce1cSRobert Olsson 	list_for_each_entry_rcu(fa, fa_head, fa_list) {
182819baf839SRobert Olsson 		struct fib_info *next_fi = fa->fa_info;
182919baf839SRobert Olsson 
183019baf839SRobert Olsson 		if (fa->fa_scope != res->scope ||
183119baf839SRobert Olsson 		    fa->fa_type != RTN_UNICAST)
183219baf839SRobert Olsson 			continue;
183319baf839SRobert Olsson 
183419baf839SRobert Olsson 		if (next_fi->fib_priority > res->fi->fib_priority)
183519baf839SRobert Olsson 			break;
183619baf839SRobert Olsson 		if (!next_fi->fib_nh[0].nh_gw ||
183719baf839SRobert Olsson 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
183819baf839SRobert Olsson 			continue;
183919baf839SRobert Olsson 		fa->fa_state |= FA_S_ACCESSED;
184019baf839SRobert Olsson 
184119baf839SRobert Olsson 		if (fi == NULL) {
184219baf839SRobert Olsson 			if (next_fi != res->fi)
184319baf839SRobert Olsson 				break;
184419baf839SRobert Olsson 		} else if (!fib_detect_death(fi, order, &last_resort,
1845971b893eSDenis V. Lunev 					     &last_idx, tb->tb_default)) {
1846a2bbe682SDenis V. Lunev 			fib_result_assign(res, fi);
1847971b893eSDenis V. Lunev 			tb->tb_default = order;
184819baf839SRobert Olsson 			goto out;
184919baf839SRobert Olsson 		}
185019baf839SRobert Olsson 		fi = next_fi;
185119baf839SRobert Olsson 		order++;
185219baf839SRobert Olsson 	}
185319baf839SRobert Olsson 	if (order <= 0 || fi == NULL) {
1854971b893eSDenis V. Lunev 		tb->tb_default = -1;
185519baf839SRobert Olsson 		goto out;
185619baf839SRobert Olsson 	}
185719baf839SRobert Olsson 
1858971b893eSDenis V. Lunev 	if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1859971b893eSDenis V. Lunev 				tb->tb_default)) {
1860a2bbe682SDenis V. Lunev 		fib_result_assign(res, fi);
1861971b893eSDenis V. Lunev 		tb->tb_default = order;
186219baf839SRobert Olsson 		goto out;
186319baf839SRobert Olsson 	}
1864a2bbe682SDenis V. Lunev 	if (last_idx >= 0)
1865a2bbe682SDenis V. Lunev 		fib_result_assign(res, last_resort);
1866971b893eSDenis V. Lunev 	tb->tb_default = last_idx;
1867971b893eSDenis V. Lunev out:
18682373ce1cSRobert Olsson 	rcu_read_unlock();
186919baf839SRobert Olsson }
187019baf839SRobert Olsson 
1871a07f5f50SStephen Hemminger static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1872a07f5f50SStephen Hemminger 			   struct fib_table *tb,
187319baf839SRobert Olsson 			   struct sk_buff *skb, struct netlink_callback *cb)
187419baf839SRobert Olsson {
187519baf839SRobert Olsson 	int i, s_i;
187619baf839SRobert Olsson 	struct fib_alias *fa;
187719baf839SRobert Olsson 
187832ab5f80SAl Viro 	__be32 xkey = htonl(key);
187919baf839SRobert Olsson 
18801af5a8c4SPatrick McHardy 	s_i = cb->args[4];
188119baf839SRobert Olsson 	i = 0;
188219baf839SRobert Olsson 
18832373ce1cSRobert Olsson 	/* rcu_read_lock is hold by caller */
18842373ce1cSRobert Olsson 
18852373ce1cSRobert Olsson 	list_for_each_entry_rcu(fa, fah, fa_list) {
188619baf839SRobert Olsson 		if (i < s_i) {
188719baf839SRobert Olsson 			i++;
188819baf839SRobert Olsson 			continue;
188919baf839SRobert Olsson 		}
189078c6671aSStephen Hemminger 		BUG_ON(!fa->fa_info);
189119baf839SRobert Olsson 
189219baf839SRobert Olsson 		if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
189319baf839SRobert Olsson 				  cb->nlh->nlmsg_seq,
189419baf839SRobert Olsson 				  RTM_NEWROUTE,
189519baf839SRobert Olsson 				  tb->tb_id,
189619baf839SRobert Olsson 				  fa->fa_type,
189719baf839SRobert Olsson 				  fa->fa_scope,
1898be403ea1SThomas Graf 				  xkey,
189919baf839SRobert Olsson 				  plen,
190019baf839SRobert Olsson 				  fa->fa_tos,
190164347f78SStephen Hemminger 				  fa->fa_info, NLM_F_MULTI) < 0) {
19021af5a8c4SPatrick McHardy 			cb->args[4] = i;
190319baf839SRobert Olsson 			return -1;
190419baf839SRobert Olsson 		}
190519baf839SRobert Olsson 		i++;
190619baf839SRobert Olsson 	}
19071af5a8c4SPatrick McHardy 	cb->args[4] = i;
190819baf839SRobert Olsson 	return skb->len;
190919baf839SRobert Olsson }
191019baf839SRobert Olsson 
1911a07f5f50SStephen Hemminger static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb,
1912a07f5f50SStephen Hemminger 			     struct sk_buff *skb, struct netlink_callback *cb)
191319baf839SRobert Olsson {
191419baf839SRobert Olsson 	int h, s_h;
191519baf839SRobert Olsson 	struct list_head *fa_head;
191619baf839SRobert Olsson 	struct leaf *l = NULL;
191791b9a277SOlof Johansson 
19181af5a8c4SPatrick McHardy 	s_h = cb->args[3];
191919baf839SRobert Olsson 
192019baf839SRobert Olsson 	for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
192119baf839SRobert Olsson 		if (h < s_h)
192219baf839SRobert Olsson 			continue;
192319baf839SRobert Olsson 		if (h > s_h)
19241af5a8c4SPatrick McHardy 			memset(&cb->args[4], 0,
19251af5a8c4SPatrick McHardy 			       sizeof(cb->args) - 4*sizeof(cb->args[0]));
192619baf839SRobert Olsson 
192719baf839SRobert Olsson 		fa_head = get_fa_head(l, plen);
192819baf839SRobert Olsson 
192919baf839SRobert Olsson 		if (!fa_head)
193019baf839SRobert Olsson 			continue;
193119baf839SRobert Olsson 
193219baf839SRobert Olsson 		if (list_empty(fa_head))
193319baf839SRobert Olsson 			continue;
193419baf839SRobert Olsson 
193519baf839SRobert Olsson 		if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb) < 0) {
19361af5a8c4SPatrick McHardy 			cb->args[3] = h;
193719baf839SRobert Olsson 			return -1;
193819baf839SRobert Olsson 		}
193919baf839SRobert Olsson 	}
19401af5a8c4SPatrick McHardy 	cb->args[3] = h;
194119baf839SRobert Olsson 	return skb->len;
194219baf839SRobert Olsson }
194319baf839SRobert Olsson 
1944a07f5f50SStephen Hemminger static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
1945a07f5f50SStephen Hemminger 			struct netlink_callback *cb)
194619baf839SRobert Olsson {
194719baf839SRobert Olsson 	int m, s_m;
194819baf839SRobert Olsson 	struct trie *t = (struct trie *) tb->tb_data;
194919baf839SRobert Olsson 
19501af5a8c4SPatrick McHardy 	s_m = cb->args[2];
195119baf839SRobert Olsson 
19522373ce1cSRobert Olsson 	rcu_read_lock();
195319baf839SRobert Olsson 	for (m = 0; m <= 32; m++) {
195419baf839SRobert Olsson 		if (m < s_m)
195519baf839SRobert Olsson 			continue;
195619baf839SRobert Olsson 		if (m > s_m)
19571af5a8c4SPatrick McHardy 			memset(&cb->args[3], 0,
19581af5a8c4SPatrick McHardy 				sizeof(cb->args) - 3*sizeof(cb->args[0]));
195919baf839SRobert Olsson 
196019baf839SRobert Olsson 		if (fn_trie_dump_plen(t, 32-m, tb, skb, cb) < 0) {
19611af5a8c4SPatrick McHardy 			cb->args[2] = m;
196219baf839SRobert Olsson 			goto out;
196319baf839SRobert Olsson 		}
196419baf839SRobert Olsson 	}
19652373ce1cSRobert Olsson 	rcu_read_unlock();
19661af5a8c4SPatrick McHardy 	cb->args[2] = m;
196719baf839SRobert Olsson 	return skb->len;
196819baf839SRobert Olsson out:
19692373ce1cSRobert Olsson 	rcu_read_unlock();
197019baf839SRobert Olsson 	return -1;
197119baf839SRobert Olsson }
197219baf839SRobert Olsson 
19737f9b8052SStephen Hemminger void __init fib_hash_init(void)
19747f9b8052SStephen Hemminger {
1975a07f5f50SStephen Hemminger 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1976a07f5f50SStephen Hemminger 					  sizeof(struct fib_alias),
1977bc3c8c1eSStephen Hemminger 					  0, SLAB_PANIC, NULL);
1978bc3c8c1eSStephen Hemminger 
1979bc3c8c1eSStephen Hemminger 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
1980bc3c8c1eSStephen Hemminger 					   max(sizeof(struct leaf),
1981bc3c8c1eSStephen Hemminger 					       sizeof(struct leaf_info)),
1982bc3c8c1eSStephen Hemminger 					   0, SLAB_PANIC, NULL);
19837f9b8052SStephen Hemminger }
198419baf839SRobert Olsson 
19857f9b8052SStephen Hemminger 
19867f9b8052SStephen Hemminger /* Fix more generic FIB names for init later */
19877f9b8052SStephen Hemminger struct fib_table *fib_hash_table(u32 id)
198819baf839SRobert Olsson {
198919baf839SRobert Olsson 	struct fib_table *tb;
199019baf839SRobert Olsson 	struct trie *t;
199119baf839SRobert Olsson 
199219baf839SRobert Olsson 	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
199319baf839SRobert Olsson 		     GFP_KERNEL);
199419baf839SRobert Olsson 	if (tb == NULL)
199519baf839SRobert Olsson 		return NULL;
199619baf839SRobert Olsson 
199719baf839SRobert Olsson 	tb->tb_id = id;
1998971b893eSDenis V. Lunev 	tb->tb_default = -1;
199919baf839SRobert Olsson 	tb->tb_lookup = fn_trie_lookup;
200019baf839SRobert Olsson 	tb->tb_insert = fn_trie_insert;
200119baf839SRobert Olsson 	tb->tb_delete = fn_trie_delete;
200219baf839SRobert Olsson 	tb->tb_flush = fn_trie_flush;
200319baf839SRobert Olsson 	tb->tb_select_default = fn_trie_select_default;
200419baf839SRobert Olsson 	tb->tb_dump = fn_trie_dump;
200519baf839SRobert Olsson 
200619baf839SRobert Olsson 	t = (struct trie *) tb->tb_data;
2007c28a1cf4SStephen Hemminger 	memset(t, 0, sizeof(*t));
200819baf839SRobert Olsson 
200919baf839SRobert Olsson 	if (id == RT_TABLE_LOCAL)
2010a07f5f50SStephen Hemminger 		pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
201119baf839SRobert Olsson 
201219baf839SRobert Olsson 	return tb;
201319baf839SRobert Olsson }
201419baf839SRobert Olsson 
2015cb7b593cSStephen Hemminger #ifdef CONFIG_PROC_FS
2016cb7b593cSStephen Hemminger /* Depth first Trie walk iterator */
2017cb7b593cSStephen Hemminger struct fib_trie_iter {
20181c340b2fSDenis V. Lunev 	struct seq_net_private p;
2019877a9bffSEric W. Biederman 	struct trie *trie_local, *trie_main;
2020cb7b593cSStephen Hemminger 	struct tnode *tnode;
2021cb7b593cSStephen Hemminger 	struct trie *trie;
2022cb7b593cSStephen Hemminger 	unsigned index;
2023cb7b593cSStephen Hemminger 	unsigned depth;
2024cb7b593cSStephen Hemminger };
202519baf839SRobert Olsson 
2026cb7b593cSStephen Hemminger static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
202719baf839SRobert Olsson {
2028cb7b593cSStephen Hemminger 	struct tnode *tn = iter->tnode;
2029cb7b593cSStephen Hemminger 	unsigned cindex = iter->index;
2030cb7b593cSStephen Hemminger 	struct tnode *p;
203119baf839SRobert Olsson 
20326640e697SEric W. Biederman 	/* A single entry routing table */
20336640e697SEric W. Biederman 	if (!tn)
20346640e697SEric W. Biederman 		return NULL;
20356640e697SEric W. Biederman 
2036cb7b593cSStephen Hemminger 	pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2037cb7b593cSStephen Hemminger 		 iter->tnode, iter->index, iter->depth);
2038cb7b593cSStephen Hemminger rescan:
2039cb7b593cSStephen Hemminger 	while (cindex < (1<<tn->bits)) {
2040b59cfbf7SEric Dumazet 		struct node *n = tnode_get_child_rcu(tn, cindex);
204119baf839SRobert Olsson 
2042cb7b593cSStephen Hemminger 		if (n) {
204319baf839SRobert Olsson 			if (IS_LEAF(n)) {
2044cb7b593cSStephen Hemminger 				iter->tnode = tn;
2045cb7b593cSStephen Hemminger 				iter->index = cindex + 1;
204691b9a277SOlof Johansson 			} else {
2047cb7b593cSStephen Hemminger 				/* push down one level */
2048cb7b593cSStephen Hemminger 				iter->tnode = (struct tnode *) n;
2049cb7b593cSStephen Hemminger 				iter->index = 0;
2050cb7b593cSStephen Hemminger 				++iter->depth;
205119baf839SRobert Olsson 			}
2052cb7b593cSStephen Hemminger 			return n;
205319baf839SRobert Olsson 		}
205419baf839SRobert Olsson 
2055cb7b593cSStephen Hemminger 		++cindex;
2056cb7b593cSStephen Hemminger 	}
2057cb7b593cSStephen Hemminger 
2058cb7b593cSStephen Hemminger 	/* Current node exhausted, pop back up */
2059b59cfbf7SEric Dumazet 	p = node_parent_rcu((struct node *)tn);
2060cb7b593cSStephen Hemminger 	if (p) {
2061cb7b593cSStephen Hemminger 		cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2062cb7b593cSStephen Hemminger 		tn = p;
2063cb7b593cSStephen Hemminger 		--iter->depth;
2064cb7b593cSStephen Hemminger 		goto rescan;
2065cb7b593cSStephen Hemminger 	}
2066cb7b593cSStephen Hemminger 
2067cb7b593cSStephen Hemminger 	/* got root? */
2068cb7b593cSStephen Hemminger 	return NULL;
2069cb7b593cSStephen Hemminger }
2070cb7b593cSStephen Hemminger 
2071cb7b593cSStephen Hemminger static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2072cb7b593cSStephen Hemminger 				       struct trie *t)
2073cb7b593cSStephen Hemminger {
20745ddf0eb2SRobert Olsson 	struct node *n ;
20755ddf0eb2SRobert Olsson 
20765ddf0eb2SRobert Olsson 	if (!t)
20775ddf0eb2SRobert Olsson 		return NULL;
20785ddf0eb2SRobert Olsson 
20795ddf0eb2SRobert Olsson 	n = rcu_dereference(t->trie);
20805ddf0eb2SRobert Olsson 
20815ddf0eb2SRobert Olsson 	if (!iter)
20825ddf0eb2SRobert Olsson 		return NULL;
2083cb7b593cSStephen Hemminger 
20846640e697SEric W. Biederman 	if (n) {
20856640e697SEric W. Biederman 		if (IS_TNODE(n)) {
2086cb7b593cSStephen Hemminger 			iter->tnode = (struct tnode *) n;
2087cb7b593cSStephen Hemminger 			iter->trie = t;
2088cb7b593cSStephen Hemminger 			iter->index = 0;
20891d25cd6cSRobert Olsson 			iter->depth = 1;
20906640e697SEric W. Biederman 		} else {
20916640e697SEric W. Biederman 			iter->tnode = NULL;
20926640e697SEric W. Biederman 			iter->trie  = t;
20936640e697SEric W. Biederman 			iter->index = 0;
20946640e697SEric W. Biederman 			iter->depth = 0;
20956640e697SEric W. Biederman 		}
2096cb7b593cSStephen Hemminger 		return n;
2097cb7b593cSStephen Hemminger 	}
2098cb7b593cSStephen Hemminger 	return NULL;
2099cb7b593cSStephen Hemminger }
2100cb7b593cSStephen Hemminger 
2101cb7b593cSStephen Hemminger static void trie_collect_stats(struct trie *t, struct trie_stat *s)
210219baf839SRobert Olsson {
21032373ce1cSRobert Olsson 	struct node *n;
2104cb7b593cSStephen Hemminger 	struct fib_trie_iter iter;
2105cb7b593cSStephen Hemminger 
2106cb7b593cSStephen Hemminger 	memset(s, 0, sizeof(*s));
210719baf839SRobert Olsson 
21082373ce1cSRobert Olsson 	rcu_read_lock();
2109cb7b593cSStephen Hemminger 	for (n = fib_trie_get_first(&iter, t); n;
2110cb7b593cSStephen Hemminger 	     n = fib_trie_get_next(&iter)) {
2111cb7b593cSStephen Hemminger 		if (IS_LEAF(n)) {
211293672292SStephen Hemminger 			struct leaf *l = (struct leaf *)n;
211393672292SStephen Hemminger 			struct leaf_info *li;
211493672292SStephen Hemminger 			struct hlist_node *tmp;
211593672292SStephen Hemminger 
211619baf839SRobert Olsson 			s->leaves++;
2117cb7b593cSStephen Hemminger 			s->totdepth += iter.depth;
2118cb7b593cSStephen Hemminger 			if (iter.depth > s->maxdepth)
2119cb7b593cSStephen Hemminger 				s->maxdepth = iter.depth;
212093672292SStephen Hemminger 
212193672292SStephen Hemminger 			hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
212293672292SStephen Hemminger 				++s->prefixes;
212391b9a277SOlof Johansson 		} else {
2124cb7b593cSStephen Hemminger 			const struct tnode *tn = (const struct tnode *) n;
2125cb7b593cSStephen Hemminger 			int i;
212619baf839SRobert Olsson 
212719baf839SRobert Olsson 			s->tnodes++;
212806ef921dSRobert Olsson 			if (tn->bits < MAX_STAT_DEPTH)
212919baf839SRobert Olsson 				s->nodesizes[tn->bits]++;
213006ef921dSRobert Olsson 
2131cb7b593cSStephen Hemminger 			for (i = 0; i < (1<<tn->bits); i++)
2132cb7b593cSStephen Hemminger 				if (!tn->child[i])
213319baf839SRobert Olsson 					s->nullpointers++;
213419baf839SRobert Olsson 		}
213519baf839SRobert Olsson 	}
21362373ce1cSRobert Olsson 	rcu_read_unlock();
213719baf839SRobert Olsson }
213819baf839SRobert Olsson 
213919baf839SRobert Olsson /*
214019baf839SRobert Olsson  *	This outputs /proc/net/fib_triestats
214119baf839SRobert Olsson  */
2142cb7b593cSStephen Hemminger static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
214319baf839SRobert Olsson {
2144cb7b593cSStephen Hemminger 	unsigned i, max, pointers, bytes, avdepth;
214519baf839SRobert Olsson 
214619baf839SRobert Olsson 	if (stat->leaves)
214719baf839SRobert Olsson 		avdepth = stat->totdepth*100 / stat->leaves;
214819baf839SRobert Olsson 	else
214919baf839SRobert Olsson 		avdepth = 0;
215019baf839SRobert Olsson 
2151a07f5f50SStephen Hemminger 	seq_printf(seq, "\tAver depth:     %u.%02d\n",
2152a07f5f50SStephen Hemminger 		   avdepth / 100, avdepth % 100);
2153cb7b593cSStephen Hemminger 	seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2154cb7b593cSStephen Hemminger 
2155cb7b593cSStephen Hemminger 	seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2156cb7b593cSStephen Hemminger 	bytes = sizeof(struct leaf) * stat->leaves;
215793672292SStephen Hemminger 
215893672292SStephen Hemminger 	seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
215993672292SStephen Hemminger 	bytes += sizeof(struct leaf_info) * stat->prefixes;
216093672292SStephen Hemminger 
2161187b5188SStephen Hemminger 	seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
216219baf839SRobert Olsson 	bytes += sizeof(struct tnode) * stat->tnodes;
216319baf839SRobert Olsson 
216406ef921dSRobert Olsson 	max = MAX_STAT_DEPTH;
216506ef921dSRobert Olsson 	while (max > 0 && stat->nodesizes[max-1] == 0)
216619baf839SRobert Olsson 		max--;
216719baf839SRobert Olsson 
2168cb7b593cSStephen Hemminger 	pointers = 0;
216919baf839SRobert Olsson 	for (i = 1; i <= max; i++)
217019baf839SRobert Olsson 		if (stat->nodesizes[i] != 0) {
2171187b5188SStephen Hemminger 			seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
217219baf839SRobert Olsson 			pointers += (1<<i) * stat->nodesizes[i];
217319baf839SRobert Olsson 		}
2174cb7b593cSStephen Hemminger 	seq_putc(seq, '\n');
2175187b5188SStephen Hemminger 	seq_printf(seq, "\tPointers: %u\n", pointers);
2176cb7b593cSStephen Hemminger 
217719baf839SRobert Olsson 	bytes += sizeof(struct node *) * pointers;
2178187b5188SStephen Hemminger 	seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2179187b5188SStephen Hemminger 	seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
218066a2f7fdSStephen Hemminger }
218119baf839SRobert Olsson 
218219baf839SRobert Olsson #ifdef CONFIG_IP_FIB_TRIE_STATS
218366a2f7fdSStephen Hemminger static void trie_show_usage(struct seq_file *seq,
218466a2f7fdSStephen Hemminger 			    const struct trie_use_stats *stats)
218566a2f7fdSStephen Hemminger {
218666a2f7fdSStephen Hemminger 	seq_printf(seq, "\nCounters:\n---------\n");
218766a2f7fdSStephen Hemminger 	seq_printf(seq, "gets = %u\n", stats->gets);
218866a2f7fdSStephen Hemminger 	seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2189a07f5f50SStephen Hemminger 	seq_printf(seq, "semantic match passed = %u\n",
2190a07f5f50SStephen Hemminger 		   stats->semantic_match_passed);
2191a07f5f50SStephen Hemminger 	seq_printf(seq, "semantic match miss = %u\n",
2192a07f5f50SStephen Hemminger 		   stats->semantic_match_miss);
219366a2f7fdSStephen Hemminger 	seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2194a07f5f50SStephen Hemminger 	seq_printf(seq, "skipped node resize = %u\n\n",
2195a07f5f50SStephen Hemminger 		   stats->resize_node_skipped);
219619baf839SRobert Olsson }
219766a2f7fdSStephen Hemminger #endif /*  CONFIG_IP_FIB_TRIE_STATS */
219866a2f7fdSStephen Hemminger 
2199a07f5f50SStephen Hemminger static void fib_trie_show(struct seq_file *seq, const char *name,
2200a07f5f50SStephen Hemminger 			  struct trie *trie)
2201d717a9a6SStephen Hemminger {
2202d717a9a6SStephen Hemminger 	struct trie_stat stat;
2203d717a9a6SStephen Hemminger 
2204d717a9a6SStephen Hemminger 	trie_collect_stats(trie, &stat);
220593672292SStephen Hemminger 	seq_printf(seq, "%s:\n", name);
2206d717a9a6SStephen Hemminger 	trie_show_stats(seq, &stat);
2207d717a9a6SStephen Hemminger #ifdef CONFIG_IP_FIB_TRIE_STATS
2208d717a9a6SStephen Hemminger 	trie_show_usage(seq, &trie->stats);
2209d717a9a6SStephen Hemminger #endif
2210d717a9a6SStephen Hemminger }
221119baf839SRobert Olsson 
221219baf839SRobert Olsson static int fib_triestat_seq_show(struct seq_file *seq, void *v)
221319baf839SRobert Olsson {
22141c340b2fSDenis V. Lunev 	struct net *net = (struct net *)seq->private;
2215877a9bffSEric W. Biederman 	struct fib_table *tb;
2216877a9bffSEric W. Biederman 
2217d717a9a6SStephen Hemminger 	seq_printf(seq,
2218a07f5f50SStephen Hemminger 		   "Basic info: size of leaf:"
2219a07f5f50SStephen Hemminger 		   " %Zd bytes, size of tnode: %Zd bytes.\n",
222019baf839SRobert Olsson 		   sizeof(struct leaf), sizeof(struct tnode));
222119baf839SRobert Olsson 
2222d717a9a6SStephen Hemminger 	tb = fib_get_table(net, RT_TABLE_LOCAL);
2223d717a9a6SStephen Hemminger 	if (tb)
2224d717a9a6SStephen Hemminger 		fib_trie_show(seq, "Local", (struct trie *) tb->tb_data);
2225cb7b593cSStephen Hemminger 
2226d717a9a6SStephen Hemminger 	tb = fib_get_table(net, RT_TABLE_MAIN);
2227d717a9a6SStephen Hemminger 	if (tb)
2228d717a9a6SStephen Hemminger 		fib_trie_show(seq, "Main", (struct trie *) tb->tb_data);
2229cb7b593cSStephen Hemminger 
223019baf839SRobert Olsson 	return 0;
223119baf839SRobert Olsson }
223219baf839SRobert Olsson 
223319baf839SRobert Olsson static int fib_triestat_seq_open(struct inode *inode, struct file *file)
223419baf839SRobert Olsson {
22351c340b2fSDenis V. Lunev 	int err;
22361c340b2fSDenis V. Lunev 	struct net *net;
22371c340b2fSDenis V. Lunev 
22381c340b2fSDenis V. Lunev 	net = get_proc_net(inode);
22391c340b2fSDenis V. Lunev 	if (net == NULL)
22401c340b2fSDenis V. Lunev 		return -ENXIO;
22411c340b2fSDenis V. Lunev 	err = single_open(file, fib_triestat_seq_show, net);
22421c340b2fSDenis V. Lunev 	if (err < 0) {
22431c340b2fSDenis V. Lunev 		put_net(net);
22441c340b2fSDenis V. Lunev 		return err;
22451c340b2fSDenis V. Lunev 	}
22461c340b2fSDenis V. Lunev 	return 0;
22471c340b2fSDenis V. Lunev }
22481c340b2fSDenis V. Lunev 
22491c340b2fSDenis V. Lunev static int fib_triestat_seq_release(struct inode *ino, struct file *f)
22501c340b2fSDenis V. Lunev {
22511c340b2fSDenis V. Lunev 	struct seq_file *seq = f->private_data;
22521c340b2fSDenis V. Lunev 	put_net(seq->private);
22531c340b2fSDenis V. Lunev 	return single_release(ino, f);
225419baf839SRobert Olsson }
225519baf839SRobert Olsson 
22569a32144eSArjan van de Ven static const struct file_operations fib_triestat_fops = {
225719baf839SRobert Olsson 	.owner	= THIS_MODULE,
225819baf839SRobert Olsson 	.open	= fib_triestat_seq_open,
225919baf839SRobert Olsson 	.read	= seq_read,
226019baf839SRobert Olsson 	.llseek	= seq_lseek,
22611c340b2fSDenis V. Lunev 	.release = fib_triestat_seq_release,
226219baf839SRobert Olsson };
226319baf839SRobert Olsson 
2264cb7b593cSStephen Hemminger static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2265cb7b593cSStephen Hemminger 				      loff_t pos)
226619baf839SRobert Olsson {
2267cb7b593cSStephen Hemminger 	loff_t idx = 0;
2268cb7b593cSStephen Hemminger 	struct node *n;
2269cb7b593cSStephen Hemminger 
2270877a9bffSEric W. Biederman 	for (n = fib_trie_get_first(iter, iter->trie_local);
2271cb7b593cSStephen Hemminger 	     n; ++idx, n = fib_trie_get_next(iter)) {
2272cb7b593cSStephen Hemminger 		if (pos == idx)
2273cb7b593cSStephen Hemminger 			return n;
227419baf839SRobert Olsson 	}
227519baf839SRobert Olsson 
2276877a9bffSEric W. Biederman 	for (n = fib_trie_get_first(iter, iter->trie_main);
2277cb7b593cSStephen Hemminger 	     n; ++idx, n = fib_trie_get_next(iter)) {
2278cb7b593cSStephen Hemminger 		if (pos == idx)
2279cb7b593cSStephen Hemminger 			return n;
228019baf839SRobert Olsson 	}
228119baf839SRobert Olsson 	return NULL;
228219baf839SRobert Olsson }
228319baf839SRobert Olsson 
228419baf839SRobert Olsson static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2285c95aaf9aSStephen Hemminger 	__acquires(RCU)
228619baf839SRobert Olsson {
2287877a9bffSEric W. Biederman 	struct fib_trie_iter *iter = seq->private;
2288877a9bffSEric W. Biederman 	struct fib_table *tb;
2289877a9bffSEric W. Biederman 
2290877a9bffSEric W. Biederman 	if (!iter->trie_local) {
22911c340b2fSDenis V. Lunev 		tb = fib_get_table(iter->p.net, RT_TABLE_LOCAL);
2292877a9bffSEric W. Biederman 		if (tb)
2293877a9bffSEric W. Biederman 			iter->trie_local = (struct trie *) tb->tb_data;
2294877a9bffSEric W. Biederman 	}
2295877a9bffSEric W. Biederman 	if (!iter->trie_main) {
22961c340b2fSDenis V. Lunev 		tb = fib_get_table(iter->p.net, RT_TABLE_MAIN);
2297877a9bffSEric W. Biederman 		if (tb)
2298877a9bffSEric W. Biederman 			iter->trie_main = (struct trie *) tb->tb_data;
2299877a9bffSEric W. Biederman 	}
2300cb7b593cSStephen Hemminger 	rcu_read_lock();
2301cb7b593cSStephen Hemminger 	if (*pos == 0)
230291b9a277SOlof Johansson 		return SEQ_START_TOKEN;
2303877a9bffSEric W. Biederman 	return fib_trie_get_idx(iter, *pos - 1);
230419baf839SRobert Olsson }
230519baf839SRobert Olsson 
230619baf839SRobert Olsson static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
230719baf839SRobert Olsson {
2308cb7b593cSStephen Hemminger 	struct fib_trie_iter *iter = seq->private;
2309cb7b593cSStephen Hemminger 	void *l = v;
2310cb7b593cSStephen Hemminger 
231119baf839SRobert Olsson 	++*pos;
231291b9a277SOlof Johansson 	if (v == SEQ_START_TOKEN)
2313cb7b593cSStephen Hemminger 		return fib_trie_get_idx(iter, 0);
231491b9a277SOlof Johansson 
2315cb7b593cSStephen Hemminger 	v = fib_trie_get_next(iter);
2316cb7b593cSStephen Hemminger 	BUG_ON(v == l);
2317cb7b593cSStephen Hemminger 	if (v)
2318cb7b593cSStephen Hemminger 		return v;
2319cb7b593cSStephen Hemminger 
2320cb7b593cSStephen Hemminger 	/* continue scan in next trie */
2321877a9bffSEric W. Biederman 	if (iter->trie == iter->trie_local)
2322877a9bffSEric W. Biederman 		return fib_trie_get_first(iter, iter->trie_main);
2323cb7b593cSStephen Hemminger 
2324cb7b593cSStephen Hemminger 	return NULL;
232519baf839SRobert Olsson }
232619baf839SRobert Olsson 
232719baf839SRobert Olsson static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2328c95aaf9aSStephen Hemminger 	__releases(RCU)
232919baf839SRobert Olsson {
2330cb7b593cSStephen Hemminger 	rcu_read_unlock();
233119baf839SRobert Olsson }
233219baf839SRobert Olsson 
2333cb7b593cSStephen Hemminger static void seq_indent(struct seq_file *seq, int n)
2334cb7b593cSStephen Hemminger {
2335cb7b593cSStephen Hemminger 	while (n-- > 0) seq_puts(seq, "   ");
2336cb7b593cSStephen Hemminger }
233719baf839SRobert Olsson 
233828d36e37SEric Dumazet static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2339cb7b593cSStephen Hemminger {
2340cb7b593cSStephen Hemminger 	switch (s) {
2341cb7b593cSStephen Hemminger 	case RT_SCOPE_UNIVERSE: return "universe";
2342cb7b593cSStephen Hemminger 	case RT_SCOPE_SITE:	return "site";
2343cb7b593cSStephen Hemminger 	case RT_SCOPE_LINK:	return "link";
2344cb7b593cSStephen Hemminger 	case RT_SCOPE_HOST:	return "host";
2345cb7b593cSStephen Hemminger 	case RT_SCOPE_NOWHERE:	return "nowhere";
2346cb7b593cSStephen Hemminger 	default:
234728d36e37SEric Dumazet 		snprintf(buf, len, "scope=%d", s);
2348cb7b593cSStephen Hemminger 		return buf;
2349cb7b593cSStephen Hemminger 	}
2350cb7b593cSStephen Hemminger }
2351cb7b593cSStephen Hemminger 
2352cb7b593cSStephen Hemminger static const char *rtn_type_names[__RTN_MAX] = {
2353cb7b593cSStephen Hemminger 	[RTN_UNSPEC] = "UNSPEC",
2354cb7b593cSStephen Hemminger 	[RTN_UNICAST] = "UNICAST",
2355cb7b593cSStephen Hemminger 	[RTN_LOCAL] = "LOCAL",
2356cb7b593cSStephen Hemminger 	[RTN_BROADCAST] = "BROADCAST",
2357cb7b593cSStephen Hemminger 	[RTN_ANYCAST] = "ANYCAST",
2358cb7b593cSStephen Hemminger 	[RTN_MULTICAST] = "MULTICAST",
2359cb7b593cSStephen Hemminger 	[RTN_BLACKHOLE] = "BLACKHOLE",
2360cb7b593cSStephen Hemminger 	[RTN_UNREACHABLE] = "UNREACHABLE",
2361cb7b593cSStephen Hemminger 	[RTN_PROHIBIT] = "PROHIBIT",
2362cb7b593cSStephen Hemminger 	[RTN_THROW] = "THROW",
2363cb7b593cSStephen Hemminger 	[RTN_NAT] = "NAT",
2364cb7b593cSStephen Hemminger 	[RTN_XRESOLVE] = "XRESOLVE",
2365cb7b593cSStephen Hemminger };
2366cb7b593cSStephen Hemminger 
236728d36e37SEric Dumazet static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2368cb7b593cSStephen Hemminger {
2369cb7b593cSStephen Hemminger 	if (t < __RTN_MAX && rtn_type_names[t])
2370cb7b593cSStephen Hemminger 		return rtn_type_names[t];
237128d36e37SEric Dumazet 	snprintf(buf, len, "type %u", t);
2372cb7b593cSStephen Hemminger 	return buf;
2373cb7b593cSStephen Hemminger }
2374cb7b593cSStephen Hemminger 
2375cb7b593cSStephen Hemminger /* Pretty print the trie */
237619baf839SRobert Olsson static int fib_trie_seq_show(struct seq_file *seq, void *v)
237719baf839SRobert Olsson {
2378cb7b593cSStephen Hemminger 	const struct fib_trie_iter *iter = seq->private;
2379cb7b593cSStephen Hemminger 	struct node *n = v;
238019baf839SRobert Olsson 
2381cb7b593cSStephen Hemminger 	if (v == SEQ_START_TOKEN)
2382cb7b593cSStephen Hemminger 		return 0;
238319baf839SRobert Olsson 
2384b59cfbf7SEric Dumazet 	if (!node_parent_rcu(n)) {
2385877a9bffSEric W. Biederman 		if (iter->trie == iter->trie_local)
2386cb7b593cSStephen Hemminger 			seq_puts(seq, "<local>:\n");
2387cb7b593cSStephen Hemminger 		else
2388cb7b593cSStephen Hemminger 			seq_puts(seq, "<main>:\n");
2389cb7b593cSStephen Hemminger 	}
2390095b8501SRobert Olsson 
2391095b8501SRobert Olsson 	if (IS_TNODE(n)) {
2392095b8501SRobert Olsson 		struct tnode *tn = (struct tnode *) n;
2393ab66b4a7SStephen Hemminger 		__be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2394095b8501SRobert Olsson 
23951d25cd6cSRobert Olsson 		seq_indent(seq, iter->depth-1);
23961d25cd6cSRobert Olsson 		seq_printf(seq, "  +-- %d.%d.%d.%d/%d %d %d %d\n",
23971d25cd6cSRobert Olsson 			   NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
23981d25cd6cSRobert Olsson 			   tn->empty_children);
23991d25cd6cSRobert Olsson 
2400cb7b593cSStephen Hemminger 	} else {
2401cb7b593cSStephen Hemminger 		struct leaf *l = (struct leaf *) n;
24021328042eSStephen Hemminger 		struct leaf_info *li;
24031328042eSStephen Hemminger 		struct hlist_node *node;
24041328042eSStephen Hemminger 
240532ab5f80SAl Viro 		__be32 val = htonl(l->key);
2406cb7b593cSStephen Hemminger 
2407cb7b593cSStephen Hemminger 		seq_indent(seq, iter->depth);
2408cb7b593cSStephen Hemminger 		seq_printf(seq, "  |-- %d.%d.%d.%d\n", NIPQUAD(val));
240928d36e37SEric Dumazet 
24101328042eSStephen Hemminger 		hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2411cb7b593cSStephen Hemminger 			struct fib_alias *fa;
241228d36e37SEric Dumazet 
2413cb7b593cSStephen Hemminger 			list_for_each_entry_rcu(fa, &li->falh, fa_list) {
241428d36e37SEric Dumazet 				char buf1[32], buf2[32];
241528d36e37SEric Dumazet 
2416cb7b593cSStephen Hemminger 				seq_indent(seq, iter->depth+1);
24171328042eSStephen Hemminger 				seq_printf(seq, "  /%d %s %s", li->plen,
241828d36e37SEric Dumazet 					   rtn_scope(buf1, sizeof(buf1),
241928d36e37SEric Dumazet 						     fa->fa_scope),
242028d36e37SEric Dumazet 					   rtn_type(buf2, sizeof(buf2),
242128d36e37SEric Dumazet 						    fa->fa_type));
2422cb7b593cSStephen Hemminger 				if (fa->fa_tos)
2423cb7b593cSStephen Hemminger 					seq_printf(seq, "tos =%d\n",
2424cb7b593cSStephen Hemminger 						   fa->fa_tos);
2425cb7b593cSStephen Hemminger 				seq_putc(seq, '\n');
2426cb7b593cSStephen Hemminger 			}
2427cb7b593cSStephen Hemminger 		}
2428cb7b593cSStephen Hemminger 	}
242919baf839SRobert Olsson 
243019baf839SRobert Olsson 	return 0;
243119baf839SRobert Olsson }
243219baf839SRobert Olsson 
2433f690808eSStephen Hemminger static const struct seq_operations fib_trie_seq_ops = {
243419baf839SRobert Olsson 	.start  = fib_trie_seq_start,
243519baf839SRobert Olsson 	.next   = fib_trie_seq_next,
243619baf839SRobert Olsson 	.stop   = fib_trie_seq_stop,
243719baf839SRobert Olsson 	.show   = fib_trie_seq_show,
243819baf839SRobert Olsson };
243919baf839SRobert Olsson 
244019baf839SRobert Olsson static int fib_trie_seq_open(struct inode *inode, struct file *file)
244119baf839SRobert Olsson {
24421c340b2fSDenis V. Lunev 	return seq_open_net(inode, file, &fib_trie_seq_ops,
2443cf7732e4SPavel Emelyanov 			    sizeof(struct fib_trie_iter));
244419baf839SRobert Olsson }
244519baf839SRobert Olsson 
24469a32144eSArjan van de Ven static const struct file_operations fib_trie_fops = {
244719baf839SRobert Olsson 	.owner  = THIS_MODULE,
244819baf839SRobert Olsson 	.open   = fib_trie_seq_open,
244919baf839SRobert Olsson 	.read   = seq_read,
245019baf839SRobert Olsson 	.llseek = seq_lseek,
24511c340b2fSDenis V. Lunev 	.release = seq_release_net,
245219baf839SRobert Olsson };
245319baf839SRobert Olsson 
245432ab5f80SAl Viro static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2455cb7b593cSStephen Hemminger {
2456cb7b593cSStephen Hemminger 	static unsigned type2flags[RTN_MAX + 1] = {
2457cb7b593cSStephen Hemminger 		[7] = RTF_REJECT, [8] = RTF_REJECT,
2458cb7b593cSStephen Hemminger 	};
2459cb7b593cSStephen Hemminger 	unsigned flags = type2flags[type];
2460cb7b593cSStephen Hemminger 
2461cb7b593cSStephen Hemminger 	if (fi && fi->fib_nh->nh_gw)
2462cb7b593cSStephen Hemminger 		flags |= RTF_GATEWAY;
246332ab5f80SAl Viro 	if (mask == htonl(0xFFFFFFFF))
2464cb7b593cSStephen Hemminger 		flags |= RTF_HOST;
2465cb7b593cSStephen Hemminger 	flags |= RTF_UP;
2466cb7b593cSStephen Hemminger 	return flags;
2467cb7b593cSStephen Hemminger }
2468cb7b593cSStephen Hemminger 
2469cb7b593cSStephen Hemminger /*
2470cb7b593cSStephen Hemminger  *	This outputs /proc/net/route.
2471cb7b593cSStephen Hemminger  *	The format of the file is not supposed to be changed
2472cb7b593cSStephen Hemminger  * 	and needs to be same as fib_hash output to avoid breaking
2473cb7b593cSStephen Hemminger  *	legacy utilities
2474cb7b593cSStephen Hemminger  */
2475cb7b593cSStephen Hemminger static int fib_route_seq_show(struct seq_file *seq, void *v)
2476cb7b593cSStephen Hemminger {
2477c9e53cbeSPatrick McHardy 	const struct fib_trie_iter *iter = seq->private;
2478cb7b593cSStephen Hemminger 	struct leaf *l = v;
24791328042eSStephen Hemminger 	struct leaf_info *li;
24801328042eSStephen Hemminger 	struct hlist_node *node;
2481cb7b593cSStephen Hemminger 
2482cb7b593cSStephen Hemminger 	if (v == SEQ_START_TOKEN) {
2483cb7b593cSStephen Hemminger 		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2484cb7b593cSStephen Hemminger 			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2485cb7b593cSStephen Hemminger 			   "\tWindow\tIRTT");
2486cb7b593cSStephen Hemminger 		return 0;
2487cb7b593cSStephen Hemminger 	}
2488cb7b593cSStephen Hemminger 
2489877a9bffSEric W. Biederman 	if (iter->trie == iter->trie_local)
2490c9e53cbeSPatrick McHardy 		return 0;
2491a07f5f50SStephen Hemminger 
2492cb7b593cSStephen Hemminger 	if (IS_TNODE(l))
2493cb7b593cSStephen Hemminger 		return 0;
2494cb7b593cSStephen Hemminger 
24951328042eSStephen Hemminger 	hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2496cb7b593cSStephen Hemminger 		struct fib_alias *fa;
249732ab5f80SAl Viro 		__be32 mask, prefix;
2498cb7b593cSStephen Hemminger 
2499cb7b593cSStephen Hemminger 		if (!li)
2500cb7b593cSStephen Hemminger 			continue;
2501cb7b593cSStephen Hemminger 
2502cb7b593cSStephen Hemminger 		mask = inet_make_mask(li->plen);
2503cb7b593cSStephen Hemminger 		prefix = htonl(l->key);
2504cb7b593cSStephen Hemminger 
2505cb7b593cSStephen Hemminger 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
25061371e37dSHerbert Xu 			const struct fib_info *fi = fa->fa_info;
2507cb7b593cSStephen Hemminger 			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
25081328042eSStephen Hemminger 			char bf[128];
2509cb7b593cSStephen Hemminger 
2510cb7b593cSStephen Hemminger 			if (fa->fa_type == RTN_BROADCAST
2511cb7b593cSStephen Hemminger 			    || fa->fa_type == RTN_MULTICAST)
2512cb7b593cSStephen Hemminger 				continue;
2513cb7b593cSStephen Hemminger 
2514cb7b593cSStephen Hemminger 			if (fi)
2515cb7b593cSStephen Hemminger 				snprintf(bf, sizeof(bf),
2516cb7b593cSStephen Hemminger 					 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2517cb7b593cSStephen Hemminger 					 fi->fib_dev ? fi->fib_dev->name : "*",
2518cb7b593cSStephen Hemminger 					 prefix,
2519cb7b593cSStephen Hemminger 					 fi->fib_nh->nh_gw, flags, 0, 0,
2520cb7b593cSStephen Hemminger 					 fi->fib_priority,
2521cb7b593cSStephen Hemminger 					 mask,
2522a07f5f50SStephen Hemminger 					 (fi->fib_advmss ?
2523a07f5f50SStephen Hemminger 					  fi->fib_advmss + 40 : 0),
2524cb7b593cSStephen Hemminger 					 fi->fib_window,
2525cb7b593cSStephen Hemminger 					 fi->fib_rtt >> 3);
2526cb7b593cSStephen Hemminger 			else
2527cb7b593cSStephen Hemminger 				snprintf(bf, sizeof(bf),
2528cb7b593cSStephen Hemminger 					 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2529cb7b593cSStephen Hemminger 					 prefix, 0, flags, 0, 0, 0,
2530cb7b593cSStephen Hemminger 					 mask, 0, 0, 0);
2531cb7b593cSStephen Hemminger 
2532cb7b593cSStephen Hemminger 			seq_printf(seq, "%-127s\n", bf);
2533cb7b593cSStephen Hemminger 		}
2534cb7b593cSStephen Hemminger 	}
2535cb7b593cSStephen Hemminger 
2536cb7b593cSStephen Hemminger 	return 0;
2537cb7b593cSStephen Hemminger }
2538cb7b593cSStephen Hemminger 
2539f690808eSStephen Hemminger static const struct seq_operations fib_route_seq_ops = {
2540cb7b593cSStephen Hemminger 	.start  = fib_trie_seq_start,
2541cb7b593cSStephen Hemminger 	.next   = fib_trie_seq_next,
2542cb7b593cSStephen Hemminger 	.stop   = fib_trie_seq_stop,
2543cb7b593cSStephen Hemminger 	.show   = fib_route_seq_show,
2544cb7b593cSStephen Hemminger };
2545cb7b593cSStephen Hemminger 
2546cb7b593cSStephen Hemminger static int fib_route_seq_open(struct inode *inode, struct file *file)
2547cb7b593cSStephen Hemminger {
25481c340b2fSDenis V. Lunev 	return seq_open_net(inode, file, &fib_route_seq_ops,
2549cf7732e4SPavel Emelyanov 			    sizeof(struct fib_trie_iter));
2550cb7b593cSStephen Hemminger }
2551cb7b593cSStephen Hemminger 
25529a32144eSArjan van de Ven static const struct file_operations fib_route_fops = {
2553cb7b593cSStephen Hemminger 	.owner  = THIS_MODULE,
2554cb7b593cSStephen Hemminger 	.open   = fib_route_seq_open,
2555cb7b593cSStephen Hemminger 	.read   = seq_read,
2556cb7b593cSStephen Hemminger 	.llseek = seq_lseek,
25571c340b2fSDenis V. Lunev 	.release = seq_release_net,
2558cb7b593cSStephen Hemminger };
2559cb7b593cSStephen Hemminger 
256061a02653SDenis V. Lunev int __net_init fib_proc_init(struct net *net)
256119baf839SRobert Olsson {
256261a02653SDenis V. Lunev 	if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2563cb7b593cSStephen Hemminger 		goto out1;
2564cb7b593cSStephen Hemminger 
256561a02653SDenis V. Lunev 	if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
256661a02653SDenis V. Lunev 				  &fib_triestat_fops))
2567cb7b593cSStephen Hemminger 		goto out2;
2568cb7b593cSStephen Hemminger 
256961a02653SDenis V. Lunev 	if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2570cb7b593cSStephen Hemminger 		goto out3;
2571cb7b593cSStephen Hemminger 
257219baf839SRobert Olsson 	return 0;
2573cb7b593cSStephen Hemminger 
2574cb7b593cSStephen Hemminger out3:
257561a02653SDenis V. Lunev 	proc_net_remove(net, "fib_triestat");
2576cb7b593cSStephen Hemminger out2:
257761a02653SDenis V. Lunev 	proc_net_remove(net, "fib_trie");
2578cb7b593cSStephen Hemminger out1:
2579cb7b593cSStephen Hemminger 	return -ENOMEM;
258019baf839SRobert Olsson }
258119baf839SRobert Olsson 
258261a02653SDenis V. Lunev void __net_exit fib_proc_exit(struct net *net)
258319baf839SRobert Olsson {
258461a02653SDenis V. Lunev 	proc_net_remove(net, "fib_trie");
258561a02653SDenis V. Lunev 	proc_net_remove(net, "fib_triestat");
258661a02653SDenis V. Lunev 	proc_net_remove(net, "route");
258719baf839SRobert Olsson }
258819baf839SRobert Olsson 
258919baf839SRobert Olsson #endif /* CONFIG_PROC_FS */
2590