xref: /openbmc/linux/net/ipv6/ip6_fib.c (revision 96de2506)
1 /*
2  *	Linux INET6 implementation
3  *	Forwarding Information Database
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  *
13  *	Changes:
14  *	Yuji SEKIYA @USAGI:	Support default route on router node;
15  *				remove ip6_null_entry from the top of
16  *				routing table.
17  *	Ville Nuorvala:		Fixed routing subtrees.
18  */
19 
20 #define pr_fmt(fmt) "IPv6: " fmt
21 
22 #include <linux/errno.h>
23 #include <linux/types.h>
24 #include <linux/net.h>
25 #include <linux/route.h>
26 #include <linux/netdevice.h>
27 #include <linux/in6.h>
28 #include <linux/init.h>
29 #include <linux/list.h>
30 #include <linux/slab.h>
31 
32 #include <net/ip.h>
33 #include <net/ipv6.h>
34 #include <net/ndisc.h>
35 #include <net/addrconf.h>
36 #include <net/lwtunnel.h>
37 #include <net/fib_notifier.h>
38 
39 #include <net/ip6_fib.h>
40 #include <net/ip6_route.h>
41 
42 static struct kmem_cache *fib6_node_kmem __read_mostly;
43 
44 struct fib6_cleaner {
45 	struct fib6_walker w;
46 	struct net *net;
47 	int (*func)(struct fib6_info *, void *arg);
48 	int sernum;
49 	void *arg;
50 };
51 
52 #ifdef CONFIG_IPV6_SUBTREES
53 #define FWS_INIT FWS_S
54 #else
55 #define FWS_INIT FWS_L
56 #endif
57 
58 static struct fib6_info *fib6_find_prefix(struct net *net,
59 					 struct fib6_table *table,
60 					 struct fib6_node *fn);
61 static struct fib6_node *fib6_repair_tree(struct net *net,
62 					  struct fib6_table *table,
63 					  struct fib6_node *fn);
64 static int fib6_walk(struct net *net, struct fib6_walker *w);
65 static int fib6_walk_continue(struct fib6_walker *w);
66 
67 /*
68  *	A routing update causes an increase of the serial number on the
69  *	affected subtree. This allows for cached routes to be asynchronously
70  *	tested when modifications are made to the destination cache as a
71  *	result of redirects, path MTU changes, etc.
72  */
73 
74 static void fib6_gc_timer_cb(struct timer_list *t);
75 
76 #define FOR_WALKERS(net, w) \
77 	list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
78 
79 static void fib6_walker_link(struct net *net, struct fib6_walker *w)
80 {
81 	write_lock_bh(&net->ipv6.fib6_walker_lock);
82 	list_add(&w->lh, &net->ipv6.fib6_walkers);
83 	write_unlock_bh(&net->ipv6.fib6_walker_lock);
84 }
85 
86 static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
87 {
88 	write_lock_bh(&net->ipv6.fib6_walker_lock);
89 	list_del(&w->lh);
90 	write_unlock_bh(&net->ipv6.fib6_walker_lock);
91 }
92 
93 static int fib6_new_sernum(struct net *net)
94 {
95 	int new, old;
96 
97 	do {
98 		old = atomic_read(&net->ipv6.fib6_sernum);
99 		new = old < INT_MAX ? old + 1 : 1;
100 	} while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
101 				old, new) != old);
102 	return new;
103 }
104 
105 enum {
106 	FIB6_NO_SERNUM_CHANGE = 0,
107 };
108 
109 void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
110 {
111 	struct fib6_node *fn;
112 
113 	fn = rcu_dereference_protected(f6i->fib6_node,
114 			lockdep_is_held(&f6i->fib6_table->tb6_lock));
115 	if (fn)
116 		fn->fn_sernum = fib6_new_sernum(net);
117 }
118 
119 /*
120  *	Auxiliary address test functions for the radix tree.
121  *
122  *	These assume a 32bit processor (although it will work on
123  *	64bit processors)
124  */
125 
126 /*
127  *	test bit
128  */
129 #if defined(__LITTLE_ENDIAN)
130 # define BITOP_BE32_SWIZZLE	(0x1F & ~7)
131 #else
132 # define BITOP_BE32_SWIZZLE	0
133 #endif
134 
135 static __be32 addr_bit_set(const void *token, int fn_bit)
136 {
137 	const __be32 *addr = token;
138 	/*
139 	 * Here,
140 	 *	1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
141 	 * is optimized version of
142 	 *	htonl(1 << ((~fn_bit)&0x1F))
143 	 * See include/asm-generic/bitops/le.h.
144 	 */
145 	return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
146 	       addr[fn_bit >> 5];
147 }
148 
149 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
150 {
151 	struct fib6_info *f6i;
152 
153 	f6i = kzalloc(sizeof(*f6i), gfp_flags);
154 	if (!f6i)
155 		return NULL;
156 
157 	f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
158 	if (!f6i->rt6i_pcpu) {
159 		kfree(f6i);
160 		return NULL;
161 	}
162 
163 	INIT_LIST_HEAD(&f6i->fib6_siblings);
164 	atomic_inc(&f6i->fib6_ref);
165 
166 	return f6i;
167 }
168 
169 void fib6_info_destroy_rcu(struct rcu_head *head)
170 {
171 	struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
172 	struct rt6_exception_bucket *bucket;
173 
174 	WARN_ON(f6i->fib6_node);
175 
176 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
177 	if (bucket) {
178 		f6i->rt6i_exception_bucket = NULL;
179 		kfree(bucket);
180 	}
181 
182 	if (f6i->rt6i_pcpu) {
183 		int cpu;
184 
185 		for_each_possible_cpu(cpu) {
186 			struct rt6_info **ppcpu_rt;
187 			struct rt6_info *pcpu_rt;
188 
189 			ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
190 			pcpu_rt = *ppcpu_rt;
191 			if (pcpu_rt) {
192 				dst_dev_put(&pcpu_rt->dst);
193 				dst_release(&pcpu_rt->dst);
194 				*ppcpu_rt = NULL;
195 			}
196 		}
197 	}
198 
199 	lwtstate_put(f6i->fib6_nh.nh_lwtstate);
200 
201 	if (f6i->fib6_nh.nh_dev)
202 		dev_put(f6i->fib6_nh.nh_dev);
203 
204 	ip_fib_metrics_put(f6i->fib6_metrics);
205 
206 	kfree(f6i);
207 }
208 EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
209 
210 static struct fib6_node *node_alloc(struct net *net)
211 {
212 	struct fib6_node *fn;
213 
214 	fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
215 	if (fn)
216 		net->ipv6.rt6_stats->fib_nodes++;
217 
218 	return fn;
219 }
220 
221 static void node_free_immediate(struct net *net, struct fib6_node *fn)
222 {
223 	kmem_cache_free(fib6_node_kmem, fn);
224 	net->ipv6.rt6_stats->fib_nodes--;
225 }
226 
227 static void node_free_rcu(struct rcu_head *head)
228 {
229 	struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
230 
231 	kmem_cache_free(fib6_node_kmem, fn);
232 }
233 
234 static void node_free(struct net *net, struct fib6_node *fn)
235 {
236 	call_rcu(&fn->rcu, node_free_rcu);
237 	net->ipv6.rt6_stats->fib_nodes--;
238 }
239 
240 static void fib6_free_table(struct fib6_table *table)
241 {
242 	inetpeer_invalidate_tree(&table->tb6_peers);
243 	kfree(table);
244 }
245 
246 static void fib6_link_table(struct net *net, struct fib6_table *tb)
247 {
248 	unsigned int h;
249 
250 	/*
251 	 * Initialize table lock at a single place to give lockdep a key,
252 	 * tables aren't visible prior to being linked to the list.
253 	 */
254 	spin_lock_init(&tb->tb6_lock);
255 	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
256 
257 	/*
258 	 * No protection necessary, this is the only list mutatation
259 	 * operation, tables never disappear once they exist.
260 	 */
261 	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
262 }
263 
264 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
265 
266 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
267 {
268 	struct fib6_table *table;
269 
270 	table = kzalloc(sizeof(*table), GFP_ATOMIC);
271 	if (table) {
272 		table->tb6_id = id;
273 		rcu_assign_pointer(table->tb6_root.leaf,
274 				   net->ipv6.fib6_null_entry);
275 		table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
276 		inet_peer_base_init(&table->tb6_peers);
277 	}
278 
279 	return table;
280 }
281 
282 struct fib6_table *fib6_new_table(struct net *net, u32 id)
283 {
284 	struct fib6_table *tb;
285 
286 	if (id == 0)
287 		id = RT6_TABLE_MAIN;
288 	tb = fib6_get_table(net, id);
289 	if (tb)
290 		return tb;
291 
292 	tb = fib6_alloc_table(net, id);
293 	if (tb)
294 		fib6_link_table(net, tb);
295 
296 	return tb;
297 }
298 EXPORT_SYMBOL_GPL(fib6_new_table);
299 
300 struct fib6_table *fib6_get_table(struct net *net, u32 id)
301 {
302 	struct fib6_table *tb;
303 	struct hlist_head *head;
304 	unsigned int h;
305 
306 	if (id == 0)
307 		id = RT6_TABLE_MAIN;
308 	h = id & (FIB6_TABLE_HASHSZ - 1);
309 	rcu_read_lock();
310 	head = &net->ipv6.fib_table_hash[h];
311 	hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
312 		if (tb->tb6_id == id) {
313 			rcu_read_unlock();
314 			return tb;
315 		}
316 	}
317 	rcu_read_unlock();
318 
319 	return NULL;
320 }
321 EXPORT_SYMBOL_GPL(fib6_get_table);
322 
323 static void __net_init fib6_tables_init(struct net *net)
324 {
325 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
326 	fib6_link_table(net, net->ipv6.fib6_local_tbl);
327 }
328 #else
329 
330 struct fib6_table *fib6_new_table(struct net *net, u32 id)
331 {
332 	return fib6_get_table(net, id);
333 }
334 
335 struct fib6_table *fib6_get_table(struct net *net, u32 id)
336 {
337 	  return net->ipv6.fib6_main_tbl;
338 }
339 
340 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
341 				   const struct sk_buff *skb,
342 				   int flags, pol_lookup_t lookup)
343 {
344 	struct rt6_info *rt;
345 
346 	rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
347 	if (rt->dst.error == -EAGAIN) {
348 		ip6_rt_put(rt);
349 		rt = net->ipv6.ip6_null_entry;
350 		dst_hold(&rt->dst);
351 	}
352 
353 	return &rt->dst;
354 }
355 
356 /* called with rcu lock held; no reference taken on fib6_info */
357 struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
358 			      int flags)
359 {
360 	return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
361 }
362 
363 static void __net_init fib6_tables_init(struct net *net)
364 {
365 	fib6_link_table(net, net->ipv6.fib6_main_tbl);
366 }
367 
368 #endif
369 
370 unsigned int fib6_tables_seq_read(struct net *net)
371 {
372 	unsigned int h, fib_seq = 0;
373 
374 	rcu_read_lock();
375 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
376 		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
377 		struct fib6_table *tb;
378 
379 		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
380 			fib_seq += tb->fib_seq;
381 	}
382 	rcu_read_unlock();
383 
384 	return fib_seq;
385 }
386 
387 static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
388 				    enum fib_event_type event_type,
389 				    struct fib6_info *rt)
390 {
391 	struct fib6_entry_notifier_info info = {
392 		.rt = rt,
393 	};
394 
395 	return call_fib6_notifier(nb, net, event_type, &info.info);
396 }
397 
398 static int call_fib6_entry_notifiers(struct net *net,
399 				     enum fib_event_type event_type,
400 				     struct fib6_info *rt,
401 				     struct netlink_ext_ack *extack)
402 {
403 	struct fib6_entry_notifier_info info = {
404 		.info.extack = extack,
405 		.rt = rt,
406 	};
407 
408 	rt->fib6_table->fib_seq++;
409 	return call_fib6_notifiers(net, event_type, &info.info);
410 }
411 
412 struct fib6_dump_arg {
413 	struct net *net;
414 	struct notifier_block *nb;
415 };
416 
417 static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
418 {
419 	if (rt == arg->net->ipv6.fib6_null_entry)
420 		return;
421 	call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
422 }
423 
424 static int fib6_node_dump(struct fib6_walker *w)
425 {
426 	struct fib6_info *rt;
427 
428 	for_each_fib6_walker_rt(w)
429 		fib6_rt_dump(rt, w->args);
430 	w->leaf = NULL;
431 	return 0;
432 }
433 
434 static void fib6_table_dump(struct net *net, struct fib6_table *tb,
435 			    struct fib6_walker *w)
436 {
437 	w->root = &tb->tb6_root;
438 	spin_lock_bh(&tb->tb6_lock);
439 	fib6_walk(net, w);
440 	spin_unlock_bh(&tb->tb6_lock);
441 }
442 
443 /* Called with rcu_read_lock() */
444 int fib6_tables_dump(struct net *net, struct notifier_block *nb)
445 {
446 	struct fib6_dump_arg arg;
447 	struct fib6_walker *w;
448 	unsigned int h;
449 
450 	w = kzalloc(sizeof(*w), GFP_ATOMIC);
451 	if (!w)
452 		return -ENOMEM;
453 
454 	w->func = fib6_node_dump;
455 	arg.net = net;
456 	arg.nb = nb;
457 	w->args = &arg;
458 
459 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
460 		struct hlist_head *head = &net->ipv6.fib_table_hash[h];
461 		struct fib6_table *tb;
462 
463 		hlist_for_each_entry_rcu(tb, head, tb6_hlist)
464 			fib6_table_dump(net, tb, w);
465 	}
466 
467 	kfree(w);
468 
469 	return 0;
470 }
471 
472 static int fib6_dump_node(struct fib6_walker *w)
473 {
474 	int res;
475 	struct fib6_info *rt;
476 
477 	for_each_fib6_walker_rt(w) {
478 		res = rt6_dump_route(rt, w->args);
479 		if (res < 0) {
480 			/* Frame is full, suspend walking */
481 			w->leaf = rt;
482 			return 1;
483 		}
484 
485 		/* Multipath routes are dumped in one route with the
486 		 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
487 		 * last sibling of this route (no need to dump the
488 		 * sibling routes again)
489 		 */
490 		if (rt->fib6_nsiblings)
491 			rt = list_last_entry(&rt->fib6_siblings,
492 					     struct fib6_info,
493 					     fib6_siblings);
494 	}
495 	w->leaf = NULL;
496 	return 0;
497 }
498 
499 static void fib6_dump_end(struct netlink_callback *cb)
500 {
501 	struct net *net = sock_net(cb->skb->sk);
502 	struct fib6_walker *w = (void *)cb->args[2];
503 
504 	if (w) {
505 		if (cb->args[4]) {
506 			cb->args[4] = 0;
507 			fib6_walker_unlink(net, w);
508 		}
509 		cb->args[2] = 0;
510 		kfree(w);
511 	}
512 	cb->done = (void *)cb->args[3];
513 	cb->args[1] = 3;
514 }
515 
516 static int fib6_dump_done(struct netlink_callback *cb)
517 {
518 	fib6_dump_end(cb);
519 	return cb->done ? cb->done(cb) : 0;
520 }
521 
522 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
523 			   struct netlink_callback *cb)
524 {
525 	struct net *net = sock_net(skb->sk);
526 	struct fib6_walker *w;
527 	int res;
528 
529 	w = (void *)cb->args[2];
530 	w->root = &table->tb6_root;
531 
532 	if (cb->args[4] == 0) {
533 		w->count = 0;
534 		w->skip = 0;
535 
536 		spin_lock_bh(&table->tb6_lock);
537 		res = fib6_walk(net, w);
538 		spin_unlock_bh(&table->tb6_lock);
539 		if (res > 0) {
540 			cb->args[4] = 1;
541 			cb->args[5] = w->root->fn_sernum;
542 		}
543 	} else {
544 		if (cb->args[5] != w->root->fn_sernum) {
545 			/* Begin at the root if the tree changed */
546 			cb->args[5] = w->root->fn_sernum;
547 			w->state = FWS_INIT;
548 			w->node = w->root;
549 			w->skip = w->count;
550 		} else
551 			w->skip = 0;
552 
553 		spin_lock_bh(&table->tb6_lock);
554 		res = fib6_walk_continue(w);
555 		spin_unlock_bh(&table->tb6_lock);
556 		if (res <= 0) {
557 			fib6_walker_unlink(net, w);
558 			cb->args[4] = 0;
559 		}
560 	}
561 
562 	return res;
563 }
564 
565 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
566 {
567 	const struct nlmsghdr *nlh = cb->nlh;
568 	struct net *net = sock_net(skb->sk);
569 	unsigned int h, s_h;
570 	unsigned int e = 0, s_e;
571 	struct rt6_rtnl_dump_arg arg;
572 	struct fib6_walker *w;
573 	struct fib6_table *tb;
574 	struct hlist_head *head;
575 	int res = 0;
576 
577 	if (cb->strict_check) {
578 		int err = ip_valid_fib_dump_req(nlh, cb->extack);
579 
580 		if (err < 0)
581 			return err;
582 	}
583 
584 	s_h = cb->args[0];
585 	s_e = cb->args[1];
586 
587 	w = (void *)cb->args[2];
588 	if (!w) {
589 		/* New dump:
590 		 *
591 		 * 1. hook callback destructor.
592 		 */
593 		cb->args[3] = (long)cb->done;
594 		cb->done = fib6_dump_done;
595 
596 		/*
597 		 * 2. allocate and initialize walker.
598 		 */
599 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
600 		if (!w)
601 			return -ENOMEM;
602 		w->func = fib6_dump_node;
603 		cb->args[2] = (long)w;
604 	}
605 
606 	arg.skb = skb;
607 	arg.cb = cb;
608 	arg.net = net;
609 	w->args = &arg;
610 
611 	rcu_read_lock();
612 	for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
613 		e = 0;
614 		head = &net->ipv6.fib_table_hash[h];
615 		hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
616 			if (e < s_e)
617 				goto next;
618 			res = fib6_dump_table(tb, skb, cb);
619 			if (res != 0)
620 				goto out;
621 next:
622 			e++;
623 		}
624 	}
625 out:
626 	rcu_read_unlock();
627 	cb->args[1] = e;
628 	cb->args[0] = h;
629 
630 	res = res < 0 ? res : skb->len;
631 	if (res <= 0)
632 		fib6_dump_end(cb);
633 	return res;
634 }
635 
636 void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
637 {
638 	if (!f6i)
639 		return;
640 
641 	if (f6i->fib6_metrics == &dst_default_metrics) {
642 		struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
643 
644 		if (!p)
645 			return;
646 
647 		refcount_set(&p->refcnt, 1);
648 		f6i->fib6_metrics = p;
649 	}
650 
651 	f6i->fib6_metrics->metrics[metric - 1] = val;
652 }
653 
654 /*
655  *	Routing Table
656  *
657  *	return the appropriate node for a routing tree "add" operation
658  *	by either creating and inserting or by returning an existing
659  *	node.
660  */
661 
662 static struct fib6_node *fib6_add_1(struct net *net,
663 				    struct fib6_table *table,
664 				    struct fib6_node *root,
665 				    struct in6_addr *addr, int plen,
666 				    int offset, int allow_create,
667 				    int replace_required,
668 				    struct netlink_ext_ack *extack)
669 {
670 	struct fib6_node *fn, *in, *ln;
671 	struct fib6_node *pn = NULL;
672 	struct rt6key *key;
673 	int	bit;
674 	__be32	dir = 0;
675 
676 	RT6_TRACE("fib6_add_1\n");
677 
678 	/* insert node in tree */
679 
680 	fn = root;
681 
682 	do {
683 		struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
684 					    lockdep_is_held(&table->tb6_lock));
685 		key = (struct rt6key *)((u8 *)leaf + offset);
686 
687 		/*
688 		 *	Prefix match
689 		 */
690 		if (plen < fn->fn_bit ||
691 		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
692 			if (!allow_create) {
693 				if (replace_required) {
694 					NL_SET_ERR_MSG(extack,
695 						       "Can not replace route - no match found");
696 					pr_warn("Can't replace route, no match found\n");
697 					return ERR_PTR(-ENOENT);
698 				}
699 				pr_warn("NLM_F_CREATE should be set when creating new route\n");
700 			}
701 			goto insert_above;
702 		}
703 
704 		/*
705 		 *	Exact match ?
706 		 */
707 
708 		if (plen == fn->fn_bit) {
709 			/* clean up an intermediate node */
710 			if (!(fn->fn_flags & RTN_RTINFO)) {
711 				RCU_INIT_POINTER(fn->leaf, NULL);
712 				fib6_info_release(leaf);
713 			/* remove null_entry in the root node */
714 			} else if (fn->fn_flags & RTN_TL_ROOT &&
715 				   rcu_access_pointer(fn->leaf) ==
716 				   net->ipv6.fib6_null_entry) {
717 				RCU_INIT_POINTER(fn->leaf, NULL);
718 			}
719 
720 			return fn;
721 		}
722 
723 		/*
724 		 *	We have more bits to go
725 		 */
726 
727 		/* Try to walk down on tree. */
728 		dir = addr_bit_set(addr, fn->fn_bit);
729 		pn = fn;
730 		fn = dir ?
731 		     rcu_dereference_protected(fn->right,
732 					lockdep_is_held(&table->tb6_lock)) :
733 		     rcu_dereference_protected(fn->left,
734 					lockdep_is_held(&table->tb6_lock));
735 	} while (fn);
736 
737 	if (!allow_create) {
738 		/* We should not create new node because
739 		 * NLM_F_REPLACE was specified without NLM_F_CREATE
740 		 * I assume it is safe to require NLM_F_CREATE when
741 		 * REPLACE flag is used! Later we may want to remove the
742 		 * check for replace_required, because according
743 		 * to netlink specification, NLM_F_CREATE
744 		 * MUST be specified if new route is created.
745 		 * That would keep IPv6 consistent with IPv4
746 		 */
747 		if (replace_required) {
748 			NL_SET_ERR_MSG(extack,
749 				       "Can not replace route - no match found");
750 			pr_warn("Can't replace route, no match found\n");
751 			return ERR_PTR(-ENOENT);
752 		}
753 		pr_warn("NLM_F_CREATE should be set when creating new route\n");
754 	}
755 	/*
756 	 *	We walked to the bottom of tree.
757 	 *	Create new leaf node without children.
758 	 */
759 
760 	ln = node_alloc(net);
761 
762 	if (!ln)
763 		return ERR_PTR(-ENOMEM);
764 	ln->fn_bit = plen;
765 	RCU_INIT_POINTER(ln->parent, pn);
766 
767 	if (dir)
768 		rcu_assign_pointer(pn->right, ln);
769 	else
770 		rcu_assign_pointer(pn->left, ln);
771 
772 	return ln;
773 
774 
775 insert_above:
776 	/*
777 	 * split since we don't have a common prefix anymore or
778 	 * we have a less significant route.
779 	 * we've to insert an intermediate node on the list
780 	 * this new node will point to the one we need to create
781 	 * and the current
782 	 */
783 
784 	pn = rcu_dereference_protected(fn->parent,
785 				       lockdep_is_held(&table->tb6_lock));
786 
787 	/* find 1st bit in difference between the 2 addrs.
788 
789 	   See comment in __ipv6_addr_diff: bit may be an invalid value,
790 	   but if it is >= plen, the value is ignored in any case.
791 	 */
792 
793 	bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
794 
795 	/*
796 	 *		(intermediate)[in]
797 	 *	          /	   \
798 	 *	(new leaf node)[ln] (old node)[fn]
799 	 */
800 	if (plen > bit) {
801 		in = node_alloc(net);
802 		ln = node_alloc(net);
803 
804 		if (!in || !ln) {
805 			if (in)
806 				node_free_immediate(net, in);
807 			if (ln)
808 				node_free_immediate(net, ln);
809 			return ERR_PTR(-ENOMEM);
810 		}
811 
812 		/*
813 		 * new intermediate node.
814 		 * RTN_RTINFO will
815 		 * be off since that an address that chooses one of
816 		 * the branches would not match less specific routes
817 		 * in the other branch
818 		 */
819 
820 		in->fn_bit = bit;
821 
822 		RCU_INIT_POINTER(in->parent, pn);
823 		in->leaf = fn->leaf;
824 		atomic_inc(&rcu_dereference_protected(in->leaf,
825 				lockdep_is_held(&table->tb6_lock))->fib6_ref);
826 
827 		/* update parent pointer */
828 		if (dir)
829 			rcu_assign_pointer(pn->right, in);
830 		else
831 			rcu_assign_pointer(pn->left, in);
832 
833 		ln->fn_bit = plen;
834 
835 		RCU_INIT_POINTER(ln->parent, in);
836 		rcu_assign_pointer(fn->parent, in);
837 
838 		if (addr_bit_set(addr, bit)) {
839 			rcu_assign_pointer(in->right, ln);
840 			rcu_assign_pointer(in->left, fn);
841 		} else {
842 			rcu_assign_pointer(in->left, ln);
843 			rcu_assign_pointer(in->right, fn);
844 		}
845 	} else { /* plen <= bit */
846 
847 		/*
848 		 *		(new leaf node)[ln]
849 		 *	          /	   \
850 		 *	     (old node)[fn] NULL
851 		 */
852 
853 		ln = node_alloc(net);
854 
855 		if (!ln)
856 			return ERR_PTR(-ENOMEM);
857 
858 		ln->fn_bit = plen;
859 
860 		RCU_INIT_POINTER(ln->parent, pn);
861 
862 		if (addr_bit_set(&key->addr, plen))
863 			RCU_INIT_POINTER(ln->right, fn);
864 		else
865 			RCU_INIT_POINTER(ln->left, fn);
866 
867 		rcu_assign_pointer(fn->parent, ln);
868 
869 		if (dir)
870 			rcu_assign_pointer(pn->right, ln);
871 		else
872 			rcu_assign_pointer(pn->left, ln);
873 	}
874 	return ln;
875 }
876 
877 static void fib6_drop_pcpu_from(struct fib6_info *f6i,
878 				const struct fib6_table *table)
879 {
880 	int cpu;
881 
882 	/* release the reference to this fib entry from
883 	 * all of its cached pcpu routes
884 	 */
885 	for_each_possible_cpu(cpu) {
886 		struct rt6_info **ppcpu_rt;
887 		struct rt6_info *pcpu_rt;
888 
889 		ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
890 		pcpu_rt = *ppcpu_rt;
891 		if (pcpu_rt) {
892 			struct fib6_info *from;
893 
894 			from = rcu_dereference_protected(pcpu_rt->from,
895 					     lockdep_is_held(&table->tb6_lock));
896 			rcu_assign_pointer(pcpu_rt->from, NULL);
897 			fib6_info_release(from);
898 		}
899 	}
900 }
901 
902 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
903 			  struct net *net)
904 {
905 	struct fib6_table *table = rt->fib6_table;
906 
907 	if (atomic_read(&rt->fib6_ref) != 1) {
908 		/* This route is used as dummy address holder in some split
909 		 * nodes. It is not leaked, but it still holds other resources,
910 		 * which must be released in time. So, scan ascendant nodes
911 		 * and replace dummy references to this route with references
912 		 * to still alive ones.
913 		 */
914 		while (fn) {
915 			struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
916 					    lockdep_is_held(&table->tb6_lock));
917 			struct fib6_info *new_leaf;
918 			if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
919 				new_leaf = fib6_find_prefix(net, table, fn);
920 				atomic_inc(&new_leaf->fib6_ref);
921 
922 				rcu_assign_pointer(fn->leaf, new_leaf);
923 				fib6_info_release(rt);
924 			}
925 			fn = rcu_dereference_protected(fn->parent,
926 				    lockdep_is_held(&table->tb6_lock));
927 		}
928 
929 		if (rt->rt6i_pcpu)
930 			fib6_drop_pcpu_from(rt, table);
931 	}
932 }
933 
934 /*
935  *	Insert routing information in a node.
936  */
937 
938 static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
939 			    struct nl_info *info,
940 			    struct netlink_ext_ack *extack)
941 {
942 	struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
943 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
944 	struct fib6_info *iter = NULL;
945 	struct fib6_info __rcu **ins;
946 	struct fib6_info __rcu **fallback_ins = NULL;
947 	int replace = (info->nlh &&
948 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
949 	int add = (!info->nlh ||
950 		   (info->nlh->nlmsg_flags & NLM_F_CREATE));
951 	int found = 0;
952 	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
953 	u16 nlflags = NLM_F_EXCL;
954 	int err;
955 
956 	if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
957 		nlflags |= NLM_F_APPEND;
958 
959 	ins = &fn->leaf;
960 
961 	for (iter = leaf; iter;
962 	     iter = rcu_dereference_protected(iter->fib6_next,
963 				lockdep_is_held(&rt->fib6_table->tb6_lock))) {
964 		/*
965 		 *	Search for duplicates
966 		 */
967 
968 		if (iter->fib6_metric == rt->fib6_metric) {
969 			/*
970 			 *	Same priority level
971 			 */
972 			if (info->nlh &&
973 			    (info->nlh->nlmsg_flags & NLM_F_EXCL))
974 				return -EEXIST;
975 
976 			nlflags &= ~NLM_F_EXCL;
977 			if (replace) {
978 				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
979 					found++;
980 					break;
981 				}
982 				if (rt_can_ecmp)
983 					fallback_ins = fallback_ins ?: ins;
984 				goto next_iter;
985 			}
986 
987 			if (rt6_duplicate_nexthop(iter, rt)) {
988 				if (rt->fib6_nsiblings)
989 					rt->fib6_nsiblings = 0;
990 				if (!(iter->fib6_flags & RTF_EXPIRES))
991 					return -EEXIST;
992 				if (!(rt->fib6_flags & RTF_EXPIRES))
993 					fib6_clean_expires(iter);
994 				else
995 					fib6_set_expires(iter, rt->expires);
996 
997 				if (rt->fib6_pmtu)
998 					fib6_metric_set(iter, RTAX_MTU,
999 							rt->fib6_pmtu);
1000 				return -EEXIST;
1001 			}
1002 			/* If we have the same destination and the same metric,
1003 			 * but not the same gateway, then the route we try to
1004 			 * add is sibling to this route, increment our counter
1005 			 * of siblings, and later we will add our route to the
1006 			 * list.
1007 			 * Only static routes (which don't have flag
1008 			 * RTF_EXPIRES) are used for ECMPv6.
1009 			 *
1010 			 * To avoid long list, we only had siblings if the
1011 			 * route have a gateway.
1012 			 */
1013 			if (rt_can_ecmp &&
1014 			    rt6_qualify_for_ecmp(iter))
1015 				rt->fib6_nsiblings++;
1016 		}
1017 
1018 		if (iter->fib6_metric > rt->fib6_metric)
1019 			break;
1020 
1021 next_iter:
1022 		ins = &iter->fib6_next;
1023 	}
1024 
1025 	if (fallback_ins && !found) {
1026 		/* No ECMP-able route found, replace first non-ECMP one */
1027 		ins = fallback_ins;
1028 		iter = rcu_dereference_protected(*ins,
1029 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1030 		found++;
1031 	}
1032 
1033 	/* Reset round-robin state, if necessary */
1034 	if (ins == &fn->leaf)
1035 		fn->rr_ptr = NULL;
1036 
1037 	/* Link this route to others same route. */
1038 	if (rt->fib6_nsiblings) {
1039 		unsigned int fib6_nsiblings;
1040 		struct fib6_info *sibling, *temp_sibling;
1041 
1042 		/* Find the first route that have the same metric */
1043 		sibling = leaf;
1044 		while (sibling) {
1045 			if (sibling->fib6_metric == rt->fib6_metric &&
1046 			    rt6_qualify_for_ecmp(sibling)) {
1047 				list_add_tail(&rt->fib6_siblings,
1048 					      &sibling->fib6_siblings);
1049 				break;
1050 			}
1051 			sibling = rcu_dereference_protected(sibling->fib6_next,
1052 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1053 		}
1054 		/* For each sibling in the list, increment the counter of
1055 		 * siblings. BUG() if counters does not match, list of siblings
1056 		 * is broken!
1057 		 */
1058 		fib6_nsiblings = 0;
1059 		list_for_each_entry_safe(sibling, temp_sibling,
1060 					 &rt->fib6_siblings, fib6_siblings) {
1061 			sibling->fib6_nsiblings++;
1062 			BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1063 			fib6_nsiblings++;
1064 		}
1065 		BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1066 		rt6_multipath_rebalance(temp_sibling);
1067 	}
1068 
1069 	/*
1070 	 *	insert node
1071 	 */
1072 	if (!replace) {
1073 		if (!add)
1074 			pr_warn("NLM_F_CREATE should be set when creating new route\n");
1075 
1076 add:
1077 		nlflags |= NLM_F_CREATE;
1078 
1079 		err = call_fib6_entry_notifiers(info->nl_net,
1080 						FIB_EVENT_ENTRY_ADD,
1081 						rt, extack);
1082 		if (err)
1083 			return err;
1084 
1085 		rcu_assign_pointer(rt->fib6_next, iter);
1086 		atomic_inc(&rt->fib6_ref);
1087 		rcu_assign_pointer(rt->fib6_node, fn);
1088 		rcu_assign_pointer(*ins, rt);
1089 		if (!info->skip_notify)
1090 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1091 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1092 
1093 		if (!(fn->fn_flags & RTN_RTINFO)) {
1094 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1095 			fn->fn_flags |= RTN_RTINFO;
1096 		}
1097 
1098 	} else {
1099 		int nsiblings;
1100 
1101 		if (!found) {
1102 			if (add)
1103 				goto add;
1104 			pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1105 			return -ENOENT;
1106 		}
1107 
1108 		err = call_fib6_entry_notifiers(info->nl_net,
1109 						FIB_EVENT_ENTRY_REPLACE,
1110 						rt, extack);
1111 		if (err)
1112 			return err;
1113 
1114 		atomic_inc(&rt->fib6_ref);
1115 		rcu_assign_pointer(rt->fib6_node, fn);
1116 		rt->fib6_next = iter->fib6_next;
1117 		rcu_assign_pointer(*ins, rt);
1118 		if (!info->skip_notify)
1119 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1120 		if (!(fn->fn_flags & RTN_RTINFO)) {
1121 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1122 			fn->fn_flags |= RTN_RTINFO;
1123 		}
1124 		nsiblings = iter->fib6_nsiblings;
1125 		iter->fib6_node = NULL;
1126 		fib6_purge_rt(iter, fn, info->nl_net);
1127 		if (rcu_access_pointer(fn->rr_ptr) == iter)
1128 			fn->rr_ptr = NULL;
1129 		fib6_info_release(iter);
1130 
1131 		if (nsiblings) {
1132 			/* Replacing an ECMP route, remove all siblings */
1133 			ins = &rt->fib6_next;
1134 			iter = rcu_dereference_protected(*ins,
1135 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1136 			while (iter) {
1137 				if (iter->fib6_metric > rt->fib6_metric)
1138 					break;
1139 				if (rt6_qualify_for_ecmp(iter)) {
1140 					*ins = iter->fib6_next;
1141 					iter->fib6_node = NULL;
1142 					fib6_purge_rt(iter, fn, info->nl_net);
1143 					if (rcu_access_pointer(fn->rr_ptr) == iter)
1144 						fn->rr_ptr = NULL;
1145 					fib6_info_release(iter);
1146 					nsiblings--;
1147 					info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1148 				} else {
1149 					ins = &iter->fib6_next;
1150 				}
1151 				iter = rcu_dereference_protected(*ins,
1152 					lockdep_is_held(&rt->fib6_table->tb6_lock));
1153 			}
1154 			WARN_ON(nsiblings != 0);
1155 		}
1156 	}
1157 
1158 	return 0;
1159 }
1160 
1161 static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1162 {
1163 	if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1164 	    (rt->fib6_flags & RTF_EXPIRES))
1165 		mod_timer(&net->ipv6.ip6_fib_timer,
1166 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1167 }
1168 
1169 void fib6_force_start_gc(struct net *net)
1170 {
1171 	if (!timer_pending(&net->ipv6.ip6_fib_timer))
1172 		mod_timer(&net->ipv6.ip6_fib_timer,
1173 			  jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1174 }
1175 
1176 static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1177 					   int sernum)
1178 {
1179 	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1180 				lockdep_is_held(&rt->fib6_table->tb6_lock));
1181 
1182 	/* paired with smp_rmb() in rt6_get_cookie_safe() */
1183 	smp_wmb();
1184 	while (fn) {
1185 		fn->fn_sernum = sernum;
1186 		fn = rcu_dereference_protected(fn->parent,
1187 				lockdep_is_held(&rt->fib6_table->tb6_lock));
1188 	}
1189 }
1190 
1191 void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1192 {
1193 	__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1194 }
1195 
1196 /*
1197  *	Add routing information to the routing tree.
1198  *	<destination addr>/<source addr>
1199  *	with source addr info in sub-trees
1200  *	Need to own table->tb6_lock
1201  */
1202 
1203 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1204 	     struct nl_info *info, struct netlink_ext_ack *extack)
1205 {
1206 	struct fib6_table *table = rt->fib6_table;
1207 	struct fib6_node *fn, *pn = NULL;
1208 	int err = -ENOMEM;
1209 	int allow_create = 1;
1210 	int replace_required = 0;
1211 	int sernum = fib6_new_sernum(info->nl_net);
1212 
1213 	if (info->nlh) {
1214 		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1215 			allow_create = 0;
1216 		if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1217 			replace_required = 1;
1218 	}
1219 	if (!allow_create && !replace_required)
1220 		pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1221 
1222 	fn = fib6_add_1(info->nl_net, table, root,
1223 			&rt->fib6_dst.addr, rt->fib6_dst.plen,
1224 			offsetof(struct fib6_info, fib6_dst), allow_create,
1225 			replace_required, extack);
1226 	if (IS_ERR(fn)) {
1227 		err = PTR_ERR(fn);
1228 		fn = NULL;
1229 		goto out;
1230 	}
1231 
1232 	pn = fn;
1233 
1234 #ifdef CONFIG_IPV6_SUBTREES
1235 	if (rt->fib6_src.plen) {
1236 		struct fib6_node *sn;
1237 
1238 		if (!rcu_access_pointer(fn->subtree)) {
1239 			struct fib6_node *sfn;
1240 
1241 			/*
1242 			 * Create subtree.
1243 			 *
1244 			 *		fn[main tree]
1245 			 *		|
1246 			 *		sfn[subtree root]
1247 			 *		   \
1248 			 *		    sn[new leaf node]
1249 			 */
1250 
1251 			/* Create subtree root node */
1252 			sfn = node_alloc(info->nl_net);
1253 			if (!sfn)
1254 				goto failure;
1255 
1256 			atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
1257 			rcu_assign_pointer(sfn->leaf,
1258 					   info->nl_net->ipv6.fib6_null_entry);
1259 			sfn->fn_flags = RTN_ROOT;
1260 
1261 			/* Now add the first leaf node to new subtree */
1262 
1263 			sn = fib6_add_1(info->nl_net, table, sfn,
1264 					&rt->fib6_src.addr, rt->fib6_src.plen,
1265 					offsetof(struct fib6_info, fib6_src),
1266 					allow_create, replace_required, extack);
1267 
1268 			if (IS_ERR(sn)) {
1269 				/* If it is failed, discard just allocated
1270 				   root, and then (in failure) stale node
1271 				   in main tree.
1272 				 */
1273 				node_free_immediate(info->nl_net, sfn);
1274 				err = PTR_ERR(sn);
1275 				goto failure;
1276 			}
1277 
1278 			/* Now link new subtree to main tree */
1279 			rcu_assign_pointer(sfn->parent, fn);
1280 			rcu_assign_pointer(fn->subtree, sfn);
1281 		} else {
1282 			sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1283 					&rt->fib6_src.addr, rt->fib6_src.plen,
1284 					offsetof(struct fib6_info, fib6_src),
1285 					allow_create, replace_required, extack);
1286 
1287 			if (IS_ERR(sn)) {
1288 				err = PTR_ERR(sn);
1289 				goto failure;
1290 			}
1291 		}
1292 
1293 		if (!rcu_access_pointer(fn->leaf)) {
1294 			if (fn->fn_flags & RTN_TL_ROOT) {
1295 				/* put back null_entry for root node */
1296 				rcu_assign_pointer(fn->leaf,
1297 					    info->nl_net->ipv6.fib6_null_entry);
1298 			} else {
1299 				atomic_inc(&rt->fib6_ref);
1300 				rcu_assign_pointer(fn->leaf, rt);
1301 			}
1302 		}
1303 		fn = sn;
1304 	}
1305 #endif
1306 
1307 	err = fib6_add_rt2node(fn, rt, info, extack);
1308 	if (!err) {
1309 		__fib6_update_sernum_upto_root(rt, sernum);
1310 		fib6_start_gc(info->nl_net, rt);
1311 	}
1312 
1313 out:
1314 	if (err) {
1315 #ifdef CONFIG_IPV6_SUBTREES
1316 		/*
1317 		 * If fib6_add_1 has cleared the old leaf pointer in the
1318 		 * super-tree leaf node we have to find a new one for it.
1319 		 */
1320 		if (pn != fn) {
1321 			struct fib6_info *pn_leaf =
1322 				rcu_dereference_protected(pn->leaf,
1323 				    lockdep_is_held(&table->tb6_lock));
1324 			if (pn_leaf == rt) {
1325 				pn_leaf = NULL;
1326 				RCU_INIT_POINTER(pn->leaf, NULL);
1327 				fib6_info_release(rt);
1328 			}
1329 			if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1330 				pn_leaf = fib6_find_prefix(info->nl_net, table,
1331 							   pn);
1332 #if RT6_DEBUG >= 2
1333 				if (!pn_leaf) {
1334 					WARN_ON(!pn_leaf);
1335 					pn_leaf =
1336 					    info->nl_net->ipv6.fib6_null_entry;
1337 				}
1338 #endif
1339 				fib6_info_hold(pn_leaf);
1340 				rcu_assign_pointer(pn->leaf, pn_leaf);
1341 			}
1342 		}
1343 #endif
1344 		goto failure;
1345 	}
1346 	return err;
1347 
1348 failure:
1349 	/* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1350 	 * 1. fn is an intermediate node and we failed to add the new
1351 	 * route to it in both subtree creation failure and fib6_add_rt2node()
1352 	 * failure case.
1353 	 * 2. fn is the root node in the table and we fail to add the first
1354 	 * default route to it.
1355 	 */
1356 	if (fn &&
1357 	    (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1358 	     (fn->fn_flags & RTN_TL_ROOT &&
1359 	      !rcu_access_pointer(fn->leaf))))
1360 		fib6_repair_tree(info->nl_net, table, fn);
1361 	return err;
1362 }
1363 
1364 /*
1365  *	Routing tree lookup
1366  *
1367  */
1368 
1369 struct lookup_args {
1370 	int			offset;		/* key offset on fib6_info */
1371 	const struct in6_addr	*addr;		/* search key			*/
1372 };
1373 
1374 static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1375 					    struct lookup_args *args)
1376 {
1377 	struct fib6_node *fn;
1378 	__be32 dir;
1379 
1380 	if (unlikely(args->offset == 0))
1381 		return NULL;
1382 
1383 	/*
1384 	 *	Descend on a tree
1385 	 */
1386 
1387 	fn = root;
1388 
1389 	for (;;) {
1390 		struct fib6_node *next;
1391 
1392 		dir = addr_bit_set(args->addr, fn->fn_bit);
1393 
1394 		next = dir ? rcu_dereference(fn->right) :
1395 			     rcu_dereference(fn->left);
1396 
1397 		if (next) {
1398 			fn = next;
1399 			continue;
1400 		}
1401 		break;
1402 	}
1403 
1404 	while (fn) {
1405 		struct fib6_node *subtree = FIB6_SUBTREE(fn);
1406 
1407 		if (subtree || fn->fn_flags & RTN_RTINFO) {
1408 			struct fib6_info *leaf = rcu_dereference(fn->leaf);
1409 			struct rt6key *key;
1410 
1411 			if (!leaf)
1412 				goto backtrack;
1413 
1414 			key = (struct rt6key *) ((u8 *)leaf + args->offset);
1415 
1416 			if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1417 #ifdef CONFIG_IPV6_SUBTREES
1418 				if (subtree) {
1419 					struct fib6_node *sfn;
1420 					sfn = fib6_node_lookup_1(subtree,
1421 								 args + 1);
1422 					if (!sfn)
1423 						goto backtrack;
1424 					fn = sfn;
1425 				}
1426 #endif
1427 				if (fn->fn_flags & RTN_RTINFO)
1428 					return fn;
1429 			}
1430 		}
1431 backtrack:
1432 		if (fn->fn_flags & RTN_ROOT)
1433 			break;
1434 
1435 		fn = rcu_dereference(fn->parent);
1436 	}
1437 
1438 	return NULL;
1439 }
1440 
1441 /* called with rcu_read_lock() held
1442  */
1443 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1444 				   const struct in6_addr *daddr,
1445 				   const struct in6_addr *saddr)
1446 {
1447 	struct fib6_node *fn;
1448 	struct lookup_args args[] = {
1449 		{
1450 			.offset = offsetof(struct fib6_info, fib6_dst),
1451 			.addr = daddr,
1452 		},
1453 #ifdef CONFIG_IPV6_SUBTREES
1454 		{
1455 			.offset = offsetof(struct fib6_info, fib6_src),
1456 			.addr = saddr,
1457 		},
1458 #endif
1459 		{
1460 			.offset = 0,	/* sentinel */
1461 		}
1462 	};
1463 
1464 	fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1465 	if (!fn || fn->fn_flags & RTN_TL_ROOT)
1466 		fn = root;
1467 
1468 	return fn;
1469 }
1470 
1471 /*
1472  *	Get node with specified destination prefix (and source prefix,
1473  *	if subtrees are used)
1474  *	exact_match == true means we try to find fn with exact match of
1475  *	the passed in prefix addr
1476  *	exact_match == false means we try to find fn with longest prefix
1477  *	match of the passed in prefix addr. This is useful for finding fn
1478  *	for cached route as it will be stored in the exception table under
1479  *	the node with longest prefix length.
1480  */
1481 
1482 
1483 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1484 				       const struct in6_addr *addr,
1485 				       int plen, int offset,
1486 				       bool exact_match)
1487 {
1488 	struct fib6_node *fn, *prev = NULL;
1489 
1490 	for (fn = root; fn ; ) {
1491 		struct fib6_info *leaf = rcu_dereference(fn->leaf);
1492 		struct rt6key *key;
1493 
1494 		/* This node is being deleted */
1495 		if (!leaf) {
1496 			if (plen <= fn->fn_bit)
1497 				goto out;
1498 			else
1499 				goto next;
1500 		}
1501 
1502 		key = (struct rt6key *)((u8 *)leaf + offset);
1503 
1504 		/*
1505 		 *	Prefix match
1506 		 */
1507 		if (plen < fn->fn_bit ||
1508 		    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1509 			goto out;
1510 
1511 		if (plen == fn->fn_bit)
1512 			return fn;
1513 
1514 		prev = fn;
1515 
1516 next:
1517 		/*
1518 		 *	We have more bits to go
1519 		 */
1520 		if (addr_bit_set(addr, fn->fn_bit))
1521 			fn = rcu_dereference(fn->right);
1522 		else
1523 			fn = rcu_dereference(fn->left);
1524 	}
1525 out:
1526 	if (exact_match)
1527 		return NULL;
1528 	else
1529 		return prev;
1530 }
1531 
1532 struct fib6_node *fib6_locate(struct fib6_node *root,
1533 			      const struct in6_addr *daddr, int dst_len,
1534 			      const struct in6_addr *saddr, int src_len,
1535 			      bool exact_match)
1536 {
1537 	struct fib6_node *fn;
1538 
1539 	fn = fib6_locate_1(root, daddr, dst_len,
1540 			   offsetof(struct fib6_info, fib6_dst),
1541 			   exact_match);
1542 
1543 #ifdef CONFIG_IPV6_SUBTREES
1544 	if (src_len) {
1545 		WARN_ON(saddr == NULL);
1546 		if (fn) {
1547 			struct fib6_node *subtree = FIB6_SUBTREE(fn);
1548 
1549 			if (subtree) {
1550 				fn = fib6_locate_1(subtree, saddr, src_len,
1551 					   offsetof(struct fib6_info, fib6_src),
1552 					   exact_match);
1553 			}
1554 		}
1555 	}
1556 #endif
1557 
1558 	if (fn && fn->fn_flags & RTN_RTINFO)
1559 		return fn;
1560 
1561 	return NULL;
1562 }
1563 
1564 
1565 /*
1566  *	Deletion
1567  *
1568  */
1569 
1570 static struct fib6_info *fib6_find_prefix(struct net *net,
1571 					 struct fib6_table *table,
1572 					 struct fib6_node *fn)
1573 {
1574 	struct fib6_node *child_left, *child_right;
1575 
1576 	if (fn->fn_flags & RTN_ROOT)
1577 		return net->ipv6.fib6_null_entry;
1578 
1579 	while (fn) {
1580 		child_left = rcu_dereference_protected(fn->left,
1581 				    lockdep_is_held(&table->tb6_lock));
1582 		child_right = rcu_dereference_protected(fn->right,
1583 				    lockdep_is_held(&table->tb6_lock));
1584 		if (child_left)
1585 			return rcu_dereference_protected(child_left->leaf,
1586 					lockdep_is_held(&table->tb6_lock));
1587 		if (child_right)
1588 			return rcu_dereference_protected(child_right->leaf,
1589 					lockdep_is_held(&table->tb6_lock));
1590 
1591 		fn = FIB6_SUBTREE(fn);
1592 	}
1593 	return NULL;
1594 }
1595 
1596 /*
1597  *	Called to trim the tree of intermediate nodes when possible. "fn"
1598  *	is the node we want to try and remove.
1599  *	Need to own table->tb6_lock
1600  */
1601 
1602 static struct fib6_node *fib6_repair_tree(struct net *net,
1603 					  struct fib6_table *table,
1604 					  struct fib6_node *fn)
1605 {
1606 	int children;
1607 	int nstate;
1608 	struct fib6_node *child;
1609 	struct fib6_walker *w;
1610 	int iter = 0;
1611 
1612 	/* Set fn->leaf to null_entry for root node. */
1613 	if (fn->fn_flags & RTN_TL_ROOT) {
1614 		rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1615 		return fn;
1616 	}
1617 
1618 	for (;;) {
1619 		struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1620 					    lockdep_is_held(&table->tb6_lock));
1621 		struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1622 					    lockdep_is_held(&table->tb6_lock));
1623 		struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1624 					    lockdep_is_held(&table->tb6_lock));
1625 		struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1626 					    lockdep_is_held(&table->tb6_lock));
1627 		struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1628 					    lockdep_is_held(&table->tb6_lock));
1629 		struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1630 					    lockdep_is_held(&table->tb6_lock));
1631 		struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1632 					    lockdep_is_held(&table->tb6_lock));
1633 		struct fib6_info *new_fn_leaf;
1634 
1635 		RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1636 		iter++;
1637 
1638 		WARN_ON(fn->fn_flags & RTN_RTINFO);
1639 		WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1640 		WARN_ON(fn_leaf);
1641 
1642 		children = 0;
1643 		child = NULL;
1644 		if (fn_r)
1645 			child = fn_r, children |= 1;
1646 		if (fn_l)
1647 			child = fn_l, children |= 2;
1648 
1649 		if (children == 3 || FIB6_SUBTREE(fn)
1650 #ifdef CONFIG_IPV6_SUBTREES
1651 		    /* Subtree root (i.e. fn) may have one child */
1652 		    || (children && fn->fn_flags & RTN_ROOT)
1653 #endif
1654 		    ) {
1655 			new_fn_leaf = fib6_find_prefix(net, table, fn);
1656 #if RT6_DEBUG >= 2
1657 			if (!new_fn_leaf) {
1658 				WARN_ON(!new_fn_leaf);
1659 				new_fn_leaf = net->ipv6.fib6_null_entry;
1660 			}
1661 #endif
1662 			fib6_info_hold(new_fn_leaf);
1663 			rcu_assign_pointer(fn->leaf, new_fn_leaf);
1664 			return pn;
1665 		}
1666 
1667 #ifdef CONFIG_IPV6_SUBTREES
1668 		if (FIB6_SUBTREE(pn) == fn) {
1669 			WARN_ON(!(fn->fn_flags & RTN_ROOT));
1670 			RCU_INIT_POINTER(pn->subtree, NULL);
1671 			nstate = FWS_L;
1672 		} else {
1673 			WARN_ON(fn->fn_flags & RTN_ROOT);
1674 #endif
1675 			if (pn_r == fn)
1676 				rcu_assign_pointer(pn->right, child);
1677 			else if (pn_l == fn)
1678 				rcu_assign_pointer(pn->left, child);
1679 #if RT6_DEBUG >= 2
1680 			else
1681 				WARN_ON(1);
1682 #endif
1683 			if (child)
1684 				rcu_assign_pointer(child->parent, pn);
1685 			nstate = FWS_R;
1686 #ifdef CONFIG_IPV6_SUBTREES
1687 		}
1688 #endif
1689 
1690 		read_lock(&net->ipv6.fib6_walker_lock);
1691 		FOR_WALKERS(net, w) {
1692 			if (!child) {
1693 				if (w->node == fn) {
1694 					RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1695 					w->node = pn;
1696 					w->state = nstate;
1697 				}
1698 			} else {
1699 				if (w->node == fn) {
1700 					w->node = child;
1701 					if (children&2) {
1702 						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1703 						w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1704 					} else {
1705 						RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1706 						w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1707 					}
1708 				}
1709 			}
1710 		}
1711 		read_unlock(&net->ipv6.fib6_walker_lock);
1712 
1713 		node_free(net, fn);
1714 		if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1715 			return pn;
1716 
1717 		RCU_INIT_POINTER(pn->leaf, NULL);
1718 		fib6_info_release(pn_leaf);
1719 		fn = pn;
1720 	}
1721 }
1722 
1723 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1724 			   struct fib6_info __rcu **rtp, struct nl_info *info)
1725 {
1726 	struct fib6_walker *w;
1727 	struct fib6_info *rt = rcu_dereference_protected(*rtp,
1728 				    lockdep_is_held(&table->tb6_lock));
1729 	struct net *net = info->nl_net;
1730 
1731 	RT6_TRACE("fib6_del_route\n");
1732 
1733 	/* Unlink it */
1734 	*rtp = rt->fib6_next;
1735 	rt->fib6_node = NULL;
1736 	net->ipv6.rt6_stats->fib_rt_entries--;
1737 	net->ipv6.rt6_stats->fib_discarded_routes++;
1738 
1739 	/* Flush all cached dst in exception table */
1740 	rt6_flush_exceptions(rt);
1741 
1742 	/* Reset round-robin state, if necessary */
1743 	if (rcu_access_pointer(fn->rr_ptr) == rt)
1744 		fn->rr_ptr = NULL;
1745 
1746 	/* Remove this entry from other siblings */
1747 	if (rt->fib6_nsiblings) {
1748 		struct fib6_info *sibling, *next_sibling;
1749 
1750 		list_for_each_entry_safe(sibling, next_sibling,
1751 					 &rt->fib6_siblings, fib6_siblings)
1752 			sibling->fib6_nsiblings--;
1753 		rt->fib6_nsiblings = 0;
1754 		list_del_init(&rt->fib6_siblings);
1755 		rt6_multipath_rebalance(next_sibling);
1756 	}
1757 
1758 	/* Adjust walkers */
1759 	read_lock(&net->ipv6.fib6_walker_lock);
1760 	FOR_WALKERS(net, w) {
1761 		if (w->state == FWS_C && w->leaf == rt) {
1762 			RT6_TRACE("walker %p adjusted by delroute\n", w);
1763 			w->leaf = rcu_dereference_protected(rt->fib6_next,
1764 					    lockdep_is_held(&table->tb6_lock));
1765 			if (!w->leaf)
1766 				w->state = FWS_U;
1767 		}
1768 	}
1769 	read_unlock(&net->ipv6.fib6_walker_lock);
1770 
1771 	/* If it was last route, call fib6_repair_tree() to:
1772 	 * 1. For root node, put back null_entry as how the table was created.
1773 	 * 2. For other nodes, expunge its radix tree node.
1774 	 */
1775 	if (!rcu_access_pointer(fn->leaf)) {
1776 		if (!(fn->fn_flags & RTN_TL_ROOT)) {
1777 			fn->fn_flags &= ~RTN_RTINFO;
1778 			net->ipv6.rt6_stats->fib_route_nodes--;
1779 		}
1780 		fn = fib6_repair_tree(net, table, fn);
1781 	}
1782 
1783 	fib6_purge_rt(rt, fn, net);
1784 
1785 	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
1786 	if (!info->skip_notify)
1787 		inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1788 	fib6_info_release(rt);
1789 }
1790 
1791 /* Need to own table->tb6_lock */
1792 int fib6_del(struct fib6_info *rt, struct nl_info *info)
1793 {
1794 	struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1795 				    lockdep_is_held(&rt->fib6_table->tb6_lock));
1796 	struct fib6_table *table = rt->fib6_table;
1797 	struct net *net = info->nl_net;
1798 	struct fib6_info __rcu **rtp;
1799 	struct fib6_info __rcu **rtp_next;
1800 
1801 	if (!fn || rt == net->ipv6.fib6_null_entry)
1802 		return -ENOENT;
1803 
1804 	WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1805 
1806 	/*
1807 	 *	Walk the leaf entries looking for ourself
1808 	 */
1809 
1810 	for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
1811 		struct fib6_info *cur = rcu_dereference_protected(*rtp,
1812 					lockdep_is_held(&table->tb6_lock));
1813 		if (rt == cur) {
1814 			fib6_del_route(table, fn, rtp, info);
1815 			return 0;
1816 		}
1817 		rtp_next = &cur->fib6_next;
1818 	}
1819 	return -ENOENT;
1820 }
1821 
1822 /*
1823  *	Tree traversal function.
1824  *
1825  *	Certainly, it is not interrupt safe.
1826  *	However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1827  *	It means, that we can modify tree during walking
1828  *	and use this function for garbage collection, clone pruning,
1829  *	cleaning tree when a device goes down etc. etc.
1830  *
1831  *	It guarantees that every node will be traversed,
1832  *	and that it will be traversed only once.
1833  *
1834  *	Callback function w->func may return:
1835  *	0 -> continue walking.
1836  *	positive value -> walking is suspended (used by tree dumps,
1837  *	and probably by gc, if it will be split to several slices)
1838  *	negative value -> terminate walking.
1839  *
1840  *	The function itself returns:
1841  *	0   -> walk is complete.
1842  *	>0  -> walk is incomplete (i.e. suspended)
1843  *	<0  -> walk is terminated by an error.
1844  *
1845  *	This function is called with tb6_lock held.
1846  */
1847 
1848 static int fib6_walk_continue(struct fib6_walker *w)
1849 {
1850 	struct fib6_node *fn, *pn, *left, *right;
1851 
1852 	/* w->root should always be table->tb6_root */
1853 	WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
1854 
1855 	for (;;) {
1856 		fn = w->node;
1857 		if (!fn)
1858 			return 0;
1859 
1860 		switch (w->state) {
1861 #ifdef CONFIG_IPV6_SUBTREES
1862 		case FWS_S:
1863 			if (FIB6_SUBTREE(fn)) {
1864 				w->node = FIB6_SUBTREE(fn);
1865 				continue;
1866 			}
1867 			w->state = FWS_L;
1868 #endif
1869 			/* fall through */
1870 		case FWS_L:
1871 			left = rcu_dereference_protected(fn->left, 1);
1872 			if (left) {
1873 				w->node = left;
1874 				w->state = FWS_INIT;
1875 				continue;
1876 			}
1877 			w->state = FWS_R;
1878 			/* fall through */
1879 		case FWS_R:
1880 			right = rcu_dereference_protected(fn->right, 1);
1881 			if (right) {
1882 				w->node = right;
1883 				w->state = FWS_INIT;
1884 				continue;
1885 			}
1886 			w->state = FWS_C;
1887 			w->leaf = rcu_dereference_protected(fn->leaf, 1);
1888 			/* fall through */
1889 		case FWS_C:
1890 			if (w->leaf && fn->fn_flags & RTN_RTINFO) {
1891 				int err;
1892 
1893 				if (w->skip) {
1894 					w->skip--;
1895 					goto skip;
1896 				}
1897 
1898 				err = w->func(w);
1899 				if (err)
1900 					return err;
1901 
1902 				w->count++;
1903 				continue;
1904 			}
1905 skip:
1906 			w->state = FWS_U;
1907 			/* fall through */
1908 		case FWS_U:
1909 			if (fn == w->root)
1910 				return 0;
1911 			pn = rcu_dereference_protected(fn->parent, 1);
1912 			left = rcu_dereference_protected(pn->left, 1);
1913 			right = rcu_dereference_protected(pn->right, 1);
1914 			w->node = pn;
1915 #ifdef CONFIG_IPV6_SUBTREES
1916 			if (FIB6_SUBTREE(pn) == fn) {
1917 				WARN_ON(!(fn->fn_flags & RTN_ROOT));
1918 				w->state = FWS_L;
1919 				continue;
1920 			}
1921 #endif
1922 			if (left == fn) {
1923 				w->state = FWS_R;
1924 				continue;
1925 			}
1926 			if (right == fn) {
1927 				w->state = FWS_C;
1928 				w->leaf = rcu_dereference_protected(w->node->leaf, 1);
1929 				continue;
1930 			}
1931 #if RT6_DEBUG >= 2
1932 			WARN_ON(1);
1933 #endif
1934 		}
1935 	}
1936 }
1937 
1938 static int fib6_walk(struct net *net, struct fib6_walker *w)
1939 {
1940 	int res;
1941 
1942 	w->state = FWS_INIT;
1943 	w->node = w->root;
1944 
1945 	fib6_walker_link(net, w);
1946 	res = fib6_walk_continue(w);
1947 	if (res <= 0)
1948 		fib6_walker_unlink(net, w);
1949 	return res;
1950 }
1951 
1952 static int fib6_clean_node(struct fib6_walker *w)
1953 {
1954 	int res;
1955 	struct fib6_info *rt;
1956 	struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
1957 	struct nl_info info = {
1958 		.nl_net = c->net,
1959 	};
1960 
1961 	if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
1962 	    w->node->fn_sernum != c->sernum)
1963 		w->node->fn_sernum = c->sernum;
1964 
1965 	if (!c->func) {
1966 		WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
1967 		w->leaf = NULL;
1968 		return 0;
1969 	}
1970 
1971 	for_each_fib6_walker_rt(w) {
1972 		res = c->func(rt, c->arg);
1973 		if (res == -1) {
1974 			w->leaf = rt;
1975 			res = fib6_del(rt, &info);
1976 			if (res) {
1977 #if RT6_DEBUG >= 2
1978 				pr_debug("%s: del failed: rt=%p@%p err=%d\n",
1979 					 __func__, rt,
1980 					 rcu_access_pointer(rt->fib6_node),
1981 					 res);
1982 #endif
1983 				continue;
1984 			}
1985 			return 0;
1986 		} else if (res == -2) {
1987 			if (WARN_ON(!rt->fib6_nsiblings))
1988 				continue;
1989 			rt = list_last_entry(&rt->fib6_siblings,
1990 					     struct fib6_info, fib6_siblings);
1991 			continue;
1992 		}
1993 		WARN_ON(res != 0);
1994 	}
1995 	w->leaf = rt;
1996 	return 0;
1997 }
1998 
1999 /*
2000  *	Convenient frontend to tree walker.
2001  *
2002  *	func is called on each route.
2003  *		It may return -2 -> skip multipath route.
2004  *			      -1 -> delete this route.
2005  *		              0  -> continue walking
2006  */
2007 
2008 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2009 			    int (*func)(struct fib6_info *, void *arg),
2010 			    int sernum, void *arg)
2011 {
2012 	struct fib6_cleaner c;
2013 
2014 	c.w.root = root;
2015 	c.w.func = fib6_clean_node;
2016 	c.w.count = 0;
2017 	c.w.skip = 0;
2018 	c.func = func;
2019 	c.sernum = sernum;
2020 	c.arg = arg;
2021 	c.net = net;
2022 
2023 	fib6_walk(net, &c.w);
2024 }
2025 
2026 static void __fib6_clean_all(struct net *net,
2027 			     int (*func)(struct fib6_info *, void *),
2028 			     int sernum, void *arg)
2029 {
2030 	struct fib6_table *table;
2031 	struct hlist_head *head;
2032 	unsigned int h;
2033 
2034 	rcu_read_lock();
2035 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2036 		head = &net->ipv6.fib_table_hash[h];
2037 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2038 			spin_lock_bh(&table->tb6_lock);
2039 			fib6_clean_tree(net, &table->tb6_root,
2040 					func, sernum, arg);
2041 			spin_unlock_bh(&table->tb6_lock);
2042 		}
2043 	}
2044 	rcu_read_unlock();
2045 }
2046 
2047 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2048 		    void *arg)
2049 {
2050 	__fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
2051 }
2052 
2053 static void fib6_flush_trees(struct net *net)
2054 {
2055 	int new_sernum = fib6_new_sernum(net);
2056 
2057 	__fib6_clean_all(net, NULL, new_sernum, NULL);
2058 }
2059 
2060 /*
2061  *	Garbage collection
2062  */
2063 
2064 static int fib6_age(struct fib6_info *rt, void *arg)
2065 {
2066 	struct fib6_gc_args *gc_args = arg;
2067 	unsigned long now = jiffies;
2068 
2069 	/*
2070 	 *	check addrconf expiration here.
2071 	 *	Routes are expired even if they are in use.
2072 	 */
2073 
2074 	if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2075 		if (time_after(now, rt->expires)) {
2076 			RT6_TRACE("expiring %p\n", rt);
2077 			return -1;
2078 		}
2079 		gc_args->more++;
2080 	}
2081 
2082 	/*	Also age clones in the exception table.
2083 	 *	Note, that clones are aged out
2084 	 *	only if they are not in use now.
2085 	 */
2086 	rt6_age_exceptions(rt, gc_args, now);
2087 
2088 	return 0;
2089 }
2090 
2091 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2092 {
2093 	struct fib6_gc_args gc_args;
2094 	unsigned long now;
2095 
2096 	if (force) {
2097 		spin_lock_bh(&net->ipv6.fib6_gc_lock);
2098 	} else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2099 		mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2100 		return;
2101 	}
2102 	gc_args.timeout = expires ? (int)expires :
2103 			  net->ipv6.sysctl.ip6_rt_gc_interval;
2104 	gc_args.more = 0;
2105 
2106 	fib6_clean_all(net, fib6_age, &gc_args);
2107 	now = jiffies;
2108 	net->ipv6.ip6_rt_last_gc = now;
2109 
2110 	if (gc_args.more)
2111 		mod_timer(&net->ipv6.ip6_fib_timer,
2112 			  round_jiffies(now
2113 					+ net->ipv6.sysctl.ip6_rt_gc_interval));
2114 	else
2115 		del_timer(&net->ipv6.ip6_fib_timer);
2116 	spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2117 }
2118 
2119 static void fib6_gc_timer_cb(struct timer_list *t)
2120 {
2121 	struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
2122 
2123 	fib6_run_gc(0, arg, true);
2124 }
2125 
2126 static int __net_init fib6_net_init(struct net *net)
2127 {
2128 	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2129 	int err;
2130 
2131 	err = fib6_notifier_init(net);
2132 	if (err)
2133 		return err;
2134 
2135 	spin_lock_init(&net->ipv6.fib6_gc_lock);
2136 	rwlock_init(&net->ipv6.fib6_walker_lock);
2137 	INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2138 	timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2139 
2140 	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2141 	if (!net->ipv6.rt6_stats)
2142 		goto out_timer;
2143 
2144 	/* Avoid false sharing : Use at least a full cache line */
2145 	size = max_t(size_t, size, L1_CACHE_BYTES);
2146 
2147 	net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2148 	if (!net->ipv6.fib_table_hash)
2149 		goto out_rt6_stats;
2150 
2151 	net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2152 					  GFP_KERNEL);
2153 	if (!net->ipv6.fib6_main_tbl)
2154 		goto out_fib_table_hash;
2155 
2156 	net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2157 	rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2158 			   net->ipv6.fib6_null_entry);
2159 	net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2160 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2161 	inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2162 
2163 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2164 	net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2165 					   GFP_KERNEL);
2166 	if (!net->ipv6.fib6_local_tbl)
2167 		goto out_fib6_main_tbl;
2168 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2169 	rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2170 			   net->ipv6.fib6_null_entry);
2171 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2172 		RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2173 	inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2174 #endif
2175 	fib6_tables_init(net);
2176 
2177 	return 0;
2178 
2179 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2180 out_fib6_main_tbl:
2181 	kfree(net->ipv6.fib6_main_tbl);
2182 #endif
2183 out_fib_table_hash:
2184 	kfree(net->ipv6.fib_table_hash);
2185 out_rt6_stats:
2186 	kfree(net->ipv6.rt6_stats);
2187 out_timer:
2188 	fib6_notifier_exit(net);
2189 	return -ENOMEM;
2190 }
2191 
2192 static void fib6_net_exit(struct net *net)
2193 {
2194 	unsigned int i;
2195 
2196 	del_timer_sync(&net->ipv6.ip6_fib_timer);
2197 
2198 	for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2199 		struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2200 		struct hlist_node *tmp;
2201 		struct fib6_table *tb;
2202 
2203 		hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2204 			hlist_del(&tb->tb6_hlist);
2205 			fib6_free_table(tb);
2206 		}
2207 	}
2208 
2209 	kfree(net->ipv6.fib_table_hash);
2210 	kfree(net->ipv6.rt6_stats);
2211 	fib6_notifier_exit(net);
2212 }
2213 
2214 static struct pernet_operations fib6_net_ops = {
2215 	.init = fib6_net_init,
2216 	.exit = fib6_net_exit,
2217 };
2218 
2219 int __init fib6_init(void)
2220 {
2221 	int ret = -ENOMEM;
2222 
2223 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
2224 					   sizeof(struct fib6_node),
2225 					   0, SLAB_HWCACHE_ALIGN,
2226 					   NULL);
2227 	if (!fib6_node_kmem)
2228 		goto out;
2229 
2230 	ret = register_pernet_subsys(&fib6_net_ops);
2231 	if (ret)
2232 		goto out_kmem_cache_create;
2233 
2234 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
2235 				   inet6_dump_fib, 0);
2236 	if (ret)
2237 		goto out_unregister_subsys;
2238 
2239 	__fib6_flush_trees = fib6_flush_trees;
2240 out:
2241 	return ret;
2242 
2243 out_unregister_subsys:
2244 	unregister_pernet_subsys(&fib6_net_ops);
2245 out_kmem_cache_create:
2246 	kmem_cache_destroy(fib6_node_kmem);
2247 	goto out;
2248 }
2249 
2250 void fib6_gc_cleanup(void)
2251 {
2252 	unregister_pernet_subsys(&fib6_net_ops);
2253 	kmem_cache_destroy(fib6_node_kmem);
2254 }
2255 
2256 #ifdef CONFIG_PROC_FS
2257 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2258 {
2259 	struct fib6_info *rt = v;
2260 	struct ipv6_route_iter *iter = seq->private;
2261 	const struct net_device *dev;
2262 
2263 	seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2264 
2265 #ifdef CONFIG_IPV6_SUBTREES
2266 	seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2267 #else
2268 	seq_puts(seq, "00000000000000000000000000000000 00 ");
2269 #endif
2270 	if (rt->fib6_flags & RTF_GATEWAY)
2271 		seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
2272 	else
2273 		seq_puts(seq, "00000000000000000000000000000000");
2274 
2275 	dev = rt->fib6_nh.nh_dev;
2276 	seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2277 		   rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
2278 		   rt->fib6_flags, dev ? dev->name : "");
2279 	iter->w.leaf = NULL;
2280 	return 0;
2281 }
2282 
2283 static int ipv6_route_yield(struct fib6_walker *w)
2284 {
2285 	struct ipv6_route_iter *iter = w->args;
2286 
2287 	if (!iter->skip)
2288 		return 1;
2289 
2290 	do {
2291 		iter->w.leaf = rcu_dereference_protected(
2292 				iter->w.leaf->fib6_next,
2293 				lockdep_is_held(&iter->tbl->tb6_lock));
2294 		iter->skip--;
2295 		if (!iter->skip && iter->w.leaf)
2296 			return 1;
2297 	} while (iter->w.leaf);
2298 
2299 	return 0;
2300 }
2301 
2302 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2303 				      struct net *net)
2304 {
2305 	memset(&iter->w, 0, sizeof(iter->w));
2306 	iter->w.func = ipv6_route_yield;
2307 	iter->w.root = &iter->tbl->tb6_root;
2308 	iter->w.state = FWS_INIT;
2309 	iter->w.node = iter->w.root;
2310 	iter->w.args = iter;
2311 	iter->sernum = iter->w.root->fn_sernum;
2312 	INIT_LIST_HEAD(&iter->w.lh);
2313 	fib6_walker_link(net, &iter->w);
2314 }
2315 
2316 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2317 						    struct net *net)
2318 {
2319 	unsigned int h;
2320 	struct hlist_node *node;
2321 
2322 	if (tbl) {
2323 		h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2324 		node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
2325 	} else {
2326 		h = 0;
2327 		node = NULL;
2328 	}
2329 
2330 	while (!node && h < FIB6_TABLE_HASHSZ) {
2331 		node = rcu_dereference_bh(
2332 			hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2333 	}
2334 	return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2335 }
2336 
2337 static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2338 {
2339 	if (iter->sernum != iter->w.root->fn_sernum) {
2340 		iter->sernum = iter->w.root->fn_sernum;
2341 		iter->w.state = FWS_INIT;
2342 		iter->w.node = iter->w.root;
2343 		WARN_ON(iter->w.skip);
2344 		iter->w.skip = iter->w.count;
2345 	}
2346 }
2347 
2348 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2349 {
2350 	int r;
2351 	struct fib6_info *n;
2352 	struct net *net = seq_file_net(seq);
2353 	struct ipv6_route_iter *iter = seq->private;
2354 
2355 	if (!v)
2356 		goto iter_table;
2357 
2358 	n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
2359 	if (n) {
2360 		++*pos;
2361 		return n;
2362 	}
2363 
2364 iter_table:
2365 	ipv6_route_check_sernum(iter);
2366 	spin_lock_bh(&iter->tbl->tb6_lock);
2367 	r = fib6_walk_continue(&iter->w);
2368 	spin_unlock_bh(&iter->tbl->tb6_lock);
2369 	if (r > 0) {
2370 		if (v)
2371 			++*pos;
2372 		return iter->w.leaf;
2373 	} else if (r < 0) {
2374 		fib6_walker_unlink(net, &iter->w);
2375 		return NULL;
2376 	}
2377 	fib6_walker_unlink(net, &iter->w);
2378 
2379 	iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2380 	if (!iter->tbl)
2381 		return NULL;
2382 
2383 	ipv6_route_seq_setup_walk(iter, net);
2384 	goto iter_table;
2385 }
2386 
2387 static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2388 	__acquires(RCU_BH)
2389 {
2390 	struct net *net = seq_file_net(seq);
2391 	struct ipv6_route_iter *iter = seq->private;
2392 
2393 	rcu_read_lock_bh();
2394 	iter->tbl = ipv6_route_seq_next_table(NULL, net);
2395 	iter->skip = *pos;
2396 
2397 	if (iter->tbl) {
2398 		ipv6_route_seq_setup_walk(iter, net);
2399 		return ipv6_route_seq_next(seq, NULL, pos);
2400 	} else {
2401 		return NULL;
2402 	}
2403 }
2404 
2405 static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2406 {
2407 	struct fib6_walker *w = &iter->w;
2408 	return w->node && !(w->state == FWS_U && w->node == w->root);
2409 }
2410 
2411 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2412 	__releases(RCU_BH)
2413 {
2414 	struct net *net = seq_file_net(seq);
2415 	struct ipv6_route_iter *iter = seq->private;
2416 
2417 	if (ipv6_route_iter_active(iter))
2418 		fib6_walker_unlink(net, &iter->w);
2419 
2420 	rcu_read_unlock_bh();
2421 }
2422 
2423 const struct seq_operations ipv6_route_seq_ops = {
2424 	.start	= ipv6_route_seq_start,
2425 	.next	= ipv6_route_seq_next,
2426 	.stop	= ipv6_route_seq_stop,
2427 	.show	= ipv6_route_seq_show
2428 };
2429 #endif /* CONFIG_PROC_FS */
2430