xref: /openbmc/linux/net/ipv6/route.c (revision 1d053da910947afccec96d90892c0f5488c7a9cf)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 			 struct fib6_info *rt, struct dst_entry *dst,
103 			 struct in6_addr *dest, struct in6_addr *src,
104 			 int iif, int type, u32 portid, u32 seq,
105 			 unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 					   struct in6_addr *daddr,
108 					   struct in6_addr *saddr);
109 
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 					   const struct in6_addr *prefix, int prefixlen,
113 					   const struct in6_addr *gwaddr,
114 					   struct net_device *dev,
115 					   unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 					   const struct in6_addr *prefix, int prefixlen,
118 					   const struct in6_addr *gwaddr,
119 					   struct net_device *dev);
120 #endif
121 
122 struct uncached_list {
123 	spinlock_t		lock;
124 	struct list_head	head;
125 };
126 
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 
133 	rt->rt6i_uncached_list = ul;
134 
135 	spin_lock_bh(&ul->lock);
136 	list_add_tail(&rt->rt6i_uncached, &ul->head);
137 	spin_unlock_bh(&ul->lock);
138 }
139 
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142 	if (!list_empty(&rt->rt6i_uncached)) {
143 		struct uncached_list *ul = rt->rt6i_uncached_list;
144 		struct net *net = dev_net(rt->dst.dev);
145 
146 		spin_lock_bh(&ul->lock);
147 		list_del(&rt->rt6i_uncached);
148 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 		spin_unlock_bh(&ul->lock);
150 	}
151 }
152 
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155 	struct net_device *loopback_dev = net->loopback_dev;
156 	int cpu;
157 
158 	if (dev == loopback_dev)
159 		return;
160 
161 	for_each_possible_cpu(cpu) {
162 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163 		struct rt6_info *rt;
164 
165 		spin_lock_bh(&ul->lock);
166 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 			struct inet6_dev *rt_idev = rt->rt6i_idev;
168 			struct net_device *rt_dev = rt->dst.dev;
169 
170 			if (rt_idev->dev == dev) {
171 				rt->rt6i_idev = in6_dev_get(loopback_dev);
172 				in6_dev_put(rt_idev);
173 			}
174 
175 			if (rt_dev == dev) {
176 				rt->dst.dev = loopback_dev;
177 				dev_hold(rt->dst.dev);
178 				dev_put(rt_dev);
179 			}
180 		}
181 		spin_unlock_bh(&ul->lock);
182 	}
183 }
184 
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	if (!ipv6_addr_any(p))
190 		return (const void *) p;
191 	else if (skb)
192 		return &ipv6_hdr(skb)->daddr;
193 	return daddr;
194 }
195 
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 				   struct net_device *dev,
198 				   struct sk_buff *skb,
199 				   const void *daddr)
200 {
201 	struct neighbour *n;
202 
203 	daddr = choose_neigh_daddr(gw, skb, daddr);
204 	n = __ipv6_neigh_lookup(dev, daddr);
205 	if (n)
206 		return n;
207 	return neigh_create(&nd_tbl, daddr, dev);
208 }
209 
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211 					      struct sk_buff *skb,
212 					      const void *daddr)
213 {
214 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215 
216 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218 
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221 	struct net_device *dev = dst->dev;
222 	struct rt6_info *rt = (struct rt6_info *)dst;
223 
224 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 	if (!daddr)
226 		return;
227 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228 		return;
229 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230 		return;
231 	__ipv6_confirm_neigh(dev, daddr);
232 }
233 
234 static struct dst_ops ip6_dst_ops_template = {
235 	.family			=	AF_INET6,
236 	.gc			=	ip6_dst_gc,
237 	.gc_thresh		=	1024,
238 	.check			=	ip6_dst_check,
239 	.default_advmss		=	ip6_default_advmss,
240 	.mtu			=	ip6_mtu,
241 	.cow_metrics		=	dst_cow_metrics_generic,
242 	.destroy		=	ip6_dst_destroy,
243 	.ifdown			=	ip6_dst_ifdown,
244 	.negative_advice	=	ip6_negative_advice,
245 	.link_failure		=	ip6_link_failure,
246 	.update_pmtu		=	ip6_rt_update_pmtu,
247 	.redirect		=	rt6_do_redirect,
248 	.local_out		=	__ip6_local_out,
249 	.neigh_lookup		=	ip6_dst_neigh_lookup,
250 	.confirm_neigh		=	ip6_confirm_neigh,
251 };
252 
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256 
257 	return mtu ? : dst->dev->mtu;
258 }
259 
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 					 struct sk_buff *skb, u32 mtu)
262 {
263 }
264 
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266 				      struct sk_buff *skb)
267 {
268 }
269 
270 static struct dst_ops ip6_dst_blackhole_ops = {
271 	.family			=	AF_INET6,
272 	.destroy		=	ip6_dst_destroy,
273 	.check			=	ip6_dst_check,
274 	.mtu			=	ip6_blackhole_mtu,
275 	.default_advmss		=	ip6_default_advmss,
276 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277 	.redirect		=	ip6_rt_blackhole_redirect,
278 	.cow_metrics		=	dst_cow_metrics_generic,
279 	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 };
281 
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 	[RTAX_HOPLIMIT - 1] = 0,
284 };
285 
286 static const struct fib6_info fib6_null_entry_template = {
287 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
288 	.fib6_protocol  = RTPROT_KERNEL,
289 	.fib6_metric	= ~(u32)0,
290 	.fib6_ref	= ATOMIC_INIT(1),
291 	.fib6_type	= RTN_UNREACHABLE,
292 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
293 };
294 
295 static const struct rt6_info ip6_null_entry_template = {
296 	.dst = {
297 		.__refcnt	= ATOMIC_INIT(1),
298 		.__use		= 1,
299 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 		.error		= -ENETUNREACH,
301 		.input		= ip6_pkt_discard,
302 		.output		= ip6_pkt_discard_out,
303 	},
304 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
305 };
306 
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308 
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 	.dst = {
311 		.__refcnt	= ATOMIC_INIT(1),
312 		.__use		= 1,
313 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 		.error		= -EACCES,
315 		.input		= ip6_pkt_prohibit,
316 		.output		= ip6_pkt_prohibit_out,
317 	},
318 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319 };
320 
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322 	.dst = {
323 		.__refcnt	= ATOMIC_INIT(1),
324 		.__use		= 1,
325 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 		.error		= -EINVAL,
327 		.input		= dst_discard,
328 		.output		= dst_discard_out,
329 	},
330 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
331 };
332 
333 #endif
334 
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337 	struct dst_entry *dst = &rt->dst;
338 
339 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 	INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342 
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 					1, DST_OBSOLETE_FORCE_CHK, flags);
349 
350 	if (rt) {
351 		rt6_info_init(rt);
352 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353 	}
354 
355 	return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358 
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361 	struct rt6_info *rt = (struct rt6_info *)dst;
362 	struct fib6_info *from;
363 	struct inet6_dev *idev;
364 
365 	dst_destroy_metrics_generic(dst);
366 	rt6_uncached_list_del(rt);
367 
368 	idev = rt->rt6i_idev;
369 	if (idev) {
370 		rt->rt6i_idev = NULL;
371 		in6_dev_put(idev);
372 	}
373 
374 	rcu_read_lock();
375 	from = rcu_dereference(rt->from);
376 	rcu_assign_pointer(rt->from, NULL);
377 	fib6_info_release(from);
378 	rcu_read_unlock();
379 }
380 
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382 			   int how)
383 {
384 	struct rt6_info *rt = (struct rt6_info *)dst;
385 	struct inet6_dev *idev = rt->rt6i_idev;
386 	struct net_device *loopback_dev =
387 		dev_net(dev)->loopback_dev;
388 
389 	if (idev && idev->dev != loopback_dev) {
390 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391 		if (loopback_idev) {
392 			rt->rt6i_idev = loopback_idev;
393 			in6_dev_put(idev);
394 		}
395 	}
396 }
397 
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400 	if (rt->rt6i_flags & RTF_EXPIRES)
401 		return time_after(jiffies, rt->dst.expires);
402 	else
403 		return false;
404 }
405 
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408 	struct fib6_info *from;
409 
410 	from = rcu_dereference(rt->from);
411 
412 	if (rt->rt6i_flags & RTF_EXPIRES) {
413 		if (time_after(jiffies, rt->dst.expires))
414 			return true;
415 	} else if (from) {
416 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 			fib6_check_expired(from);
418 	}
419 	return false;
420 }
421 
422 struct fib6_info *fib6_multipath_select(const struct net *net,
423 					struct fib6_info *match,
424 					struct flowi6 *fl6, int oif,
425 					const struct sk_buff *skb,
426 					int strict)
427 {
428 	struct fib6_info *sibling, *next_sibling;
429 
430 	/* We might have already computed the hash for ICMPv6 errors. In such
431 	 * case it will always be non-zero. Otherwise now is the time to do it.
432 	 */
433 	if (!fl6->mp_hash)
434 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435 
436 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437 		return match;
438 
439 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440 				 fib6_siblings) {
441 		int nh_upper_bound;
442 
443 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 		if (fl6->mp_hash > nh_upper_bound)
445 			continue;
446 		if (rt6_score_route(sibling, oif, strict) < 0)
447 			break;
448 		match = sibling;
449 		break;
450 	}
451 
452 	return match;
453 }
454 
455 /*
456  *	Route lookup. rcu_read_lock() should be held.
457  */
458 
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 						 struct fib6_info *rt,
461 						    const struct in6_addr *saddr,
462 						    int oif,
463 						    int flags)
464 {
465 	struct fib6_info *sprt;
466 
467 	if (!oif && ipv6_addr_any(saddr) &&
468 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469 		return rt;
470 
471 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
473 
474 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475 			continue;
476 
477 		if (oif) {
478 			if (dev->ifindex == oif)
479 				return sprt;
480 		} else {
481 			if (ipv6_chk_addr(net, saddr, dev,
482 					  flags & RT6_LOOKUP_F_IFACE))
483 				return sprt;
484 		}
485 	}
486 
487 	if (oif && flags & RT6_LOOKUP_F_IFACE)
488 		return net->ipv6.fib6_null_entry;
489 
490 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492 
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 	struct work_struct work;
496 	struct in6_addr target;
497 	struct net_device *dev;
498 };
499 
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502 	struct in6_addr mcaddr;
503 	struct __rt6_probe_work *work =
504 		container_of(w, struct __rt6_probe_work, work);
505 
506 	addrconf_addr_solict_mult(&work->target, &mcaddr);
507 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508 	dev_put(work->dev);
509 	kfree(work);
510 }
511 
512 static void rt6_probe(struct fib6_info *rt)
513 {
514 	struct __rt6_probe_work *work;
515 	const struct in6_addr *nh_gw;
516 	struct neighbour *neigh;
517 	struct net_device *dev;
518 
519 	/*
520 	 * Okay, this does not seem to be appropriate
521 	 * for now, however, we need to check if it
522 	 * is really so; aka Router Reachability Probing.
523 	 *
524 	 * Router Reachability Probe MUST be rate-limited
525 	 * to no more than one per minute.
526 	 */
527 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528 		return;
529 
530 	nh_gw = &rt->fib6_nh.nh_gw;
531 	dev = rt->fib6_nh.nh_dev;
532 	rcu_read_lock_bh();
533 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534 	if (neigh) {
535 		struct inet6_dev *idev;
536 
537 		if (neigh->nud_state & NUD_VALID)
538 			goto out;
539 
540 		idev = __in6_dev_get(dev);
541 		work = NULL;
542 		write_lock(&neigh->lock);
543 		if (!(neigh->nud_state & NUD_VALID) &&
544 		    time_after(jiffies,
545 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
546 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
547 			if (work)
548 				__neigh_set_probe_once(neigh);
549 		}
550 		write_unlock(&neigh->lock);
551 	} else {
552 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 	}
554 
555 	if (work) {
556 		INIT_WORK(&work->work, rt6_probe_deferred);
557 		work->target = *nh_gw;
558 		dev_hold(dev);
559 		work->dev = dev;
560 		schedule_work(&work->work);
561 	}
562 
563 out:
564 	rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571 
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577 	const struct net_device *dev = rt->fib6_nh.nh_dev;
578 
579 	if (!oif || dev->ifindex == oif)
580 		return 2;
581 	return 0;
582 }
583 
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 	struct neighbour *neigh;
588 
589 	if (rt->fib6_flags & RTF_NONEXTHOP ||
590 	    !(rt->fib6_flags & RTF_GATEWAY))
591 		return RT6_NUD_SUCCEED;
592 
593 	rcu_read_lock_bh();
594 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595 					  &rt->fib6_nh.nh_gw);
596 	if (neigh) {
597 		read_lock(&neigh->lock);
598 		if (neigh->nud_state & NUD_VALID)
599 			ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 		else if (!(neigh->nud_state & NUD_FAILED))
602 			ret = RT6_NUD_SUCCEED;
603 		else
604 			ret = RT6_NUD_FAIL_PROBE;
605 #endif
606 		read_unlock(&neigh->lock);
607 	} else {
608 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610 	}
611 	rcu_read_unlock_bh();
612 
613 	return ret;
614 }
615 
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618 	int m;
619 
620 	m = rt6_check_dev(rt, oif);
621 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 		return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626 	if (strict & RT6_LOOKUP_F_REACHABLE) {
627 		int n = rt6_check_neigh(rt);
628 		if (n < 0)
629 			return n;
630 	}
631 	return m;
632 }
633 
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637 	const struct net_device *dev = fib6_info_nh_dev(f6i);
638 	bool rc = false;
639 
640 	if (dev) {
641 		const struct inet6_dev *idev = __in6_dev_get(dev);
642 
643 		rc = !!idev->cnf.ignore_routes_with_linkdown;
644 	}
645 
646 	return rc;
647 }
648 
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 				   int *mpri, struct fib6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 
656 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657 		goto out;
658 
659 	if (fib6_ignore_linkdown(rt) &&
660 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662 		goto out;
663 
664 	if (fib6_check_expired(rt))
665 		goto out;
666 
667 	m = rt6_score_route(rt, oif, strict);
668 	if (m == RT6_NUD_FAIL_DO_RR) {
669 		match_do_rr = true;
670 		m = 0; /* lowest valid score */
671 	} else if (m == RT6_NUD_FAIL_HARD) {
672 		goto out;
673 	}
674 
675 	if (strict & RT6_LOOKUP_F_REACHABLE)
676 		rt6_probe(rt);
677 
678 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
679 	if (m > *mpri) {
680 		*do_rr = match_do_rr;
681 		*mpri = m;
682 		match = rt;
683 	}
684 out:
685 	return match;
686 }
687 
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 				     struct fib6_info *leaf,
690 				     struct fib6_info *rr_head,
691 				     u32 metric, int oif, int strict,
692 				     bool *do_rr)
693 {
694 	struct fib6_info *rt, *match, *cont;
695 	int mpri = -1;
696 
697 	match = NULL;
698 	cont = NULL;
699 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700 		if (rt->fib6_metric != metric) {
701 			cont = rt;
702 			break;
703 		}
704 
705 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
706 	}
707 
708 	for (rt = leaf; rt && rt != rr_head;
709 	     rt = rcu_dereference(rt->fib6_next)) {
710 		if (rt->fib6_metric != metric) {
711 			cont = rt;
712 			break;
713 		}
714 
715 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 	}
717 
718 	if (match || !cont)
719 		return match;
720 
721 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 
724 	return match;
725 }
726 
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728 				   int oif, int strict)
729 {
730 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 	struct fib6_info *match, *rt0;
732 	bool do_rr = false;
733 	int key_plen;
734 
735 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 		return net->ipv6.fib6_null_entry;
737 
738 	rt0 = rcu_dereference(fn->rr_ptr);
739 	if (!rt0)
740 		rt0 = leaf;
741 
742 	/* Double check to make sure fn is not an intermediate node
743 	 * and fn->leaf does not points to its child's leaf
744 	 * (This might happen if all routes under fn are deleted from
745 	 * the tree and fib6_repair_tree() is called on the node.)
746 	 */
747 	key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 	if (rt0->fib6_src.plen)
750 		key_plen = rt0->fib6_src.plen;
751 #endif
752 	if (fn->fn_bit != key_plen)
753 		return net->ipv6.fib6_null_entry;
754 
755 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756 			     &do_rr);
757 
758 	if (do_rr) {
759 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760 
761 		/* no entries matched; do round-robin */
762 		if (!next || next->fib6_metric != rt0->fib6_metric)
763 			next = leaf;
764 
765 		if (next != rt0) {
766 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 			/* make sure next is not being deleted from the tree */
768 			if (next->fib6_node)
769 				rcu_assign_pointer(fn->rr_ptr, next);
770 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771 		}
772 	}
773 
774 	return match ? match : net->ipv6.fib6_null_entry;
775 }
776 
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781 
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 		  const struct in6_addr *gwaddr)
785 {
786 	struct net *net = dev_net(dev);
787 	struct route_info *rinfo = (struct route_info *) opt;
788 	struct in6_addr prefix_buf, *prefix;
789 	unsigned int pref;
790 	unsigned long lifetime;
791 	struct fib6_info *rt;
792 
793 	if (len < sizeof(struct route_info)) {
794 		return -EINVAL;
795 	}
796 
797 	/* Sanity check for prefix_len and length */
798 	if (rinfo->length > 3) {
799 		return -EINVAL;
800 	} else if (rinfo->prefix_len > 128) {
801 		return -EINVAL;
802 	} else if (rinfo->prefix_len > 64) {
803 		if (rinfo->length < 2) {
804 			return -EINVAL;
805 		}
806 	} else if (rinfo->prefix_len > 0) {
807 		if (rinfo->length < 1) {
808 			return -EINVAL;
809 		}
810 	}
811 
812 	pref = rinfo->route_pref;
813 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
814 		return -EINVAL;
815 
816 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817 
818 	if (rinfo->length == 3)
819 		prefix = (struct in6_addr *)rinfo->prefix;
820 	else {
821 		/* this function is safe */
822 		ipv6_addr_prefix(&prefix_buf,
823 				 (struct in6_addr *)rinfo->prefix,
824 				 rinfo->prefix_len);
825 		prefix = &prefix_buf;
826 	}
827 
828 	if (rinfo->prefix_len == 0)
829 		rt = rt6_get_dflt_router(net, gwaddr, dev);
830 	else
831 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832 					gwaddr, dev);
833 
834 	if (rt && !lifetime) {
835 		ip6_del_rt(net, rt);
836 		rt = NULL;
837 	}
838 
839 	if (!rt && lifetime)
840 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841 					dev, pref);
842 	else if (rt)
843 		rt->fib6_flags = RTF_ROUTEINFO |
844 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845 
846 	if (rt) {
847 		if (!addrconf_finite_timeout(lifetime))
848 			fib6_clean_expires(rt);
849 		else
850 			fib6_set_expires(rt, jiffies + HZ * lifetime);
851 
852 		fib6_info_release(rt);
853 	}
854 	return 0;
855 }
856 #endif
857 
858 /*
859  *	Misc support functions
860  */
861 
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865 	struct net_device *dev = rt->fib6_nh.nh_dev;
866 
867 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 		/* for copies of local routes, dst->dev needs to be the
869 		 * device if it is a master device, the master device if
870 		 * device is enslaved, and the loopback as the default
871 		 */
872 		if (netif_is_l3_slave(dev) &&
873 		    !rt6_need_strict(&rt->fib6_dst.addr))
874 			dev = l3mdev_master_dev_rcu(dev);
875 		else if (!netif_is_l3_master(dev))
876 			dev = dev_net(dev)->loopback_dev;
877 		/* last case is netif_is_l3_master(dev) is true in which
878 		 * case we want dev returned to be dev
879 		 */
880 	}
881 
882 	return dev;
883 }
884 
885 static const int fib6_prop[RTN_MAX + 1] = {
886 	[RTN_UNSPEC]	= 0,
887 	[RTN_UNICAST]	= 0,
888 	[RTN_LOCAL]	= 0,
889 	[RTN_BROADCAST]	= 0,
890 	[RTN_ANYCAST]	= 0,
891 	[RTN_MULTICAST]	= 0,
892 	[RTN_BLACKHOLE]	= -EINVAL,
893 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
894 	[RTN_PROHIBIT]	= -EACCES,
895 	[RTN_THROW]	= -EAGAIN,
896 	[RTN_NAT]	= -EINVAL,
897 	[RTN_XRESOLVE]	= -EINVAL,
898 };
899 
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902 	return fib6_prop[fib6_type];
903 }
904 
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907 	unsigned short flags = 0;
908 
909 	if (rt->dst_nocount)
910 		flags |= DST_NOCOUNT;
911 	if (rt->dst_nopolicy)
912 		flags |= DST_NOPOLICY;
913 	if (rt->dst_host)
914 		flags |= DST_HOST;
915 
916 	return flags;
917 }
918 
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922 
923 	switch (ort->fib6_type) {
924 	case RTN_BLACKHOLE:
925 		rt->dst.output = dst_discard_out;
926 		rt->dst.input = dst_discard;
927 		break;
928 	case RTN_PROHIBIT:
929 		rt->dst.output = ip6_pkt_prohibit_out;
930 		rt->dst.input = ip6_pkt_prohibit;
931 		break;
932 	case RTN_THROW:
933 	case RTN_UNREACHABLE:
934 	default:
935 		rt->dst.output = ip6_pkt_discard_out;
936 		rt->dst.input = ip6_pkt_discard;
937 		break;
938 	}
939 }
940 
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943 	rt->dst.flags |= fib6_info_dst_flags(ort);
944 
945 	if (ort->fib6_flags & RTF_REJECT) {
946 		ip6_rt_init_dst_reject(rt, ort);
947 		return;
948 	}
949 
950 	rt->dst.error = 0;
951 	rt->dst.output = ip6_output;
952 
953 	if (ort->fib6_type == RTN_LOCAL) {
954 		rt->dst.input = ip6_input;
955 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 		rt->dst.input = ip6_mc_input;
957 	} else {
958 		rt->dst.input = ip6_forward;
959 	}
960 
961 	if (ort->fib6_nh.nh_lwtstate) {
962 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 		lwtunnel_set_redirect(&rt->dst);
964 	}
965 
966 	rt->dst.lastuse = jiffies;
967 }
968 
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971 	rt->rt6i_flags &= ~RTF_EXPIRES;
972 	fib6_info_hold(from);
973 	rcu_assign_pointer(rt->from, from);
974 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 	if (from->fib6_metrics != &dst_default_metrics) {
976 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 		refcount_inc(&from->fib6_metrics->refcnt);
978 	}
979 }
980 
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983 	struct net_device *dev = fib6_info_nh_dev(ort);
984 
985 	ip6_rt_init_dst(rt, ort);
986 
987 	rt->rt6i_dst = ort->fib6_dst;
988 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 	rt->rt6i_src = ort->fib6_src;
994 #endif
995 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998 
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 					struct in6_addr *saddr)
1001 {
1002 	struct fib6_node *pn, *sn;
1003 	while (1) {
1004 		if (fn->fn_flags & RTN_TL_ROOT)
1005 			return NULL;
1006 		pn = rcu_dereference(fn->parent);
1007 		sn = FIB6_SUBTREE(pn);
1008 		if (sn && sn != fn)
1009 			fn = fib6_node_lookup(sn, NULL, saddr);
1010 		else
1011 			fn = pn;
1012 		if (fn->fn_flags & RTN_RTINFO)
1013 			return fn;
1014 	}
1015 }
1016 
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018 			  bool null_fallback)
1019 {
1020 	struct rt6_info *rt = *prt;
1021 
1022 	if (dst_hold_safe(&rt->dst))
1023 		return true;
1024 	if (null_fallback) {
1025 		rt = net->ipv6.ip6_null_entry;
1026 		dst_hold(&rt->dst);
1027 	} else {
1028 		rt = NULL;
1029 	}
1030 	*prt = rt;
1031 	return false;
1032 }
1033 
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037 	unsigned short flags = fib6_info_dst_flags(rt);
1038 	struct net_device *dev = rt->fib6_nh.nh_dev;
1039 	struct rt6_info *nrt;
1040 
1041 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042 	if (nrt)
1043 		ip6_rt_copy_init(nrt, rt);
1044 
1045 	return nrt;
1046 }
1047 
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 					     struct fib6_table *table,
1050 					     struct flowi6 *fl6,
1051 					     const struct sk_buff *skb,
1052 					     int flags)
1053 {
1054 	struct fib6_info *f6i;
1055 	struct fib6_node *fn;
1056 	struct rt6_info *rt;
1057 
1058 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 		flags &= ~RT6_LOOKUP_F_IFACE;
1060 
1061 	rcu_read_lock();
1062 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064 	f6i = rcu_dereference(fn->leaf);
1065 	if (!f6i) {
1066 		f6i = net->ipv6.fib6_null_entry;
1067 	} else {
1068 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 				      fl6->flowi6_oif, flags);
1070 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 			f6i = fib6_multipath_select(net, f6i, fl6,
1072 						    fl6->flowi6_oif, skb,
1073 						    flags);
1074 	}
1075 	if (f6i == net->ipv6.fib6_null_entry) {
1076 		fn = fib6_backtrack(fn, &fl6->saddr);
1077 		if (fn)
1078 			goto restart;
1079 	}
1080 
1081 	/* Search through exception table */
1082 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1083 	if (rt) {
1084 		if (ip6_hold_safe(net, &rt, true))
1085 			dst_use_noref(&rt->dst, jiffies);
1086 	} else if (f6i == net->ipv6.fib6_null_entry) {
1087 		rt = net->ipv6.ip6_null_entry;
1088 		dst_hold(&rt->dst);
1089 	} else {
1090 		rt = ip6_create_rt_rcu(f6i);
1091 		if (!rt) {
1092 			rt = net->ipv6.ip6_null_entry;
1093 			dst_hold(&rt->dst);
1094 		}
1095 	}
1096 
1097 	rcu_read_unlock();
1098 
1099 	trace_fib6_table_lookup(net, rt, table, fl6);
1100 
1101 	return rt;
1102 }
1103 
1104 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1105 				   const struct sk_buff *skb, int flags)
1106 {
1107 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 
1111 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1112 			    const struct in6_addr *saddr, int oif,
1113 			    const struct sk_buff *skb, int strict)
1114 {
1115 	struct flowi6 fl6 = {
1116 		.flowi6_oif = oif,
1117 		.daddr = *daddr,
1118 	};
1119 	struct dst_entry *dst;
1120 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1121 
1122 	if (saddr) {
1123 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1124 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1125 	}
1126 
1127 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1128 	if (dst->error == 0)
1129 		return (struct rt6_info *) dst;
1130 
1131 	dst_release(dst);
1132 
1133 	return NULL;
1134 }
1135 EXPORT_SYMBOL(rt6_lookup);
1136 
1137 /* ip6_ins_rt is called with FREE table->tb6_lock.
1138  * It takes new route entry, the addition fails by any reason the
1139  * route is released.
1140  * Caller must hold dst before calling it.
1141  */
1142 
1143 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1144 			struct netlink_ext_ack *extack)
1145 {
1146 	int err;
1147 	struct fib6_table *table;
1148 
1149 	table = rt->fib6_table;
1150 	spin_lock_bh(&table->tb6_lock);
1151 	err = fib6_add(&table->tb6_root, rt, info, extack);
1152 	spin_unlock_bh(&table->tb6_lock);
1153 
1154 	return err;
1155 }
1156 
1157 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 {
1159 	struct nl_info info = {	.nl_net = net, };
1160 
1161 	return __ip6_ins_rt(rt, &info, NULL);
1162 }
1163 
1164 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1165 					   const struct in6_addr *daddr,
1166 					   const struct in6_addr *saddr)
1167 {
1168 	struct net_device *dev;
1169 	struct rt6_info *rt;
1170 
1171 	/*
1172 	 *	Clone the route.
1173 	 */
1174 
1175 	dev = ip6_rt_get_dev_rcu(ort);
1176 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1177 	if (!rt)
1178 		return NULL;
1179 
1180 	ip6_rt_copy_init(rt, ort);
1181 	rt->rt6i_flags |= RTF_CACHE;
1182 	rt->dst.flags |= DST_HOST;
1183 	rt->rt6i_dst.addr = *daddr;
1184 	rt->rt6i_dst.plen = 128;
1185 
1186 	if (!rt6_is_gw_or_nonexthop(ort)) {
1187 		if (ort->fib6_dst.plen != 128 &&
1188 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189 			rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191 		if (rt->rt6i_src.plen && saddr) {
1192 			rt->rt6i_src.addr = *saddr;
1193 			rt->rt6i_src.plen = 128;
1194 		}
1195 #endif
1196 	}
1197 
1198 	return rt;
1199 }
1200 
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 {
1203 	unsigned short flags = fib6_info_dst_flags(rt);
1204 	struct net_device *dev;
1205 	struct rt6_info *pcpu_rt;
1206 
1207 	rcu_read_lock();
1208 	dev = ip6_rt_get_dev_rcu(rt);
1209 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1210 	rcu_read_unlock();
1211 	if (!pcpu_rt)
1212 		return NULL;
1213 	ip6_rt_copy_init(pcpu_rt, rt);
1214 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1215 	return pcpu_rt;
1216 }
1217 
1218 /* It should be called with rcu_read_lock() acquired */
1219 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 {
1221 	struct rt6_info *pcpu_rt, **p;
1222 
1223 	p = this_cpu_ptr(rt->rt6i_pcpu);
1224 	pcpu_rt = *p;
1225 
1226 	if (pcpu_rt)
1227 		ip6_hold_safe(NULL, &pcpu_rt, false);
1228 
1229 	return pcpu_rt;
1230 }
1231 
1232 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1233 					    struct fib6_info *rt)
1234 {
1235 	struct rt6_info *pcpu_rt, *prev, **p;
1236 
1237 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 	if (!pcpu_rt) {
1239 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1240 		return net->ipv6.ip6_null_entry;
1241 	}
1242 
1243 	dst_hold(&pcpu_rt->dst);
1244 	p = this_cpu_ptr(rt->rt6i_pcpu);
1245 	prev = cmpxchg(p, NULL, pcpu_rt);
1246 	BUG_ON(prev);
1247 
1248 	return pcpu_rt;
1249 }
1250 
1251 /* exception hash table implementation
1252  */
1253 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 
1255 /* Remove rt6_ex from hash table and free the memory
1256  * Caller must hold rt6_exception_lock
1257  */
1258 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1259 				 struct rt6_exception *rt6_ex)
1260 {
1261 	struct net *net;
1262 
1263 	if (!bucket || !rt6_ex)
1264 		return;
1265 
1266 	net = dev_net(rt6_ex->rt6i->dst.dev);
1267 	hlist_del_rcu(&rt6_ex->hlist);
1268 	dst_release(&rt6_ex->rt6i->dst);
1269 	kfree_rcu(rt6_ex, rcu);
1270 	WARN_ON_ONCE(!bucket->depth);
1271 	bucket->depth--;
1272 	net->ipv6.rt6_stats->fib_rt_cache--;
1273 }
1274 
1275 /* Remove oldest rt6_ex in bucket and free the memory
1276  * Caller must hold rt6_exception_lock
1277  */
1278 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 {
1280 	struct rt6_exception *rt6_ex, *oldest = NULL;
1281 
1282 	if (!bucket)
1283 		return;
1284 
1285 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1286 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1287 			oldest = rt6_ex;
1288 	}
1289 	rt6_remove_exception(bucket, oldest);
1290 }
1291 
1292 static u32 rt6_exception_hash(const struct in6_addr *dst,
1293 			      const struct in6_addr *src)
1294 {
1295 	static u32 seed __read_mostly;
1296 	u32 val;
1297 
1298 	net_get_random_once(&seed, sizeof(seed));
1299 	val = jhash(dst, sizeof(*dst), seed);
1300 
1301 #ifdef CONFIG_IPV6_SUBTREES
1302 	if (src)
1303 		val = jhash(src, sizeof(*src), val);
1304 #endif
1305 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1306 }
1307 
1308 /* Helper function to find the cached rt in the hash table
1309  * and update bucket pointer to point to the bucket for this
1310  * (daddr, saddr) pair
1311  * Caller must hold rt6_exception_lock
1312  */
1313 static struct rt6_exception *
1314 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1315 			      const struct in6_addr *daddr,
1316 			      const struct in6_addr *saddr)
1317 {
1318 	struct rt6_exception *rt6_ex;
1319 	u32 hval;
1320 
1321 	if (!(*bucket) || !daddr)
1322 		return NULL;
1323 
1324 	hval = rt6_exception_hash(daddr, saddr);
1325 	*bucket += hval;
1326 
1327 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1328 		struct rt6_info *rt6 = rt6_ex->rt6i;
1329 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 
1331 #ifdef CONFIG_IPV6_SUBTREES
1332 		if (matched && saddr)
1333 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1334 #endif
1335 		if (matched)
1336 			return rt6_ex;
1337 	}
1338 	return NULL;
1339 }
1340 
1341 /* Helper function to find the cached rt in the hash table
1342  * and update bucket pointer to point to the bucket for this
1343  * (daddr, saddr) pair
1344  * Caller must hold rcu_read_lock()
1345  */
1346 static struct rt6_exception *
1347 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1348 			 const struct in6_addr *daddr,
1349 			 const struct in6_addr *saddr)
1350 {
1351 	struct rt6_exception *rt6_ex;
1352 	u32 hval;
1353 
1354 	WARN_ON_ONCE(!rcu_read_lock_held());
1355 
1356 	if (!(*bucket) || !daddr)
1357 		return NULL;
1358 
1359 	hval = rt6_exception_hash(daddr, saddr);
1360 	*bucket += hval;
1361 
1362 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1363 		struct rt6_info *rt6 = rt6_ex->rt6i;
1364 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 
1366 #ifdef CONFIG_IPV6_SUBTREES
1367 		if (matched && saddr)
1368 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1369 #endif
1370 		if (matched)
1371 			return rt6_ex;
1372 	}
1373 	return NULL;
1374 }
1375 
1376 static unsigned int fib6_mtu(const struct fib6_info *rt)
1377 {
1378 	unsigned int mtu;
1379 
1380 	if (rt->fib6_pmtu) {
1381 		mtu = rt->fib6_pmtu;
1382 	} else {
1383 		struct net_device *dev = fib6_info_nh_dev(rt);
1384 		struct inet6_dev *idev;
1385 
1386 		rcu_read_lock();
1387 		idev = __in6_dev_get(dev);
1388 		mtu = idev->cnf.mtu6;
1389 		rcu_read_unlock();
1390 	}
1391 
1392 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 
1394 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1395 }
1396 
1397 static int rt6_insert_exception(struct rt6_info *nrt,
1398 				struct fib6_info *ort)
1399 {
1400 	struct net *net = dev_net(nrt->dst.dev);
1401 	struct rt6_exception_bucket *bucket;
1402 	struct in6_addr *src_key = NULL;
1403 	struct rt6_exception *rt6_ex;
1404 	int err = 0;
1405 
1406 	spin_lock_bh(&rt6_exception_lock);
1407 
1408 	if (ort->exception_bucket_flushed) {
1409 		err = -EINVAL;
1410 		goto out;
1411 	}
1412 
1413 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1414 					lockdep_is_held(&rt6_exception_lock));
1415 	if (!bucket) {
1416 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1417 				 GFP_ATOMIC);
1418 		if (!bucket) {
1419 			err = -ENOMEM;
1420 			goto out;
1421 		}
1422 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1423 	}
1424 
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 	/* rt6i_src.plen != 0 indicates ort is in subtree
1427 	 * and exception table is indexed by a hash of
1428 	 * both rt6i_dst and rt6i_src.
1429 	 * Otherwise, the exception table is indexed by
1430 	 * a hash of only rt6i_dst.
1431 	 */
1432 	if (ort->fib6_src.plen)
1433 		src_key = &nrt->rt6i_src.addr;
1434 #endif
1435 
1436 	/* Update rt6i_prefsrc as it could be changed
1437 	 * in rt6_remove_prefsrc()
1438 	 */
1439 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1440 	/* rt6_mtu_change() might lower mtu on ort.
1441 	 * Only insert this exception route if its mtu
1442 	 * is less than ort's mtu value.
1443 	 */
1444 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1445 		err = -EINVAL;
1446 		goto out;
1447 	}
1448 
1449 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1450 					       src_key);
1451 	if (rt6_ex)
1452 		rt6_remove_exception(bucket, rt6_ex);
1453 
1454 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1455 	if (!rt6_ex) {
1456 		err = -ENOMEM;
1457 		goto out;
1458 	}
1459 	rt6_ex->rt6i = nrt;
1460 	rt6_ex->stamp = jiffies;
1461 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 	bucket->depth++;
1463 	net->ipv6.rt6_stats->fib_rt_cache++;
1464 
1465 	if (bucket->depth > FIB6_MAX_DEPTH)
1466 		rt6_exception_remove_oldest(bucket);
1467 
1468 out:
1469 	spin_unlock_bh(&rt6_exception_lock);
1470 
1471 	/* Update fn->fn_sernum to invalidate all cached dst */
1472 	if (!err) {
1473 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1474 		fib6_update_sernum(net, ort);
1475 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1476 		fib6_force_start_gc(net);
1477 	}
1478 
1479 	return err;
1480 }
1481 
1482 void rt6_flush_exceptions(struct fib6_info *rt)
1483 {
1484 	struct rt6_exception_bucket *bucket;
1485 	struct rt6_exception *rt6_ex;
1486 	struct hlist_node *tmp;
1487 	int i;
1488 
1489 	spin_lock_bh(&rt6_exception_lock);
1490 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1491 	rt->exception_bucket_flushed = 1;
1492 
1493 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1494 				    lockdep_is_held(&rt6_exception_lock));
1495 	if (!bucket)
1496 		goto out;
1497 
1498 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1499 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1500 			rt6_remove_exception(bucket, rt6_ex);
1501 		WARN_ON_ONCE(bucket->depth);
1502 		bucket++;
1503 	}
1504 
1505 out:
1506 	spin_unlock_bh(&rt6_exception_lock);
1507 }
1508 
1509 /* Find cached rt in the hash table inside passed in rt
1510  * Caller has to hold rcu_read_lock()
1511  */
1512 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1513 					   struct in6_addr *daddr,
1514 					   struct in6_addr *saddr)
1515 {
1516 	struct rt6_exception_bucket *bucket;
1517 	struct in6_addr *src_key = NULL;
1518 	struct rt6_exception *rt6_ex;
1519 	struct rt6_info *res = NULL;
1520 
1521 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 
1523 #ifdef CONFIG_IPV6_SUBTREES
1524 	/* rt6i_src.plen != 0 indicates rt is in subtree
1525 	 * and exception table is indexed by a hash of
1526 	 * both rt6i_dst and rt6i_src.
1527 	 * Otherwise, the exception table is indexed by
1528 	 * a hash of only rt6i_dst.
1529 	 */
1530 	if (rt->fib6_src.plen)
1531 		src_key = saddr;
1532 #endif
1533 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 
1535 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1536 		res = rt6_ex->rt6i;
1537 
1538 	return res;
1539 }
1540 
1541 /* Remove the passed in cached rt from the hash table that contains it */
1542 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 {
1544 	struct rt6_exception_bucket *bucket;
1545 	struct in6_addr *src_key = NULL;
1546 	struct rt6_exception *rt6_ex;
1547 	struct fib6_info *from;
1548 	int err;
1549 
1550 	from = rcu_dereference(rt->from);
1551 	if (!from ||
1552 	    !(rt->rt6i_flags & RTF_CACHE))
1553 		return -EINVAL;
1554 
1555 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556 		return -ENOENT;
1557 
1558 	spin_lock_bh(&rt6_exception_lock);
1559 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560 				    lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1563 	 * and exception table is indexed by a hash of
1564 	 * both rt6i_dst and rt6i_src.
1565 	 * Otherwise, the exception table is indexed by
1566 	 * a hash of only rt6i_dst.
1567 	 */
1568 	if (from->fib6_src.plen)
1569 		src_key = &rt->rt6i_src.addr;
1570 #endif
1571 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1572 					       &rt->rt6i_dst.addr,
1573 					       src_key);
1574 	if (rt6_ex) {
1575 		rt6_remove_exception(bucket, rt6_ex);
1576 		err = 0;
1577 	} else {
1578 		err = -ENOENT;
1579 	}
1580 
1581 	spin_unlock_bh(&rt6_exception_lock);
1582 	return err;
1583 }
1584 
1585 /* Find rt6_ex which contains the passed in rt cache and
1586  * refresh its stamp
1587  */
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 {
1590 	struct rt6_exception_bucket *bucket;
1591 	struct fib6_info *from = rt->from;
1592 	struct in6_addr *src_key = NULL;
1593 	struct rt6_exception *rt6_ex;
1594 
1595 	if (!from ||
1596 	    !(rt->rt6i_flags & RTF_CACHE))
1597 		return;
1598 
1599 	rcu_read_lock();
1600 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1601 
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1604 	 * and exception table is indexed by a hash of
1605 	 * both rt6i_dst and rt6i_src.
1606 	 * Otherwise, the exception table is indexed by
1607 	 * a hash of only rt6i_dst.
1608 	 */
1609 	if (from->fib6_src.plen)
1610 		src_key = &rt->rt6i_src.addr;
1611 #endif
1612 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1613 					  &rt->rt6i_dst.addr,
1614 					  src_key);
1615 	if (rt6_ex)
1616 		rt6_ex->stamp = jiffies;
1617 
1618 	rcu_read_unlock();
1619 }
1620 
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 {
1623 	struct rt6_exception_bucket *bucket;
1624 	struct rt6_exception *rt6_ex;
1625 	int i;
1626 
1627 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628 					lockdep_is_held(&rt6_exception_lock));
1629 
1630 	if (bucket) {
1631 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1634 			}
1635 			bucket++;
1636 		}
1637 	}
1638 }
1639 
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641 					 struct rt6_info *rt, int mtu)
1642 {
1643 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1644 	 * lowest MTU in the path: always allow updating the route PMTU to
1645 	 * reflect PMTU decreases.
1646 	 *
1647 	 * If the new MTU is higher, and the route PMTU is equal to the local
1648 	 * MTU, this means the old MTU is the lowest in the path, so allow
1649 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1650 	 * handle this.
1651 	 */
1652 
1653 	if (dst_mtu(&rt->dst) >= mtu)
1654 		return true;
1655 
1656 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657 		return true;
1658 
1659 	return false;
1660 }
1661 
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663 				       struct fib6_info *rt, int mtu)
1664 {
1665 	struct rt6_exception_bucket *bucket;
1666 	struct rt6_exception *rt6_ex;
1667 	int i;
1668 
1669 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670 					lockdep_is_held(&rt6_exception_lock));
1671 
1672 	if (!bucket)
1673 		return;
1674 
1675 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677 			struct rt6_info *entry = rt6_ex->rt6i;
1678 
1679 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680 			 * route), the metrics of its rt->from have already
1681 			 * been updated.
1682 			 */
1683 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1685 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686 		}
1687 		bucket++;
1688 	}
1689 }
1690 
1691 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1692 
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694 					struct in6_addr *gateway)
1695 {
1696 	struct rt6_exception_bucket *bucket;
1697 	struct rt6_exception *rt6_ex;
1698 	struct hlist_node *tmp;
1699 	int i;
1700 
1701 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702 		return;
1703 
1704 	spin_lock_bh(&rt6_exception_lock);
1705 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 				     lockdep_is_held(&rt6_exception_lock));
1707 
1708 	if (bucket) {
1709 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 			hlist_for_each_entry_safe(rt6_ex, tmp,
1711 						  &bucket->chain, hlist) {
1712 				struct rt6_info *entry = rt6_ex->rt6i;
1713 
1714 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715 				    RTF_CACHE_GATEWAY &&
1716 				    ipv6_addr_equal(gateway,
1717 						    &entry->rt6i_gateway)) {
1718 					rt6_remove_exception(bucket, rt6_ex);
1719 				}
1720 			}
1721 			bucket++;
1722 		}
1723 	}
1724 
1725 	spin_unlock_bh(&rt6_exception_lock);
1726 }
1727 
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729 				      struct rt6_exception *rt6_ex,
1730 				      struct fib6_gc_args *gc_args,
1731 				      unsigned long now)
1732 {
1733 	struct rt6_info *rt = rt6_ex->rt6i;
1734 
1735 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1736 	 * even if others have still references to them, so that on next
1737 	 * dst_check() such references can be dropped.
1738 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739 	 * expired, independently from their aging, as per RFC 8201 section 4
1740 	 */
1741 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743 			RT6_TRACE("aging clone %p\n", rt);
1744 			rt6_remove_exception(bucket, rt6_ex);
1745 			return;
1746 		}
1747 	} else if (time_after(jiffies, rt->dst.expires)) {
1748 		RT6_TRACE("purging expired route %p\n", rt);
1749 		rt6_remove_exception(bucket, rt6_ex);
1750 		return;
1751 	}
1752 
1753 	if (rt->rt6i_flags & RTF_GATEWAY) {
1754 		struct neighbour *neigh;
1755 		__u8 neigh_flags = 0;
1756 
1757 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758 		if (neigh)
1759 			neigh_flags = neigh->flags;
1760 
1761 		if (!(neigh_flags & NTF_ROUTER)) {
1762 			RT6_TRACE("purging route %p via non-router but gateway\n",
1763 				  rt);
1764 			rt6_remove_exception(bucket, rt6_ex);
1765 			return;
1766 		}
1767 	}
1768 
1769 	gc_args->more++;
1770 }
1771 
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773 			struct fib6_gc_args *gc_args,
1774 			unsigned long now)
1775 {
1776 	struct rt6_exception_bucket *bucket;
1777 	struct rt6_exception *rt6_ex;
1778 	struct hlist_node *tmp;
1779 	int i;
1780 
1781 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1782 		return;
1783 
1784 	rcu_read_lock_bh();
1785 	spin_lock(&rt6_exception_lock);
1786 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787 				    lockdep_is_held(&rt6_exception_lock));
1788 
1789 	if (bucket) {
1790 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791 			hlist_for_each_entry_safe(rt6_ex, tmp,
1792 						  &bucket->chain, hlist) {
1793 				rt6_age_examine_exception(bucket, rt6_ex,
1794 							  gc_args, now);
1795 			}
1796 			bucket++;
1797 		}
1798 	}
1799 	spin_unlock(&rt6_exception_lock);
1800 	rcu_read_unlock_bh();
1801 }
1802 
1803 /* must be called with rcu lock held */
1804 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1805 				    int oif, struct flowi6 *fl6, int strict)
1806 {
1807 	struct fib6_node *fn, *saved_fn;
1808 	struct fib6_info *f6i;
1809 
1810 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1811 	saved_fn = fn;
1812 
1813 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1814 		oif = 0;
1815 
1816 redo_rt6_select:
1817 	f6i = rt6_select(net, fn, oif, strict);
1818 	if (f6i == net->ipv6.fib6_null_entry) {
1819 		fn = fib6_backtrack(fn, &fl6->saddr);
1820 		if (fn)
1821 			goto redo_rt6_select;
1822 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1823 			/* also consider unreachable route */
1824 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1825 			fn = saved_fn;
1826 			goto redo_rt6_select;
1827 		}
1828 	}
1829 
1830 	return f6i;
1831 }
1832 
1833 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1834 			       int oif, struct flowi6 *fl6,
1835 			       const struct sk_buff *skb, int flags)
1836 {
1837 	struct fib6_info *f6i;
1838 	struct rt6_info *rt;
1839 	int strict = 0;
1840 
1841 	strict |= flags & RT6_LOOKUP_F_IFACE;
1842 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1843 	if (net->ipv6.devconf_all->forwarding == 0)
1844 		strict |= RT6_LOOKUP_F_REACHABLE;
1845 
1846 	rcu_read_lock();
1847 
1848 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1849 	if (f6i->fib6_nsiblings)
1850 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1851 
1852 	if (f6i == net->ipv6.fib6_null_entry) {
1853 		rt = net->ipv6.ip6_null_entry;
1854 		rcu_read_unlock();
1855 		dst_hold(&rt->dst);
1856 		trace_fib6_table_lookup(net, rt, table, fl6);
1857 		return rt;
1858 	}
1859 
1860 	/*Search through exception table */
1861 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1862 	if (rt) {
1863 		if (ip6_hold_safe(net, &rt, true))
1864 			dst_use_noref(&rt->dst, jiffies);
1865 
1866 		rcu_read_unlock();
1867 		trace_fib6_table_lookup(net, rt, table, fl6);
1868 		return rt;
1869 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1870 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1871 		/* Create a RTF_CACHE clone which will not be
1872 		 * owned by the fib6 tree.  It is for the special case where
1873 		 * the daddr in the skb during the neighbor look-up is different
1874 		 * from the fl6->daddr used to look-up route here.
1875 		 */
1876 		struct rt6_info *uncached_rt;
1877 
1878 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1879 
1880 		rcu_read_unlock();
1881 
1882 		if (uncached_rt) {
1883 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1884 			 * No need for another dst_hold()
1885 			 */
1886 			rt6_uncached_list_add(uncached_rt);
1887 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1888 		} else {
1889 			uncached_rt = net->ipv6.ip6_null_entry;
1890 			dst_hold(&uncached_rt->dst);
1891 		}
1892 
1893 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1894 		return uncached_rt;
1895 
1896 	} else {
1897 		/* Get a percpu copy */
1898 
1899 		struct rt6_info *pcpu_rt;
1900 
1901 		local_bh_disable();
1902 		pcpu_rt = rt6_get_pcpu_route(f6i);
1903 
1904 		if (!pcpu_rt)
1905 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1906 
1907 		local_bh_enable();
1908 		rcu_read_unlock();
1909 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1910 		return pcpu_rt;
1911 	}
1912 }
1913 EXPORT_SYMBOL_GPL(ip6_pol_route);
1914 
1915 static struct rt6_info *ip6_pol_route_input(struct net *net,
1916 					    struct fib6_table *table,
1917 					    struct flowi6 *fl6,
1918 					    const struct sk_buff *skb,
1919 					    int flags)
1920 {
1921 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1922 }
1923 
1924 struct dst_entry *ip6_route_input_lookup(struct net *net,
1925 					 struct net_device *dev,
1926 					 struct flowi6 *fl6,
1927 					 const struct sk_buff *skb,
1928 					 int flags)
1929 {
1930 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1931 		flags |= RT6_LOOKUP_F_IFACE;
1932 
1933 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1934 }
1935 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1936 
1937 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1938 				  struct flow_keys *keys,
1939 				  struct flow_keys *flkeys)
1940 {
1941 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1942 	const struct ipv6hdr *key_iph = outer_iph;
1943 	struct flow_keys *_flkeys = flkeys;
1944 	const struct ipv6hdr *inner_iph;
1945 	const struct icmp6hdr *icmph;
1946 	struct ipv6hdr _inner_iph;
1947 	struct icmp6hdr _icmph;
1948 
1949 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1950 		goto out;
1951 
1952 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1953 				   sizeof(_icmph), &_icmph);
1954 	if (!icmph)
1955 		goto out;
1956 
1957 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1958 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1959 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1960 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1961 		goto out;
1962 
1963 	inner_iph = skb_header_pointer(skb,
1964 				       skb_transport_offset(skb) + sizeof(*icmph),
1965 				       sizeof(_inner_iph), &_inner_iph);
1966 	if (!inner_iph)
1967 		goto out;
1968 
1969 	key_iph = inner_iph;
1970 	_flkeys = NULL;
1971 out:
1972 	if (_flkeys) {
1973 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1974 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1975 		keys->tags.flow_label = _flkeys->tags.flow_label;
1976 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1977 	} else {
1978 		keys->addrs.v6addrs.src = key_iph->saddr;
1979 		keys->addrs.v6addrs.dst = key_iph->daddr;
1980 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1981 		keys->basic.ip_proto = key_iph->nexthdr;
1982 	}
1983 }
1984 
1985 /* if skb is set it will be used and fl6 can be NULL */
1986 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1987 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1988 {
1989 	struct flow_keys hash_keys;
1990 	u32 mhash;
1991 
1992 	switch (ip6_multipath_hash_policy(net)) {
1993 	case 0:
1994 		memset(&hash_keys, 0, sizeof(hash_keys));
1995 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1996 		if (skb) {
1997 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1998 		} else {
1999 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2000 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2001 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
2002 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2003 		}
2004 		break;
2005 	case 1:
2006 		if (skb) {
2007 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2008 			struct flow_keys keys;
2009 
2010 			/* short-circuit if we already have L4 hash present */
2011 			if (skb->l4_hash)
2012 				return skb_get_hash_raw(skb) >> 1;
2013 
2014 			memset(&hash_keys, 0, sizeof(hash_keys));
2015 
2016                         if (!flkeys) {
2017 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2018 				flkeys = &keys;
2019 			}
2020 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2021 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2022 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2023 			hash_keys.ports.src = flkeys->ports.src;
2024 			hash_keys.ports.dst = flkeys->ports.dst;
2025 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2026 		} else {
2027 			memset(&hash_keys, 0, sizeof(hash_keys));
2028 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2029 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2030 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2031 			hash_keys.ports.src = fl6->fl6_sport;
2032 			hash_keys.ports.dst = fl6->fl6_dport;
2033 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2034 		}
2035 		break;
2036 	}
2037 	mhash = flow_hash_from_keys(&hash_keys);
2038 
2039 	return mhash >> 1;
2040 }
2041 
2042 void ip6_route_input(struct sk_buff *skb)
2043 {
2044 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2045 	struct net *net = dev_net(skb->dev);
2046 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2047 	struct ip_tunnel_info *tun_info;
2048 	struct flowi6 fl6 = {
2049 		.flowi6_iif = skb->dev->ifindex,
2050 		.daddr = iph->daddr,
2051 		.saddr = iph->saddr,
2052 		.flowlabel = ip6_flowinfo(iph),
2053 		.flowi6_mark = skb->mark,
2054 		.flowi6_proto = iph->nexthdr,
2055 	};
2056 	struct flow_keys *flkeys = NULL, _flkeys;
2057 
2058 	tun_info = skb_tunnel_info(skb);
2059 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2060 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2061 
2062 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2063 		flkeys = &_flkeys;
2064 
2065 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2066 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2067 	skb_dst_drop(skb);
2068 	skb_dst_set(skb,
2069 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2070 }
2071 
2072 static struct rt6_info *ip6_pol_route_output(struct net *net,
2073 					     struct fib6_table *table,
2074 					     struct flowi6 *fl6,
2075 					     const struct sk_buff *skb,
2076 					     int flags)
2077 {
2078 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2079 }
2080 
2081 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2082 					 struct flowi6 *fl6, int flags)
2083 {
2084 	bool any_src;
2085 
2086 	if (rt6_need_strict(&fl6->daddr)) {
2087 		struct dst_entry *dst;
2088 
2089 		dst = l3mdev_link_scope_lookup(net, fl6);
2090 		if (dst)
2091 			return dst;
2092 	}
2093 
2094 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2095 
2096 	any_src = ipv6_addr_any(&fl6->saddr);
2097 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2098 	    (fl6->flowi6_oif && any_src))
2099 		flags |= RT6_LOOKUP_F_IFACE;
2100 
2101 	if (!any_src)
2102 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2103 	else if (sk)
2104 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2105 
2106 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2107 }
2108 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2109 
2110 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2111 {
2112 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2113 	struct net_device *loopback_dev = net->loopback_dev;
2114 	struct dst_entry *new = NULL;
2115 
2116 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2117 		       DST_OBSOLETE_DEAD, 0);
2118 	if (rt) {
2119 		rt6_info_init(rt);
2120 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2121 
2122 		new = &rt->dst;
2123 		new->__use = 1;
2124 		new->input = dst_discard;
2125 		new->output = dst_discard_out;
2126 
2127 		dst_copy_metrics(new, &ort->dst);
2128 
2129 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2130 		rt->rt6i_gateway = ort->rt6i_gateway;
2131 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2132 
2133 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2134 #ifdef CONFIG_IPV6_SUBTREES
2135 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2136 #endif
2137 	}
2138 
2139 	dst_release(dst_orig);
2140 	return new ? new : ERR_PTR(-ENOMEM);
2141 }
2142 
2143 /*
2144  *	Destination cache support functions
2145  */
2146 
2147 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2148 {
2149 	u32 rt_cookie = 0;
2150 
2151 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2152 		return false;
2153 
2154 	if (fib6_check_expired(f6i))
2155 		return false;
2156 
2157 	return true;
2158 }
2159 
2160 static struct dst_entry *rt6_check(struct rt6_info *rt,
2161 				   struct fib6_info *from,
2162 				   u32 cookie)
2163 {
2164 	u32 rt_cookie = 0;
2165 
2166 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2167 	    rt_cookie != cookie)
2168 		return NULL;
2169 
2170 	if (rt6_check_expired(rt))
2171 		return NULL;
2172 
2173 	return &rt->dst;
2174 }
2175 
2176 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2177 					    struct fib6_info *from,
2178 					    u32 cookie)
2179 {
2180 	if (!__rt6_check_expired(rt) &&
2181 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2182 	    fib6_check(from, cookie))
2183 		return &rt->dst;
2184 	else
2185 		return NULL;
2186 }
2187 
2188 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2189 {
2190 	struct dst_entry *dst_ret;
2191 	struct fib6_info *from;
2192 	struct rt6_info *rt;
2193 
2194 	rt = container_of(dst, struct rt6_info, dst);
2195 
2196 	rcu_read_lock();
2197 
2198 	/* All IPV6 dsts are created with ->obsolete set to the value
2199 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2200 	 * into this function always.
2201 	 */
2202 
2203 	from = rcu_dereference(rt->from);
2204 
2205 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2206 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2207 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2208 	else
2209 		dst_ret = rt6_check(rt, from, cookie);
2210 
2211 	rcu_read_unlock();
2212 
2213 	return dst_ret;
2214 }
2215 
2216 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2217 {
2218 	struct rt6_info *rt = (struct rt6_info *) dst;
2219 
2220 	if (rt) {
2221 		if (rt->rt6i_flags & RTF_CACHE) {
2222 			rcu_read_lock();
2223 			if (rt6_check_expired(rt)) {
2224 				rt6_remove_exception_rt(rt);
2225 				dst = NULL;
2226 			}
2227 			rcu_read_unlock();
2228 		} else {
2229 			dst_release(dst);
2230 			dst = NULL;
2231 		}
2232 	}
2233 	return dst;
2234 }
2235 
2236 static void ip6_link_failure(struct sk_buff *skb)
2237 {
2238 	struct rt6_info *rt;
2239 
2240 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2241 
2242 	rt = (struct rt6_info *) skb_dst(skb);
2243 	if (rt) {
2244 		rcu_read_lock();
2245 		if (rt->rt6i_flags & RTF_CACHE) {
2246 			if (dst_hold_safe(&rt->dst))
2247 				rt6_remove_exception_rt(rt);
2248 		} else {
2249 			struct fib6_info *from;
2250 			struct fib6_node *fn;
2251 
2252 			from = rcu_dereference(rt->from);
2253 			if (from) {
2254 				fn = rcu_dereference(from->fib6_node);
2255 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2256 					fn->fn_sernum = -1;
2257 			}
2258 		}
2259 		rcu_read_unlock();
2260 	}
2261 }
2262 
2263 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2264 {
2265 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2266 		struct fib6_info *from;
2267 
2268 		rcu_read_lock();
2269 		from = rcu_dereference(rt0->from);
2270 		if (from)
2271 			rt0->dst.expires = from->expires;
2272 		rcu_read_unlock();
2273 	}
2274 
2275 	dst_set_expires(&rt0->dst, timeout);
2276 	rt0->rt6i_flags |= RTF_EXPIRES;
2277 }
2278 
2279 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2280 {
2281 	struct net *net = dev_net(rt->dst.dev);
2282 
2283 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2284 	rt->rt6i_flags |= RTF_MODIFIED;
2285 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2286 }
2287 
2288 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2289 {
2290 	bool from_set;
2291 
2292 	rcu_read_lock();
2293 	from_set = !!rcu_dereference(rt->from);
2294 	rcu_read_unlock();
2295 
2296 	return !(rt->rt6i_flags & RTF_CACHE) &&
2297 		(rt->rt6i_flags & RTF_PCPU || from_set);
2298 }
2299 
2300 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2301 				 const struct ipv6hdr *iph, u32 mtu)
2302 {
2303 	const struct in6_addr *daddr, *saddr;
2304 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2305 
2306 	if (rt6->rt6i_flags & RTF_LOCAL)
2307 		return;
2308 
2309 	if (dst_metric_locked(dst, RTAX_MTU))
2310 		return;
2311 
2312 	if (iph) {
2313 		daddr = &iph->daddr;
2314 		saddr = &iph->saddr;
2315 	} else if (sk) {
2316 		daddr = &sk->sk_v6_daddr;
2317 		saddr = &inet6_sk(sk)->saddr;
2318 	} else {
2319 		daddr = NULL;
2320 		saddr = NULL;
2321 	}
2322 	dst_confirm_neigh(dst, daddr);
2323 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2324 	if (mtu >= dst_mtu(dst))
2325 		return;
2326 
2327 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2328 		rt6_do_update_pmtu(rt6, mtu);
2329 		/* update rt6_ex->stamp for cache */
2330 		if (rt6->rt6i_flags & RTF_CACHE)
2331 			rt6_update_exception_stamp_rt(rt6);
2332 	} else if (daddr) {
2333 		struct fib6_info *from;
2334 		struct rt6_info *nrt6;
2335 
2336 		rcu_read_lock();
2337 		from = rcu_dereference(rt6->from);
2338 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2339 		if (nrt6) {
2340 			rt6_do_update_pmtu(nrt6, mtu);
2341 			if (rt6_insert_exception(nrt6, from))
2342 				dst_release_immediate(&nrt6->dst);
2343 		}
2344 		rcu_read_unlock();
2345 	}
2346 }
2347 
2348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2349 			       struct sk_buff *skb, u32 mtu)
2350 {
2351 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2352 }
2353 
2354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2355 		     int oif, u32 mark, kuid_t uid)
2356 {
2357 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2358 	struct dst_entry *dst;
2359 	struct flowi6 fl6;
2360 
2361 	memset(&fl6, 0, sizeof(fl6));
2362 	fl6.flowi6_oif = oif;
2363 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2364 	fl6.daddr = iph->daddr;
2365 	fl6.saddr = iph->saddr;
2366 	fl6.flowlabel = ip6_flowinfo(iph);
2367 	fl6.flowi6_uid = uid;
2368 
2369 	dst = ip6_route_output(net, NULL, &fl6);
2370 	if (!dst->error)
2371 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2372 	dst_release(dst);
2373 }
2374 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2375 
2376 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2377 {
2378 	struct dst_entry *dst;
2379 
2380 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2381 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2382 
2383 	dst = __sk_dst_get(sk);
2384 	if (!dst || !dst->obsolete ||
2385 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2386 		return;
2387 
2388 	bh_lock_sock(sk);
2389 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2390 		ip6_datagram_dst_update(sk, false);
2391 	bh_unlock_sock(sk);
2392 }
2393 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2394 
2395 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2396 			   const struct flowi6 *fl6)
2397 {
2398 #ifdef CONFIG_IPV6_SUBTREES
2399 	struct ipv6_pinfo *np = inet6_sk(sk);
2400 #endif
2401 
2402 	ip6_dst_store(sk, dst,
2403 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2404 		      &sk->sk_v6_daddr : NULL,
2405 #ifdef CONFIG_IPV6_SUBTREES
2406 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2407 		      &np->saddr :
2408 #endif
2409 		      NULL);
2410 }
2411 
2412 /* Handle redirects */
2413 struct ip6rd_flowi {
2414 	struct flowi6 fl6;
2415 	struct in6_addr gateway;
2416 };
2417 
2418 static struct rt6_info *__ip6_route_redirect(struct net *net,
2419 					     struct fib6_table *table,
2420 					     struct flowi6 *fl6,
2421 					     const struct sk_buff *skb,
2422 					     int flags)
2423 {
2424 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2425 	struct rt6_info *ret = NULL, *rt_cache;
2426 	struct fib6_info *rt;
2427 	struct fib6_node *fn;
2428 
2429 	/* Get the "current" route for this destination and
2430 	 * check if the redirect has come from appropriate router.
2431 	 *
2432 	 * RFC 4861 specifies that redirects should only be
2433 	 * accepted if they come from the nexthop to the target.
2434 	 * Due to the way the routes are chosen, this notion
2435 	 * is a bit fuzzy and one might need to check all possible
2436 	 * routes.
2437 	 */
2438 
2439 	rcu_read_lock();
2440 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2441 restart:
2442 	for_each_fib6_node_rt_rcu(fn) {
2443 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2444 			continue;
2445 		if (fib6_check_expired(rt))
2446 			continue;
2447 		if (rt->fib6_flags & RTF_REJECT)
2448 			break;
2449 		if (!(rt->fib6_flags & RTF_GATEWAY))
2450 			continue;
2451 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2452 			continue;
2453 		/* rt_cache's gateway might be different from its 'parent'
2454 		 * in the case of an ip redirect.
2455 		 * So we keep searching in the exception table if the gateway
2456 		 * is different.
2457 		 */
2458 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2459 			rt_cache = rt6_find_cached_rt(rt,
2460 						      &fl6->daddr,
2461 						      &fl6->saddr);
2462 			if (rt_cache &&
2463 			    ipv6_addr_equal(&rdfl->gateway,
2464 					    &rt_cache->rt6i_gateway)) {
2465 				ret = rt_cache;
2466 				break;
2467 			}
2468 			continue;
2469 		}
2470 		break;
2471 	}
2472 
2473 	if (!rt)
2474 		rt = net->ipv6.fib6_null_entry;
2475 	else if (rt->fib6_flags & RTF_REJECT) {
2476 		ret = net->ipv6.ip6_null_entry;
2477 		goto out;
2478 	}
2479 
2480 	if (rt == net->ipv6.fib6_null_entry) {
2481 		fn = fib6_backtrack(fn, &fl6->saddr);
2482 		if (fn)
2483 			goto restart;
2484 	}
2485 
2486 out:
2487 	if (ret)
2488 		dst_hold(&ret->dst);
2489 	else
2490 		ret = ip6_create_rt_rcu(rt);
2491 
2492 	rcu_read_unlock();
2493 
2494 	trace_fib6_table_lookup(net, ret, table, fl6);
2495 	return ret;
2496 };
2497 
2498 static struct dst_entry *ip6_route_redirect(struct net *net,
2499 					    const struct flowi6 *fl6,
2500 					    const struct sk_buff *skb,
2501 					    const struct in6_addr *gateway)
2502 {
2503 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2504 	struct ip6rd_flowi rdfl;
2505 
2506 	rdfl.fl6 = *fl6;
2507 	rdfl.gateway = *gateway;
2508 
2509 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2510 				flags, __ip6_route_redirect);
2511 }
2512 
2513 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2514 		  kuid_t uid)
2515 {
2516 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2517 	struct dst_entry *dst;
2518 	struct flowi6 fl6;
2519 
2520 	memset(&fl6, 0, sizeof(fl6));
2521 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2522 	fl6.flowi6_oif = oif;
2523 	fl6.flowi6_mark = mark;
2524 	fl6.daddr = iph->daddr;
2525 	fl6.saddr = iph->saddr;
2526 	fl6.flowlabel = ip6_flowinfo(iph);
2527 	fl6.flowi6_uid = uid;
2528 
2529 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2530 	rt6_do_redirect(dst, NULL, skb);
2531 	dst_release(dst);
2532 }
2533 EXPORT_SYMBOL_GPL(ip6_redirect);
2534 
2535 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2536 			    u32 mark)
2537 {
2538 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2539 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2540 	struct dst_entry *dst;
2541 	struct flowi6 fl6;
2542 
2543 	memset(&fl6, 0, sizeof(fl6));
2544 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2545 	fl6.flowi6_oif = oif;
2546 	fl6.flowi6_mark = mark;
2547 	fl6.daddr = msg->dest;
2548 	fl6.saddr = iph->daddr;
2549 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2550 
2551 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2552 	rt6_do_redirect(dst, NULL, skb);
2553 	dst_release(dst);
2554 }
2555 
2556 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2557 {
2558 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2559 		     sk->sk_uid);
2560 }
2561 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2562 
2563 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2564 {
2565 	struct net_device *dev = dst->dev;
2566 	unsigned int mtu = dst_mtu(dst);
2567 	struct net *net = dev_net(dev);
2568 
2569 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2570 
2571 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2572 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2573 
2574 	/*
2575 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2576 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2577 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2578 	 * rely only on pmtu discovery"
2579 	 */
2580 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2581 		mtu = IPV6_MAXPLEN;
2582 	return mtu;
2583 }
2584 
2585 static unsigned int ip6_mtu(const struct dst_entry *dst)
2586 {
2587 	struct inet6_dev *idev;
2588 	unsigned int mtu;
2589 
2590 	mtu = dst_metric_raw(dst, RTAX_MTU);
2591 	if (mtu)
2592 		goto out;
2593 
2594 	mtu = IPV6_MIN_MTU;
2595 
2596 	rcu_read_lock();
2597 	idev = __in6_dev_get(dst->dev);
2598 	if (idev)
2599 		mtu = idev->cnf.mtu6;
2600 	rcu_read_unlock();
2601 
2602 out:
2603 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2604 
2605 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2606 }
2607 
2608 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2609 				  struct flowi6 *fl6)
2610 {
2611 	struct dst_entry *dst;
2612 	struct rt6_info *rt;
2613 	struct inet6_dev *idev = in6_dev_get(dev);
2614 	struct net *net = dev_net(dev);
2615 
2616 	if (unlikely(!idev))
2617 		return ERR_PTR(-ENODEV);
2618 
2619 	rt = ip6_dst_alloc(net, dev, 0);
2620 	if (unlikely(!rt)) {
2621 		in6_dev_put(idev);
2622 		dst = ERR_PTR(-ENOMEM);
2623 		goto out;
2624 	}
2625 
2626 	rt->dst.flags |= DST_HOST;
2627 	rt->dst.input = ip6_input;
2628 	rt->dst.output  = ip6_output;
2629 	rt->rt6i_gateway  = fl6->daddr;
2630 	rt->rt6i_dst.addr = fl6->daddr;
2631 	rt->rt6i_dst.plen = 128;
2632 	rt->rt6i_idev     = idev;
2633 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2634 
2635 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2636 	 * do proper release of the net_device
2637 	 */
2638 	rt6_uncached_list_add(rt);
2639 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2640 
2641 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2642 
2643 out:
2644 	return dst;
2645 }
2646 
2647 static int ip6_dst_gc(struct dst_ops *ops)
2648 {
2649 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2650 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2651 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2652 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2653 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2654 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2655 	int entries;
2656 
2657 	entries = dst_entries_get_fast(ops);
2658 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2659 	    entries <= rt_max_size)
2660 		goto out;
2661 
2662 	net->ipv6.ip6_rt_gc_expire++;
2663 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2664 	entries = dst_entries_get_slow(ops);
2665 	if (entries < ops->gc_thresh)
2666 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2667 out:
2668 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2669 	return entries > rt_max_size;
2670 }
2671 
2672 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2673 			       struct fib6_config *cfg)
2674 {
2675 	struct dst_metrics *p;
2676 
2677 	if (!cfg->fc_mx)
2678 		return 0;
2679 
2680 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2681 	if (unlikely(!p))
2682 		return -ENOMEM;
2683 
2684 	refcount_set(&p->refcnt, 1);
2685 	rt->fib6_metrics = p;
2686 
2687 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2688 }
2689 
2690 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2691 					    struct fib6_config *cfg,
2692 					    const struct in6_addr *gw_addr,
2693 					    u32 tbid, int flags)
2694 {
2695 	struct flowi6 fl6 = {
2696 		.flowi6_oif = cfg->fc_ifindex,
2697 		.daddr = *gw_addr,
2698 		.saddr = cfg->fc_prefsrc,
2699 	};
2700 	struct fib6_table *table;
2701 	struct rt6_info *rt;
2702 
2703 	table = fib6_get_table(net, tbid);
2704 	if (!table)
2705 		return NULL;
2706 
2707 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2708 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2709 
2710 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2711 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2712 
2713 	/* if table lookup failed, fall back to full lookup */
2714 	if (rt == net->ipv6.ip6_null_entry) {
2715 		ip6_rt_put(rt);
2716 		rt = NULL;
2717 	}
2718 
2719 	return rt;
2720 }
2721 
2722 static int ip6_route_check_nh_onlink(struct net *net,
2723 				     struct fib6_config *cfg,
2724 				     const struct net_device *dev,
2725 				     struct netlink_ext_ack *extack)
2726 {
2727 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2728 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2729 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2730 	struct rt6_info *grt;
2731 	int err;
2732 
2733 	err = 0;
2734 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2735 	if (grt) {
2736 		if (!grt->dst.error &&
2737 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2738 			NL_SET_ERR_MSG(extack,
2739 				       "Nexthop has invalid gateway or device mismatch");
2740 			err = -EINVAL;
2741 		}
2742 
2743 		ip6_rt_put(grt);
2744 	}
2745 
2746 	return err;
2747 }
2748 
2749 static int ip6_route_check_nh(struct net *net,
2750 			      struct fib6_config *cfg,
2751 			      struct net_device **_dev,
2752 			      struct inet6_dev **idev)
2753 {
2754 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2755 	struct net_device *dev = _dev ? *_dev : NULL;
2756 	struct rt6_info *grt = NULL;
2757 	int err = -EHOSTUNREACH;
2758 
2759 	if (cfg->fc_table) {
2760 		int flags = RT6_LOOKUP_F_IFACE;
2761 
2762 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2763 					  cfg->fc_table, flags);
2764 		if (grt) {
2765 			if (grt->rt6i_flags & RTF_GATEWAY ||
2766 			    (dev && dev != grt->dst.dev)) {
2767 				ip6_rt_put(grt);
2768 				grt = NULL;
2769 			}
2770 		}
2771 	}
2772 
2773 	if (!grt)
2774 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2775 
2776 	if (!grt)
2777 		goto out;
2778 
2779 	if (dev) {
2780 		if (dev != grt->dst.dev) {
2781 			ip6_rt_put(grt);
2782 			goto out;
2783 		}
2784 	} else {
2785 		*_dev = dev = grt->dst.dev;
2786 		*idev = grt->rt6i_idev;
2787 		dev_hold(dev);
2788 		in6_dev_hold(grt->rt6i_idev);
2789 	}
2790 
2791 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2792 		err = 0;
2793 
2794 	ip6_rt_put(grt);
2795 
2796 out:
2797 	return err;
2798 }
2799 
2800 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2801 			   struct net_device **_dev, struct inet6_dev **idev,
2802 			   struct netlink_ext_ack *extack)
2803 {
2804 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2805 	int gwa_type = ipv6_addr_type(gw_addr);
2806 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2807 	const struct net_device *dev = *_dev;
2808 	bool need_addr_check = !dev;
2809 	int err = -EINVAL;
2810 
2811 	/* if gw_addr is local we will fail to detect this in case
2812 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2813 	 * will return already-added prefix route via interface that
2814 	 * prefix route was assigned to, which might be non-loopback.
2815 	 */
2816 	if (dev &&
2817 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2818 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2819 		goto out;
2820 	}
2821 
2822 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2823 		/* IPv6 strictly inhibits using not link-local
2824 		 * addresses as nexthop address.
2825 		 * Otherwise, router will not able to send redirects.
2826 		 * It is very good, but in some (rare!) circumstances
2827 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2828 		 * some exceptions. --ANK
2829 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2830 		 * addressing
2831 		 */
2832 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2833 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2834 			goto out;
2835 		}
2836 
2837 		if (cfg->fc_flags & RTNH_F_ONLINK)
2838 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2839 		else
2840 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2841 
2842 		if (err)
2843 			goto out;
2844 	}
2845 
2846 	/* reload in case device was changed */
2847 	dev = *_dev;
2848 
2849 	err = -EINVAL;
2850 	if (!dev) {
2851 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2852 		goto out;
2853 	} else if (dev->flags & IFF_LOOPBACK) {
2854 		NL_SET_ERR_MSG(extack,
2855 			       "Egress device can not be loopback device for this route");
2856 		goto out;
2857 	}
2858 
2859 	/* if we did not check gw_addr above, do so now that the
2860 	 * egress device has been resolved.
2861 	 */
2862 	if (need_addr_check &&
2863 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2864 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2865 		goto out;
2866 	}
2867 
2868 	err = 0;
2869 out:
2870 	return err;
2871 }
2872 
2873 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2874 					      gfp_t gfp_flags,
2875 					      struct netlink_ext_ack *extack)
2876 {
2877 	struct net *net = cfg->fc_nlinfo.nl_net;
2878 	struct fib6_info *rt = NULL;
2879 	struct net_device *dev = NULL;
2880 	struct inet6_dev *idev = NULL;
2881 	struct fib6_table *table;
2882 	int addr_type;
2883 	int err = -EINVAL;
2884 
2885 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2886 	if (cfg->fc_flags & RTF_PCPU) {
2887 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2888 		goto out;
2889 	}
2890 
2891 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2892 	if (cfg->fc_flags & RTF_CACHE) {
2893 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2894 		goto out;
2895 	}
2896 
2897 	if (cfg->fc_type > RTN_MAX) {
2898 		NL_SET_ERR_MSG(extack, "Invalid route type");
2899 		goto out;
2900 	}
2901 
2902 	if (cfg->fc_dst_len > 128) {
2903 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2904 		goto out;
2905 	}
2906 	if (cfg->fc_src_len > 128) {
2907 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2908 		goto out;
2909 	}
2910 #ifndef CONFIG_IPV6_SUBTREES
2911 	if (cfg->fc_src_len) {
2912 		NL_SET_ERR_MSG(extack,
2913 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2914 		goto out;
2915 	}
2916 #endif
2917 	if (cfg->fc_ifindex) {
2918 		err = -ENODEV;
2919 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2920 		if (!dev)
2921 			goto out;
2922 		idev = in6_dev_get(dev);
2923 		if (!idev)
2924 			goto out;
2925 	}
2926 
2927 	if (cfg->fc_metric == 0)
2928 		cfg->fc_metric = IP6_RT_PRIO_USER;
2929 
2930 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2931 		if (!dev) {
2932 			NL_SET_ERR_MSG(extack,
2933 				       "Nexthop device required for onlink");
2934 			err = -ENODEV;
2935 			goto out;
2936 		}
2937 
2938 		if (!(dev->flags & IFF_UP)) {
2939 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2940 			err = -ENETDOWN;
2941 			goto out;
2942 		}
2943 	}
2944 
2945 	err = -ENOBUFS;
2946 	if (cfg->fc_nlinfo.nlh &&
2947 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2948 		table = fib6_get_table(net, cfg->fc_table);
2949 		if (!table) {
2950 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2951 			table = fib6_new_table(net, cfg->fc_table);
2952 		}
2953 	} else {
2954 		table = fib6_new_table(net, cfg->fc_table);
2955 	}
2956 
2957 	if (!table)
2958 		goto out;
2959 
2960 	err = -ENOMEM;
2961 	rt = fib6_info_alloc(gfp_flags);
2962 	if (!rt)
2963 		goto out;
2964 
2965 	if (cfg->fc_flags & RTF_ADDRCONF)
2966 		rt->dst_nocount = true;
2967 
2968 	err = ip6_convert_metrics(net, rt, cfg);
2969 	if (err < 0)
2970 		goto out;
2971 
2972 	if (cfg->fc_flags & RTF_EXPIRES)
2973 		fib6_set_expires(rt, jiffies +
2974 				clock_t_to_jiffies(cfg->fc_expires));
2975 	else
2976 		fib6_clean_expires(rt);
2977 
2978 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2979 		cfg->fc_protocol = RTPROT_BOOT;
2980 	rt->fib6_protocol = cfg->fc_protocol;
2981 
2982 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2983 
2984 	if (cfg->fc_encap) {
2985 		struct lwtunnel_state *lwtstate;
2986 
2987 		err = lwtunnel_build_state(cfg->fc_encap_type,
2988 					   cfg->fc_encap, AF_INET6, cfg,
2989 					   &lwtstate, extack);
2990 		if (err)
2991 			goto out;
2992 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2993 	}
2994 
2995 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2996 	rt->fib6_dst.plen = cfg->fc_dst_len;
2997 	if (rt->fib6_dst.plen == 128)
2998 		rt->dst_host = true;
2999 
3000 #ifdef CONFIG_IPV6_SUBTREES
3001 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3002 	rt->fib6_src.plen = cfg->fc_src_len;
3003 #endif
3004 
3005 	rt->fib6_metric = cfg->fc_metric;
3006 	rt->fib6_nh.nh_weight = 1;
3007 
3008 	rt->fib6_type = cfg->fc_type;
3009 
3010 	/* We cannot add true routes via loopback here,
3011 	   they would result in kernel looping; promote them to reject routes
3012 	 */
3013 	if ((cfg->fc_flags & RTF_REJECT) ||
3014 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3015 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3016 	     !(cfg->fc_flags & RTF_LOCAL))) {
3017 		/* hold loopback dev/idev if we haven't done so. */
3018 		if (dev != net->loopback_dev) {
3019 			if (dev) {
3020 				dev_put(dev);
3021 				in6_dev_put(idev);
3022 			}
3023 			dev = net->loopback_dev;
3024 			dev_hold(dev);
3025 			idev = in6_dev_get(dev);
3026 			if (!idev) {
3027 				err = -ENODEV;
3028 				goto out;
3029 			}
3030 		}
3031 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3032 		goto install_route;
3033 	}
3034 
3035 	if (cfg->fc_flags & RTF_GATEWAY) {
3036 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3037 		if (err)
3038 			goto out;
3039 
3040 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3041 	}
3042 
3043 	err = -ENODEV;
3044 	if (!dev)
3045 		goto out;
3046 
3047 	if (idev->cnf.disable_ipv6) {
3048 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3049 		err = -EACCES;
3050 		goto out;
3051 	}
3052 
3053 	if (!(dev->flags & IFF_UP)) {
3054 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3055 		err = -ENETDOWN;
3056 		goto out;
3057 	}
3058 
3059 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3060 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3061 			NL_SET_ERR_MSG(extack, "Invalid source address");
3062 			err = -EINVAL;
3063 			goto out;
3064 		}
3065 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3066 		rt->fib6_prefsrc.plen = 128;
3067 	} else
3068 		rt->fib6_prefsrc.plen = 0;
3069 
3070 	rt->fib6_flags = cfg->fc_flags;
3071 
3072 install_route:
3073 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3074 	    !netif_carrier_ok(dev))
3075 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3076 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3077 	rt->fib6_nh.nh_dev = dev;
3078 	rt->fib6_table = table;
3079 
3080 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3081 
3082 	if (idev)
3083 		in6_dev_put(idev);
3084 
3085 	return rt;
3086 out:
3087 	if (dev)
3088 		dev_put(dev);
3089 	if (idev)
3090 		in6_dev_put(idev);
3091 
3092 	fib6_info_release(rt);
3093 	return ERR_PTR(err);
3094 }
3095 
3096 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3097 		  struct netlink_ext_ack *extack)
3098 {
3099 	struct fib6_info *rt;
3100 	int err;
3101 
3102 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3103 	if (IS_ERR(rt))
3104 		return PTR_ERR(rt);
3105 
3106 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3107 	fib6_info_release(rt);
3108 
3109 	return err;
3110 }
3111 
3112 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3113 {
3114 	struct net *net = info->nl_net;
3115 	struct fib6_table *table;
3116 	int err;
3117 
3118 	if (rt == net->ipv6.fib6_null_entry) {
3119 		err = -ENOENT;
3120 		goto out;
3121 	}
3122 
3123 	table = rt->fib6_table;
3124 	spin_lock_bh(&table->tb6_lock);
3125 	err = fib6_del(rt, info);
3126 	spin_unlock_bh(&table->tb6_lock);
3127 
3128 out:
3129 	fib6_info_release(rt);
3130 	return err;
3131 }
3132 
3133 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3134 {
3135 	struct nl_info info = { .nl_net = net };
3136 
3137 	return __ip6_del_rt(rt, &info);
3138 }
3139 
3140 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3141 {
3142 	struct nl_info *info = &cfg->fc_nlinfo;
3143 	struct net *net = info->nl_net;
3144 	struct sk_buff *skb = NULL;
3145 	struct fib6_table *table;
3146 	int err = -ENOENT;
3147 
3148 	if (rt == net->ipv6.fib6_null_entry)
3149 		goto out_put;
3150 	table = rt->fib6_table;
3151 	spin_lock_bh(&table->tb6_lock);
3152 
3153 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3154 		struct fib6_info *sibling, *next_sibling;
3155 
3156 		/* prefer to send a single notification with all hops */
3157 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3158 		if (skb) {
3159 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3160 
3161 			if (rt6_fill_node(net, skb, rt, NULL,
3162 					  NULL, NULL, 0, RTM_DELROUTE,
3163 					  info->portid, seq, 0) < 0) {
3164 				kfree_skb(skb);
3165 				skb = NULL;
3166 			} else
3167 				info->skip_notify = 1;
3168 		}
3169 
3170 		list_for_each_entry_safe(sibling, next_sibling,
3171 					 &rt->fib6_siblings,
3172 					 fib6_siblings) {
3173 			err = fib6_del(sibling, info);
3174 			if (err)
3175 				goto out_unlock;
3176 		}
3177 	}
3178 
3179 	err = fib6_del(rt, info);
3180 out_unlock:
3181 	spin_unlock_bh(&table->tb6_lock);
3182 out_put:
3183 	fib6_info_release(rt);
3184 
3185 	if (skb) {
3186 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3187 			    info->nlh, gfp_any());
3188 	}
3189 	return err;
3190 }
3191 
3192 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3193 {
3194 	int rc = -ESRCH;
3195 
3196 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3197 		goto out;
3198 
3199 	if (cfg->fc_flags & RTF_GATEWAY &&
3200 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3201 		goto out;
3202 	if (dst_hold_safe(&rt->dst))
3203 		rc = rt6_remove_exception_rt(rt);
3204 out:
3205 	return rc;
3206 }
3207 
3208 static int ip6_route_del(struct fib6_config *cfg,
3209 			 struct netlink_ext_ack *extack)
3210 {
3211 	struct rt6_info *rt_cache;
3212 	struct fib6_table *table;
3213 	struct fib6_info *rt;
3214 	struct fib6_node *fn;
3215 	int err = -ESRCH;
3216 
3217 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3218 	if (!table) {
3219 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3220 		return err;
3221 	}
3222 
3223 	rcu_read_lock();
3224 
3225 	fn = fib6_locate(&table->tb6_root,
3226 			 &cfg->fc_dst, cfg->fc_dst_len,
3227 			 &cfg->fc_src, cfg->fc_src_len,
3228 			 !(cfg->fc_flags & RTF_CACHE));
3229 
3230 	if (fn) {
3231 		for_each_fib6_node_rt_rcu(fn) {
3232 			if (cfg->fc_flags & RTF_CACHE) {
3233 				int rc;
3234 
3235 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3236 							      &cfg->fc_src);
3237 				if (rt_cache) {
3238 					rc = ip6_del_cached_rt(rt_cache, cfg);
3239 					if (rc != -ESRCH)
3240 						return rc;
3241 				}
3242 				continue;
3243 			}
3244 			if (cfg->fc_ifindex &&
3245 			    (!rt->fib6_nh.nh_dev ||
3246 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3247 				continue;
3248 			if (cfg->fc_flags & RTF_GATEWAY &&
3249 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3250 				continue;
3251 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3252 				continue;
3253 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3254 				continue;
3255 			fib6_info_hold(rt);
3256 			rcu_read_unlock();
3257 
3258 			/* if gateway was specified only delete the one hop */
3259 			if (cfg->fc_flags & RTF_GATEWAY)
3260 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3261 
3262 			return __ip6_del_rt_siblings(rt, cfg);
3263 		}
3264 	}
3265 	rcu_read_unlock();
3266 
3267 	return err;
3268 }
3269 
3270 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3271 {
3272 	struct netevent_redirect netevent;
3273 	struct rt6_info *rt, *nrt = NULL;
3274 	struct ndisc_options ndopts;
3275 	struct inet6_dev *in6_dev;
3276 	struct neighbour *neigh;
3277 	struct fib6_info *from;
3278 	struct rd_msg *msg;
3279 	int optlen, on_link;
3280 	u8 *lladdr;
3281 
3282 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3283 	optlen -= sizeof(*msg);
3284 
3285 	if (optlen < 0) {
3286 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3287 		return;
3288 	}
3289 
3290 	msg = (struct rd_msg *)icmp6_hdr(skb);
3291 
3292 	if (ipv6_addr_is_multicast(&msg->dest)) {
3293 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3294 		return;
3295 	}
3296 
3297 	on_link = 0;
3298 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3299 		on_link = 1;
3300 	} else if (ipv6_addr_type(&msg->target) !=
3301 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3302 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3303 		return;
3304 	}
3305 
3306 	in6_dev = __in6_dev_get(skb->dev);
3307 	if (!in6_dev)
3308 		return;
3309 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3310 		return;
3311 
3312 	/* RFC2461 8.1:
3313 	 *	The IP source address of the Redirect MUST be the same as the current
3314 	 *	first-hop router for the specified ICMP Destination Address.
3315 	 */
3316 
3317 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3318 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3319 		return;
3320 	}
3321 
3322 	lladdr = NULL;
3323 	if (ndopts.nd_opts_tgt_lladdr) {
3324 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3325 					     skb->dev);
3326 		if (!lladdr) {
3327 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3328 			return;
3329 		}
3330 	}
3331 
3332 	rt = (struct rt6_info *) dst;
3333 	if (rt->rt6i_flags & RTF_REJECT) {
3334 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3335 		return;
3336 	}
3337 
3338 	/* Redirect received -> path was valid.
3339 	 * Look, redirects are sent only in response to data packets,
3340 	 * so that this nexthop apparently is reachable. --ANK
3341 	 */
3342 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3343 
3344 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3345 	if (!neigh)
3346 		return;
3347 
3348 	/*
3349 	 *	We have finally decided to accept it.
3350 	 */
3351 
3352 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3353 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3354 		     NEIGH_UPDATE_F_OVERRIDE|
3355 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3356 				     NEIGH_UPDATE_F_ISROUTER)),
3357 		     NDISC_REDIRECT, &ndopts);
3358 
3359 	rcu_read_lock();
3360 	from = rcu_dereference(rt->from);
3361 	fib6_info_hold(from);
3362 	rcu_read_unlock();
3363 
3364 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3365 	if (!nrt)
3366 		goto out;
3367 
3368 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3369 	if (on_link)
3370 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3371 
3372 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3373 
3374 	/* No need to remove rt from the exception table if rt is
3375 	 * a cached route because rt6_insert_exception() will
3376 	 * takes care of it
3377 	 */
3378 	if (rt6_insert_exception(nrt, from)) {
3379 		dst_release_immediate(&nrt->dst);
3380 		goto out;
3381 	}
3382 
3383 	netevent.old = &rt->dst;
3384 	netevent.new = &nrt->dst;
3385 	netevent.daddr = &msg->dest;
3386 	netevent.neigh = neigh;
3387 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3388 
3389 out:
3390 	fib6_info_release(from);
3391 	neigh_release(neigh);
3392 }
3393 
3394 #ifdef CONFIG_IPV6_ROUTE_INFO
3395 static struct fib6_info *rt6_get_route_info(struct net *net,
3396 					   const struct in6_addr *prefix, int prefixlen,
3397 					   const struct in6_addr *gwaddr,
3398 					   struct net_device *dev)
3399 {
3400 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3401 	int ifindex = dev->ifindex;
3402 	struct fib6_node *fn;
3403 	struct fib6_info *rt = NULL;
3404 	struct fib6_table *table;
3405 
3406 	table = fib6_get_table(net, tb_id);
3407 	if (!table)
3408 		return NULL;
3409 
3410 	rcu_read_lock();
3411 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3412 	if (!fn)
3413 		goto out;
3414 
3415 	for_each_fib6_node_rt_rcu(fn) {
3416 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3417 			continue;
3418 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3419 			continue;
3420 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3421 			continue;
3422 		fib6_info_hold(rt);
3423 		break;
3424 	}
3425 out:
3426 	rcu_read_unlock();
3427 	return rt;
3428 }
3429 
3430 static struct fib6_info *rt6_add_route_info(struct net *net,
3431 					   const struct in6_addr *prefix, int prefixlen,
3432 					   const struct in6_addr *gwaddr,
3433 					   struct net_device *dev,
3434 					   unsigned int pref)
3435 {
3436 	struct fib6_config cfg = {
3437 		.fc_metric	= IP6_RT_PRIO_USER,
3438 		.fc_ifindex	= dev->ifindex,
3439 		.fc_dst_len	= prefixlen,
3440 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3441 				  RTF_UP | RTF_PREF(pref),
3442 		.fc_protocol = RTPROT_RA,
3443 		.fc_type = RTN_UNICAST,
3444 		.fc_nlinfo.portid = 0,
3445 		.fc_nlinfo.nlh = NULL,
3446 		.fc_nlinfo.nl_net = net,
3447 	};
3448 
3449 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3450 	cfg.fc_dst = *prefix;
3451 	cfg.fc_gateway = *gwaddr;
3452 
3453 	/* We should treat it as a default route if prefix length is 0. */
3454 	if (!prefixlen)
3455 		cfg.fc_flags |= RTF_DEFAULT;
3456 
3457 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3458 
3459 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3460 }
3461 #endif
3462 
3463 struct fib6_info *rt6_get_dflt_router(struct net *net,
3464 				     const struct in6_addr *addr,
3465 				     struct net_device *dev)
3466 {
3467 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3468 	struct fib6_info *rt;
3469 	struct fib6_table *table;
3470 
3471 	table = fib6_get_table(net, tb_id);
3472 	if (!table)
3473 		return NULL;
3474 
3475 	rcu_read_lock();
3476 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3477 		if (dev == rt->fib6_nh.nh_dev &&
3478 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3479 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3480 			break;
3481 	}
3482 	if (rt)
3483 		fib6_info_hold(rt);
3484 	rcu_read_unlock();
3485 	return rt;
3486 }
3487 
3488 struct fib6_info *rt6_add_dflt_router(struct net *net,
3489 				     const struct in6_addr *gwaddr,
3490 				     struct net_device *dev,
3491 				     unsigned int pref)
3492 {
3493 	struct fib6_config cfg = {
3494 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3495 		.fc_metric	= IP6_RT_PRIO_USER,
3496 		.fc_ifindex	= dev->ifindex,
3497 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3498 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3499 		.fc_protocol = RTPROT_RA,
3500 		.fc_type = RTN_UNICAST,
3501 		.fc_nlinfo.portid = 0,
3502 		.fc_nlinfo.nlh = NULL,
3503 		.fc_nlinfo.nl_net = net,
3504 	};
3505 
3506 	cfg.fc_gateway = *gwaddr;
3507 
3508 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3509 		struct fib6_table *table;
3510 
3511 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3512 		if (table)
3513 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3514 	}
3515 
3516 	return rt6_get_dflt_router(net, gwaddr, dev);
3517 }
3518 
3519 static void __rt6_purge_dflt_routers(struct net *net,
3520 				     struct fib6_table *table)
3521 {
3522 	struct fib6_info *rt;
3523 
3524 restart:
3525 	rcu_read_lock();
3526 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3527 		struct net_device *dev = fib6_info_nh_dev(rt);
3528 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3529 
3530 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3531 		    (!idev || idev->cnf.accept_ra != 2)) {
3532 			fib6_info_hold(rt);
3533 			rcu_read_unlock();
3534 			ip6_del_rt(net, rt);
3535 			goto restart;
3536 		}
3537 	}
3538 	rcu_read_unlock();
3539 
3540 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3541 }
3542 
3543 void rt6_purge_dflt_routers(struct net *net)
3544 {
3545 	struct fib6_table *table;
3546 	struct hlist_head *head;
3547 	unsigned int h;
3548 
3549 	rcu_read_lock();
3550 
3551 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3552 		head = &net->ipv6.fib_table_hash[h];
3553 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3554 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3555 				__rt6_purge_dflt_routers(net, table);
3556 		}
3557 	}
3558 
3559 	rcu_read_unlock();
3560 }
3561 
3562 static void rtmsg_to_fib6_config(struct net *net,
3563 				 struct in6_rtmsg *rtmsg,
3564 				 struct fib6_config *cfg)
3565 {
3566 	memset(cfg, 0, sizeof(*cfg));
3567 
3568 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3569 			 : RT6_TABLE_MAIN;
3570 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3571 	cfg->fc_metric = rtmsg->rtmsg_metric;
3572 	cfg->fc_expires = rtmsg->rtmsg_info;
3573 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3574 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3575 	cfg->fc_flags = rtmsg->rtmsg_flags;
3576 	cfg->fc_type = rtmsg->rtmsg_type;
3577 
3578 	cfg->fc_nlinfo.nl_net = net;
3579 
3580 	cfg->fc_dst = rtmsg->rtmsg_dst;
3581 	cfg->fc_src = rtmsg->rtmsg_src;
3582 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3583 }
3584 
3585 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3586 {
3587 	struct fib6_config cfg;
3588 	struct in6_rtmsg rtmsg;
3589 	int err;
3590 
3591 	switch (cmd) {
3592 	case SIOCADDRT:		/* Add a route */
3593 	case SIOCDELRT:		/* Delete a route */
3594 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3595 			return -EPERM;
3596 		err = copy_from_user(&rtmsg, arg,
3597 				     sizeof(struct in6_rtmsg));
3598 		if (err)
3599 			return -EFAULT;
3600 
3601 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3602 
3603 		rtnl_lock();
3604 		switch (cmd) {
3605 		case SIOCADDRT:
3606 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3607 			break;
3608 		case SIOCDELRT:
3609 			err = ip6_route_del(&cfg, NULL);
3610 			break;
3611 		default:
3612 			err = -EINVAL;
3613 		}
3614 		rtnl_unlock();
3615 
3616 		return err;
3617 	}
3618 
3619 	return -EINVAL;
3620 }
3621 
3622 /*
3623  *	Drop the packet on the floor
3624  */
3625 
3626 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3627 {
3628 	int type;
3629 	struct dst_entry *dst = skb_dst(skb);
3630 	switch (ipstats_mib_noroutes) {
3631 	case IPSTATS_MIB_INNOROUTES:
3632 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3633 		if (type == IPV6_ADDR_ANY) {
3634 			IP6_INC_STATS(dev_net(dst->dev),
3635 				      __in6_dev_get_safely(skb->dev),
3636 				      IPSTATS_MIB_INADDRERRORS);
3637 			break;
3638 		}
3639 		/* FALLTHROUGH */
3640 	case IPSTATS_MIB_OUTNOROUTES:
3641 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3642 			      ipstats_mib_noroutes);
3643 		break;
3644 	}
3645 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3646 	kfree_skb(skb);
3647 	return 0;
3648 }
3649 
3650 static int ip6_pkt_discard(struct sk_buff *skb)
3651 {
3652 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3653 }
3654 
3655 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3656 {
3657 	skb->dev = skb_dst(skb)->dev;
3658 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3659 }
3660 
3661 static int ip6_pkt_prohibit(struct sk_buff *skb)
3662 {
3663 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3664 }
3665 
3666 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3667 {
3668 	skb->dev = skb_dst(skb)->dev;
3669 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3670 }
3671 
3672 /*
3673  *	Allocate a dst for local (unicast / anycast) address.
3674  */
3675 
3676 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3677 				     struct inet6_dev *idev,
3678 				     const struct in6_addr *addr,
3679 				     bool anycast, gfp_t gfp_flags)
3680 {
3681 	u32 tb_id;
3682 	struct net_device *dev = idev->dev;
3683 	struct fib6_info *f6i;
3684 
3685 	f6i = fib6_info_alloc(gfp_flags);
3686 	if (!f6i)
3687 		return ERR_PTR(-ENOMEM);
3688 
3689 	f6i->dst_nocount = true;
3690 	f6i->dst_host = true;
3691 	f6i->fib6_protocol = RTPROT_KERNEL;
3692 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3693 	if (anycast) {
3694 		f6i->fib6_type = RTN_ANYCAST;
3695 		f6i->fib6_flags |= RTF_ANYCAST;
3696 	} else {
3697 		f6i->fib6_type = RTN_LOCAL;
3698 		f6i->fib6_flags |= RTF_LOCAL;
3699 	}
3700 
3701 	f6i->fib6_nh.nh_gw = *addr;
3702 	dev_hold(dev);
3703 	f6i->fib6_nh.nh_dev = dev;
3704 	f6i->fib6_dst.addr = *addr;
3705 	f6i->fib6_dst.plen = 128;
3706 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3707 	f6i->fib6_table = fib6_get_table(net, tb_id);
3708 
3709 	return f6i;
3710 }
3711 
3712 /* remove deleted ip from prefsrc entries */
3713 struct arg_dev_net_ip {
3714 	struct net_device *dev;
3715 	struct net *net;
3716 	struct in6_addr *addr;
3717 };
3718 
3719 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3720 {
3721 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3722 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3723 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3724 
3725 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3726 	    rt != net->ipv6.fib6_null_entry &&
3727 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3728 		spin_lock_bh(&rt6_exception_lock);
3729 		/* remove prefsrc entry */
3730 		rt->fib6_prefsrc.plen = 0;
3731 		/* need to update cache as well */
3732 		rt6_exceptions_remove_prefsrc(rt);
3733 		spin_unlock_bh(&rt6_exception_lock);
3734 	}
3735 	return 0;
3736 }
3737 
3738 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3739 {
3740 	struct net *net = dev_net(ifp->idev->dev);
3741 	struct arg_dev_net_ip adni = {
3742 		.dev = ifp->idev->dev,
3743 		.net = net,
3744 		.addr = &ifp->addr,
3745 	};
3746 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3747 }
3748 
3749 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3750 
3751 /* Remove routers and update dst entries when gateway turn into host. */
3752 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3753 {
3754 	struct in6_addr *gateway = (struct in6_addr *)arg;
3755 
3756 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3757 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3758 		return -1;
3759 	}
3760 
3761 	/* Further clean up cached routes in exception table.
3762 	 * This is needed because cached route may have a different
3763 	 * gateway than its 'parent' in the case of an ip redirect.
3764 	 */
3765 	rt6_exceptions_clean_tohost(rt, gateway);
3766 
3767 	return 0;
3768 }
3769 
3770 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3771 {
3772 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3773 }
3774 
3775 struct arg_netdev_event {
3776 	const struct net_device *dev;
3777 	union {
3778 		unsigned int nh_flags;
3779 		unsigned long event;
3780 	};
3781 };
3782 
3783 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3784 {
3785 	struct fib6_info *iter;
3786 	struct fib6_node *fn;
3787 
3788 	fn = rcu_dereference_protected(rt->fib6_node,
3789 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3790 	iter = rcu_dereference_protected(fn->leaf,
3791 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3792 	while (iter) {
3793 		if (iter->fib6_metric == rt->fib6_metric &&
3794 		    rt6_qualify_for_ecmp(iter))
3795 			return iter;
3796 		iter = rcu_dereference_protected(iter->fib6_next,
3797 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3798 	}
3799 
3800 	return NULL;
3801 }
3802 
3803 static bool rt6_is_dead(const struct fib6_info *rt)
3804 {
3805 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3806 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3807 	     fib6_ignore_linkdown(rt)))
3808 		return true;
3809 
3810 	return false;
3811 }
3812 
3813 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3814 {
3815 	struct fib6_info *iter;
3816 	int total = 0;
3817 
3818 	if (!rt6_is_dead(rt))
3819 		total += rt->fib6_nh.nh_weight;
3820 
3821 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3822 		if (!rt6_is_dead(iter))
3823 			total += iter->fib6_nh.nh_weight;
3824 	}
3825 
3826 	return total;
3827 }
3828 
3829 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3830 {
3831 	int upper_bound = -1;
3832 
3833 	if (!rt6_is_dead(rt)) {
3834 		*weight += rt->fib6_nh.nh_weight;
3835 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3836 						    total) - 1;
3837 	}
3838 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3839 }
3840 
3841 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3842 {
3843 	struct fib6_info *iter;
3844 	int weight = 0;
3845 
3846 	rt6_upper_bound_set(rt, &weight, total);
3847 
3848 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3849 		rt6_upper_bound_set(iter, &weight, total);
3850 }
3851 
3852 void rt6_multipath_rebalance(struct fib6_info *rt)
3853 {
3854 	struct fib6_info *first;
3855 	int total;
3856 
3857 	/* In case the entire multipath route was marked for flushing,
3858 	 * then there is no need to rebalance upon the removal of every
3859 	 * sibling route.
3860 	 */
3861 	if (!rt->fib6_nsiblings || rt->should_flush)
3862 		return;
3863 
3864 	/* During lookup routes are evaluated in order, so we need to
3865 	 * make sure upper bounds are assigned from the first sibling
3866 	 * onwards.
3867 	 */
3868 	first = rt6_multipath_first_sibling(rt);
3869 	if (WARN_ON_ONCE(!first))
3870 		return;
3871 
3872 	total = rt6_multipath_total_weight(first);
3873 	rt6_multipath_upper_bound_set(first, total);
3874 }
3875 
3876 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3877 {
3878 	const struct arg_netdev_event *arg = p_arg;
3879 	struct net *net = dev_net(arg->dev);
3880 
3881 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3882 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3883 		fib6_update_sernum_upto_root(net, rt);
3884 		rt6_multipath_rebalance(rt);
3885 	}
3886 
3887 	return 0;
3888 }
3889 
3890 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3891 {
3892 	struct arg_netdev_event arg = {
3893 		.dev = dev,
3894 		{
3895 			.nh_flags = nh_flags,
3896 		},
3897 	};
3898 
3899 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3900 		arg.nh_flags |= RTNH_F_LINKDOWN;
3901 
3902 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3903 }
3904 
3905 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3906 				   const struct net_device *dev)
3907 {
3908 	struct fib6_info *iter;
3909 
3910 	if (rt->fib6_nh.nh_dev == dev)
3911 		return true;
3912 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3913 		if (iter->fib6_nh.nh_dev == dev)
3914 			return true;
3915 
3916 	return false;
3917 }
3918 
3919 static void rt6_multipath_flush(struct fib6_info *rt)
3920 {
3921 	struct fib6_info *iter;
3922 
3923 	rt->should_flush = 1;
3924 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3925 		iter->should_flush = 1;
3926 }
3927 
3928 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3929 					     const struct net_device *down_dev)
3930 {
3931 	struct fib6_info *iter;
3932 	unsigned int dead = 0;
3933 
3934 	if (rt->fib6_nh.nh_dev == down_dev ||
3935 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3936 		dead++;
3937 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3938 		if (iter->fib6_nh.nh_dev == down_dev ||
3939 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3940 			dead++;
3941 
3942 	return dead;
3943 }
3944 
3945 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3946 				       const struct net_device *dev,
3947 				       unsigned int nh_flags)
3948 {
3949 	struct fib6_info *iter;
3950 
3951 	if (rt->fib6_nh.nh_dev == dev)
3952 		rt->fib6_nh.nh_flags |= nh_flags;
3953 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3954 		if (iter->fib6_nh.nh_dev == dev)
3955 			iter->fib6_nh.nh_flags |= nh_flags;
3956 }
3957 
3958 /* called with write lock held for table with rt */
3959 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3960 {
3961 	const struct arg_netdev_event *arg = p_arg;
3962 	const struct net_device *dev = arg->dev;
3963 	struct net *net = dev_net(dev);
3964 
3965 	if (rt == net->ipv6.fib6_null_entry)
3966 		return 0;
3967 
3968 	switch (arg->event) {
3969 	case NETDEV_UNREGISTER:
3970 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3971 	case NETDEV_DOWN:
3972 		if (rt->should_flush)
3973 			return -1;
3974 		if (!rt->fib6_nsiblings)
3975 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3976 		if (rt6_multipath_uses_dev(rt, dev)) {
3977 			unsigned int count;
3978 
3979 			count = rt6_multipath_dead_count(rt, dev);
3980 			if (rt->fib6_nsiblings + 1 == count) {
3981 				rt6_multipath_flush(rt);
3982 				return -1;
3983 			}
3984 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3985 						   RTNH_F_LINKDOWN);
3986 			fib6_update_sernum(net, rt);
3987 			rt6_multipath_rebalance(rt);
3988 		}
3989 		return -2;
3990 	case NETDEV_CHANGE:
3991 		if (rt->fib6_nh.nh_dev != dev ||
3992 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3993 			break;
3994 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3995 		rt6_multipath_rebalance(rt);
3996 		break;
3997 	}
3998 
3999 	return 0;
4000 }
4001 
4002 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4003 {
4004 	struct arg_netdev_event arg = {
4005 		.dev = dev,
4006 		{
4007 			.event = event,
4008 		},
4009 	};
4010 
4011 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4012 }
4013 
4014 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4015 {
4016 	rt6_sync_down_dev(dev, event);
4017 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4018 	neigh_ifdown(&nd_tbl, dev);
4019 }
4020 
4021 struct rt6_mtu_change_arg {
4022 	struct net_device *dev;
4023 	unsigned int mtu;
4024 };
4025 
4026 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4027 {
4028 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4029 	struct inet6_dev *idev;
4030 
4031 	/* In IPv6 pmtu discovery is not optional,
4032 	   so that RTAX_MTU lock cannot disable it.
4033 	   We still use this lock to block changes
4034 	   caused by addrconf/ndisc.
4035 	*/
4036 
4037 	idev = __in6_dev_get(arg->dev);
4038 	if (!idev)
4039 		return 0;
4040 
4041 	/* For administrative MTU increase, there is no way to discover
4042 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4043 	   Since RFC 1981 doesn't include administrative MTU increase
4044 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4045 	 */
4046 	if (rt->fib6_nh.nh_dev == arg->dev &&
4047 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4048 		u32 mtu = rt->fib6_pmtu;
4049 
4050 		if (mtu >= arg->mtu ||
4051 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4052 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4053 
4054 		spin_lock_bh(&rt6_exception_lock);
4055 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4056 		spin_unlock_bh(&rt6_exception_lock);
4057 	}
4058 	return 0;
4059 }
4060 
4061 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4062 {
4063 	struct rt6_mtu_change_arg arg = {
4064 		.dev = dev,
4065 		.mtu = mtu,
4066 	};
4067 
4068 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4069 }
4070 
4071 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4072 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4073 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4074 	[RTA_OIF]               = { .type = NLA_U32 },
4075 	[RTA_IIF]		= { .type = NLA_U32 },
4076 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4077 	[RTA_METRICS]           = { .type = NLA_NESTED },
4078 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4079 	[RTA_PREF]              = { .type = NLA_U8 },
4080 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4081 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4082 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4083 	[RTA_UID]		= { .type = NLA_U32 },
4084 	[RTA_MARK]		= { .type = NLA_U32 },
4085 	[RTA_TABLE]		= { .type = NLA_U32 },
4086 };
4087 
4088 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4089 			      struct fib6_config *cfg,
4090 			      struct netlink_ext_ack *extack)
4091 {
4092 	struct rtmsg *rtm;
4093 	struct nlattr *tb[RTA_MAX+1];
4094 	unsigned int pref;
4095 	int err;
4096 
4097 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4098 			  NULL);
4099 	if (err < 0)
4100 		goto errout;
4101 
4102 	err = -EINVAL;
4103 	rtm = nlmsg_data(nlh);
4104 	memset(cfg, 0, sizeof(*cfg));
4105 
4106 	cfg->fc_table = rtm->rtm_table;
4107 	cfg->fc_dst_len = rtm->rtm_dst_len;
4108 	cfg->fc_src_len = rtm->rtm_src_len;
4109 	cfg->fc_flags = RTF_UP;
4110 	cfg->fc_protocol = rtm->rtm_protocol;
4111 	cfg->fc_type = rtm->rtm_type;
4112 
4113 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4114 	    rtm->rtm_type == RTN_BLACKHOLE ||
4115 	    rtm->rtm_type == RTN_PROHIBIT ||
4116 	    rtm->rtm_type == RTN_THROW)
4117 		cfg->fc_flags |= RTF_REJECT;
4118 
4119 	if (rtm->rtm_type == RTN_LOCAL)
4120 		cfg->fc_flags |= RTF_LOCAL;
4121 
4122 	if (rtm->rtm_flags & RTM_F_CLONED)
4123 		cfg->fc_flags |= RTF_CACHE;
4124 
4125 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4126 
4127 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4128 	cfg->fc_nlinfo.nlh = nlh;
4129 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4130 
4131 	if (tb[RTA_GATEWAY]) {
4132 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4133 		cfg->fc_flags |= RTF_GATEWAY;
4134 	}
4135 
4136 	if (tb[RTA_DST]) {
4137 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4138 
4139 		if (nla_len(tb[RTA_DST]) < plen)
4140 			goto errout;
4141 
4142 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4143 	}
4144 
4145 	if (tb[RTA_SRC]) {
4146 		int plen = (rtm->rtm_src_len + 7) >> 3;
4147 
4148 		if (nla_len(tb[RTA_SRC]) < plen)
4149 			goto errout;
4150 
4151 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4152 	}
4153 
4154 	if (tb[RTA_PREFSRC])
4155 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4156 
4157 	if (tb[RTA_OIF])
4158 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4159 
4160 	if (tb[RTA_PRIORITY])
4161 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4162 
4163 	if (tb[RTA_METRICS]) {
4164 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4165 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4166 	}
4167 
4168 	if (tb[RTA_TABLE])
4169 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4170 
4171 	if (tb[RTA_MULTIPATH]) {
4172 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4173 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4174 
4175 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4176 						     cfg->fc_mp_len, extack);
4177 		if (err < 0)
4178 			goto errout;
4179 	}
4180 
4181 	if (tb[RTA_PREF]) {
4182 		pref = nla_get_u8(tb[RTA_PREF]);
4183 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4184 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4185 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4186 		cfg->fc_flags |= RTF_PREF(pref);
4187 	}
4188 
4189 	if (tb[RTA_ENCAP])
4190 		cfg->fc_encap = tb[RTA_ENCAP];
4191 
4192 	if (tb[RTA_ENCAP_TYPE]) {
4193 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4194 
4195 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4196 		if (err < 0)
4197 			goto errout;
4198 	}
4199 
4200 	if (tb[RTA_EXPIRES]) {
4201 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4202 
4203 		if (addrconf_finite_timeout(timeout)) {
4204 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4205 			cfg->fc_flags |= RTF_EXPIRES;
4206 		}
4207 	}
4208 
4209 	err = 0;
4210 errout:
4211 	return err;
4212 }
4213 
4214 struct rt6_nh {
4215 	struct fib6_info *fib6_info;
4216 	struct fib6_config r_cfg;
4217 	struct list_head next;
4218 };
4219 
4220 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4221 {
4222 	struct rt6_nh *nh;
4223 
4224 	list_for_each_entry(nh, rt6_nh_list, next) {
4225 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4226 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4227 		        nh->r_cfg.fc_ifindex);
4228 	}
4229 }
4230 
4231 static int ip6_route_info_append(struct net *net,
4232 				 struct list_head *rt6_nh_list,
4233 				 struct fib6_info *rt,
4234 				 struct fib6_config *r_cfg)
4235 {
4236 	struct rt6_nh *nh;
4237 	int err = -EEXIST;
4238 
4239 	list_for_each_entry(nh, rt6_nh_list, next) {
4240 		/* check if fib6_info already exists */
4241 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4242 			return err;
4243 	}
4244 
4245 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4246 	if (!nh)
4247 		return -ENOMEM;
4248 	nh->fib6_info = rt;
4249 	err = ip6_convert_metrics(net, rt, r_cfg);
4250 	if (err) {
4251 		kfree(nh);
4252 		return err;
4253 	}
4254 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4255 	list_add_tail(&nh->next, rt6_nh_list);
4256 
4257 	return 0;
4258 }
4259 
4260 static void ip6_route_mpath_notify(struct fib6_info *rt,
4261 				   struct fib6_info *rt_last,
4262 				   struct nl_info *info,
4263 				   __u16 nlflags)
4264 {
4265 	/* if this is an APPEND route, then rt points to the first route
4266 	 * inserted and rt_last points to last route inserted. Userspace
4267 	 * wants a consistent dump of the route which starts at the first
4268 	 * nexthop. Since sibling routes are always added at the end of
4269 	 * the list, find the first sibling of the last route appended
4270 	 */
4271 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4272 		rt = list_first_entry(&rt_last->fib6_siblings,
4273 				      struct fib6_info,
4274 				      fib6_siblings);
4275 	}
4276 
4277 	if (rt)
4278 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4279 }
4280 
4281 static int ip6_route_multipath_add(struct fib6_config *cfg,
4282 				   struct netlink_ext_ack *extack)
4283 {
4284 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4285 	struct nl_info *info = &cfg->fc_nlinfo;
4286 	struct fib6_config r_cfg;
4287 	struct rtnexthop *rtnh;
4288 	struct fib6_info *rt;
4289 	struct rt6_nh *err_nh;
4290 	struct rt6_nh *nh, *nh_safe;
4291 	__u16 nlflags;
4292 	int remaining;
4293 	int attrlen;
4294 	int err = 1;
4295 	int nhn = 0;
4296 	int replace = (cfg->fc_nlinfo.nlh &&
4297 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4298 	LIST_HEAD(rt6_nh_list);
4299 
4300 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4301 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4302 		nlflags |= NLM_F_APPEND;
4303 
4304 	remaining = cfg->fc_mp_len;
4305 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4306 
4307 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4308 	 * fib6_info structs per nexthop
4309 	 */
4310 	while (rtnh_ok(rtnh, remaining)) {
4311 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4312 		if (rtnh->rtnh_ifindex)
4313 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4314 
4315 		attrlen = rtnh_attrlen(rtnh);
4316 		if (attrlen > 0) {
4317 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4318 
4319 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4320 			if (nla) {
4321 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4322 				r_cfg.fc_flags |= RTF_GATEWAY;
4323 			}
4324 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4325 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4326 			if (nla)
4327 				r_cfg.fc_encap_type = nla_get_u16(nla);
4328 		}
4329 
4330 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4331 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4332 		if (IS_ERR(rt)) {
4333 			err = PTR_ERR(rt);
4334 			rt = NULL;
4335 			goto cleanup;
4336 		}
4337 
4338 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4339 
4340 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4341 					    rt, &r_cfg);
4342 		if (err) {
4343 			fib6_info_release(rt);
4344 			goto cleanup;
4345 		}
4346 
4347 		rtnh = rtnh_next(rtnh, &remaining);
4348 	}
4349 
4350 	/* for add and replace send one notification with all nexthops.
4351 	 * Skip the notification in fib6_add_rt2node and send one with
4352 	 * the full route when done
4353 	 */
4354 	info->skip_notify = 1;
4355 
4356 	err_nh = NULL;
4357 	list_for_each_entry(nh, &rt6_nh_list, next) {
4358 		rt_last = nh->fib6_info;
4359 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4360 		fib6_info_release(nh->fib6_info);
4361 
4362 		/* save reference to first route for notification */
4363 		if (!rt_notif && !err)
4364 			rt_notif = nh->fib6_info;
4365 
4366 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4367 		nh->fib6_info = NULL;
4368 		if (err) {
4369 			if (replace && nhn)
4370 				ip6_print_replace_route_err(&rt6_nh_list);
4371 			err_nh = nh;
4372 			goto add_errout;
4373 		}
4374 
4375 		/* Because each route is added like a single route we remove
4376 		 * these flags after the first nexthop: if there is a collision,
4377 		 * we have already failed to add the first nexthop:
4378 		 * fib6_add_rt2node() has rejected it; when replacing, old
4379 		 * nexthops have been replaced by first new, the rest should
4380 		 * be added to it.
4381 		 */
4382 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4383 						     NLM_F_REPLACE);
4384 		nhn++;
4385 	}
4386 
4387 	/* success ... tell user about new route */
4388 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4389 	goto cleanup;
4390 
4391 add_errout:
4392 	/* send notification for routes that were added so that
4393 	 * the delete notifications sent by ip6_route_del are
4394 	 * coherent
4395 	 */
4396 	if (rt_notif)
4397 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4398 
4399 	/* Delete routes that were already added */
4400 	list_for_each_entry(nh, &rt6_nh_list, next) {
4401 		if (err_nh == nh)
4402 			break;
4403 		ip6_route_del(&nh->r_cfg, extack);
4404 	}
4405 
4406 cleanup:
4407 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4408 		if (nh->fib6_info)
4409 			fib6_info_release(nh->fib6_info);
4410 		list_del(&nh->next);
4411 		kfree(nh);
4412 	}
4413 
4414 	return err;
4415 }
4416 
4417 static int ip6_route_multipath_del(struct fib6_config *cfg,
4418 				   struct netlink_ext_ack *extack)
4419 {
4420 	struct fib6_config r_cfg;
4421 	struct rtnexthop *rtnh;
4422 	int remaining;
4423 	int attrlen;
4424 	int err = 1, last_err = 0;
4425 
4426 	remaining = cfg->fc_mp_len;
4427 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4428 
4429 	/* Parse a Multipath Entry */
4430 	while (rtnh_ok(rtnh, remaining)) {
4431 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4432 		if (rtnh->rtnh_ifindex)
4433 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4434 
4435 		attrlen = rtnh_attrlen(rtnh);
4436 		if (attrlen > 0) {
4437 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4438 
4439 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4440 			if (nla) {
4441 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4442 				r_cfg.fc_flags |= RTF_GATEWAY;
4443 			}
4444 		}
4445 		err = ip6_route_del(&r_cfg, extack);
4446 		if (err)
4447 			last_err = err;
4448 
4449 		rtnh = rtnh_next(rtnh, &remaining);
4450 	}
4451 
4452 	return last_err;
4453 }
4454 
4455 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4456 			      struct netlink_ext_ack *extack)
4457 {
4458 	struct fib6_config cfg;
4459 	int err;
4460 
4461 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4462 	if (err < 0)
4463 		return err;
4464 
4465 	if (cfg.fc_mp)
4466 		return ip6_route_multipath_del(&cfg, extack);
4467 	else {
4468 		cfg.fc_delete_all_nh = 1;
4469 		return ip6_route_del(&cfg, extack);
4470 	}
4471 }
4472 
4473 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4474 			      struct netlink_ext_ack *extack)
4475 {
4476 	struct fib6_config cfg;
4477 	int err;
4478 
4479 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4480 	if (err < 0)
4481 		return err;
4482 
4483 	if (cfg.fc_mp)
4484 		return ip6_route_multipath_add(&cfg, extack);
4485 	else
4486 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4487 }
4488 
4489 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4490 {
4491 	int nexthop_len = 0;
4492 
4493 	if (rt->fib6_nsiblings) {
4494 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4495 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4496 			    + nla_total_size(16) /* RTA_GATEWAY */
4497 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4498 
4499 		nexthop_len *= rt->fib6_nsiblings;
4500 	}
4501 
4502 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4503 	       + nla_total_size(16) /* RTA_SRC */
4504 	       + nla_total_size(16) /* RTA_DST */
4505 	       + nla_total_size(16) /* RTA_GATEWAY */
4506 	       + nla_total_size(16) /* RTA_PREFSRC */
4507 	       + nla_total_size(4) /* RTA_TABLE */
4508 	       + nla_total_size(4) /* RTA_IIF */
4509 	       + nla_total_size(4) /* RTA_OIF */
4510 	       + nla_total_size(4) /* RTA_PRIORITY */
4511 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4512 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4513 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4514 	       + nla_total_size(1) /* RTA_PREF */
4515 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4516 	       + nexthop_len;
4517 }
4518 
4519 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4520 			    unsigned int *flags, bool skip_oif)
4521 {
4522 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4523 		*flags |= RTNH_F_DEAD;
4524 
4525 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4526 		*flags |= RTNH_F_LINKDOWN;
4527 
4528 		rcu_read_lock();
4529 		if (fib6_ignore_linkdown(rt))
4530 			*flags |= RTNH_F_DEAD;
4531 		rcu_read_unlock();
4532 	}
4533 
4534 	if (rt->fib6_flags & RTF_GATEWAY) {
4535 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4536 			goto nla_put_failure;
4537 	}
4538 
4539 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4540 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4541 		*flags |= RTNH_F_OFFLOAD;
4542 
4543 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4544 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4545 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4546 		goto nla_put_failure;
4547 
4548 	if (rt->fib6_nh.nh_lwtstate &&
4549 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4550 		goto nla_put_failure;
4551 
4552 	return 0;
4553 
4554 nla_put_failure:
4555 	return -EMSGSIZE;
4556 }
4557 
4558 /* add multipath next hop */
4559 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4560 {
4561 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4562 	struct rtnexthop *rtnh;
4563 	unsigned int flags = 0;
4564 
4565 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4566 	if (!rtnh)
4567 		goto nla_put_failure;
4568 
4569 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4570 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4571 
4572 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4573 		goto nla_put_failure;
4574 
4575 	rtnh->rtnh_flags = flags;
4576 
4577 	/* length of rtnetlink header + attributes */
4578 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4579 
4580 	return 0;
4581 
4582 nla_put_failure:
4583 	return -EMSGSIZE;
4584 }
4585 
4586 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4587 			 struct fib6_info *rt, struct dst_entry *dst,
4588 			 struct in6_addr *dest, struct in6_addr *src,
4589 			 int iif, int type, u32 portid, u32 seq,
4590 			 unsigned int flags)
4591 {
4592 	struct rtmsg *rtm;
4593 	struct nlmsghdr *nlh;
4594 	long expires = 0;
4595 	u32 *pmetrics;
4596 	u32 table;
4597 
4598 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4599 	if (!nlh)
4600 		return -EMSGSIZE;
4601 
4602 	rtm = nlmsg_data(nlh);
4603 	rtm->rtm_family = AF_INET6;
4604 	rtm->rtm_dst_len = rt->fib6_dst.plen;
4605 	rtm->rtm_src_len = rt->fib6_src.plen;
4606 	rtm->rtm_tos = 0;
4607 	if (rt->fib6_table)
4608 		table = rt->fib6_table->tb6_id;
4609 	else
4610 		table = RT6_TABLE_UNSPEC;
4611 	rtm->rtm_table = table;
4612 	if (nla_put_u32(skb, RTA_TABLE, table))
4613 		goto nla_put_failure;
4614 
4615 	rtm->rtm_type = rt->fib6_type;
4616 	rtm->rtm_flags = 0;
4617 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4618 	rtm->rtm_protocol = rt->fib6_protocol;
4619 
4620 	if (rt->fib6_flags & RTF_CACHE)
4621 		rtm->rtm_flags |= RTM_F_CLONED;
4622 
4623 	if (dest) {
4624 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4625 			goto nla_put_failure;
4626 		rtm->rtm_dst_len = 128;
4627 	} else if (rtm->rtm_dst_len)
4628 		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4629 			goto nla_put_failure;
4630 #ifdef CONFIG_IPV6_SUBTREES
4631 	if (src) {
4632 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4633 			goto nla_put_failure;
4634 		rtm->rtm_src_len = 128;
4635 	} else if (rtm->rtm_src_len &&
4636 		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4637 		goto nla_put_failure;
4638 #endif
4639 	if (iif) {
4640 #ifdef CONFIG_IPV6_MROUTE
4641 		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4642 			int err = ip6mr_get_route(net, skb, rtm, portid);
4643 
4644 			if (err == 0)
4645 				return 0;
4646 			if (err < 0)
4647 				goto nla_put_failure;
4648 		} else
4649 #endif
4650 			if (nla_put_u32(skb, RTA_IIF, iif))
4651 				goto nla_put_failure;
4652 	} else if (dest) {
4653 		struct in6_addr saddr_buf;
4654 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4655 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4656 			goto nla_put_failure;
4657 	}
4658 
4659 	if (rt->fib6_prefsrc.plen) {
4660 		struct in6_addr saddr_buf;
4661 		saddr_buf = rt->fib6_prefsrc.addr;
4662 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4663 			goto nla_put_failure;
4664 	}
4665 
4666 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4667 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4668 		goto nla_put_failure;
4669 
4670 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4671 		goto nla_put_failure;
4672 
4673 	/* For multipath routes, walk the siblings list and add
4674 	 * each as a nexthop within RTA_MULTIPATH.
4675 	 */
4676 	if (rt->fib6_nsiblings) {
4677 		struct fib6_info *sibling, *next_sibling;
4678 		struct nlattr *mp;
4679 
4680 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4681 		if (!mp)
4682 			goto nla_put_failure;
4683 
4684 		if (rt6_add_nexthop(skb, rt) < 0)
4685 			goto nla_put_failure;
4686 
4687 		list_for_each_entry_safe(sibling, next_sibling,
4688 					 &rt->fib6_siblings, fib6_siblings) {
4689 			if (rt6_add_nexthop(skb, sibling) < 0)
4690 				goto nla_put_failure;
4691 		}
4692 
4693 		nla_nest_end(skb, mp);
4694 	} else {
4695 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4696 			goto nla_put_failure;
4697 	}
4698 
4699 	if (rt->fib6_flags & RTF_EXPIRES) {
4700 		expires = dst ? dst->expires : rt->expires;
4701 		expires -= jiffies;
4702 	}
4703 
4704 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4705 		goto nla_put_failure;
4706 
4707 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4708 		goto nla_put_failure;
4709 
4710 
4711 	nlmsg_end(skb, nlh);
4712 	return 0;
4713 
4714 nla_put_failure:
4715 	nlmsg_cancel(skb, nlh);
4716 	return -EMSGSIZE;
4717 }
4718 
4719 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4720 {
4721 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4722 	struct net *net = arg->net;
4723 
4724 	if (rt == net->ipv6.fib6_null_entry)
4725 		return 0;
4726 
4727 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4728 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4729 
4730 		/* user wants prefix routes only */
4731 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4732 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4733 			/* success since this is not a prefix route */
4734 			return 1;
4735 		}
4736 	}
4737 
4738 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4739 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4740 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4741 }
4742 
4743 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4744 			      struct netlink_ext_ack *extack)
4745 {
4746 	struct net *net = sock_net(in_skb->sk);
4747 	struct nlattr *tb[RTA_MAX+1];
4748 	int err, iif = 0, oif = 0;
4749 	struct fib6_info *from;
4750 	struct dst_entry *dst;
4751 	struct rt6_info *rt;
4752 	struct sk_buff *skb;
4753 	struct rtmsg *rtm;
4754 	struct flowi6 fl6;
4755 	bool fibmatch;
4756 
4757 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4758 			  extack);
4759 	if (err < 0)
4760 		goto errout;
4761 
4762 	err = -EINVAL;
4763 	memset(&fl6, 0, sizeof(fl6));
4764 	rtm = nlmsg_data(nlh);
4765 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4766 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4767 
4768 	if (tb[RTA_SRC]) {
4769 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4770 			goto errout;
4771 
4772 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4773 	}
4774 
4775 	if (tb[RTA_DST]) {
4776 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4777 			goto errout;
4778 
4779 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4780 	}
4781 
4782 	if (tb[RTA_IIF])
4783 		iif = nla_get_u32(tb[RTA_IIF]);
4784 
4785 	if (tb[RTA_OIF])
4786 		oif = nla_get_u32(tb[RTA_OIF]);
4787 
4788 	if (tb[RTA_MARK])
4789 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4790 
4791 	if (tb[RTA_UID])
4792 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4793 					   nla_get_u32(tb[RTA_UID]));
4794 	else
4795 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4796 
4797 	if (iif) {
4798 		struct net_device *dev;
4799 		int flags = 0;
4800 
4801 		rcu_read_lock();
4802 
4803 		dev = dev_get_by_index_rcu(net, iif);
4804 		if (!dev) {
4805 			rcu_read_unlock();
4806 			err = -ENODEV;
4807 			goto errout;
4808 		}
4809 
4810 		fl6.flowi6_iif = iif;
4811 
4812 		if (!ipv6_addr_any(&fl6.saddr))
4813 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4814 
4815 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4816 
4817 		rcu_read_unlock();
4818 	} else {
4819 		fl6.flowi6_oif = oif;
4820 
4821 		dst = ip6_route_output(net, NULL, &fl6);
4822 	}
4823 
4824 
4825 	rt = container_of(dst, struct rt6_info, dst);
4826 	if (rt->dst.error) {
4827 		err = rt->dst.error;
4828 		ip6_rt_put(rt);
4829 		goto errout;
4830 	}
4831 
4832 	if (rt == net->ipv6.ip6_null_entry) {
4833 		err = rt->dst.error;
4834 		ip6_rt_put(rt);
4835 		goto errout;
4836 	}
4837 
4838 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4839 	if (!skb) {
4840 		ip6_rt_put(rt);
4841 		err = -ENOBUFS;
4842 		goto errout;
4843 	}
4844 
4845 	skb_dst_set(skb, &rt->dst);
4846 
4847 	rcu_read_lock();
4848 	from = rcu_dereference(rt->from);
4849 
4850 	if (fibmatch)
4851 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4852 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4853 				    nlh->nlmsg_seq, 0);
4854 	else
4855 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4856 				    &fl6.saddr, iif, RTM_NEWROUTE,
4857 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4858 				    0);
4859 	rcu_read_unlock();
4860 
4861 	if (err < 0) {
4862 		kfree_skb(skb);
4863 		goto errout;
4864 	}
4865 
4866 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4867 errout:
4868 	return err;
4869 }
4870 
4871 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4872 		     unsigned int nlm_flags)
4873 {
4874 	struct sk_buff *skb;
4875 	struct net *net = info->nl_net;
4876 	u32 seq;
4877 	int err;
4878 
4879 	err = -ENOBUFS;
4880 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4881 
4882 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4883 	if (!skb)
4884 		goto errout;
4885 
4886 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4887 			    event, info->portid, seq, nlm_flags);
4888 	if (err < 0) {
4889 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4890 		WARN_ON(err == -EMSGSIZE);
4891 		kfree_skb(skb);
4892 		goto errout;
4893 	}
4894 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4895 		    info->nlh, gfp_any());
4896 	return;
4897 errout:
4898 	if (err < 0)
4899 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4900 }
4901 
4902 static int ip6_route_dev_notify(struct notifier_block *this,
4903 				unsigned long event, void *ptr)
4904 {
4905 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4906 	struct net *net = dev_net(dev);
4907 
4908 	if (!(dev->flags & IFF_LOOPBACK))
4909 		return NOTIFY_OK;
4910 
4911 	if (event == NETDEV_REGISTER) {
4912 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4913 		net->ipv6.ip6_null_entry->dst.dev = dev;
4914 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4915 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4916 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4917 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4918 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4919 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4920 #endif
4921 	 } else if (event == NETDEV_UNREGISTER &&
4922 		    dev->reg_state != NETREG_UNREGISTERED) {
4923 		/* NETDEV_UNREGISTER could be fired for multiple times by
4924 		 * netdev_wait_allrefs(). Make sure we only call this once.
4925 		 */
4926 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4927 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4928 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4929 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4930 #endif
4931 	}
4932 
4933 	return NOTIFY_OK;
4934 }
4935 
4936 /*
4937  *	/proc
4938  */
4939 
4940 #ifdef CONFIG_PROC_FS
4941 
4942 static const struct file_operations ipv6_route_proc_fops = {
4943 	.open		= ipv6_route_open,
4944 	.read		= seq_read,
4945 	.llseek		= seq_lseek,
4946 	.release	= seq_release_net,
4947 };
4948 
4949 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4950 {
4951 	struct net *net = (struct net *)seq->private;
4952 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4953 		   net->ipv6.rt6_stats->fib_nodes,
4954 		   net->ipv6.rt6_stats->fib_route_nodes,
4955 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4956 		   net->ipv6.rt6_stats->fib_rt_entries,
4957 		   net->ipv6.rt6_stats->fib_rt_cache,
4958 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4959 		   net->ipv6.rt6_stats->fib_discarded_routes);
4960 
4961 	return 0;
4962 }
4963 
4964 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4965 {
4966 	return single_open_net(inode, file, rt6_stats_seq_show);
4967 }
4968 
4969 static const struct file_operations rt6_stats_seq_fops = {
4970 	.open	 = rt6_stats_seq_open,
4971 	.read	 = seq_read,
4972 	.llseek	 = seq_lseek,
4973 	.release = single_release_net,
4974 };
4975 #endif	/* CONFIG_PROC_FS */
4976 
4977 #ifdef CONFIG_SYSCTL
4978 
4979 static
4980 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4981 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4982 {
4983 	struct net *net;
4984 	int delay;
4985 	if (!write)
4986 		return -EINVAL;
4987 
4988 	net = (struct net *)ctl->extra1;
4989 	delay = net->ipv6.sysctl.flush_delay;
4990 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4991 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4992 	return 0;
4993 }
4994 
4995 struct ctl_table ipv6_route_table_template[] = {
4996 	{
4997 		.procname	=	"flush",
4998 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4999 		.maxlen		=	sizeof(int),
5000 		.mode		=	0200,
5001 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5002 	},
5003 	{
5004 		.procname	=	"gc_thresh",
5005 		.data		=	&ip6_dst_ops_template.gc_thresh,
5006 		.maxlen		=	sizeof(int),
5007 		.mode		=	0644,
5008 		.proc_handler	=	proc_dointvec,
5009 	},
5010 	{
5011 		.procname	=	"max_size",
5012 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5013 		.maxlen		=	sizeof(int),
5014 		.mode		=	0644,
5015 		.proc_handler	=	proc_dointvec,
5016 	},
5017 	{
5018 		.procname	=	"gc_min_interval",
5019 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5020 		.maxlen		=	sizeof(int),
5021 		.mode		=	0644,
5022 		.proc_handler	=	proc_dointvec_jiffies,
5023 	},
5024 	{
5025 		.procname	=	"gc_timeout",
5026 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5027 		.maxlen		=	sizeof(int),
5028 		.mode		=	0644,
5029 		.proc_handler	=	proc_dointvec_jiffies,
5030 	},
5031 	{
5032 		.procname	=	"gc_interval",
5033 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5034 		.maxlen		=	sizeof(int),
5035 		.mode		=	0644,
5036 		.proc_handler	=	proc_dointvec_jiffies,
5037 	},
5038 	{
5039 		.procname	=	"gc_elasticity",
5040 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5041 		.maxlen		=	sizeof(int),
5042 		.mode		=	0644,
5043 		.proc_handler	=	proc_dointvec,
5044 	},
5045 	{
5046 		.procname	=	"mtu_expires",
5047 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5048 		.maxlen		=	sizeof(int),
5049 		.mode		=	0644,
5050 		.proc_handler	=	proc_dointvec_jiffies,
5051 	},
5052 	{
5053 		.procname	=	"min_adv_mss",
5054 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5055 		.maxlen		=	sizeof(int),
5056 		.mode		=	0644,
5057 		.proc_handler	=	proc_dointvec,
5058 	},
5059 	{
5060 		.procname	=	"gc_min_interval_ms",
5061 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5062 		.maxlen		=	sizeof(int),
5063 		.mode		=	0644,
5064 		.proc_handler	=	proc_dointvec_ms_jiffies,
5065 	},
5066 	{ }
5067 };
5068 
5069 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5070 {
5071 	struct ctl_table *table;
5072 
5073 	table = kmemdup(ipv6_route_table_template,
5074 			sizeof(ipv6_route_table_template),
5075 			GFP_KERNEL);
5076 
5077 	if (table) {
5078 		table[0].data = &net->ipv6.sysctl.flush_delay;
5079 		table[0].extra1 = net;
5080 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5081 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5082 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5083 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5084 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5085 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5086 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5087 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5088 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5089 
5090 		/* Don't export sysctls to unprivileged users */
5091 		if (net->user_ns != &init_user_ns)
5092 			table[0].procname = NULL;
5093 	}
5094 
5095 	return table;
5096 }
5097 #endif
5098 
5099 static int __net_init ip6_route_net_init(struct net *net)
5100 {
5101 	int ret = -ENOMEM;
5102 
5103 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5104 	       sizeof(net->ipv6.ip6_dst_ops));
5105 
5106 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5107 		goto out_ip6_dst_ops;
5108 
5109 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5110 					    sizeof(*net->ipv6.fib6_null_entry),
5111 					    GFP_KERNEL);
5112 	if (!net->ipv6.fib6_null_entry)
5113 		goto out_ip6_dst_entries;
5114 
5115 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5116 					   sizeof(*net->ipv6.ip6_null_entry),
5117 					   GFP_KERNEL);
5118 	if (!net->ipv6.ip6_null_entry)
5119 		goto out_fib6_null_entry;
5120 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5121 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5122 			 ip6_template_metrics, true);
5123 
5124 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5125 	net->ipv6.fib6_has_custom_rules = false;
5126 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5127 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5128 					       GFP_KERNEL);
5129 	if (!net->ipv6.ip6_prohibit_entry)
5130 		goto out_ip6_null_entry;
5131 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5132 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5133 			 ip6_template_metrics, true);
5134 
5135 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5136 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5137 					       GFP_KERNEL);
5138 	if (!net->ipv6.ip6_blk_hole_entry)
5139 		goto out_ip6_prohibit_entry;
5140 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5141 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5142 			 ip6_template_metrics, true);
5143 #endif
5144 
5145 	net->ipv6.sysctl.flush_delay = 0;
5146 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5147 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5148 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5149 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5150 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5151 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5152 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5153 
5154 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5155 
5156 	ret = 0;
5157 out:
5158 	return ret;
5159 
5160 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5161 out_ip6_prohibit_entry:
5162 	kfree(net->ipv6.ip6_prohibit_entry);
5163 out_ip6_null_entry:
5164 	kfree(net->ipv6.ip6_null_entry);
5165 #endif
5166 out_fib6_null_entry:
5167 	kfree(net->ipv6.fib6_null_entry);
5168 out_ip6_dst_entries:
5169 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5170 out_ip6_dst_ops:
5171 	goto out;
5172 }
5173 
5174 static void __net_exit ip6_route_net_exit(struct net *net)
5175 {
5176 	kfree(net->ipv6.fib6_null_entry);
5177 	kfree(net->ipv6.ip6_null_entry);
5178 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5179 	kfree(net->ipv6.ip6_prohibit_entry);
5180 	kfree(net->ipv6.ip6_blk_hole_entry);
5181 #endif
5182 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5183 }
5184 
5185 static int __net_init ip6_route_net_init_late(struct net *net)
5186 {
5187 #ifdef CONFIG_PROC_FS
5188 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5189 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5190 #endif
5191 	return 0;
5192 }
5193 
5194 static void __net_exit ip6_route_net_exit_late(struct net *net)
5195 {
5196 #ifdef CONFIG_PROC_FS
5197 	remove_proc_entry("ipv6_route", net->proc_net);
5198 	remove_proc_entry("rt6_stats", net->proc_net);
5199 #endif
5200 }
5201 
5202 static struct pernet_operations ip6_route_net_ops = {
5203 	.init = ip6_route_net_init,
5204 	.exit = ip6_route_net_exit,
5205 };
5206 
5207 static int __net_init ipv6_inetpeer_init(struct net *net)
5208 {
5209 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5210 
5211 	if (!bp)
5212 		return -ENOMEM;
5213 	inet_peer_base_init(bp);
5214 	net->ipv6.peers = bp;
5215 	return 0;
5216 }
5217 
5218 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5219 {
5220 	struct inet_peer_base *bp = net->ipv6.peers;
5221 
5222 	net->ipv6.peers = NULL;
5223 	inetpeer_invalidate_tree(bp);
5224 	kfree(bp);
5225 }
5226 
5227 static struct pernet_operations ipv6_inetpeer_ops = {
5228 	.init	=	ipv6_inetpeer_init,
5229 	.exit	=	ipv6_inetpeer_exit,
5230 };
5231 
5232 static struct pernet_operations ip6_route_net_late_ops = {
5233 	.init = ip6_route_net_init_late,
5234 	.exit = ip6_route_net_exit_late,
5235 };
5236 
5237 static struct notifier_block ip6_route_dev_notifier = {
5238 	.notifier_call = ip6_route_dev_notify,
5239 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5240 };
5241 
5242 void __init ip6_route_init_special_entries(void)
5243 {
5244 	/* Registering of the loopback is done before this portion of code,
5245 	 * the loopback reference in rt6_info will not be taken, do it
5246 	 * manually for init_net */
5247 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5248 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5249 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5250   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5251 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5252 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5253 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5254 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5255   #endif
5256 }
5257 
5258 int __init ip6_route_init(void)
5259 {
5260 	int ret;
5261 	int cpu;
5262 
5263 	ret = -ENOMEM;
5264 	ip6_dst_ops_template.kmem_cachep =
5265 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5266 				  SLAB_HWCACHE_ALIGN, NULL);
5267 	if (!ip6_dst_ops_template.kmem_cachep)
5268 		goto out;
5269 
5270 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5271 	if (ret)
5272 		goto out_kmem_cache;
5273 
5274 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5275 	if (ret)
5276 		goto out_dst_entries;
5277 
5278 	ret = register_pernet_subsys(&ip6_route_net_ops);
5279 	if (ret)
5280 		goto out_register_inetpeer;
5281 
5282 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5283 
5284 	ret = fib6_init();
5285 	if (ret)
5286 		goto out_register_subsys;
5287 
5288 	ret = xfrm6_init();
5289 	if (ret)
5290 		goto out_fib6_init;
5291 
5292 	ret = fib6_rules_init();
5293 	if (ret)
5294 		goto xfrm6_init;
5295 
5296 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5297 	if (ret)
5298 		goto fib6_rules_init;
5299 
5300 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5301 				   inet6_rtm_newroute, NULL, 0);
5302 	if (ret < 0)
5303 		goto out_register_late_subsys;
5304 
5305 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5306 				   inet6_rtm_delroute, NULL, 0);
5307 	if (ret < 0)
5308 		goto out_register_late_subsys;
5309 
5310 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5311 				   inet6_rtm_getroute, NULL,
5312 				   RTNL_FLAG_DOIT_UNLOCKED);
5313 	if (ret < 0)
5314 		goto out_register_late_subsys;
5315 
5316 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5317 	if (ret)
5318 		goto out_register_late_subsys;
5319 
5320 	for_each_possible_cpu(cpu) {
5321 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5322 
5323 		INIT_LIST_HEAD(&ul->head);
5324 		spin_lock_init(&ul->lock);
5325 	}
5326 
5327 out:
5328 	return ret;
5329 
5330 out_register_late_subsys:
5331 	rtnl_unregister_all(PF_INET6);
5332 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5333 fib6_rules_init:
5334 	fib6_rules_cleanup();
5335 xfrm6_init:
5336 	xfrm6_fini();
5337 out_fib6_init:
5338 	fib6_gc_cleanup();
5339 out_register_subsys:
5340 	unregister_pernet_subsys(&ip6_route_net_ops);
5341 out_register_inetpeer:
5342 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5343 out_dst_entries:
5344 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5345 out_kmem_cache:
5346 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5347 	goto out;
5348 }
5349 
5350 void ip6_route_cleanup(void)
5351 {
5352 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5353 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5354 	fib6_rules_cleanup();
5355 	xfrm6_fini();
5356 	fib6_gc_cleanup();
5357 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5358 	unregister_pernet_subsys(&ip6_route_net_ops);
5359 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5360 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5361 }
5362