xref: /openbmc/linux/net/ipv6/route.c (revision d2ba09c1)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 			 struct fib6_info *rt, struct dst_entry *dst,
103 			 struct in6_addr *dest, struct in6_addr *src,
104 			 int iif, int type, u32 portid, u32 seq,
105 			 unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 					   struct in6_addr *daddr,
108 					   struct in6_addr *saddr);
109 
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 					   const struct in6_addr *prefix, int prefixlen,
113 					   const struct in6_addr *gwaddr,
114 					   struct net_device *dev,
115 					   unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 					   const struct in6_addr *prefix, int prefixlen,
118 					   const struct in6_addr *gwaddr,
119 					   struct net_device *dev);
120 #endif
121 
122 struct uncached_list {
123 	spinlock_t		lock;
124 	struct list_head	head;
125 };
126 
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 
133 	rt->rt6i_uncached_list = ul;
134 
135 	spin_lock_bh(&ul->lock);
136 	list_add_tail(&rt->rt6i_uncached, &ul->head);
137 	spin_unlock_bh(&ul->lock);
138 }
139 
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142 	if (!list_empty(&rt->rt6i_uncached)) {
143 		struct uncached_list *ul = rt->rt6i_uncached_list;
144 		struct net *net = dev_net(rt->dst.dev);
145 
146 		spin_lock_bh(&ul->lock);
147 		list_del(&rt->rt6i_uncached);
148 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 		spin_unlock_bh(&ul->lock);
150 	}
151 }
152 
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155 	struct net_device *loopback_dev = net->loopback_dev;
156 	int cpu;
157 
158 	if (dev == loopback_dev)
159 		return;
160 
161 	for_each_possible_cpu(cpu) {
162 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163 		struct rt6_info *rt;
164 
165 		spin_lock_bh(&ul->lock);
166 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 			struct inet6_dev *rt_idev = rt->rt6i_idev;
168 			struct net_device *rt_dev = rt->dst.dev;
169 
170 			if (rt_idev->dev == dev) {
171 				rt->rt6i_idev = in6_dev_get(loopback_dev);
172 				in6_dev_put(rt_idev);
173 			}
174 
175 			if (rt_dev == dev) {
176 				rt->dst.dev = loopback_dev;
177 				dev_hold(rt->dst.dev);
178 				dev_put(rt_dev);
179 			}
180 		}
181 		spin_unlock_bh(&ul->lock);
182 	}
183 }
184 
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	if (!ipv6_addr_any(p))
190 		return (const void *) p;
191 	else if (skb)
192 		return &ipv6_hdr(skb)->daddr;
193 	return daddr;
194 }
195 
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 				   struct net_device *dev,
198 				   struct sk_buff *skb,
199 				   const void *daddr)
200 {
201 	struct neighbour *n;
202 
203 	daddr = choose_neigh_daddr(gw, skb, daddr);
204 	n = __ipv6_neigh_lookup(dev, daddr);
205 	if (n)
206 		return n;
207 	return neigh_create(&nd_tbl, daddr, dev);
208 }
209 
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211 					      struct sk_buff *skb,
212 					      const void *daddr)
213 {
214 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215 
216 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218 
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221 	struct net_device *dev = dst->dev;
222 	struct rt6_info *rt = (struct rt6_info *)dst;
223 
224 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 	if (!daddr)
226 		return;
227 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228 		return;
229 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230 		return;
231 	__ipv6_confirm_neigh(dev, daddr);
232 }
233 
234 static struct dst_ops ip6_dst_ops_template = {
235 	.family			=	AF_INET6,
236 	.gc			=	ip6_dst_gc,
237 	.gc_thresh		=	1024,
238 	.check			=	ip6_dst_check,
239 	.default_advmss		=	ip6_default_advmss,
240 	.mtu			=	ip6_mtu,
241 	.cow_metrics		=	dst_cow_metrics_generic,
242 	.destroy		=	ip6_dst_destroy,
243 	.ifdown			=	ip6_dst_ifdown,
244 	.negative_advice	=	ip6_negative_advice,
245 	.link_failure		=	ip6_link_failure,
246 	.update_pmtu		=	ip6_rt_update_pmtu,
247 	.redirect		=	rt6_do_redirect,
248 	.local_out		=	__ip6_local_out,
249 	.neigh_lookup		=	ip6_dst_neigh_lookup,
250 	.confirm_neigh		=	ip6_confirm_neigh,
251 };
252 
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256 
257 	return mtu ? : dst->dev->mtu;
258 }
259 
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 					 struct sk_buff *skb, u32 mtu)
262 {
263 }
264 
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266 				      struct sk_buff *skb)
267 {
268 }
269 
270 static struct dst_ops ip6_dst_blackhole_ops = {
271 	.family			=	AF_INET6,
272 	.destroy		=	ip6_dst_destroy,
273 	.check			=	ip6_dst_check,
274 	.mtu			=	ip6_blackhole_mtu,
275 	.default_advmss		=	ip6_default_advmss,
276 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277 	.redirect		=	ip6_rt_blackhole_redirect,
278 	.cow_metrics		=	dst_cow_metrics_generic,
279 	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 };
281 
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 	[RTAX_HOPLIMIT - 1] = 0,
284 };
285 
286 static const struct fib6_info fib6_null_entry_template = {
287 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
288 	.fib6_protocol  = RTPROT_KERNEL,
289 	.fib6_metric	= ~(u32)0,
290 	.fib6_ref	= ATOMIC_INIT(1),
291 	.fib6_type	= RTN_UNREACHABLE,
292 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
293 };
294 
295 static const struct rt6_info ip6_null_entry_template = {
296 	.dst = {
297 		.__refcnt	= ATOMIC_INIT(1),
298 		.__use		= 1,
299 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 		.error		= -ENETUNREACH,
301 		.input		= ip6_pkt_discard,
302 		.output		= ip6_pkt_discard_out,
303 	},
304 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
305 };
306 
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308 
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 	.dst = {
311 		.__refcnt	= ATOMIC_INIT(1),
312 		.__use		= 1,
313 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 		.error		= -EACCES,
315 		.input		= ip6_pkt_prohibit,
316 		.output		= ip6_pkt_prohibit_out,
317 	},
318 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319 };
320 
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322 	.dst = {
323 		.__refcnt	= ATOMIC_INIT(1),
324 		.__use		= 1,
325 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 		.error		= -EINVAL,
327 		.input		= dst_discard,
328 		.output		= dst_discard_out,
329 	},
330 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
331 };
332 
333 #endif
334 
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337 	struct dst_entry *dst = &rt->dst;
338 
339 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 	INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342 
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 					1, DST_OBSOLETE_FORCE_CHK, flags);
349 
350 	if (rt) {
351 		rt6_info_init(rt);
352 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353 	}
354 
355 	return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358 
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361 	struct rt6_info *rt = (struct rt6_info *)dst;
362 	struct fib6_info *from;
363 	struct inet6_dev *idev;
364 
365 	dst_destroy_metrics_generic(dst);
366 	rt6_uncached_list_del(rt);
367 
368 	idev = rt->rt6i_idev;
369 	if (idev) {
370 		rt->rt6i_idev = NULL;
371 		in6_dev_put(idev);
372 	}
373 
374 	rcu_read_lock();
375 	from = rcu_dereference(rt->from);
376 	rcu_assign_pointer(rt->from, NULL);
377 	fib6_info_release(from);
378 	rcu_read_unlock();
379 }
380 
381 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
382 			   int how)
383 {
384 	struct rt6_info *rt = (struct rt6_info *)dst;
385 	struct inet6_dev *idev = rt->rt6i_idev;
386 	struct net_device *loopback_dev =
387 		dev_net(dev)->loopback_dev;
388 
389 	if (idev && idev->dev != loopback_dev) {
390 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
391 		if (loopback_idev) {
392 			rt->rt6i_idev = loopback_idev;
393 			in6_dev_put(idev);
394 		}
395 	}
396 }
397 
398 static bool __rt6_check_expired(const struct rt6_info *rt)
399 {
400 	if (rt->rt6i_flags & RTF_EXPIRES)
401 		return time_after(jiffies, rt->dst.expires);
402 	else
403 		return false;
404 }
405 
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408 	struct fib6_info *from;
409 
410 	from = rcu_dereference(rt->from);
411 
412 	if (rt->rt6i_flags & RTF_EXPIRES) {
413 		if (time_after(jiffies, rt->dst.expires))
414 			return true;
415 	} else if (from) {
416 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417 			fib6_check_expired(from);
418 	}
419 	return false;
420 }
421 
422 struct fib6_info *fib6_multipath_select(const struct net *net,
423 					struct fib6_info *match,
424 					struct flowi6 *fl6, int oif,
425 					const struct sk_buff *skb,
426 					int strict)
427 {
428 	struct fib6_info *sibling, *next_sibling;
429 
430 	/* We might have already computed the hash for ICMPv6 errors. In such
431 	 * case it will always be non-zero. Otherwise now is the time to do it.
432 	 */
433 	if (!fl6->mp_hash)
434 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435 
436 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437 		return match;
438 
439 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
440 				 fib6_siblings) {
441 		int nh_upper_bound;
442 
443 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
444 		if (fl6->mp_hash > nh_upper_bound)
445 			continue;
446 		if (rt6_score_route(sibling, oif, strict) < 0)
447 			break;
448 		match = sibling;
449 		break;
450 	}
451 
452 	return match;
453 }
454 
455 /*
456  *	Route lookup. rcu_read_lock() should be held.
457  */
458 
459 static inline struct fib6_info *rt6_device_match(struct net *net,
460 						 struct fib6_info *rt,
461 						    const struct in6_addr *saddr,
462 						    int oif,
463 						    int flags)
464 {
465 	struct fib6_info *sprt;
466 
467 	if (!oif && ipv6_addr_any(saddr) &&
468 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469 		return rt;
470 
471 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
473 
474 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475 			continue;
476 
477 		if (oif) {
478 			if (dev->ifindex == oif)
479 				return sprt;
480 		} else {
481 			if (ipv6_chk_addr(net, saddr, dev,
482 					  flags & RT6_LOOKUP_F_IFACE))
483 				return sprt;
484 		}
485 	}
486 
487 	if (oif && flags & RT6_LOOKUP_F_IFACE)
488 		return net->ipv6.fib6_null_entry;
489 
490 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
491 }
492 
493 #ifdef CONFIG_IPV6_ROUTER_PREF
494 struct __rt6_probe_work {
495 	struct work_struct work;
496 	struct in6_addr target;
497 	struct net_device *dev;
498 };
499 
500 static void rt6_probe_deferred(struct work_struct *w)
501 {
502 	struct in6_addr mcaddr;
503 	struct __rt6_probe_work *work =
504 		container_of(w, struct __rt6_probe_work, work);
505 
506 	addrconf_addr_solict_mult(&work->target, &mcaddr);
507 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508 	dev_put(work->dev);
509 	kfree(work);
510 }
511 
512 static void rt6_probe(struct fib6_info *rt)
513 {
514 	struct __rt6_probe_work *work;
515 	const struct in6_addr *nh_gw;
516 	struct neighbour *neigh;
517 	struct net_device *dev;
518 
519 	/*
520 	 * Okay, this does not seem to be appropriate
521 	 * for now, however, we need to check if it
522 	 * is really so; aka Router Reachability Probing.
523 	 *
524 	 * Router Reachability Probe MUST be rate-limited
525 	 * to no more than one per minute.
526 	 */
527 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528 		return;
529 
530 	nh_gw = &rt->fib6_nh.nh_gw;
531 	dev = rt->fib6_nh.nh_dev;
532 	rcu_read_lock_bh();
533 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534 	if (neigh) {
535 		struct inet6_dev *idev;
536 
537 		if (neigh->nud_state & NUD_VALID)
538 			goto out;
539 
540 		idev = __in6_dev_get(dev);
541 		work = NULL;
542 		write_lock(&neigh->lock);
543 		if (!(neigh->nud_state & NUD_VALID) &&
544 		    time_after(jiffies,
545 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
546 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
547 			if (work)
548 				__neigh_set_probe_once(neigh);
549 		}
550 		write_unlock(&neigh->lock);
551 	} else {
552 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
553 	}
554 
555 	if (work) {
556 		INIT_WORK(&work->work, rt6_probe_deferred);
557 		work->target = *nh_gw;
558 		dev_hold(dev);
559 		work->dev = dev;
560 		schedule_work(&work->work);
561 	}
562 
563 out:
564 	rcu_read_unlock_bh();
565 }
566 #else
567 static inline void rt6_probe(struct fib6_info *rt)
568 {
569 }
570 #endif
571 
572 /*
573  * Default Router Selection (RFC 2461 6.3.6)
574  */
575 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576 {
577 	const struct net_device *dev = rt->fib6_nh.nh_dev;
578 
579 	if (!oif || dev->ifindex == oif)
580 		return 2;
581 	return 0;
582 }
583 
584 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
585 {
586 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587 	struct neighbour *neigh;
588 
589 	if (rt->fib6_flags & RTF_NONEXTHOP ||
590 	    !(rt->fib6_flags & RTF_GATEWAY))
591 		return RT6_NUD_SUCCEED;
592 
593 	rcu_read_lock_bh();
594 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
595 					  &rt->fib6_nh.nh_gw);
596 	if (neigh) {
597 		read_lock(&neigh->lock);
598 		if (neigh->nud_state & NUD_VALID)
599 			ret = RT6_NUD_SUCCEED;
600 #ifdef CONFIG_IPV6_ROUTER_PREF
601 		else if (!(neigh->nud_state & NUD_FAILED))
602 			ret = RT6_NUD_SUCCEED;
603 		else
604 			ret = RT6_NUD_FAIL_PROBE;
605 #endif
606 		read_unlock(&neigh->lock);
607 	} else {
608 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
609 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610 	}
611 	rcu_read_unlock_bh();
612 
613 	return ret;
614 }
615 
616 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
617 {
618 	int m;
619 
620 	m = rt6_check_dev(rt, oif);
621 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
622 		return RT6_NUD_FAIL_HARD;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625 #endif
626 	if (strict & RT6_LOOKUP_F_REACHABLE) {
627 		int n = rt6_check_neigh(rt);
628 		if (n < 0)
629 			return n;
630 	}
631 	return m;
632 }
633 
634 /* called with rc_read_lock held */
635 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
636 {
637 	const struct net_device *dev = fib6_info_nh_dev(f6i);
638 	bool rc = false;
639 
640 	if (dev) {
641 		const struct inet6_dev *idev = __in6_dev_get(dev);
642 
643 		rc = !!idev->cnf.ignore_routes_with_linkdown;
644 	}
645 
646 	return rc;
647 }
648 
649 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
650 				   int *mpri, struct fib6_info *match,
651 				   bool *do_rr)
652 {
653 	int m;
654 	bool match_do_rr = false;
655 
656 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657 		goto out;
658 
659 	if (fib6_ignore_linkdown(rt) &&
660 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662 		goto out;
663 
664 	if (fib6_check_expired(rt))
665 		goto out;
666 
667 	m = rt6_score_route(rt, oif, strict);
668 	if (m == RT6_NUD_FAIL_DO_RR) {
669 		match_do_rr = true;
670 		m = 0; /* lowest valid score */
671 	} else if (m == RT6_NUD_FAIL_HARD) {
672 		goto out;
673 	}
674 
675 	if (strict & RT6_LOOKUP_F_REACHABLE)
676 		rt6_probe(rt);
677 
678 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
679 	if (m > *mpri) {
680 		*do_rr = match_do_rr;
681 		*mpri = m;
682 		match = rt;
683 	}
684 out:
685 	return match;
686 }
687 
688 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
689 				     struct fib6_info *leaf,
690 				     struct fib6_info *rr_head,
691 				     u32 metric, int oif, int strict,
692 				     bool *do_rr)
693 {
694 	struct fib6_info *rt, *match, *cont;
695 	int mpri = -1;
696 
697 	match = NULL;
698 	cont = NULL;
699 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700 		if (rt->fib6_metric != metric) {
701 			cont = rt;
702 			break;
703 		}
704 
705 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
706 	}
707 
708 	for (rt = leaf; rt && rt != rr_head;
709 	     rt = rcu_dereference(rt->fib6_next)) {
710 		if (rt->fib6_metric != metric) {
711 			cont = rt;
712 			break;
713 		}
714 
715 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 	}
717 
718 	if (match || !cont)
719 		return match;
720 
721 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 
724 	return match;
725 }
726 
727 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
728 				   int oif, int strict)
729 {
730 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
731 	struct fib6_info *match, *rt0;
732 	bool do_rr = false;
733 	int key_plen;
734 
735 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
736 		return net->ipv6.fib6_null_entry;
737 
738 	rt0 = rcu_dereference(fn->rr_ptr);
739 	if (!rt0)
740 		rt0 = leaf;
741 
742 	/* Double check to make sure fn is not an intermediate node
743 	 * and fn->leaf does not points to its child's leaf
744 	 * (This might happen if all routes under fn are deleted from
745 	 * the tree and fib6_repair_tree() is called on the node.)
746 	 */
747 	key_plen = rt0->fib6_dst.plen;
748 #ifdef CONFIG_IPV6_SUBTREES
749 	if (rt0->fib6_src.plen)
750 		key_plen = rt0->fib6_src.plen;
751 #endif
752 	if (fn->fn_bit != key_plen)
753 		return net->ipv6.fib6_null_entry;
754 
755 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756 			     &do_rr);
757 
758 	if (do_rr) {
759 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760 
761 		/* no entries matched; do round-robin */
762 		if (!next || next->fib6_metric != rt0->fib6_metric)
763 			next = leaf;
764 
765 		if (next != rt0) {
766 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
767 			/* make sure next is not being deleted from the tree */
768 			if (next->fib6_node)
769 				rcu_assign_pointer(fn->rr_ptr, next);
770 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771 		}
772 	}
773 
774 	return match ? match : net->ipv6.fib6_null_entry;
775 }
776 
777 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778 {
779 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 }
781 
782 #ifdef CONFIG_IPV6_ROUTE_INFO
783 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784 		  const struct in6_addr *gwaddr)
785 {
786 	struct net *net = dev_net(dev);
787 	struct route_info *rinfo = (struct route_info *) opt;
788 	struct in6_addr prefix_buf, *prefix;
789 	unsigned int pref;
790 	unsigned long lifetime;
791 	struct fib6_info *rt;
792 
793 	if (len < sizeof(struct route_info)) {
794 		return -EINVAL;
795 	}
796 
797 	/* Sanity check for prefix_len and length */
798 	if (rinfo->length > 3) {
799 		return -EINVAL;
800 	} else if (rinfo->prefix_len > 128) {
801 		return -EINVAL;
802 	} else if (rinfo->prefix_len > 64) {
803 		if (rinfo->length < 2) {
804 			return -EINVAL;
805 		}
806 	} else if (rinfo->prefix_len > 0) {
807 		if (rinfo->length < 1) {
808 			return -EINVAL;
809 		}
810 	}
811 
812 	pref = rinfo->route_pref;
813 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
814 		return -EINVAL;
815 
816 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817 
818 	if (rinfo->length == 3)
819 		prefix = (struct in6_addr *)rinfo->prefix;
820 	else {
821 		/* this function is safe */
822 		ipv6_addr_prefix(&prefix_buf,
823 				 (struct in6_addr *)rinfo->prefix,
824 				 rinfo->prefix_len);
825 		prefix = &prefix_buf;
826 	}
827 
828 	if (rinfo->prefix_len == 0)
829 		rt = rt6_get_dflt_router(net, gwaddr, dev);
830 	else
831 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832 					gwaddr, dev);
833 
834 	if (rt && !lifetime) {
835 		ip6_del_rt(net, rt);
836 		rt = NULL;
837 	}
838 
839 	if (!rt && lifetime)
840 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
841 					dev, pref);
842 	else if (rt)
843 		rt->fib6_flags = RTF_ROUTEINFO |
844 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845 
846 	if (rt) {
847 		if (!addrconf_finite_timeout(lifetime))
848 			fib6_clean_expires(rt);
849 		else
850 			fib6_set_expires(rt, jiffies + HZ * lifetime);
851 
852 		fib6_info_release(rt);
853 	}
854 	return 0;
855 }
856 #endif
857 
858 /*
859  *	Misc support functions
860  */
861 
862 /* called with rcu_lock held */
863 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864 {
865 	struct net_device *dev = rt->fib6_nh.nh_dev;
866 
867 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 		/* for copies of local routes, dst->dev needs to be the
869 		 * device if it is a master device, the master device if
870 		 * device is enslaved, and the loopback as the default
871 		 */
872 		if (netif_is_l3_slave(dev) &&
873 		    !rt6_need_strict(&rt->fib6_dst.addr))
874 			dev = l3mdev_master_dev_rcu(dev);
875 		else if (!netif_is_l3_master(dev))
876 			dev = dev_net(dev)->loopback_dev;
877 		/* last case is netif_is_l3_master(dev) is true in which
878 		 * case we want dev returned to be dev
879 		 */
880 	}
881 
882 	return dev;
883 }
884 
885 static const int fib6_prop[RTN_MAX + 1] = {
886 	[RTN_UNSPEC]	= 0,
887 	[RTN_UNICAST]	= 0,
888 	[RTN_LOCAL]	= 0,
889 	[RTN_BROADCAST]	= 0,
890 	[RTN_ANYCAST]	= 0,
891 	[RTN_MULTICAST]	= 0,
892 	[RTN_BLACKHOLE]	= -EINVAL,
893 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
894 	[RTN_PROHIBIT]	= -EACCES,
895 	[RTN_THROW]	= -EAGAIN,
896 	[RTN_NAT]	= -EINVAL,
897 	[RTN_XRESOLVE]	= -EINVAL,
898 };
899 
900 static int ip6_rt_type_to_error(u8 fib6_type)
901 {
902 	return fib6_prop[fib6_type];
903 }
904 
905 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 {
907 	unsigned short flags = 0;
908 
909 	if (rt->dst_nocount)
910 		flags |= DST_NOCOUNT;
911 	if (rt->dst_nopolicy)
912 		flags |= DST_NOPOLICY;
913 	if (rt->dst_host)
914 		flags |= DST_HOST;
915 
916 	return flags;
917 }
918 
919 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 {
921 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
922 
923 	switch (ort->fib6_type) {
924 	case RTN_BLACKHOLE:
925 		rt->dst.output = dst_discard_out;
926 		rt->dst.input = dst_discard;
927 		break;
928 	case RTN_PROHIBIT:
929 		rt->dst.output = ip6_pkt_prohibit_out;
930 		rt->dst.input = ip6_pkt_prohibit;
931 		break;
932 	case RTN_THROW:
933 	case RTN_UNREACHABLE:
934 	default:
935 		rt->dst.output = ip6_pkt_discard_out;
936 		rt->dst.input = ip6_pkt_discard;
937 		break;
938 	}
939 }
940 
941 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942 {
943 	rt->dst.flags |= fib6_info_dst_flags(ort);
944 
945 	if (ort->fib6_flags & RTF_REJECT) {
946 		ip6_rt_init_dst_reject(rt, ort);
947 		return;
948 	}
949 
950 	rt->dst.error = 0;
951 	rt->dst.output = ip6_output;
952 
953 	if (ort->fib6_type == RTN_LOCAL) {
954 		rt->dst.input = ip6_input;
955 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 		rt->dst.input = ip6_mc_input;
957 	} else {
958 		rt->dst.input = ip6_forward;
959 	}
960 
961 	if (ort->fib6_nh.nh_lwtstate) {
962 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
963 		lwtunnel_set_redirect(&rt->dst);
964 	}
965 
966 	rt->dst.lastuse = jiffies;
967 }
968 
969 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 {
971 	rt->rt6i_flags &= ~RTF_EXPIRES;
972 	fib6_info_hold(from);
973 	rcu_assign_pointer(rt->from, from);
974 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
975 	if (from->fib6_metrics != &dst_default_metrics) {
976 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
977 		refcount_inc(&from->fib6_metrics->refcnt);
978 	}
979 }
980 
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983 	struct net_device *dev = fib6_info_nh_dev(ort);
984 
985 	ip6_rt_init_dst(rt, ort);
986 
987 	rt->rt6i_dst = ort->fib6_dst;
988 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 	rt->rt6i_src = ort->fib6_src;
994 #endif
995 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
996 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 }
998 
999 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1000 					struct in6_addr *saddr)
1001 {
1002 	struct fib6_node *pn, *sn;
1003 	while (1) {
1004 		if (fn->fn_flags & RTN_TL_ROOT)
1005 			return NULL;
1006 		pn = rcu_dereference(fn->parent);
1007 		sn = FIB6_SUBTREE(pn);
1008 		if (sn && sn != fn)
1009 			fn = fib6_node_lookup(sn, NULL, saddr);
1010 		else
1011 			fn = pn;
1012 		if (fn->fn_flags & RTN_RTINFO)
1013 			return fn;
1014 	}
1015 }
1016 
1017 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1018 			  bool null_fallback)
1019 {
1020 	struct rt6_info *rt = *prt;
1021 
1022 	if (dst_hold_safe(&rt->dst))
1023 		return true;
1024 	if (null_fallback) {
1025 		rt = net->ipv6.ip6_null_entry;
1026 		dst_hold(&rt->dst);
1027 	} else {
1028 		rt = NULL;
1029 	}
1030 	*prt = rt;
1031 	return false;
1032 }
1033 
1034 /* called with rcu_lock held */
1035 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036 {
1037 	unsigned short flags = fib6_info_dst_flags(rt);
1038 	struct net_device *dev = rt->fib6_nh.nh_dev;
1039 	struct rt6_info *nrt;
1040 
1041 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042 	if (nrt)
1043 		ip6_rt_copy_init(nrt, rt);
1044 
1045 	return nrt;
1046 }
1047 
1048 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1049 					     struct fib6_table *table,
1050 					     struct flowi6 *fl6,
1051 					     const struct sk_buff *skb,
1052 					     int flags)
1053 {
1054 	struct fib6_info *f6i;
1055 	struct fib6_node *fn;
1056 	struct rt6_info *rt;
1057 
1058 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1059 		flags &= ~RT6_LOOKUP_F_IFACE;
1060 
1061 	rcu_read_lock();
1062 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 restart:
1064 	f6i = rcu_dereference(fn->leaf);
1065 	if (!f6i) {
1066 		f6i = net->ipv6.fib6_null_entry;
1067 	} else {
1068 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069 				      fl6->flowi6_oif, flags);
1070 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 			f6i = fib6_multipath_select(net, f6i, fl6,
1072 						    fl6->flowi6_oif, skb,
1073 						    flags);
1074 	}
1075 	if (f6i == net->ipv6.fib6_null_entry) {
1076 		fn = fib6_backtrack(fn, &fl6->saddr);
1077 		if (fn)
1078 			goto restart;
1079 	}
1080 
1081 	trace_fib6_table_lookup(net, f6i, table, fl6);
1082 
1083 	/* Search through exception table */
1084 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1085 	if (rt) {
1086 		if (ip6_hold_safe(net, &rt, true))
1087 			dst_use_noref(&rt->dst, jiffies);
1088 	} else if (f6i == net->ipv6.fib6_null_entry) {
1089 		rt = net->ipv6.ip6_null_entry;
1090 		dst_hold(&rt->dst);
1091 	} else {
1092 		rt = ip6_create_rt_rcu(f6i);
1093 		if (!rt) {
1094 			rt = net->ipv6.ip6_null_entry;
1095 			dst_hold(&rt->dst);
1096 		}
1097 	}
1098 
1099 	rcu_read_unlock();
1100 
1101 	return rt;
1102 }
1103 
1104 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1105 				   const struct sk_buff *skb, int flags)
1106 {
1107 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1110 
1111 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1112 			    const struct in6_addr *saddr, int oif,
1113 			    const struct sk_buff *skb, int strict)
1114 {
1115 	struct flowi6 fl6 = {
1116 		.flowi6_oif = oif,
1117 		.daddr = *daddr,
1118 	};
1119 	struct dst_entry *dst;
1120 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1121 
1122 	if (saddr) {
1123 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1124 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1125 	}
1126 
1127 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1128 	if (dst->error == 0)
1129 		return (struct rt6_info *) dst;
1130 
1131 	dst_release(dst);
1132 
1133 	return NULL;
1134 }
1135 EXPORT_SYMBOL(rt6_lookup);
1136 
1137 /* ip6_ins_rt is called with FREE table->tb6_lock.
1138  * It takes new route entry, the addition fails by any reason the
1139  * route is released.
1140  * Caller must hold dst before calling it.
1141  */
1142 
1143 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1144 			struct netlink_ext_ack *extack)
1145 {
1146 	int err;
1147 	struct fib6_table *table;
1148 
1149 	table = rt->fib6_table;
1150 	spin_lock_bh(&table->tb6_lock);
1151 	err = fib6_add(&table->tb6_root, rt, info, extack);
1152 	spin_unlock_bh(&table->tb6_lock);
1153 
1154 	return err;
1155 }
1156 
1157 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1158 {
1159 	struct nl_info info = {	.nl_net = net, };
1160 
1161 	return __ip6_ins_rt(rt, &info, NULL);
1162 }
1163 
1164 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1165 					   const struct in6_addr *daddr,
1166 					   const struct in6_addr *saddr)
1167 {
1168 	struct net_device *dev;
1169 	struct rt6_info *rt;
1170 
1171 	/*
1172 	 *	Clone the route.
1173 	 */
1174 
1175 	dev = ip6_rt_get_dev_rcu(ort);
1176 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1177 	if (!rt)
1178 		return NULL;
1179 
1180 	ip6_rt_copy_init(rt, ort);
1181 	rt->rt6i_flags |= RTF_CACHE;
1182 	rt->dst.flags |= DST_HOST;
1183 	rt->rt6i_dst.addr = *daddr;
1184 	rt->rt6i_dst.plen = 128;
1185 
1186 	if (!rt6_is_gw_or_nonexthop(ort)) {
1187 		if (ort->fib6_dst.plen != 128 &&
1188 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1189 			rt->rt6i_flags |= RTF_ANYCAST;
1190 #ifdef CONFIG_IPV6_SUBTREES
1191 		if (rt->rt6i_src.plen && saddr) {
1192 			rt->rt6i_src.addr = *saddr;
1193 			rt->rt6i_src.plen = 128;
1194 		}
1195 #endif
1196 	}
1197 
1198 	return rt;
1199 }
1200 
1201 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1202 {
1203 	unsigned short flags = fib6_info_dst_flags(rt);
1204 	struct net_device *dev;
1205 	struct rt6_info *pcpu_rt;
1206 
1207 	rcu_read_lock();
1208 	dev = ip6_rt_get_dev_rcu(rt);
1209 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1210 	rcu_read_unlock();
1211 	if (!pcpu_rt)
1212 		return NULL;
1213 	ip6_rt_copy_init(pcpu_rt, rt);
1214 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1215 	return pcpu_rt;
1216 }
1217 
1218 /* It should be called with rcu_read_lock() acquired */
1219 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1220 {
1221 	struct rt6_info *pcpu_rt, **p;
1222 
1223 	p = this_cpu_ptr(rt->rt6i_pcpu);
1224 	pcpu_rt = *p;
1225 
1226 	if (pcpu_rt)
1227 		ip6_hold_safe(NULL, &pcpu_rt, false);
1228 
1229 	return pcpu_rt;
1230 }
1231 
1232 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1233 					    struct fib6_info *rt)
1234 {
1235 	struct rt6_info *pcpu_rt, *prev, **p;
1236 
1237 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1238 	if (!pcpu_rt) {
1239 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1240 		return net->ipv6.ip6_null_entry;
1241 	}
1242 
1243 	dst_hold(&pcpu_rt->dst);
1244 	p = this_cpu_ptr(rt->rt6i_pcpu);
1245 	prev = cmpxchg(p, NULL, pcpu_rt);
1246 	BUG_ON(prev);
1247 
1248 	return pcpu_rt;
1249 }
1250 
1251 /* exception hash table implementation
1252  */
1253 static DEFINE_SPINLOCK(rt6_exception_lock);
1254 
1255 /* Remove rt6_ex from hash table and free the memory
1256  * Caller must hold rt6_exception_lock
1257  */
1258 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1259 				 struct rt6_exception *rt6_ex)
1260 {
1261 	struct net *net;
1262 
1263 	if (!bucket || !rt6_ex)
1264 		return;
1265 
1266 	net = dev_net(rt6_ex->rt6i->dst.dev);
1267 	hlist_del_rcu(&rt6_ex->hlist);
1268 	dst_release(&rt6_ex->rt6i->dst);
1269 	kfree_rcu(rt6_ex, rcu);
1270 	WARN_ON_ONCE(!bucket->depth);
1271 	bucket->depth--;
1272 	net->ipv6.rt6_stats->fib_rt_cache--;
1273 }
1274 
1275 /* Remove oldest rt6_ex in bucket and free the memory
1276  * Caller must hold rt6_exception_lock
1277  */
1278 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1279 {
1280 	struct rt6_exception *rt6_ex, *oldest = NULL;
1281 
1282 	if (!bucket)
1283 		return;
1284 
1285 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1286 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1287 			oldest = rt6_ex;
1288 	}
1289 	rt6_remove_exception(bucket, oldest);
1290 }
1291 
1292 static u32 rt6_exception_hash(const struct in6_addr *dst,
1293 			      const struct in6_addr *src)
1294 {
1295 	static u32 seed __read_mostly;
1296 	u32 val;
1297 
1298 	net_get_random_once(&seed, sizeof(seed));
1299 	val = jhash(dst, sizeof(*dst), seed);
1300 
1301 #ifdef CONFIG_IPV6_SUBTREES
1302 	if (src)
1303 		val = jhash(src, sizeof(*src), val);
1304 #endif
1305 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1306 }
1307 
1308 /* Helper function to find the cached rt in the hash table
1309  * and update bucket pointer to point to the bucket for this
1310  * (daddr, saddr) pair
1311  * Caller must hold rt6_exception_lock
1312  */
1313 static struct rt6_exception *
1314 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1315 			      const struct in6_addr *daddr,
1316 			      const struct in6_addr *saddr)
1317 {
1318 	struct rt6_exception *rt6_ex;
1319 	u32 hval;
1320 
1321 	if (!(*bucket) || !daddr)
1322 		return NULL;
1323 
1324 	hval = rt6_exception_hash(daddr, saddr);
1325 	*bucket += hval;
1326 
1327 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1328 		struct rt6_info *rt6 = rt6_ex->rt6i;
1329 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1330 
1331 #ifdef CONFIG_IPV6_SUBTREES
1332 		if (matched && saddr)
1333 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1334 #endif
1335 		if (matched)
1336 			return rt6_ex;
1337 	}
1338 	return NULL;
1339 }
1340 
1341 /* Helper function to find the cached rt in the hash table
1342  * and update bucket pointer to point to the bucket for this
1343  * (daddr, saddr) pair
1344  * Caller must hold rcu_read_lock()
1345  */
1346 static struct rt6_exception *
1347 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1348 			 const struct in6_addr *daddr,
1349 			 const struct in6_addr *saddr)
1350 {
1351 	struct rt6_exception *rt6_ex;
1352 	u32 hval;
1353 
1354 	WARN_ON_ONCE(!rcu_read_lock_held());
1355 
1356 	if (!(*bucket) || !daddr)
1357 		return NULL;
1358 
1359 	hval = rt6_exception_hash(daddr, saddr);
1360 	*bucket += hval;
1361 
1362 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1363 		struct rt6_info *rt6 = rt6_ex->rt6i;
1364 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1365 
1366 #ifdef CONFIG_IPV6_SUBTREES
1367 		if (matched && saddr)
1368 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1369 #endif
1370 		if (matched)
1371 			return rt6_ex;
1372 	}
1373 	return NULL;
1374 }
1375 
1376 static unsigned int fib6_mtu(const struct fib6_info *rt)
1377 {
1378 	unsigned int mtu;
1379 
1380 	if (rt->fib6_pmtu) {
1381 		mtu = rt->fib6_pmtu;
1382 	} else {
1383 		struct net_device *dev = fib6_info_nh_dev(rt);
1384 		struct inet6_dev *idev;
1385 
1386 		rcu_read_lock();
1387 		idev = __in6_dev_get(dev);
1388 		mtu = idev->cnf.mtu6;
1389 		rcu_read_unlock();
1390 	}
1391 
1392 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1393 
1394 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1395 }
1396 
1397 static int rt6_insert_exception(struct rt6_info *nrt,
1398 				struct fib6_info *ort)
1399 {
1400 	struct net *net = dev_net(nrt->dst.dev);
1401 	struct rt6_exception_bucket *bucket;
1402 	struct in6_addr *src_key = NULL;
1403 	struct rt6_exception *rt6_ex;
1404 	int err = 0;
1405 
1406 	spin_lock_bh(&rt6_exception_lock);
1407 
1408 	if (ort->exception_bucket_flushed) {
1409 		err = -EINVAL;
1410 		goto out;
1411 	}
1412 
1413 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1414 					lockdep_is_held(&rt6_exception_lock));
1415 	if (!bucket) {
1416 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1417 				 GFP_ATOMIC);
1418 		if (!bucket) {
1419 			err = -ENOMEM;
1420 			goto out;
1421 		}
1422 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1423 	}
1424 
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 	/* rt6i_src.plen != 0 indicates ort is in subtree
1427 	 * and exception table is indexed by a hash of
1428 	 * both rt6i_dst and rt6i_src.
1429 	 * Otherwise, the exception table is indexed by
1430 	 * a hash of only rt6i_dst.
1431 	 */
1432 	if (ort->fib6_src.plen)
1433 		src_key = &nrt->rt6i_src.addr;
1434 #endif
1435 
1436 	/* Update rt6i_prefsrc as it could be changed
1437 	 * in rt6_remove_prefsrc()
1438 	 */
1439 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1440 	/* rt6_mtu_change() might lower mtu on ort.
1441 	 * Only insert this exception route if its mtu
1442 	 * is less than ort's mtu value.
1443 	 */
1444 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1445 		err = -EINVAL;
1446 		goto out;
1447 	}
1448 
1449 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1450 					       src_key);
1451 	if (rt6_ex)
1452 		rt6_remove_exception(bucket, rt6_ex);
1453 
1454 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1455 	if (!rt6_ex) {
1456 		err = -ENOMEM;
1457 		goto out;
1458 	}
1459 	rt6_ex->rt6i = nrt;
1460 	rt6_ex->stamp = jiffies;
1461 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1462 	bucket->depth++;
1463 	net->ipv6.rt6_stats->fib_rt_cache++;
1464 
1465 	if (bucket->depth > FIB6_MAX_DEPTH)
1466 		rt6_exception_remove_oldest(bucket);
1467 
1468 out:
1469 	spin_unlock_bh(&rt6_exception_lock);
1470 
1471 	/* Update fn->fn_sernum to invalidate all cached dst */
1472 	if (!err) {
1473 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1474 		fib6_update_sernum(net, ort);
1475 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1476 		fib6_force_start_gc(net);
1477 	}
1478 
1479 	return err;
1480 }
1481 
1482 void rt6_flush_exceptions(struct fib6_info *rt)
1483 {
1484 	struct rt6_exception_bucket *bucket;
1485 	struct rt6_exception *rt6_ex;
1486 	struct hlist_node *tmp;
1487 	int i;
1488 
1489 	spin_lock_bh(&rt6_exception_lock);
1490 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1491 	rt->exception_bucket_flushed = 1;
1492 
1493 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1494 				    lockdep_is_held(&rt6_exception_lock));
1495 	if (!bucket)
1496 		goto out;
1497 
1498 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1499 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1500 			rt6_remove_exception(bucket, rt6_ex);
1501 		WARN_ON_ONCE(bucket->depth);
1502 		bucket++;
1503 	}
1504 
1505 out:
1506 	spin_unlock_bh(&rt6_exception_lock);
1507 }
1508 
1509 /* Find cached rt in the hash table inside passed in rt
1510  * Caller has to hold rcu_read_lock()
1511  */
1512 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1513 					   struct in6_addr *daddr,
1514 					   struct in6_addr *saddr)
1515 {
1516 	struct rt6_exception_bucket *bucket;
1517 	struct in6_addr *src_key = NULL;
1518 	struct rt6_exception *rt6_ex;
1519 	struct rt6_info *res = NULL;
1520 
1521 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1522 
1523 #ifdef CONFIG_IPV6_SUBTREES
1524 	/* rt6i_src.plen != 0 indicates rt is in subtree
1525 	 * and exception table is indexed by a hash of
1526 	 * both rt6i_dst and rt6i_src.
1527 	 * Otherwise, the exception table is indexed by
1528 	 * a hash of only rt6i_dst.
1529 	 */
1530 	if (rt->fib6_src.plen)
1531 		src_key = saddr;
1532 #endif
1533 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1534 
1535 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1536 		res = rt6_ex->rt6i;
1537 
1538 	return res;
1539 }
1540 
1541 /* Remove the passed in cached rt from the hash table that contains it */
1542 static int rt6_remove_exception_rt(struct rt6_info *rt)
1543 {
1544 	struct rt6_exception_bucket *bucket;
1545 	struct in6_addr *src_key = NULL;
1546 	struct rt6_exception *rt6_ex;
1547 	struct fib6_info *from;
1548 	int err;
1549 
1550 	from = rcu_dereference(rt->from);
1551 	if (!from ||
1552 	    !(rt->rt6i_flags & RTF_CACHE))
1553 		return -EINVAL;
1554 
1555 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1556 		return -ENOENT;
1557 
1558 	spin_lock_bh(&rt6_exception_lock);
1559 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1560 				    lockdep_is_held(&rt6_exception_lock));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1563 	 * and exception table is indexed by a hash of
1564 	 * both rt6i_dst and rt6i_src.
1565 	 * Otherwise, the exception table is indexed by
1566 	 * a hash of only rt6i_dst.
1567 	 */
1568 	if (from->fib6_src.plen)
1569 		src_key = &rt->rt6i_src.addr;
1570 #endif
1571 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1572 					       &rt->rt6i_dst.addr,
1573 					       src_key);
1574 	if (rt6_ex) {
1575 		rt6_remove_exception(bucket, rt6_ex);
1576 		err = 0;
1577 	} else {
1578 		err = -ENOENT;
1579 	}
1580 
1581 	spin_unlock_bh(&rt6_exception_lock);
1582 	return err;
1583 }
1584 
1585 /* Find rt6_ex which contains the passed in rt cache and
1586  * refresh its stamp
1587  */
1588 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1589 {
1590 	struct rt6_exception_bucket *bucket;
1591 	struct fib6_info *from = rt->from;
1592 	struct in6_addr *src_key = NULL;
1593 	struct rt6_exception *rt6_ex;
1594 
1595 	if (!from ||
1596 	    !(rt->rt6i_flags & RTF_CACHE))
1597 		return;
1598 
1599 	rcu_read_lock();
1600 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1601 
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1604 	 * and exception table is indexed by a hash of
1605 	 * both rt6i_dst and rt6i_src.
1606 	 * Otherwise, the exception table is indexed by
1607 	 * a hash of only rt6i_dst.
1608 	 */
1609 	if (from->fib6_src.plen)
1610 		src_key = &rt->rt6i_src.addr;
1611 #endif
1612 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1613 					  &rt->rt6i_dst.addr,
1614 					  src_key);
1615 	if (rt6_ex)
1616 		rt6_ex->stamp = jiffies;
1617 
1618 	rcu_read_unlock();
1619 }
1620 
1621 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1622 {
1623 	struct rt6_exception_bucket *bucket;
1624 	struct rt6_exception *rt6_ex;
1625 	int i;
1626 
1627 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1628 					lockdep_is_held(&rt6_exception_lock));
1629 
1630 	if (bucket) {
1631 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1632 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1633 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1634 			}
1635 			bucket++;
1636 		}
1637 	}
1638 }
1639 
1640 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1641 					 struct rt6_info *rt, int mtu)
1642 {
1643 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1644 	 * lowest MTU in the path: always allow updating the route PMTU to
1645 	 * reflect PMTU decreases.
1646 	 *
1647 	 * If the new MTU is higher, and the route PMTU is equal to the local
1648 	 * MTU, this means the old MTU is the lowest in the path, so allow
1649 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1650 	 * handle this.
1651 	 */
1652 
1653 	if (dst_mtu(&rt->dst) >= mtu)
1654 		return true;
1655 
1656 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1657 		return true;
1658 
1659 	return false;
1660 }
1661 
1662 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1663 				       struct fib6_info *rt, int mtu)
1664 {
1665 	struct rt6_exception_bucket *bucket;
1666 	struct rt6_exception *rt6_ex;
1667 	int i;
1668 
1669 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1670 					lockdep_is_held(&rt6_exception_lock));
1671 
1672 	if (!bucket)
1673 		return;
1674 
1675 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1676 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1677 			struct rt6_info *entry = rt6_ex->rt6i;
1678 
1679 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1680 			 * route), the metrics of its rt->from have already
1681 			 * been updated.
1682 			 */
1683 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1684 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1685 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1686 		}
1687 		bucket++;
1688 	}
1689 }
1690 
1691 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1692 
1693 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1694 					struct in6_addr *gateway)
1695 {
1696 	struct rt6_exception_bucket *bucket;
1697 	struct rt6_exception *rt6_ex;
1698 	struct hlist_node *tmp;
1699 	int i;
1700 
1701 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1702 		return;
1703 
1704 	spin_lock_bh(&rt6_exception_lock);
1705 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1706 				     lockdep_is_held(&rt6_exception_lock));
1707 
1708 	if (bucket) {
1709 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1710 			hlist_for_each_entry_safe(rt6_ex, tmp,
1711 						  &bucket->chain, hlist) {
1712 				struct rt6_info *entry = rt6_ex->rt6i;
1713 
1714 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1715 				    RTF_CACHE_GATEWAY &&
1716 				    ipv6_addr_equal(gateway,
1717 						    &entry->rt6i_gateway)) {
1718 					rt6_remove_exception(bucket, rt6_ex);
1719 				}
1720 			}
1721 			bucket++;
1722 		}
1723 	}
1724 
1725 	spin_unlock_bh(&rt6_exception_lock);
1726 }
1727 
1728 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1729 				      struct rt6_exception *rt6_ex,
1730 				      struct fib6_gc_args *gc_args,
1731 				      unsigned long now)
1732 {
1733 	struct rt6_info *rt = rt6_ex->rt6i;
1734 
1735 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1736 	 * even if others have still references to them, so that on next
1737 	 * dst_check() such references can be dropped.
1738 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1739 	 * expired, independently from their aging, as per RFC 8201 section 4
1740 	 */
1741 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1742 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1743 			RT6_TRACE("aging clone %p\n", rt);
1744 			rt6_remove_exception(bucket, rt6_ex);
1745 			return;
1746 		}
1747 	} else if (time_after(jiffies, rt->dst.expires)) {
1748 		RT6_TRACE("purging expired route %p\n", rt);
1749 		rt6_remove_exception(bucket, rt6_ex);
1750 		return;
1751 	}
1752 
1753 	if (rt->rt6i_flags & RTF_GATEWAY) {
1754 		struct neighbour *neigh;
1755 		__u8 neigh_flags = 0;
1756 
1757 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1758 		if (neigh)
1759 			neigh_flags = neigh->flags;
1760 
1761 		if (!(neigh_flags & NTF_ROUTER)) {
1762 			RT6_TRACE("purging route %p via non-router but gateway\n",
1763 				  rt);
1764 			rt6_remove_exception(bucket, rt6_ex);
1765 			return;
1766 		}
1767 	}
1768 
1769 	gc_args->more++;
1770 }
1771 
1772 void rt6_age_exceptions(struct fib6_info *rt,
1773 			struct fib6_gc_args *gc_args,
1774 			unsigned long now)
1775 {
1776 	struct rt6_exception_bucket *bucket;
1777 	struct rt6_exception *rt6_ex;
1778 	struct hlist_node *tmp;
1779 	int i;
1780 
1781 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1782 		return;
1783 
1784 	rcu_read_lock_bh();
1785 	spin_lock(&rt6_exception_lock);
1786 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1787 				    lockdep_is_held(&rt6_exception_lock));
1788 
1789 	if (bucket) {
1790 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1791 			hlist_for_each_entry_safe(rt6_ex, tmp,
1792 						  &bucket->chain, hlist) {
1793 				rt6_age_examine_exception(bucket, rt6_ex,
1794 							  gc_args, now);
1795 			}
1796 			bucket++;
1797 		}
1798 	}
1799 	spin_unlock(&rt6_exception_lock);
1800 	rcu_read_unlock_bh();
1801 }
1802 
1803 /* must be called with rcu lock held */
1804 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1805 				    int oif, struct flowi6 *fl6, int strict)
1806 {
1807 	struct fib6_node *fn, *saved_fn;
1808 	struct fib6_info *f6i;
1809 
1810 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1811 	saved_fn = fn;
1812 
1813 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1814 		oif = 0;
1815 
1816 redo_rt6_select:
1817 	f6i = rt6_select(net, fn, oif, strict);
1818 	if (f6i == net->ipv6.fib6_null_entry) {
1819 		fn = fib6_backtrack(fn, &fl6->saddr);
1820 		if (fn)
1821 			goto redo_rt6_select;
1822 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1823 			/* also consider unreachable route */
1824 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1825 			fn = saved_fn;
1826 			goto redo_rt6_select;
1827 		}
1828 	}
1829 
1830 	trace_fib6_table_lookup(net, f6i, table, fl6);
1831 
1832 	return f6i;
1833 }
1834 
1835 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1836 			       int oif, struct flowi6 *fl6,
1837 			       const struct sk_buff *skb, int flags)
1838 {
1839 	struct fib6_info *f6i;
1840 	struct rt6_info *rt;
1841 	int strict = 0;
1842 
1843 	strict |= flags & RT6_LOOKUP_F_IFACE;
1844 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1845 	if (net->ipv6.devconf_all->forwarding == 0)
1846 		strict |= RT6_LOOKUP_F_REACHABLE;
1847 
1848 	rcu_read_lock();
1849 
1850 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1851 	if (f6i->fib6_nsiblings)
1852 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1853 
1854 	if (f6i == net->ipv6.fib6_null_entry) {
1855 		rt = net->ipv6.ip6_null_entry;
1856 		rcu_read_unlock();
1857 		dst_hold(&rt->dst);
1858 		return rt;
1859 	}
1860 
1861 	/*Search through exception table */
1862 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1863 	if (rt) {
1864 		if (ip6_hold_safe(net, &rt, true))
1865 			dst_use_noref(&rt->dst, jiffies);
1866 
1867 		rcu_read_unlock();
1868 		return rt;
1869 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1870 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1871 		/* Create a RTF_CACHE clone which will not be
1872 		 * owned by the fib6 tree.  It is for the special case where
1873 		 * the daddr in the skb during the neighbor look-up is different
1874 		 * from the fl6->daddr used to look-up route here.
1875 		 */
1876 		struct rt6_info *uncached_rt;
1877 
1878 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1879 
1880 		rcu_read_unlock();
1881 
1882 		if (uncached_rt) {
1883 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1884 			 * No need for another dst_hold()
1885 			 */
1886 			rt6_uncached_list_add(uncached_rt);
1887 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1888 		} else {
1889 			uncached_rt = net->ipv6.ip6_null_entry;
1890 			dst_hold(&uncached_rt->dst);
1891 		}
1892 
1893 		return uncached_rt;
1894 	} else {
1895 		/* Get a percpu copy */
1896 
1897 		struct rt6_info *pcpu_rt;
1898 
1899 		local_bh_disable();
1900 		pcpu_rt = rt6_get_pcpu_route(f6i);
1901 
1902 		if (!pcpu_rt)
1903 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1904 
1905 		local_bh_enable();
1906 		rcu_read_unlock();
1907 
1908 		return pcpu_rt;
1909 	}
1910 }
1911 EXPORT_SYMBOL_GPL(ip6_pol_route);
1912 
1913 static struct rt6_info *ip6_pol_route_input(struct net *net,
1914 					    struct fib6_table *table,
1915 					    struct flowi6 *fl6,
1916 					    const struct sk_buff *skb,
1917 					    int flags)
1918 {
1919 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1920 }
1921 
1922 struct dst_entry *ip6_route_input_lookup(struct net *net,
1923 					 struct net_device *dev,
1924 					 struct flowi6 *fl6,
1925 					 const struct sk_buff *skb,
1926 					 int flags)
1927 {
1928 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1929 		flags |= RT6_LOOKUP_F_IFACE;
1930 
1931 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1932 }
1933 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1934 
1935 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1936 				  struct flow_keys *keys,
1937 				  struct flow_keys *flkeys)
1938 {
1939 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1940 	const struct ipv6hdr *key_iph = outer_iph;
1941 	struct flow_keys *_flkeys = flkeys;
1942 	const struct ipv6hdr *inner_iph;
1943 	const struct icmp6hdr *icmph;
1944 	struct ipv6hdr _inner_iph;
1945 	struct icmp6hdr _icmph;
1946 
1947 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1948 		goto out;
1949 
1950 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1951 				   sizeof(_icmph), &_icmph);
1952 	if (!icmph)
1953 		goto out;
1954 
1955 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1956 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1957 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1958 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1959 		goto out;
1960 
1961 	inner_iph = skb_header_pointer(skb,
1962 				       skb_transport_offset(skb) + sizeof(*icmph),
1963 				       sizeof(_inner_iph), &_inner_iph);
1964 	if (!inner_iph)
1965 		goto out;
1966 
1967 	key_iph = inner_iph;
1968 	_flkeys = NULL;
1969 out:
1970 	if (_flkeys) {
1971 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1972 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1973 		keys->tags.flow_label = _flkeys->tags.flow_label;
1974 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1975 	} else {
1976 		keys->addrs.v6addrs.src = key_iph->saddr;
1977 		keys->addrs.v6addrs.dst = key_iph->daddr;
1978 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1979 		keys->basic.ip_proto = key_iph->nexthdr;
1980 	}
1981 }
1982 
1983 /* if skb is set it will be used and fl6 can be NULL */
1984 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1985 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1986 {
1987 	struct flow_keys hash_keys;
1988 	u32 mhash;
1989 
1990 	switch (ip6_multipath_hash_policy(net)) {
1991 	case 0:
1992 		memset(&hash_keys, 0, sizeof(hash_keys));
1993 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1994 		if (skb) {
1995 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1996 		} else {
1997 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1998 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1999 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
2000 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2001 		}
2002 		break;
2003 	case 1:
2004 		if (skb) {
2005 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2006 			struct flow_keys keys;
2007 
2008 			/* short-circuit if we already have L4 hash present */
2009 			if (skb->l4_hash)
2010 				return skb_get_hash_raw(skb) >> 1;
2011 
2012 			memset(&hash_keys, 0, sizeof(hash_keys));
2013 
2014                         if (!flkeys) {
2015 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2016 				flkeys = &keys;
2017 			}
2018 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2019 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2020 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2021 			hash_keys.ports.src = flkeys->ports.src;
2022 			hash_keys.ports.dst = flkeys->ports.dst;
2023 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2024 		} else {
2025 			memset(&hash_keys, 0, sizeof(hash_keys));
2026 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2027 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2028 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2029 			hash_keys.ports.src = fl6->fl6_sport;
2030 			hash_keys.ports.dst = fl6->fl6_dport;
2031 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2032 		}
2033 		break;
2034 	}
2035 	mhash = flow_hash_from_keys(&hash_keys);
2036 
2037 	return mhash >> 1;
2038 }
2039 
2040 void ip6_route_input(struct sk_buff *skb)
2041 {
2042 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2043 	struct net *net = dev_net(skb->dev);
2044 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2045 	struct ip_tunnel_info *tun_info;
2046 	struct flowi6 fl6 = {
2047 		.flowi6_iif = skb->dev->ifindex,
2048 		.daddr = iph->daddr,
2049 		.saddr = iph->saddr,
2050 		.flowlabel = ip6_flowinfo(iph),
2051 		.flowi6_mark = skb->mark,
2052 		.flowi6_proto = iph->nexthdr,
2053 	};
2054 	struct flow_keys *flkeys = NULL, _flkeys;
2055 
2056 	tun_info = skb_tunnel_info(skb);
2057 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2058 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2059 
2060 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2061 		flkeys = &_flkeys;
2062 
2063 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2064 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2065 	skb_dst_drop(skb);
2066 	skb_dst_set(skb,
2067 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2068 }
2069 
2070 static struct rt6_info *ip6_pol_route_output(struct net *net,
2071 					     struct fib6_table *table,
2072 					     struct flowi6 *fl6,
2073 					     const struct sk_buff *skb,
2074 					     int flags)
2075 {
2076 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2077 }
2078 
2079 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2080 					 struct flowi6 *fl6, int flags)
2081 {
2082 	bool any_src;
2083 
2084 	if (rt6_need_strict(&fl6->daddr)) {
2085 		struct dst_entry *dst;
2086 
2087 		dst = l3mdev_link_scope_lookup(net, fl6);
2088 		if (dst)
2089 			return dst;
2090 	}
2091 
2092 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2093 
2094 	any_src = ipv6_addr_any(&fl6->saddr);
2095 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2096 	    (fl6->flowi6_oif && any_src))
2097 		flags |= RT6_LOOKUP_F_IFACE;
2098 
2099 	if (!any_src)
2100 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2101 	else if (sk)
2102 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2103 
2104 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2105 }
2106 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2107 
2108 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2109 {
2110 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2111 	struct net_device *loopback_dev = net->loopback_dev;
2112 	struct dst_entry *new = NULL;
2113 
2114 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2115 		       DST_OBSOLETE_DEAD, 0);
2116 	if (rt) {
2117 		rt6_info_init(rt);
2118 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2119 
2120 		new = &rt->dst;
2121 		new->__use = 1;
2122 		new->input = dst_discard;
2123 		new->output = dst_discard_out;
2124 
2125 		dst_copy_metrics(new, &ort->dst);
2126 
2127 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2128 		rt->rt6i_gateway = ort->rt6i_gateway;
2129 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2130 
2131 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2132 #ifdef CONFIG_IPV6_SUBTREES
2133 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2134 #endif
2135 	}
2136 
2137 	dst_release(dst_orig);
2138 	return new ? new : ERR_PTR(-ENOMEM);
2139 }
2140 
2141 /*
2142  *	Destination cache support functions
2143  */
2144 
2145 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2146 {
2147 	u32 rt_cookie = 0;
2148 
2149 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2150 		return false;
2151 
2152 	if (fib6_check_expired(f6i))
2153 		return false;
2154 
2155 	return true;
2156 }
2157 
2158 static struct dst_entry *rt6_check(struct rt6_info *rt,
2159 				   struct fib6_info *from,
2160 				   u32 cookie)
2161 {
2162 	u32 rt_cookie = 0;
2163 
2164 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2165 	    rt_cookie != cookie)
2166 		return NULL;
2167 
2168 	if (rt6_check_expired(rt))
2169 		return NULL;
2170 
2171 	return &rt->dst;
2172 }
2173 
2174 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2175 					    struct fib6_info *from,
2176 					    u32 cookie)
2177 {
2178 	if (!__rt6_check_expired(rt) &&
2179 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2180 	    fib6_check(from, cookie))
2181 		return &rt->dst;
2182 	else
2183 		return NULL;
2184 }
2185 
2186 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2187 {
2188 	struct dst_entry *dst_ret;
2189 	struct fib6_info *from;
2190 	struct rt6_info *rt;
2191 
2192 	rt = container_of(dst, struct rt6_info, dst);
2193 
2194 	rcu_read_lock();
2195 
2196 	/* All IPV6 dsts are created with ->obsolete set to the value
2197 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2198 	 * into this function always.
2199 	 */
2200 
2201 	from = rcu_dereference(rt->from);
2202 
2203 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2204 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2205 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2206 	else
2207 		dst_ret = rt6_check(rt, from, cookie);
2208 
2209 	rcu_read_unlock();
2210 
2211 	return dst_ret;
2212 }
2213 
2214 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2215 {
2216 	struct rt6_info *rt = (struct rt6_info *) dst;
2217 
2218 	if (rt) {
2219 		if (rt->rt6i_flags & RTF_CACHE) {
2220 			rcu_read_lock();
2221 			if (rt6_check_expired(rt)) {
2222 				rt6_remove_exception_rt(rt);
2223 				dst = NULL;
2224 			}
2225 			rcu_read_unlock();
2226 		} else {
2227 			dst_release(dst);
2228 			dst = NULL;
2229 		}
2230 	}
2231 	return dst;
2232 }
2233 
2234 static void ip6_link_failure(struct sk_buff *skb)
2235 {
2236 	struct rt6_info *rt;
2237 
2238 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2239 
2240 	rt = (struct rt6_info *) skb_dst(skb);
2241 	if (rt) {
2242 		rcu_read_lock();
2243 		if (rt->rt6i_flags & RTF_CACHE) {
2244 			if (dst_hold_safe(&rt->dst))
2245 				rt6_remove_exception_rt(rt);
2246 		} else {
2247 			struct fib6_info *from;
2248 			struct fib6_node *fn;
2249 
2250 			from = rcu_dereference(rt->from);
2251 			if (from) {
2252 				fn = rcu_dereference(from->fib6_node);
2253 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2254 					fn->fn_sernum = -1;
2255 			}
2256 		}
2257 		rcu_read_unlock();
2258 	}
2259 }
2260 
2261 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2262 {
2263 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2264 		struct fib6_info *from;
2265 
2266 		rcu_read_lock();
2267 		from = rcu_dereference(rt0->from);
2268 		if (from)
2269 			rt0->dst.expires = from->expires;
2270 		rcu_read_unlock();
2271 	}
2272 
2273 	dst_set_expires(&rt0->dst, timeout);
2274 	rt0->rt6i_flags |= RTF_EXPIRES;
2275 }
2276 
2277 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2278 {
2279 	struct net *net = dev_net(rt->dst.dev);
2280 
2281 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2282 	rt->rt6i_flags |= RTF_MODIFIED;
2283 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2284 }
2285 
2286 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2287 {
2288 	bool from_set;
2289 
2290 	rcu_read_lock();
2291 	from_set = !!rcu_dereference(rt->from);
2292 	rcu_read_unlock();
2293 
2294 	return !(rt->rt6i_flags & RTF_CACHE) &&
2295 		(rt->rt6i_flags & RTF_PCPU || from_set);
2296 }
2297 
2298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2299 				 const struct ipv6hdr *iph, u32 mtu)
2300 {
2301 	const struct in6_addr *daddr, *saddr;
2302 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2303 
2304 	if (rt6->rt6i_flags & RTF_LOCAL)
2305 		return;
2306 
2307 	if (dst_metric_locked(dst, RTAX_MTU))
2308 		return;
2309 
2310 	if (iph) {
2311 		daddr = &iph->daddr;
2312 		saddr = &iph->saddr;
2313 	} else if (sk) {
2314 		daddr = &sk->sk_v6_daddr;
2315 		saddr = &inet6_sk(sk)->saddr;
2316 	} else {
2317 		daddr = NULL;
2318 		saddr = NULL;
2319 	}
2320 	dst_confirm_neigh(dst, daddr);
2321 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2322 	if (mtu >= dst_mtu(dst))
2323 		return;
2324 
2325 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2326 		rt6_do_update_pmtu(rt6, mtu);
2327 		/* update rt6_ex->stamp for cache */
2328 		if (rt6->rt6i_flags & RTF_CACHE)
2329 			rt6_update_exception_stamp_rt(rt6);
2330 	} else if (daddr) {
2331 		struct fib6_info *from;
2332 		struct rt6_info *nrt6;
2333 
2334 		rcu_read_lock();
2335 		from = rcu_dereference(rt6->from);
2336 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2337 		if (nrt6) {
2338 			rt6_do_update_pmtu(nrt6, mtu);
2339 			if (rt6_insert_exception(nrt6, from))
2340 				dst_release_immediate(&nrt6->dst);
2341 		}
2342 		rcu_read_unlock();
2343 	}
2344 }
2345 
2346 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2347 			       struct sk_buff *skb, u32 mtu)
2348 {
2349 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2350 }
2351 
2352 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2353 		     int oif, u32 mark, kuid_t uid)
2354 {
2355 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2356 	struct dst_entry *dst;
2357 	struct flowi6 fl6;
2358 
2359 	memset(&fl6, 0, sizeof(fl6));
2360 	fl6.flowi6_oif = oif;
2361 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2362 	fl6.daddr = iph->daddr;
2363 	fl6.saddr = iph->saddr;
2364 	fl6.flowlabel = ip6_flowinfo(iph);
2365 	fl6.flowi6_uid = uid;
2366 
2367 	dst = ip6_route_output(net, NULL, &fl6);
2368 	if (!dst->error)
2369 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2370 	dst_release(dst);
2371 }
2372 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2373 
2374 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2375 {
2376 	struct dst_entry *dst;
2377 
2378 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2379 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2380 
2381 	dst = __sk_dst_get(sk);
2382 	if (!dst || !dst->obsolete ||
2383 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384 		return;
2385 
2386 	bh_lock_sock(sk);
2387 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2388 		ip6_datagram_dst_update(sk, false);
2389 	bh_unlock_sock(sk);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392 
2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2394 			   const struct flowi6 *fl6)
2395 {
2396 #ifdef CONFIG_IPV6_SUBTREES
2397 	struct ipv6_pinfo *np = inet6_sk(sk);
2398 #endif
2399 
2400 	ip6_dst_store(sk, dst,
2401 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2402 		      &sk->sk_v6_daddr : NULL,
2403 #ifdef CONFIG_IPV6_SUBTREES
2404 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2405 		      &np->saddr :
2406 #endif
2407 		      NULL);
2408 }
2409 
2410 /* Handle redirects */
2411 struct ip6rd_flowi {
2412 	struct flowi6 fl6;
2413 	struct in6_addr gateway;
2414 };
2415 
2416 static struct rt6_info *__ip6_route_redirect(struct net *net,
2417 					     struct fib6_table *table,
2418 					     struct flowi6 *fl6,
2419 					     const struct sk_buff *skb,
2420 					     int flags)
2421 {
2422 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2423 	struct rt6_info *ret = NULL, *rt_cache;
2424 	struct fib6_info *rt;
2425 	struct fib6_node *fn;
2426 
2427 	/* Get the "current" route for this destination and
2428 	 * check if the redirect has come from appropriate router.
2429 	 *
2430 	 * RFC 4861 specifies that redirects should only be
2431 	 * accepted if they come from the nexthop to the target.
2432 	 * Due to the way the routes are chosen, this notion
2433 	 * is a bit fuzzy and one might need to check all possible
2434 	 * routes.
2435 	 */
2436 
2437 	rcu_read_lock();
2438 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2439 restart:
2440 	for_each_fib6_node_rt_rcu(fn) {
2441 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2442 			continue;
2443 		if (fib6_check_expired(rt))
2444 			continue;
2445 		if (rt->fib6_flags & RTF_REJECT)
2446 			break;
2447 		if (!(rt->fib6_flags & RTF_GATEWAY))
2448 			continue;
2449 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2450 			continue;
2451 		/* rt_cache's gateway might be different from its 'parent'
2452 		 * in the case of an ip redirect.
2453 		 * So we keep searching in the exception table if the gateway
2454 		 * is different.
2455 		 */
2456 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2457 			rt_cache = rt6_find_cached_rt(rt,
2458 						      &fl6->daddr,
2459 						      &fl6->saddr);
2460 			if (rt_cache &&
2461 			    ipv6_addr_equal(&rdfl->gateway,
2462 					    &rt_cache->rt6i_gateway)) {
2463 				ret = rt_cache;
2464 				break;
2465 			}
2466 			continue;
2467 		}
2468 		break;
2469 	}
2470 
2471 	if (!rt)
2472 		rt = net->ipv6.fib6_null_entry;
2473 	else if (rt->fib6_flags & RTF_REJECT) {
2474 		ret = net->ipv6.ip6_null_entry;
2475 		goto out;
2476 	}
2477 
2478 	if (rt == net->ipv6.fib6_null_entry) {
2479 		fn = fib6_backtrack(fn, &fl6->saddr);
2480 		if (fn)
2481 			goto restart;
2482 	}
2483 
2484 out:
2485 	if (ret)
2486 		dst_hold(&ret->dst);
2487 	else
2488 		ret = ip6_create_rt_rcu(rt);
2489 
2490 	rcu_read_unlock();
2491 
2492 	trace_fib6_table_lookup(net, rt, table, fl6);
2493 	return ret;
2494 };
2495 
2496 static struct dst_entry *ip6_route_redirect(struct net *net,
2497 					    const struct flowi6 *fl6,
2498 					    const struct sk_buff *skb,
2499 					    const struct in6_addr *gateway)
2500 {
2501 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2502 	struct ip6rd_flowi rdfl;
2503 
2504 	rdfl.fl6 = *fl6;
2505 	rdfl.gateway = *gateway;
2506 
2507 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2508 				flags, __ip6_route_redirect);
2509 }
2510 
2511 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2512 		  kuid_t uid)
2513 {
2514 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2515 	struct dst_entry *dst;
2516 	struct flowi6 fl6;
2517 
2518 	memset(&fl6, 0, sizeof(fl6));
2519 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2520 	fl6.flowi6_oif = oif;
2521 	fl6.flowi6_mark = mark;
2522 	fl6.daddr = iph->daddr;
2523 	fl6.saddr = iph->saddr;
2524 	fl6.flowlabel = ip6_flowinfo(iph);
2525 	fl6.flowi6_uid = uid;
2526 
2527 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2528 	rt6_do_redirect(dst, NULL, skb);
2529 	dst_release(dst);
2530 }
2531 EXPORT_SYMBOL_GPL(ip6_redirect);
2532 
2533 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2534 			    u32 mark)
2535 {
2536 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2537 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2538 	struct dst_entry *dst;
2539 	struct flowi6 fl6;
2540 
2541 	memset(&fl6, 0, sizeof(fl6));
2542 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2543 	fl6.flowi6_oif = oif;
2544 	fl6.flowi6_mark = mark;
2545 	fl6.daddr = msg->dest;
2546 	fl6.saddr = iph->daddr;
2547 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2548 
2549 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2550 	rt6_do_redirect(dst, NULL, skb);
2551 	dst_release(dst);
2552 }
2553 
2554 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2555 {
2556 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2557 		     sk->sk_uid);
2558 }
2559 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2560 
2561 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2562 {
2563 	struct net_device *dev = dst->dev;
2564 	unsigned int mtu = dst_mtu(dst);
2565 	struct net *net = dev_net(dev);
2566 
2567 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2568 
2569 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2570 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2571 
2572 	/*
2573 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2574 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2575 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2576 	 * rely only on pmtu discovery"
2577 	 */
2578 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2579 		mtu = IPV6_MAXPLEN;
2580 	return mtu;
2581 }
2582 
2583 static unsigned int ip6_mtu(const struct dst_entry *dst)
2584 {
2585 	struct inet6_dev *idev;
2586 	unsigned int mtu;
2587 
2588 	mtu = dst_metric_raw(dst, RTAX_MTU);
2589 	if (mtu)
2590 		goto out;
2591 
2592 	mtu = IPV6_MIN_MTU;
2593 
2594 	rcu_read_lock();
2595 	idev = __in6_dev_get(dst->dev);
2596 	if (idev)
2597 		mtu = idev->cnf.mtu6;
2598 	rcu_read_unlock();
2599 
2600 out:
2601 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2602 
2603 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2604 }
2605 
2606 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2607 				  struct flowi6 *fl6)
2608 {
2609 	struct dst_entry *dst;
2610 	struct rt6_info *rt;
2611 	struct inet6_dev *idev = in6_dev_get(dev);
2612 	struct net *net = dev_net(dev);
2613 
2614 	if (unlikely(!idev))
2615 		return ERR_PTR(-ENODEV);
2616 
2617 	rt = ip6_dst_alloc(net, dev, 0);
2618 	if (unlikely(!rt)) {
2619 		in6_dev_put(idev);
2620 		dst = ERR_PTR(-ENOMEM);
2621 		goto out;
2622 	}
2623 
2624 	rt->dst.flags |= DST_HOST;
2625 	rt->dst.input = ip6_input;
2626 	rt->dst.output  = ip6_output;
2627 	rt->rt6i_gateway  = fl6->daddr;
2628 	rt->rt6i_dst.addr = fl6->daddr;
2629 	rt->rt6i_dst.plen = 128;
2630 	rt->rt6i_idev     = idev;
2631 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2632 
2633 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2634 	 * do proper release of the net_device
2635 	 */
2636 	rt6_uncached_list_add(rt);
2637 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2638 
2639 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2640 
2641 out:
2642 	return dst;
2643 }
2644 
2645 static int ip6_dst_gc(struct dst_ops *ops)
2646 {
2647 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2648 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2649 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2650 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2651 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2652 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2653 	int entries;
2654 
2655 	entries = dst_entries_get_fast(ops);
2656 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2657 	    entries <= rt_max_size)
2658 		goto out;
2659 
2660 	net->ipv6.ip6_rt_gc_expire++;
2661 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2662 	entries = dst_entries_get_slow(ops);
2663 	if (entries < ops->gc_thresh)
2664 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2665 out:
2666 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2667 	return entries > rt_max_size;
2668 }
2669 
2670 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2671 			       struct fib6_config *cfg)
2672 {
2673 	struct dst_metrics *p;
2674 
2675 	if (!cfg->fc_mx)
2676 		return 0;
2677 
2678 	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2679 	if (unlikely(!p))
2680 		return -ENOMEM;
2681 
2682 	refcount_set(&p->refcnt, 1);
2683 	rt->fib6_metrics = p;
2684 
2685 	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2686 }
2687 
2688 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2689 					    struct fib6_config *cfg,
2690 					    const struct in6_addr *gw_addr,
2691 					    u32 tbid, int flags)
2692 {
2693 	struct flowi6 fl6 = {
2694 		.flowi6_oif = cfg->fc_ifindex,
2695 		.daddr = *gw_addr,
2696 		.saddr = cfg->fc_prefsrc,
2697 	};
2698 	struct fib6_table *table;
2699 	struct rt6_info *rt;
2700 
2701 	table = fib6_get_table(net, tbid);
2702 	if (!table)
2703 		return NULL;
2704 
2705 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2706 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2707 
2708 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2709 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2710 
2711 	/* if table lookup failed, fall back to full lookup */
2712 	if (rt == net->ipv6.ip6_null_entry) {
2713 		ip6_rt_put(rt);
2714 		rt = NULL;
2715 	}
2716 
2717 	return rt;
2718 }
2719 
2720 static int ip6_route_check_nh_onlink(struct net *net,
2721 				     struct fib6_config *cfg,
2722 				     const struct net_device *dev,
2723 				     struct netlink_ext_ack *extack)
2724 {
2725 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2726 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2727 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2728 	struct rt6_info *grt;
2729 	int err;
2730 
2731 	err = 0;
2732 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2733 	if (grt) {
2734 		if (!grt->dst.error &&
2735 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2736 			NL_SET_ERR_MSG(extack,
2737 				       "Nexthop has invalid gateway or device mismatch");
2738 			err = -EINVAL;
2739 		}
2740 
2741 		ip6_rt_put(grt);
2742 	}
2743 
2744 	return err;
2745 }
2746 
2747 static int ip6_route_check_nh(struct net *net,
2748 			      struct fib6_config *cfg,
2749 			      struct net_device **_dev,
2750 			      struct inet6_dev **idev)
2751 {
2752 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2753 	struct net_device *dev = _dev ? *_dev : NULL;
2754 	struct rt6_info *grt = NULL;
2755 	int err = -EHOSTUNREACH;
2756 
2757 	if (cfg->fc_table) {
2758 		int flags = RT6_LOOKUP_F_IFACE;
2759 
2760 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2761 					  cfg->fc_table, flags);
2762 		if (grt) {
2763 			if (grt->rt6i_flags & RTF_GATEWAY ||
2764 			    (dev && dev != grt->dst.dev)) {
2765 				ip6_rt_put(grt);
2766 				grt = NULL;
2767 			}
2768 		}
2769 	}
2770 
2771 	if (!grt)
2772 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2773 
2774 	if (!grt)
2775 		goto out;
2776 
2777 	if (dev) {
2778 		if (dev != grt->dst.dev) {
2779 			ip6_rt_put(grt);
2780 			goto out;
2781 		}
2782 	} else {
2783 		*_dev = dev = grt->dst.dev;
2784 		*idev = grt->rt6i_idev;
2785 		dev_hold(dev);
2786 		in6_dev_hold(grt->rt6i_idev);
2787 	}
2788 
2789 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2790 		err = 0;
2791 
2792 	ip6_rt_put(grt);
2793 
2794 out:
2795 	return err;
2796 }
2797 
2798 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2799 			   struct net_device **_dev, struct inet6_dev **idev,
2800 			   struct netlink_ext_ack *extack)
2801 {
2802 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2803 	int gwa_type = ipv6_addr_type(gw_addr);
2804 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2805 	const struct net_device *dev = *_dev;
2806 	bool need_addr_check = !dev;
2807 	int err = -EINVAL;
2808 
2809 	/* if gw_addr is local we will fail to detect this in case
2810 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2811 	 * will return already-added prefix route via interface that
2812 	 * prefix route was assigned to, which might be non-loopback.
2813 	 */
2814 	if (dev &&
2815 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2816 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2817 		goto out;
2818 	}
2819 
2820 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2821 		/* IPv6 strictly inhibits using not link-local
2822 		 * addresses as nexthop address.
2823 		 * Otherwise, router will not able to send redirects.
2824 		 * It is very good, but in some (rare!) circumstances
2825 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2826 		 * some exceptions. --ANK
2827 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2828 		 * addressing
2829 		 */
2830 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2831 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2832 			goto out;
2833 		}
2834 
2835 		if (cfg->fc_flags & RTNH_F_ONLINK)
2836 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2837 		else
2838 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2839 
2840 		if (err)
2841 			goto out;
2842 	}
2843 
2844 	/* reload in case device was changed */
2845 	dev = *_dev;
2846 
2847 	err = -EINVAL;
2848 	if (!dev) {
2849 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2850 		goto out;
2851 	} else if (dev->flags & IFF_LOOPBACK) {
2852 		NL_SET_ERR_MSG(extack,
2853 			       "Egress device can not be loopback device for this route");
2854 		goto out;
2855 	}
2856 
2857 	/* if we did not check gw_addr above, do so now that the
2858 	 * egress device has been resolved.
2859 	 */
2860 	if (need_addr_check &&
2861 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2862 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2863 		goto out;
2864 	}
2865 
2866 	err = 0;
2867 out:
2868 	return err;
2869 }
2870 
2871 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2872 					      gfp_t gfp_flags,
2873 					      struct netlink_ext_ack *extack)
2874 {
2875 	struct net *net = cfg->fc_nlinfo.nl_net;
2876 	struct fib6_info *rt = NULL;
2877 	struct net_device *dev = NULL;
2878 	struct inet6_dev *idev = NULL;
2879 	struct fib6_table *table;
2880 	int addr_type;
2881 	int err = -EINVAL;
2882 
2883 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2884 	if (cfg->fc_flags & RTF_PCPU) {
2885 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2886 		goto out;
2887 	}
2888 
2889 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2890 	if (cfg->fc_flags & RTF_CACHE) {
2891 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2892 		goto out;
2893 	}
2894 
2895 	if (cfg->fc_type > RTN_MAX) {
2896 		NL_SET_ERR_MSG(extack, "Invalid route type");
2897 		goto out;
2898 	}
2899 
2900 	if (cfg->fc_dst_len > 128) {
2901 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2902 		goto out;
2903 	}
2904 	if (cfg->fc_src_len > 128) {
2905 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2906 		goto out;
2907 	}
2908 #ifndef CONFIG_IPV6_SUBTREES
2909 	if (cfg->fc_src_len) {
2910 		NL_SET_ERR_MSG(extack,
2911 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2912 		goto out;
2913 	}
2914 #endif
2915 	if (cfg->fc_ifindex) {
2916 		err = -ENODEV;
2917 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2918 		if (!dev)
2919 			goto out;
2920 		idev = in6_dev_get(dev);
2921 		if (!idev)
2922 			goto out;
2923 	}
2924 
2925 	if (cfg->fc_metric == 0)
2926 		cfg->fc_metric = IP6_RT_PRIO_USER;
2927 
2928 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2929 		if (!dev) {
2930 			NL_SET_ERR_MSG(extack,
2931 				       "Nexthop device required for onlink");
2932 			err = -ENODEV;
2933 			goto out;
2934 		}
2935 
2936 		if (!(dev->flags & IFF_UP)) {
2937 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2938 			err = -ENETDOWN;
2939 			goto out;
2940 		}
2941 	}
2942 
2943 	err = -ENOBUFS;
2944 	if (cfg->fc_nlinfo.nlh &&
2945 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2946 		table = fib6_get_table(net, cfg->fc_table);
2947 		if (!table) {
2948 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2949 			table = fib6_new_table(net, cfg->fc_table);
2950 		}
2951 	} else {
2952 		table = fib6_new_table(net, cfg->fc_table);
2953 	}
2954 
2955 	if (!table)
2956 		goto out;
2957 
2958 	err = -ENOMEM;
2959 	rt = fib6_info_alloc(gfp_flags);
2960 	if (!rt)
2961 		goto out;
2962 
2963 	if (cfg->fc_flags & RTF_ADDRCONF)
2964 		rt->dst_nocount = true;
2965 
2966 	err = ip6_convert_metrics(net, rt, cfg);
2967 	if (err < 0)
2968 		goto out;
2969 
2970 	if (cfg->fc_flags & RTF_EXPIRES)
2971 		fib6_set_expires(rt, jiffies +
2972 				clock_t_to_jiffies(cfg->fc_expires));
2973 	else
2974 		fib6_clean_expires(rt);
2975 
2976 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2977 		cfg->fc_protocol = RTPROT_BOOT;
2978 	rt->fib6_protocol = cfg->fc_protocol;
2979 
2980 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2981 
2982 	if (cfg->fc_encap) {
2983 		struct lwtunnel_state *lwtstate;
2984 
2985 		err = lwtunnel_build_state(cfg->fc_encap_type,
2986 					   cfg->fc_encap, AF_INET6, cfg,
2987 					   &lwtstate, extack);
2988 		if (err)
2989 			goto out;
2990 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2991 	}
2992 
2993 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2994 	rt->fib6_dst.plen = cfg->fc_dst_len;
2995 	if (rt->fib6_dst.plen == 128)
2996 		rt->dst_host = true;
2997 
2998 #ifdef CONFIG_IPV6_SUBTREES
2999 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3000 	rt->fib6_src.plen = cfg->fc_src_len;
3001 #endif
3002 
3003 	rt->fib6_metric = cfg->fc_metric;
3004 	rt->fib6_nh.nh_weight = 1;
3005 
3006 	rt->fib6_type = cfg->fc_type;
3007 
3008 	/* We cannot add true routes via loopback here,
3009 	   they would result in kernel looping; promote them to reject routes
3010 	 */
3011 	if ((cfg->fc_flags & RTF_REJECT) ||
3012 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3013 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3014 	     !(cfg->fc_flags & RTF_LOCAL))) {
3015 		/* hold loopback dev/idev if we haven't done so. */
3016 		if (dev != net->loopback_dev) {
3017 			if (dev) {
3018 				dev_put(dev);
3019 				in6_dev_put(idev);
3020 			}
3021 			dev = net->loopback_dev;
3022 			dev_hold(dev);
3023 			idev = in6_dev_get(dev);
3024 			if (!idev) {
3025 				err = -ENODEV;
3026 				goto out;
3027 			}
3028 		}
3029 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3030 		goto install_route;
3031 	}
3032 
3033 	if (cfg->fc_flags & RTF_GATEWAY) {
3034 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3035 		if (err)
3036 			goto out;
3037 
3038 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3039 	}
3040 
3041 	err = -ENODEV;
3042 	if (!dev)
3043 		goto out;
3044 
3045 	if (idev->cnf.disable_ipv6) {
3046 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3047 		err = -EACCES;
3048 		goto out;
3049 	}
3050 
3051 	if (!(dev->flags & IFF_UP)) {
3052 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3053 		err = -ENETDOWN;
3054 		goto out;
3055 	}
3056 
3057 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3058 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3059 			NL_SET_ERR_MSG(extack, "Invalid source address");
3060 			err = -EINVAL;
3061 			goto out;
3062 		}
3063 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3064 		rt->fib6_prefsrc.plen = 128;
3065 	} else
3066 		rt->fib6_prefsrc.plen = 0;
3067 
3068 	rt->fib6_flags = cfg->fc_flags;
3069 
3070 install_route:
3071 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3072 	    !netif_carrier_ok(dev))
3073 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3074 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3075 	rt->fib6_nh.nh_dev = dev;
3076 	rt->fib6_table = table;
3077 
3078 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3079 
3080 	if (idev)
3081 		in6_dev_put(idev);
3082 
3083 	return rt;
3084 out:
3085 	if (dev)
3086 		dev_put(dev);
3087 	if (idev)
3088 		in6_dev_put(idev);
3089 
3090 	fib6_info_release(rt);
3091 	return ERR_PTR(err);
3092 }
3093 
3094 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3095 		  struct netlink_ext_ack *extack)
3096 {
3097 	struct fib6_info *rt;
3098 	int err;
3099 
3100 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3101 	if (IS_ERR(rt))
3102 		return PTR_ERR(rt);
3103 
3104 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3105 	fib6_info_release(rt);
3106 
3107 	return err;
3108 }
3109 
3110 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3111 {
3112 	struct net *net = info->nl_net;
3113 	struct fib6_table *table;
3114 	int err;
3115 
3116 	if (rt == net->ipv6.fib6_null_entry) {
3117 		err = -ENOENT;
3118 		goto out;
3119 	}
3120 
3121 	table = rt->fib6_table;
3122 	spin_lock_bh(&table->tb6_lock);
3123 	err = fib6_del(rt, info);
3124 	spin_unlock_bh(&table->tb6_lock);
3125 
3126 out:
3127 	fib6_info_release(rt);
3128 	return err;
3129 }
3130 
3131 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3132 {
3133 	struct nl_info info = { .nl_net = net };
3134 
3135 	return __ip6_del_rt(rt, &info);
3136 }
3137 
3138 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3139 {
3140 	struct nl_info *info = &cfg->fc_nlinfo;
3141 	struct net *net = info->nl_net;
3142 	struct sk_buff *skb = NULL;
3143 	struct fib6_table *table;
3144 	int err = -ENOENT;
3145 
3146 	if (rt == net->ipv6.fib6_null_entry)
3147 		goto out_put;
3148 	table = rt->fib6_table;
3149 	spin_lock_bh(&table->tb6_lock);
3150 
3151 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3152 		struct fib6_info *sibling, *next_sibling;
3153 
3154 		/* prefer to send a single notification with all hops */
3155 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3156 		if (skb) {
3157 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3158 
3159 			if (rt6_fill_node(net, skb, rt, NULL,
3160 					  NULL, NULL, 0, RTM_DELROUTE,
3161 					  info->portid, seq, 0) < 0) {
3162 				kfree_skb(skb);
3163 				skb = NULL;
3164 			} else
3165 				info->skip_notify = 1;
3166 		}
3167 
3168 		list_for_each_entry_safe(sibling, next_sibling,
3169 					 &rt->fib6_siblings,
3170 					 fib6_siblings) {
3171 			err = fib6_del(sibling, info);
3172 			if (err)
3173 				goto out_unlock;
3174 		}
3175 	}
3176 
3177 	err = fib6_del(rt, info);
3178 out_unlock:
3179 	spin_unlock_bh(&table->tb6_lock);
3180 out_put:
3181 	fib6_info_release(rt);
3182 
3183 	if (skb) {
3184 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3185 			    info->nlh, gfp_any());
3186 	}
3187 	return err;
3188 }
3189 
3190 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3191 {
3192 	int rc = -ESRCH;
3193 
3194 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3195 		goto out;
3196 
3197 	if (cfg->fc_flags & RTF_GATEWAY &&
3198 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3199 		goto out;
3200 	if (dst_hold_safe(&rt->dst))
3201 		rc = rt6_remove_exception_rt(rt);
3202 out:
3203 	return rc;
3204 }
3205 
3206 static int ip6_route_del(struct fib6_config *cfg,
3207 			 struct netlink_ext_ack *extack)
3208 {
3209 	struct rt6_info *rt_cache;
3210 	struct fib6_table *table;
3211 	struct fib6_info *rt;
3212 	struct fib6_node *fn;
3213 	int err = -ESRCH;
3214 
3215 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3216 	if (!table) {
3217 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3218 		return err;
3219 	}
3220 
3221 	rcu_read_lock();
3222 
3223 	fn = fib6_locate(&table->tb6_root,
3224 			 &cfg->fc_dst, cfg->fc_dst_len,
3225 			 &cfg->fc_src, cfg->fc_src_len,
3226 			 !(cfg->fc_flags & RTF_CACHE));
3227 
3228 	if (fn) {
3229 		for_each_fib6_node_rt_rcu(fn) {
3230 			if (cfg->fc_flags & RTF_CACHE) {
3231 				int rc;
3232 
3233 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3234 							      &cfg->fc_src);
3235 				if (rt_cache) {
3236 					rc = ip6_del_cached_rt(rt_cache, cfg);
3237 					if (rc != -ESRCH) {
3238 						rcu_read_unlock();
3239 						return rc;
3240 					}
3241 				}
3242 				continue;
3243 			}
3244 			if (cfg->fc_ifindex &&
3245 			    (!rt->fib6_nh.nh_dev ||
3246 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3247 				continue;
3248 			if (cfg->fc_flags & RTF_GATEWAY &&
3249 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3250 				continue;
3251 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3252 				continue;
3253 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3254 				continue;
3255 			fib6_info_hold(rt);
3256 			rcu_read_unlock();
3257 
3258 			/* if gateway was specified only delete the one hop */
3259 			if (cfg->fc_flags & RTF_GATEWAY)
3260 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3261 
3262 			return __ip6_del_rt_siblings(rt, cfg);
3263 		}
3264 	}
3265 	rcu_read_unlock();
3266 
3267 	return err;
3268 }
3269 
3270 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3271 {
3272 	struct netevent_redirect netevent;
3273 	struct rt6_info *rt, *nrt = NULL;
3274 	struct ndisc_options ndopts;
3275 	struct inet6_dev *in6_dev;
3276 	struct neighbour *neigh;
3277 	struct fib6_info *from;
3278 	struct rd_msg *msg;
3279 	int optlen, on_link;
3280 	u8 *lladdr;
3281 
3282 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3283 	optlen -= sizeof(*msg);
3284 
3285 	if (optlen < 0) {
3286 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3287 		return;
3288 	}
3289 
3290 	msg = (struct rd_msg *)icmp6_hdr(skb);
3291 
3292 	if (ipv6_addr_is_multicast(&msg->dest)) {
3293 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3294 		return;
3295 	}
3296 
3297 	on_link = 0;
3298 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3299 		on_link = 1;
3300 	} else if (ipv6_addr_type(&msg->target) !=
3301 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3302 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3303 		return;
3304 	}
3305 
3306 	in6_dev = __in6_dev_get(skb->dev);
3307 	if (!in6_dev)
3308 		return;
3309 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3310 		return;
3311 
3312 	/* RFC2461 8.1:
3313 	 *	The IP source address of the Redirect MUST be the same as the current
3314 	 *	first-hop router for the specified ICMP Destination Address.
3315 	 */
3316 
3317 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3318 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3319 		return;
3320 	}
3321 
3322 	lladdr = NULL;
3323 	if (ndopts.nd_opts_tgt_lladdr) {
3324 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3325 					     skb->dev);
3326 		if (!lladdr) {
3327 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3328 			return;
3329 		}
3330 	}
3331 
3332 	rt = (struct rt6_info *) dst;
3333 	if (rt->rt6i_flags & RTF_REJECT) {
3334 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3335 		return;
3336 	}
3337 
3338 	/* Redirect received -> path was valid.
3339 	 * Look, redirects are sent only in response to data packets,
3340 	 * so that this nexthop apparently is reachable. --ANK
3341 	 */
3342 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3343 
3344 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3345 	if (!neigh)
3346 		return;
3347 
3348 	/*
3349 	 *	We have finally decided to accept it.
3350 	 */
3351 
3352 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3353 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3354 		     NEIGH_UPDATE_F_OVERRIDE|
3355 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3356 				     NEIGH_UPDATE_F_ISROUTER)),
3357 		     NDISC_REDIRECT, &ndopts);
3358 
3359 	rcu_read_lock();
3360 	from = rcu_dereference(rt->from);
3361 	fib6_info_hold(from);
3362 	rcu_read_unlock();
3363 
3364 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3365 	if (!nrt)
3366 		goto out;
3367 
3368 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3369 	if (on_link)
3370 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3371 
3372 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3373 
3374 	/* No need to remove rt from the exception table if rt is
3375 	 * a cached route because rt6_insert_exception() will
3376 	 * takes care of it
3377 	 */
3378 	if (rt6_insert_exception(nrt, from)) {
3379 		dst_release_immediate(&nrt->dst);
3380 		goto out;
3381 	}
3382 
3383 	netevent.old = &rt->dst;
3384 	netevent.new = &nrt->dst;
3385 	netevent.daddr = &msg->dest;
3386 	netevent.neigh = neigh;
3387 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3388 
3389 out:
3390 	fib6_info_release(from);
3391 	neigh_release(neigh);
3392 }
3393 
3394 #ifdef CONFIG_IPV6_ROUTE_INFO
3395 static struct fib6_info *rt6_get_route_info(struct net *net,
3396 					   const struct in6_addr *prefix, int prefixlen,
3397 					   const struct in6_addr *gwaddr,
3398 					   struct net_device *dev)
3399 {
3400 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3401 	int ifindex = dev->ifindex;
3402 	struct fib6_node *fn;
3403 	struct fib6_info *rt = NULL;
3404 	struct fib6_table *table;
3405 
3406 	table = fib6_get_table(net, tb_id);
3407 	if (!table)
3408 		return NULL;
3409 
3410 	rcu_read_lock();
3411 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3412 	if (!fn)
3413 		goto out;
3414 
3415 	for_each_fib6_node_rt_rcu(fn) {
3416 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3417 			continue;
3418 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3419 			continue;
3420 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3421 			continue;
3422 		fib6_info_hold(rt);
3423 		break;
3424 	}
3425 out:
3426 	rcu_read_unlock();
3427 	return rt;
3428 }
3429 
3430 static struct fib6_info *rt6_add_route_info(struct net *net,
3431 					   const struct in6_addr *prefix, int prefixlen,
3432 					   const struct in6_addr *gwaddr,
3433 					   struct net_device *dev,
3434 					   unsigned int pref)
3435 {
3436 	struct fib6_config cfg = {
3437 		.fc_metric	= IP6_RT_PRIO_USER,
3438 		.fc_ifindex	= dev->ifindex,
3439 		.fc_dst_len	= prefixlen,
3440 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3441 				  RTF_UP | RTF_PREF(pref),
3442 		.fc_protocol = RTPROT_RA,
3443 		.fc_type = RTN_UNICAST,
3444 		.fc_nlinfo.portid = 0,
3445 		.fc_nlinfo.nlh = NULL,
3446 		.fc_nlinfo.nl_net = net,
3447 	};
3448 
3449 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3450 	cfg.fc_dst = *prefix;
3451 	cfg.fc_gateway = *gwaddr;
3452 
3453 	/* We should treat it as a default route if prefix length is 0. */
3454 	if (!prefixlen)
3455 		cfg.fc_flags |= RTF_DEFAULT;
3456 
3457 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3458 
3459 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3460 }
3461 #endif
3462 
3463 struct fib6_info *rt6_get_dflt_router(struct net *net,
3464 				     const struct in6_addr *addr,
3465 				     struct net_device *dev)
3466 {
3467 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3468 	struct fib6_info *rt;
3469 	struct fib6_table *table;
3470 
3471 	table = fib6_get_table(net, tb_id);
3472 	if (!table)
3473 		return NULL;
3474 
3475 	rcu_read_lock();
3476 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3477 		if (dev == rt->fib6_nh.nh_dev &&
3478 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3479 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3480 			break;
3481 	}
3482 	if (rt)
3483 		fib6_info_hold(rt);
3484 	rcu_read_unlock();
3485 	return rt;
3486 }
3487 
3488 struct fib6_info *rt6_add_dflt_router(struct net *net,
3489 				     const struct in6_addr *gwaddr,
3490 				     struct net_device *dev,
3491 				     unsigned int pref)
3492 {
3493 	struct fib6_config cfg = {
3494 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3495 		.fc_metric	= IP6_RT_PRIO_USER,
3496 		.fc_ifindex	= dev->ifindex,
3497 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3498 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3499 		.fc_protocol = RTPROT_RA,
3500 		.fc_type = RTN_UNICAST,
3501 		.fc_nlinfo.portid = 0,
3502 		.fc_nlinfo.nlh = NULL,
3503 		.fc_nlinfo.nl_net = net,
3504 	};
3505 
3506 	cfg.fc_gateway = *gwaddr;
3507 
3508 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3509 		struct fib6_table *table;
3510 
3511 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3512 		if (table)
3513 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3514 	}
3515 
3516 	return rt6_get_dflt_router(net, gwaddr, dev);
3517 }
3518 
3519 static void __rt6_purge_dflt_routers(struct net *net,
3520 				     struct fib6_table *table)
3521 {
3522 	struct fib6_info *rt;
3523 
3524 restart:
3525 	rcu_read_lock();
3526 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3527 		struct net_device *dev = fib6_info_nh_dev(rt);
3528 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3529 
3530 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3531 		    (!idev || idev->cnf.accept_ra != 2)) {
3532 			fib6_info_hold(rt);
3533 			rcu_read_unlock();
3534 			ip6_del_rt(net, rt);
3535 			goto restart;
3536 		}
3537 	}
3538 	rcu_read_unlock();
3539 
3540 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3541 }
3542 
3543 void rt6_purge_dflt_routers(struct net *net)
3544 {
3545 	struct fib6_table *table;
3546 	struct hlist_head *head;
3547 	unsigned int h;
3548 
3549 	rcu_read_lock();
3550 
3551 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3552 		head = &net->ipv6.fib_table_hash[h];
3553 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3554 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3555 				__rt6_purge_dflt_routers(net, table);
3556 		}
3557 	}
3558 
3559 	rcu_read_unlock();
3560 }
3561 
3562 static void rtmsg_to_fib6_config(struct net *net,
3563 				 struct in6_rtmsg *rtmsg,
3564 				 struct fib6_config *cfg)
3565 {
3566 	memset(cfg, 0, sizeof(*cfg));
3567 
3568 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3569 			 : RT6_TABLE_MAIN;
3570 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3571 	cfg->fc_metric = rtmsg->rtmsg_metric;
3572 	cfg->fc_expires = rtmsg->rtmsg_info;
3573 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3574 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3575 	cfg->fc_flags = rtmsg->rtmsg_flags;
3576 	cfg->fc_type = rtmsg->rtmsg_type;
3577 
3578 	cfg->fc_nlinfo.nl_net = net;
3579 
3580 	cfg->fc_dst = rtmsg->rtmsg_dst;
3581 	cfg->fc_src = rtmsg->rtmsg_src;
3582 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3583 }
3584 
3585 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3586 {
3587 	struct fib6_config cfg;
3588 	struct in6_rtmsg rtmsg;
3589 	int err;
3590 
3591 	switch (cmd) {
3592 	case SIOCADDRT:		/* Add a route */
3593 	case SIOCDELRT:		/* Delete a route */
3594 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3595 			return -EPERM;
3596 		err = copy_from_user(&rtmsg, arg,
3597 				     sizeof(struct in6_rtmsg));
3598 		if (err)
3599 			return -EFAULT;
3600 
3601 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3602 
3603 		rtnl_lock();
3604 		switch (cmd) {
3605 		case SIOCADDRT:
3606 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3607 			break;
3608 		case SIOCDELRT:
3609 			err = ip6_route_del(&cfg, NULL);
3610 			break;
3611 		default:
3612 			err = -EINVAL;
3613 		}
3614 		rtnl_unlock();
3615 
3616 		return err;
3617 	}
3618 
3619 	return -EINVAL;
3620 }
3621 
3622 /*
3623  *	Drop the packet on the floor
3624  */
3625 
3626 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3627 {
3628 	int type;
3629 	struct dst_entry *dst = skb_dst(skb);
3630 	switch (ipstats_mib_noroutes) {
3631 	case IPSTATS_MIB_INNOROUTES:
3632 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3633 		if (type == IPV6_ADDR_ANY) {
3634 			IP6_INC_STATS(dev_net(dst->dev),
3635 				      __in6_dev_get_safely(skb->dev),
3636 				      IPSTATS_MIB_INADDRERRORS);
3637 			break;
3638 		}
3639 		/* FALLTHROUGH */
3640 	case IPSTATS_MIB_OUTNOROUTES:
3641 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3642 			      ipstats_mib_noroutes);
3643 		break;
3644 	}
3645 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3646 	kfree_skb(skb);
3647 	return 0;
3648 }
3649 
3650 static int ip6_pkt_discard(struct sk_buff *skb)
3651 {
3652 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3653 }
3654 
3655 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3656 {
3657 	skb->dev = skb_dst(skb)->dev;
3658 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3659 }
3660 
3661 static int ip6_pkt_prohibit(struct sk_buff *skb)
3662 {
3663 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3664 }
3665 
3666 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3667 {
3668 	skb->dev = skb_dst(skb)->dev;
3669 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3670 }
3671 
3672 /*
3673  *	Allocate a dst for local (unicast / anycast) address.
3674  */
3675 
3676 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3677 				     struct inet6_dev *idev,
3678 				     const struct in6_addr *addr,
3679 				     bool anycast, gfp_t gfp_flags)
3680 {
3681 	u32 tb_id;
3682 	struct net_device *dev = idev->dev;
3683 	struct fib6_info *f6i;
3684 
3685 	f6i = fib6_info_alloc(gfp_flags);
3686 	if (!f6i)
3687 		return ERR_PTR(-ENOMEM);
3688 
3689 	f6i->dst_nocount = true;
3690 	f6i->dst_host = true;
3691 	f6i->fib6_protocol = RTPROT_KERNEL;
3692 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3693 	if (anycast) {
3694 		f6i->fib6_type = RTN_ANYCAST;
3695 		f6i->fib6_flags |= RTF_ANYCAST;
3696 	} else {
3697 		f6i->fib6_type = RTN_LOCAL;
3698 		f6i->fib6_flags |= RTF_LOCAL;
3699 	}
3700 
3701 	f6i->fib6_nh.nh_gw = *addr;
3702 	dev_hold(dev);
3703 	f6i->fib6_nh.nh_dev = dev;
3704 	f6i->fib6_dst.addr = *addr;
3705 	f6i->fib6_dst.plen = 128;
3706 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3707 	f6i->fib6_table = fib6_get_table(net, tb_id);
3708 
3709 	return f6i;
3710 }
3711 
3712 /* remove deleted ip from prefsrc entries */
3713 struct arg_dev_net_ip {
3714 	struct net_device *dev;
3715 	struct net *net;
3716 	struct in6_addr *addr;
3717 };
3718 
3719 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3720 {
3721 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3722 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3723 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3724 
3725 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3726 	    rt != net->ipv6.fib6_null_entry &&
3727 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3728 		spin_lock_bh(&rt6_exception_lock);
3729 		/* remove prefsrc entry */
3730 		rt->fib6_prefsrc.plen = 0;
3731 		/* need to update cache as well */
3732 		rt6_exceptions_remove_prefsrc(rt);
3733 		spin_unlock_bh(&rt6_exception_lock);
3734 	}
3735 	return 0;
3736 }
3737 
3738 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3739 {
3740 	struct net *net = dev_net(ifp->idev->dev);
3741 	struct arg_dev_net_ip adni = {
3742 		.dev = ifp->idev->dev,
3743 		.net = net,
3744 		.addr = &ifp->addr,
3745 	};
3746 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3747 }
3748 
3749 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3750 
3751 /* Remove routers and update dst entries when gateway turn into host. */
3752 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3753 {
3754 	struct in6_addr *gateway = (struct in6_addr *)arg;
3755 
3756 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3757 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3758 		return -1;
3759 	}
3760 
3761 	/* Further clean up cached routes in exception table.
3762 	 * This is needed because cached route may have a different
3763 	 * gateway than its 'parent' in the case of an ip redirect.
3764 	 */
3765 	rt6_exceptions_clean_tohost(rt, gateway);
3766 
3767 	return 0;
3768 }
3769 
3770 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3771 {
3772 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3773 }
3774 
3775 struct arg_netdev_event {
3776 	const struct net_device *dev;
3777 	union {
3778 		unsigned int nh_flags;
3779 		unsigned long event;
3780 	};
3781 };
3782 
3783 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3784 {
3785 	struct fib6_info *iter;
3786 	struct fib6_node *fn;
3787 
3788 	fn = rcu_dereference_protected(rt->fib6_node,
3789 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3790 	iter = rcu_dereference_protected(fn->leaf,
3791 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3792 	while (iter) {
3793 		if (iter->fib6_metric == rt->fib6_metric &&
3794 		    iter->fib6_nsiblings)
3795 			return iter;
3796 		iter = rcu_dereference_protected(iter->fib6_next,
3797 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3798 	}
3799 
3800 	return NULL;
3801 }
3802 
3803 static bool rt6_is_dead(const struct fib6_info *rt)
3804 {
3805 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3806 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3807 	     fib6_ignore_linkdown(rt)))
3808 		return true;
3809 
3810 	return false;
3811 }
3812 
3813 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3814 {
3815 	struct fib6_info *iter;
3816 	int total = 0;
3817 
3818 	if (!rt6_is_dead(rt))
3819 		total += rt->fib6_nh.nh_weight;
3820 
3821 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3822 		if (!rt6_is_dead(iter))
3823 			total += iter->fib6_nh.nh_weight;
3824 	}
3825 
3826 	return total;
3827 }
3828 
3829 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3830 {
3831 	int upper_bound = -1;
3832 
3833 	if (!rt6_is_dead(rt)) {
3834 		*weight += rt->fib6_nh.nh_weight;
3835 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3836 						    total) - 1;
3837 	}
3838 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3839 }
3840 
3841 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3842 {
3843 	struct fib6_info *iter;
3844 	int weight = 0;
3845 
3846 	rt6_upper_bound_set(rt, &weight, total);
3847 
3848 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3849 		rt6_upper_bound_set(iter, &weight, total);
3850 }
3851 
3852 void rt6_multipath_rebalance(struct fib6_info *rt)
3853 {
3854 	struct fib6_info *first;
3855 	int total;
3856 
3857 	/* In case the entire multipath route was marked for flushing,
3858 	 * then there is no need to rebalance upon the removal of every
3859 	 * sibling route.
3860 	 */
3861 	if (!rt->fib6_nsiblings || rt->should_flush)
3862 		return;
3863 
3864 	/* During lookup routes are evaluated in order, so we need to
3865 	 * make sure upper bounds are assigned from the first sibling
3866 	 * onwards.
3867 	 */
3868 	first = rt6_multipath_first_sibling(rt);
3869 	if (WARN_ON_ONCE(!first))
3870 		return;
3871 
3872 	total = rt6_multipath_total_weight(first);
3873 	rt6_multipath_upper_bound_set(first, total);
3874 }
3875 
3876 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3877 {
3878 	const struct arg_netdev_event *arg = p_arg;
3879 	struct net *net = dev_net(arg->dev);
3880 
3881 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3882 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3883 		fib6_update_sernum_upto_root(net, rt);
3884 		rt6_multipath_rebalance(rt);
3885 	}
3886 
3887 	return 0;
3888 }
3889 
3890 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3891 {
3892 	struct arg_netdev_event arg = {
3893 		.dev = dev,
3894 		{
3895 			.nh_flags = nh_flags,
3896 		},
3897 	};
3898 
3899 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3900 		arg.nh_flags |= RTNH_F_LINKDOWN;
3901 
3902 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3903 }
3904 
3905 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3906 				   const struct net_device *dev)
3907 {
3908 	struct fib6_info *iter;
3909 
3910 	if (rt->fib6_nh.nh_dev == dev)
3911 		return true;
3912 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3913 		if (iter->fib6_nh.nh_dev == dev)
3914 			return true;
3915 
3916 	return false;
3917 }
3918 
3919 static void rt6_multipath_flush(struct fib6_info *rt)
3920 {
3921 	struct fib6_info *iter;
3922 
3923 	rt->should_flush = 1;
3924 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3925 		iter->should_flush = 1;
3926 }
3927 
3928 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3929 					     const struct net_device *down_dev)
3930 {
3931 	struct fib6_info *iter;
3932 	unsigned int dead = 0;
3933 
3934 	if (rt->fib6_nh.nh_dev == down_dev ||
3935 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3936 		dead++;
3937 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3938 		if (iter->fib6_nh.nh_dev == down_dev ||
3939 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3940 			dead++;
3941 
3942 	return dead;
3943 }
3944 
3945 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3946 				       const struct net_device *dev,
3947 				       unsigned int nh_flags)
3948 {
3949 	struct fib6_info *iter;
3950 
3951 	if (rt->fib6_nh.nh_dev == dev)
3952 		rt->fib6_nh.nh_flags |= nh_flags;
3953 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3954 		if (iter->fib6_nh.nh_dev == dev)
3955 			iter->fib6_nh.nh_flags |= nh_flags;
3956 }
3957 
3958 /* called with write lock held for table with rt */
3959 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3960 {
3961 	const struct arg_netdev_event *arg = p_arg;
3962 	const struct net_device *dev = arg->dev;
3963 	struct net *net = dev_net(dev);
3964 
3965 	if (rt == net->ipv6.fib6_null_entry)
3966 		return 0;
3967 
3968 	switch (arg->event) {
3969 	case NETDEV_UNREGISTER:
3970 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3971 	case NETDEV_DOWN:
3972 		if (rt->should_flush)
3973 			return -1;
3974 		if (!rt->fib6_nsiblings)
3975 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3976 		if (rt6_multipath_uses_dev(rt, dev)) {
3977 			unsigned int count;
3978 
3979 			count = rt6_multipath_dead_count(rt, dev);
3980 			if (rt->fib6_nsiblings + 1 == count) {
3981 				rt6_multipath_flush(rt);
3982 				return -1;
3983 			}
3984 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3985 						   RTNH_F_LINKDOWN);
3986 			fib6_update_sernum(net, rt);
3987 			rt6_multipath_rebalance(rt);
3988 		}
3989 		return -2;
3990 	case NETDEV_CHANGE:
3991 		if (rt->fib6_nh.nh_dev != dev ||
3992 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3993 			break;
3994 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3995 		rt6_multipath_rebalance(rt);
3996 		break;
3997 	}
3998 
3999 	return 0;
4000 }
4001 
4002 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4003 {
4004 	struct arg_netdev_event arg = {
4005 		.dev = dev,
4006 		{
4007 			.event = event,
4008 		},
4009 	};
4010 
4011 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4012 }
4013 
4014 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4015 {
4016 	rt6_sync_down_dev(dev, event);
4017 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4018 	neigh_ifdown(&nd_tbl, dev);
4019 }
4020 
4021 struct rt6_mtu_change_arg {
4022 	struct net_device *dev;
4023 	unsigned int mtu;
4024 };
4025 
4026 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4027 {
4028 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4029 	struct inet6_dev *idev;
4030 
4031 	/* In IPv6 pmtu discovery is not optional,
4032 	   so that RTAX_MTU lock cannot disable it.
4033 	   We still use this lock to block changes
4034 	   caused by addrconf/ndisc.
4035 	*/
4036 
4037 	idev = __in6_dev_get(arg->dev);
4038 	if (!idev)
4039 		return 0;
4040 
4041 	/* For administrative MTU increase, there is no way to discover
4042 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4043 	   Since RFC 1981 doesn't include administrative MTU increase
4044 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4045 	 */
4046 	if (rt->fib6_nh.nh_dev == arg->dev &&
4047 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4048 		u32 mtu = rt->fib6_pmtu;
4049 
4050 		if (mtu >= arg->mtu ||
4051 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4052 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4053 
4054 		spin_lock_bh(&rt6_exception_lock);
4055 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4056 		spin_unlock_bh(&rt6_exception_lock);
4057 	}
4058 	return 0;
4059 }
4060 
4061 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4062 {
4063 	struct rt6_mtu_change_arg arg = {
4064 		.dev = dev,
4065 		.mtu = mtu,
4066 	};
4067 
4068 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4069 }
4070 
4071 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4072 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4073 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4074 	[RTA_OIF]               = { .type = NLA_U32 },
4075 	[RTA_IIF]		= { .type = NLA_U32 },
4076 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4077 	[RTA_METRICS]           = { .type = NLA_NESTED },
4078 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4079 	[RTA_PREF]              = { .type = NLA_U8 },
4080 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4081 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4082 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4083 	[RTA_UID]		= { .type = NLA_U32 },
4084 	[RTA_MARK]		= { .type = NLA_U32 },
4085 	[RTA_TABLE]		= { .type = NLA_U32 },
4086 };
4087 
4088 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4089 			      struct fib6_config *cfg,
4090 			      struct netlink_ext_ack *extack)
4091 {
4092 	struct rtmsg *rtm;
4093 	struct nlattr *tb[RTA_MAX+1];
4094 	unsigned int pref;
4095 	int err;
4096 
4097 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4098 			  NULL);
4099 	if (err < 0)
4100 		goto errout;
4101 
4102 	err = -EINVAL;
4103 	rtm = nlmsg_data(nlh);
4104 	memset(cfg, 0, sizeof(*cfg));
4105 
4106 	cfg->fc_table = rtm->rtm_table;
4107 	cfg->fc_dst_len = rtm->rtm_dst_len;
4108 	cfg->fc_src_len = rtm->rtm_src_len;
4109 	cfg->fc_flags = RTF_UP;
4110 	cfg->fc_protocol = rtm->rtm_protocol;
4111 	cfg->fc_type = rtm->rtm_type;
4112 
4113 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4114 	    rtm->rtm_type == RTN_BLACKHOLE ||
4115 	    rtm->rtm_type == RTN_PROHIBIT ||
4116 	    rtm->rtm_type == RTN_THROW)
4117 		cfg->fc_flags |= RTF_REJECT;
4118 
4119 	if (rtm->rtm_type == RTN_LOCAL)
4120 		cfg->fc_flags |= RTF_LOCAL;
4121 
4122 	if (rtm->rtm_flags & RTM_F_CLONED)
4123 		cfg->fc_flags |= RTF_CACHE;
4124 
4125 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4126 
4127 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4128 	cfg->fc_nlinfo.nlh = nlh;
4129 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4130 
4131 	if (tb[RTA_GATEWAY]) {
4132 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4133 		cfg->fc_flags |= RTF_GATEWAY;
4134 	}
4135 
4136 	if (tb[RTA_DST]) {
4137 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4138 
4139 		if (nla_len(tb[RTA_DST]) < plen)
4140 			goto errout;
4141 
4142 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4143 	}
4144 
4145 	if (tb[RTA_SRC]) {
4146 		int plen = (rtm->rtm_src_len + 7) >> 3;
4147 
4148 		if (nla_len(tb[RTA_SRC]) < plen)
4149 			goto errout;
4150 
4151 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4152 	}
4153 
4154 	if (tb[RTA_PREFSRC])
4155 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4156 
4157 	if (tb[RTA_OIF])
4158 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4159 
4160 	if (tb[RTA_PRIORITY])
4161 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4162 
4163 	if (tb[RTA_METRICS]) {
4164 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4165 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4166 	}
4167 
4168 	if (tb[RTA_TABLE])
4169 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4170 
4171 	if (tb[RTA_MULTIPATH]) {
4172 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4173 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4174 
4175 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4176 						     cfg->fc_mp_len, extack);
4177 		if (err < 0)
4178 			goto errout;
4179 	}
4180 
4181 	if (tb[RTA_PREF]) {
4182 		pref = nla_get_u8(tb[RTA_PREF]);
4183 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4184 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4185 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4186 		cfg->fc_flags |= RTF_PREF(pref);
4187 	}
4188 
4189 	if (tb[RTA_ENCAP])
4190 		cfg->fc_encap = tb[RTA_ENCAP];
4191 
4192 	if (tb[RTA_ENCAP_TYPE]) {
4193 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4194 
4195 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4196 		if (err < 0)
4197 			goto errout;
4198 	}
4199 
4200 	if (tb[RTA_EXPIRES]) {
4201 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4202 
4203 		if (addrconf_finite_timeout(timeout)) {
4204 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4205 			cfg->fc_flags |= RTF_EXPIRES;
4206 		}
4207 	}
4208 
4209 	err = 0;
4210 errout:
4211 	return err;
4212 }
4213 
4214 struct rt6_nh {
4215 	struct fib6_info *fib6_info;
4216 	struct fib6_config r_cfg;
4217 	struct list_head next;
4218 };
4219 
4220 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4221 {
4222 	struct rt6_nh *nh;
4223 
4224 	list_for_each_entry(nh, rt6_nh_list, next) {
4225 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4226 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4227 		        nh->r_cfg.fc_ifindex);
4228 	}
4229 }
4230 
4231 static int ip6_route_info_append(struct net *net,
4232 				 struct list_head *rt6_nh_list,
4233 				 struct fib6_info *rt,
4234 				 struct fib6_config *r_cfg)
4235 {
4236 	struct rt6_nh *nh;
4237 	int err = -EEXIST;
4238 
4239 	list_for_each_entry(nh, rt6_nh_list, next) {
4240 		/* check if fib6_info already exists */
4241 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4242 			return err;
4243 	}
4244 
4245 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4246 	if (!nh)
4247 		return -ENOMEM;
4248 	nh->fib6_info = rt;
4249 	err = ip6_convert_metrics(net, rt, r_cfg);
4250 	if (err) {
4251 		kfree(nh);
4252 		return err;
4253 	}
4254 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4255 	list_add_tail(&nh->next, rt6_nh_list);
4256 
4257 	return 0;
4258 }
4259 
4260 static void ip6_route_mpath_notify(struct fib6_info *rt,
4261 				   struct fib6_info *rt_last,
4262 				   struct nl_info *info,
4263 				   __u16 nlflags)
4264 {
4265 	/* if this is an APPEND route, then rt points to the first route
4266 	 * inserted and rt_last points to last route inserted. Userspace
4267 	 * wants a consistent dump of the route which starts at the first
4268 	 * nexthop. Since sibling routes are always added at the end of
4269 	 * the list, find the first sibling of the last route appended
4270 	 */
4271 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4272 		rt = list_first_entry(&rt_last->fib6_siblings,
4273 				      struct fib6_info,
4274 				      fib6_siblings);
4275 	}
4276 
4277 	if (rt)
4278 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4279 }
4280 
4281 static int ip6_route_multipath_add(struct fib6_config *cfg,
4282 				   struct netlink_ext_ack *extack)
4283 {
4284 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4285 	struct nl_info *info = &cfg->fc_nlinfo;
4286 	struct fib6_config r_cfg;
4287 	struct rtnexthop *rtnh;
4288 	struct fib6_info *rt;
4289 	struct rt6_nh *err_nh;
4290 	struct rt6_nh *nh, *nh_safe;
4291 	__u16 nlflags;
4292 	int remaining;
4293 	int attrlen;
4294 	int err = 1;
4295 	int nhn = 0;
4296 	int replace = (cfg->fc_nlinfo.nlh &&
4297 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4298 	LIST_HEAD(rt6_nh_list);
4299 
4300 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4301 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4302 		nlflags |= NLM_F_APPEND;
4303 
4304 	remaining = cfg->fc_mp_len;
4305 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4306 
4307 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4308 	 * fib6_info structs per nexthop
4309 	 */
4310 	while (rtnh_ok(rtnh, remaining)) {
4311 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4312 		if (rtnh->rtnh_ifindex)
4313 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4314 
4315 		attrlen = rtnh_attrlen(rtnh);
4316 		if (attrlen > 0) {
4317 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4318 
4319 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4320 			if (nla) {
4321 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4322 				r_cfg.fc_flags |= RTF_GATEWAY;
4323 			}
4324 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4325 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4326 			if (nla)
4327 				r_cfg.fc_encap_type = nla_get_u16(nla);
4328 		}
4329 
4330 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4331 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4332 		if (IS_ERR(rt)) {
4333 			err = PTR_ERR(rt);
4334 			rt = NULL;
4335 			goto cleanup;
4336 		}
4337 
4338 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4339 
4340 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4341 					    rt, &r_cfg);
4342 		if (err) {
4343 			fib6_info_release(rt);
4344 			goto cleanup;
4345 		}
4346 
4347 		rtnh = rtnh_next(rtnh, &remaining);
4348 	}
4349 
4350 	/* for add and replace send one notification with all nexthops.
4351 	 * Skip the notification in fib6_add_rt2node and send one with
4352 	 * the full route when done
4353 	 */
4354 	info->skip_notify = 1;
4355 
4356 	err_nh = NULL;
4357 	list_for_each_entry(nh, &rt6_nh_list, next) {
4358 		rt_last = nh->fib6_info;
4359 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4360 		fib6_info_release(nh->fib6_info);
4361 
4362 		/* save reference to first route for notification */
4363 		if (!rt_notif && !err)
4364 			rt_notif = nh->fib6_info;
4365 
4366 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4367 		nh->fib6_info = NULL;
4368 		if (err) {
4369 			if (replace && nhn)
4370 				ip6_print_replace_route_err(&rt6_nh_list);
4371 			err_nh = nh;
4372 			goto add_errout;
4373 		}
4374 
4375 		/* Because each route is added like a single route we remove
4376 		 * these flags after the first nexthop: if there is a collision,
4377 		 * we have already failed to add the first nexthop:
4378 		 * fib6_add_rt2node() has rejected it; when replacing, old
4379 		 * nexthops have been replaced by first new, the rest should
4380 		 * be added to it.
4381 		 */
4382 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4383 						     NLM_F_REPLACE);
4384 		cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
4385 		nhn++;
4386 	}
4387 
4388 	/* success ... tell user about new route */
4389 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4390 	goto cleanup;
4391 
4392 add_errout:
4393 	/* send notification for routes that were added so that
4394 	 * the delete notifications sent by ip6_route_del are
4395 	 * coherent
4396 	 */
4397 	if (rt_notif)
4398 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4399 
4400 	/* Delete routes that were already added */
4401 	list_for_each_entry(nh, &rt6_nh_list, next) {
4402 		if (err_nh == nh)
4403 			break;
4404 		ip6_route_del(&nh->r_cfg, extack);
4405 	}
4406 
4407 cleanup:
4408 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4409 		if (nh->fib6_info)
4410 			fib6_info_release(nh->fib6_info);
4411 		list_del(&nh->next);
4412 		kfree(nh);
4413 	}
4414 
4415 	return err;
4416 }
4417 
4418 static int ip6_route_multipath_del(struct fib6_config *cfg,
4419 				   struct netlink_ext_ack *extack)
4420 {
4421 	struct fib6_config r_cfg;
4422 	struct rtnexthop *rtnh;
4423 	int remaining;
4424 	int attrlen;
4425 	int err = 1, last_err = 0;
4426 
4427 	remaining = cfg->fc_mp_len;
4428 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4429 
4430 	/* Parse a Multipath Entry */
4431 	while (rtnh_ok(rtnh, remaining)) {
4432 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4433 		if (rtnh->rtnh_ifindex)
4434 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4435 
4436 		attrlen = rtnh_attrlen(rtnh);
4437 		if (attrlen > 0) {
4438 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4439 
4440 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4441 			if (nla) {
4442 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4443 				r_cfg.fc_flags |= RTF_GATEWAY;
4444 			}
4445 		}
4446 		err = ip6_route_del(&r_cfg, extack);
4447 		if (err)
4448 			last_err = err;
4449 
4450 		rtnh = rtnh_next(rtnh, &remaining);
4451 	}
4452 
4453 	return last_err;
4454 }
4455 
4456 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4457 			      struct netlink_ext_ack *extack)
4458 {
4459 	struct fib6_config cfg;
4460 	int err;
4461 
4462 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4463 	if (err < 0)
4464 		return err;
4465 
4466 	if (cfg.fc_mp)
4467 		return ip6_route_multipath_del(&cfg, extack);
4468 	else {
4469 		cfg.fc_delete_all_nh = 1;
4470 		return ip6_route_del(&cfg, extack);
4471 	}
4472 }
4473 
4474 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4475 			      struct netlink_ext_ack *extack)
4476 {
4477 	struct fib6_config cfg;
4478 	int err;
4479 
4480 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4481 	if (err < 0)
4482 		return err;
4483 
4484 	if (cfg.fc_mp)
4485 		return ip6_route_multipath_add(&cfg, extack);
4486 	else
4487 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4488 }
4489 
4490 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4491 {
4492 	int nexthop_len = 0;
4493 
4494 	if (rt->fib6_nsiblings) {
4495 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4496 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4497 			    + nla_total_size(16) /* RTA_GATEWAY */
4498 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4499 
4500 		nexthop_len *= rt->fib6_nsiblings;
4501 	}
4502 
4503 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4504 	       + nla_total_size(16) /* RTA_SRC */
4505 	       + nla_total_size(16) /* RTA_DST */
4506 	       + nla_total_size(16) /* RTA_GATEWAY */
4507 	       + nla_total_size(16) /* RTA_PREFSRC */
4508 	       + nla_total_size(4) /* RTA_TABLE */
4509 	       + nla_total_size(4) /* RTA_IIF */
4510 	       + nla_total_size(4) /* RTA_OIF */
4511 	       + nla_total_size(4) /* RTA_PRIORITY */
4512 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4513 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4514 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4515 	       + nla_total_size(1) /* RTA_PREF */
4516 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4517 	       + nexthop_len;
4518 }
4519 
4520 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4521 			    unsigned int *flags, bool skip_oif)
4522 {
4523 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4524 		*flags |= RTNH_F_DEAD;
4525 
4526 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4527 		*flags |= RTNH_F_LINKDOWN;
4528 
4529 		rcu_read_lock();
4530 		if (fib6_ignore_linkdown(rt))
4531 			*flags |= RTNH_F_DEAD;
4532 		rcu_read_unlock();
4533 	}
4534 
4535 	if (rt->fib6_flags & RTF_GATEWAY) {
4536 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4537 			goto nla_put_failure;
4538 	}
4539 
4540 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4541 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4542 		*flags |= RTNH_F_OFFLOAD;
4543 
4544 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4545 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4546 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4547 		goto nla_put_failure;
4548 
4549 	if (rt->fib6_nh.nh_lwtstate &&
4550 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4551 		goto nla_put_failure;
4552 
4553 	return 0;
4554 
4555 nla_put_failure:
4556 	return -EMSGSIZE;
4557 }
4558 
4559 /* add multipath next hop */
4560 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4561 {
4562 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4563 	struct rtnexthop *rtnh;
4564 	unsigned int flags = 0;
4565 
4566 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4567 	if (!rtnh)
4568 		goto nla_put_failure;
4569 
4570 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4571 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4572 
4573 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4574 		goto nla_put_failure;
4575 
4576 	rtnh->rtnh_flags = flags;
4577 
4578 	/* length of rtnetlink header + attributes */
4579 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4580 
4581 	return 0;
4582 
4583 nla_put_failure:
4584 	return -EMSGSIZE;
4585 }
4586 
4587 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4588 			 struct fib6_info *rt, struct dst_entry *dst,
4589 			 struct in6_addr *dest, struct in6_addr *src,
4590 			 int iif, int type, u32 portid, u32 seq,
4591 			 unsigned int flags)
4592 {
4593 	struct rtmsg *rtm;
4594 	struct nlmsghdr *nlh;
4595 	long expires = 0;
4596 	u32 *pmetrics;
4597 	u32 table;
4598 
4599 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4600 	if (!nlh)
4601 		return -EMSGSIZE;
4602 
4603 	rtm = nlmsg_data(nlh);
4604 	rtm->rtm_family = AF_INET6;
4605 	rtm->rtm_dst_len = rt->fib6_dst.plen;
4606 	rtm->rtm_src_len = rt->fib6_src.plen;
4607 	rtm->rtm_tos = 0;
4608 	if (rt->fib6_table)
4609 		table = rt->fib6_table->tb6_id;
4610 	else
4611 		table = RT6_TABLE_UNSPEC;
4612 	rtm->rtm_table = table;
4613 	if (nla_put_u32(skb, RTA_TABLE, table))
4614 		goto nla_put_failure;
4615 
4616 	rtm->rtm_type = rt->fib6_type;
4617 	rtm->rtm_flags = 0;
4618 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4619 	rtm->rtm_protocol = rt->fib6_protocol;
4620 
4621 	if (rt->fib6_flags & RTF_CACHE)
4622 		rtm->rtm_flags |= RTM_F_CLONED;
4623 
4624 	if (dest) {
4625 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4626 			goto nla_put_failure;
4627 		rtm->rtm_dst_len = 128;
4628 	} else if (rtm->rtm_dst_len)
4629 		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4630 			goto nla_put_failure;
4631 #ifdef CONFIG_IPV6_SUBTREES
4632 	if (src) {
4633 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4634 			goto nla_put_failure;
4635 		rtm->rtm_src_len = 128;
4636 	} else if (rtm->rtm_src_len &&
4637 		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4638 		goto nla_put_failure;
4639 #endif
4640 	if (iif) {
4641 #ifdef CONFIG_IPV6_MROUTE
4642 		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4643 			int err = ip6mr_get_route(net, skb, rtm, portid);
4644 
4645 			if (err == 0)
4646 				return 0;
4647 			if (err < 0)
4648 				goto nla_put_failure;
4649 		} else
4650 #endif
4651 			if (nla_put_u32(skb, RTA_IIF, iif))
4652 				goto nla_put_failure;
4653 	} else if (dest) {
4654 		struct in6_addr saddr_buf;
4655 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4656 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4657 			goto nla_put_failure;
4658 	}
4659 
4660 	if (rt->fib6_prefsrc.plen) {
4661 		struct in6_addr saddr_buf;
4662 		saddr_buf = rt->fib6_prefsrc.addr;
4663 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4664 			goto nla_put_failure;
4665 	}
4666 
4667 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4668 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4669 		goto nla_put_failure;
4670 
4671 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4672 		goto nla_put_failure;
4673 
4674 	/* For multipath routes, walk the siblings list and add
4675 	 * each as a nexthop within RTA_MULTIPATH.
4676 	 */
4677 	if (rt->fib6_nsiblings) {
4678 		struct fib6_info *sibling, *next_sibling;
4679 		struct nlattr *mp;
4680 
4681 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4682 		if (!mp)
4683 			goto nla_put_failure;
4684 
4685 		if (rt6_add_nexthop(skb, rt) < 0)
4686 			goto nla_put_failure;
4687 
4688 		list_for_each_entry_safe(sibling, next_sibling,
4689 					 &rt->fib6_siblings, fib6_siblings) {
4690 			if (rt6_add_nexthop(skb, sibling) < 0)
4691 				goto nla_put_failure;
4692 		}
4693 
4694 		nla_nest_end(skb, mp);
4695 	} else {
4696 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4697 			goto nla_put_failure;
4698 	}
4699 
4700 	if (rt->fib6_flags & RTF_EXPIRES) {
4701 		expires = dst ? dst->expires : rt->expires;
4702 		expires -= jiffies;
4703 	}
4704 
4705 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4706 		goto nla_put_failure;
4707 
4708 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4709 		goto nla_put_failure;
4710 
4711 
4712 	nlmsg_end(skb, nlh);
4713 	return 0;
4714 
4715 nla_put_failure:
4716 	nlmsg_cancel(skb, nlh);
4717 	return -EMSGSIZE;
4718 }
4719 
4720 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4721 {
4722 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4723 	struct net *net = arg->net;
4724 
4725 	if (rt == net->ipv6.fib6_null_entry)
4726 		return 0;
4727 
4728 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4729 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4730 
4731 		/* user wants prefix routes only */
4732 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4733 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4734 			/* success since this is not a prefix route */
4735 			return 1;
4736 		}
4737 	}
4738 
4739 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4740 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4741 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4742 }
4743 
4744 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4745 			      struct netlink_ext_ack *extack)
4746 {
4747 	struct net *net = sock_net(in_skb->sk);
4748 	struct nlattr *tb[RTA_MAX+1];
4749 	int err, iif = 0, oif = 0;
4750 	struct fib6_info *from;
4751 	struct dst_entry *dst;
4752 	struct rt6_info *rt;
4753 	struct sk_buff *skb;
4754 	struct rtmsg *rtm;
4755 	struct flowi6 fl6;
4756 	bool fibmatch;
4757 
4758 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4759 			  extack);
4760 	if (err < 0)
4761 		goto errout;
4762 
4763 	err = -EINVAL;
4764 	memset(&fl6, 0, sizeof(fl6));
4765 	rtm = nlmsg_data(nlh);
4766 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4767 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4768 
4769 	if (tb[RTA_SRC]) {
4770 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4771 			goto errout;
4772 
4773 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4774 	}
4775 
4776 	if (tb[RTA_DST]) {
4777 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4778 			goto errout;
4779 
4780 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4781 	}
4782 
4783 	if (tb[RTA_IIF])
4784 		iif = nla_get_u32(tb[RTA_IIF]);
4785 
4786 	if (tb[RTA_OIF])
4787 		oif = nla_get_u32(tb[RTA_OIF]);
4788 
4789 	if (tb[RTA_MARK])
4790 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4791 
4792 	if (tb[RTA_UID])
4793 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4794 					   nla_get_u32(tb[RTA_UID]));
4795 	else
4796 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4797 
4798 	if (iif) {
4799 		struct net_device *dev;
4800 		int flags = 0;
4801 
4802 		rcu_read_lock();
4803 
4804 		dev = dev_get_by_index_rcu(net, iif);
4805 		if (!dev) {
4806 			rcu_read_unlock();
4807 			err = -ENODEV;
4808 			goto errout;
4809 		}
4810 
4811 		fl6.flowi6_iif = iif;
4812 
4813 		if (!ipv6_addr_any(&fl6.saddr))
4814 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4815 
4816 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4817 
4818 		rcu_read_unlock();
4819 	} else {
4820 		fl6.flowi6_oif = oif;
4821 
4822 		dst = ip6_route_output(net, NULL, &fl6);
4823 	}
4824 
4825 
4826 	rt = container_of(dst, struct rt6_info, dst);
4827 	if (rt->dst.error) {
4828 		err = rt->dst.error;
4829 		ip6_rt_put(rt);
4830 		goto errout;
4831 	}
4832 
4833 	if (rt == net->ipv6.ip6_null_entry) {
4834 		err = rt->dst.error;
4835 		ip6_rt_put(rt);
4836 		goto errout;
4837 	}
4838 
4839 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4840 	if (!skb) {
4841 		ip6_rt_put(rt);
4842 		err = -ENOBUFS;
4843 		goto errout;
4844 	}
4845 
4846 	skb_dst_set(skb, &rt->dst);
4847 
4848 	rcu_read_lock();
4849 	from = rcu_dereference(rt->from);
4850 
4851 	if (fibmatch)
4852 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4853 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4854 				    nlh->nlmsg_seq, 0);
4855 	else
4856 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
4857 				    &fl6.saddr, iif, RTM_NEWROUTE,
4858 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4859 				    0);
4860 	rcu_read_unlock();
4861 
4862 	if (err < 0) {
4863 		kfree_skb(skb);
4864 		goto errout;
4865 	}
4866 
4867 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4868 errout:
4869 	return err;
4870 }
4871 
4872 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4873 		     unsigned int nlm_flags)
4874 {
4875 	struct sk_buff *skb;
4876 	struct net *net = info->nl_net;
4877 	u32 seq;
4878 	int err;
4879 
4880 	err = -ENOBUFS;
4881 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4882 
4883 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4884 	if (!skb)
4885 		goto errout;
4886 
4887 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4888 			    event, info->portid, seq, nlm_flags);
4889 	if (err < 0) {
4890 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4891 		WARN_ON(err == -EMSGSIZE);
4892 		kfree_skb(skb);
4893 		goto errout;
4894 	}
4895 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4896 		    info->nlh, gfp_any());
4897 	return;
4898 errout:
4899 	if (err < 0)
4900 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4901 }
4902 
4903 static int ip6_route_dev_notify(struct notifier_block *this,
4904 				unsigned long event, void *ptr)
4905 {
4906 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4907 	struct net *net = dev_net(dev);
4908 
4909 	if (!(dev->flags & IFF_LOOPBACK))
4910 		return NOTIFY_OK;
4911 
4912 	if (event == NETDEV_REGISTER) {
4913 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4914 		net->ipv6.ip6_null_entry->dst.dev = dev;
4915 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4916 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4917 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4918 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4919 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4920 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4921 #endif
4922 	 } else if (event == NETDEV_UNREGISTER &&
4923 		    dev->reg_state != NETREG_UNREGISTERED) {
4924 		/* NETDEV_UNREGISTER could be fired for multiple times by
4925 		 * netdev_wait_allrefs(). Make sure we only call this once.
4926 		 */
4927 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4929 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4930 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4931 #endif
4932 	}
4933 
4934 	return NOTIFY_OK;
4935 }
4936 
4937 /*
4938  *	/proc
4939  */
4940 
4941 #ifdef CONFIG_PROC_FS
4942 
4943 static const struct file_operations ipv6_route_proc_fops = {
4944 	.open		= ipv6_route_open,
4945 	.read		= seq_read,
4946 	.llseek		= seq_lseek,
4947 	.release	= seq_release_net,
4948 };
4949 
4950 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4951 {
4952 	struct net *net = (struct net *)seq->private;
4953 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4954 		   net->ipv6.rt6_stats->fib_nodes,
4955 		   net->ipv6.rt6_stats->fib_route_nodes,
4956 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4957 		   net->ipv6.rt6_stats->fib_rt_entries,
4958 		   net->ipv6.rt6_stats->fib_rt_cache,
4959 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4960 		   net->ipv6.rt6_stats->fib_discarded_routes);
4961 
4962 	return 0;
4963 }
4964 
4965 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4966 {
4967 	return single_open_net(inode, file, rt6_stats_seq_show);
4968 }
4969 
4970 static const struct file_operations rt6_stats_seq_fops = {
4971 	.open	 = rt6_stats_seq_open,
4972 	.read	 = seq_read,
4973 	.llseek	 = seq_lseek,
4974 	.release = single_release_net,
4975 };
4976 #endif	/* CONFIG_PROC_FS */
4977 
4978 #ifdef CONFIG_SYSCTL
4979 
4980 static
4981 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4982 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4983 {
4984 	struct net *net;
4985 	int delay;
4986 	if (!write)
4987 		return -EINVAL;
4988 
4989 	net = (struct net *)ctl->extra1;
4990 	delay = net->ipv6.sysctl.flush_delay;
4991 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4992 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4993 	return 0;
4994 }
4995 
4996 struct ctl_table ipv6_route_table_template[] = {
4997 	{
4998 		.procname	=	"flush",
4999 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5000 		.maxlen		=	sizeof(int),
5001 		.mode		=	0200,
5002 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5003 	},
5004 	{
5005 		.procname	=	"gc_thresh",
5006 		.data		=	&ip6_dst_ops_template.gc_thresh,
5007 		.maxlen		=	sizeof(int),
5008 		.mode		=	0644,
5009 		.proc_handler	=	proc_dointvec,
5010 	},
5011 	{
5012 		.procname	=	"max_size",
5013 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5014 		.maxlen		=	sizeof(int),
5015 		.mode		=	0644,
5016 		.proc_handler	=	proc_dointvec,
5017 	},
5018 	{
5019 		.procname	=	"gc_min_interval",
5020 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5021 		.maxlen		=	sizeof(int),
5022 		.mode		=	0644,
5023 		.proc_handler	=	proc_dointvec_jiffies,
5024 	},
5025 	{
5026 		.procname	=	"gc_timeout",
5027 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5028 		.maxlen		=	sizeof(int),
5029 		.mode		=	0644,
5030 		.proc_handler	=	proc_dointvec_jiffies,
5031 	},
5032 	{
5033 		.procname	=	"gc_interval",
5034 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5035 		.maxlen		=	sizeof(int),
5036 		.mode		=	0644,
5037 		.proc_handler	=	proc_dointvec_jiffies,
5038 	},
5039 	{
5040 		.procname	=	"gc_elasticity",
5041 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5042 		.maxlen		=	sizeof(int),
5043 		.mode		=	0644,
5044 		.proc_handler	=	proc_dointvec,
5045 	},
5046 	{
5047 		.procname	=	"mtu_expires",
5048 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5049 		.maxlen		=	sizeof(int),
5050 		.mode		=	0644,
5051 		.proc_handler	=	proc_dointvec_jiffies,
5052 	},
5053 	{
5054 		.procname	=	"min_adv_mss",
5055 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5056 		.maxlen		=	sizeof(int),
5057 		.mode		=	0644,
5058 		.proc_handler	=	proc_dointvec,
5059 	},
5060 	{
5061 		.procname	=	"gc_min_interval_ms",
5062 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5063 		.maxlen		=	sizeof(int),
5064 		.mode		=	0644,
5065 		.proc_handler	=	proc_dointvec_ms_jiffies,
5066 	},
5067 	{ }
5068 };
5069 
5070 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5071 {
5072 	struct ctl_table *table;
5073 
5074 	table = kmemdup(ipv6_route_table_template,
5075 			sizeof(ipv6_route_table_template),
5076 			GFP_KERNEL);
5077 
5078 	if (table) {
5079 		table[0].data = &net->ipv6.sysctl.flush_delay;
5080 		table[0].extra1 = net;
5081 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5082 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5083 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5084 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5085 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5086 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5087 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5088 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5089 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5090 
5091 		/* Don't export sysctls to unprivileged users */
5092 		if (net->user_ns != &init_user_ns)
5093 			table[0].procname = NULL;
5094 	}
5095 
5096 	return table;
5097 }
5098 #endif
5099 
5100 static int __net_init ip6_route_net_init(struct net *net)
5101 {
5102 	int ret = -ENOMEM;
5103 
5104 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5105 	       sizeof(net->ipv6.ip6_dst_ops));
5106 
5107 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5108 		goto out_ip6_dst_ops;
5109 
5110 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5111 					    sizeof(*net->ipv6.fib6_null_entry),
5112 					    GFP_KERNEL);
5113 	if (!net->ipv6.fib6_null_entry)
5114 		goto out_ip6_dst_entries;
5115 
5116 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5117 					   sizeof(*net->ipv6.ip6_null_entry),
5118 					   GFP_KERNEL);
5119 	if (!net->ipv6.ip6_null_entry)
5120 		goto out_fib6_null_entry;
5121 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5122 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5123 			 ip6_template_metrics, true);
5124 
5125 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5126 	net->ipv6.fib6_has_custom_rules = false;
5127 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5128 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5129 					       GFP_KERNEL);
5130 	if (!net->ipv6.ip6_prohibit_entry)
5131 		goto out_ip6_null_entry;
5132 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5133 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5134 			 ip6_template_metrics, true);
5135 
5136 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5137 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5138 					       GFP_KERNEL);
5139 	if (!net->ipv6.ip6_blk_hole_entry)
5140 		goto out_ip6_prohibit_entry;
5141 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5142 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5143 			 ip6_template_metrics, true);
5144 #endif
5145 
5146 	net->ipv6.sysctl.flush_delay = 0;
5147 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5148 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5149 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5150 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5151 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5152 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5153 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5154 
5155 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5156 
5157 	ret = 0;
5158 out:
5159 	return ret;
5160 
5161 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5162 out_ip6_prohibit_entry:
5163 	kfree(net->ipv6.ip6_prohibit_entry);
5164 out_ip6_null_entry:
5165 	kfree(net->ipv6.ip6_null_entry);
5166 #endif
5167 out_fib6_null_entry:
5168 	kfree(net->ipv6.fib6_null_entry);
5169 out_ip6_dst_entries:
5170 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5171 out_ip6_dst_ops:
5172 	goto out;
5173 }
5174 
5175 static void __net_exit ip6_route_net_exit(struct net *net)
5176 {
5177 	kfree(net->ipv6.fib6_null_entry);
5178 	kfree(net->ipv6.ip6_null_entry);
5179 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5180 	kfree(net->ipv6.ip6_prohibit_entry);
5181 	kfree(net->ipv6.ip6_blk_hole_entry);
5182 #endif
5183 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5184 }
5185 
5186 static int __net_init ip6_route_net_init_late(struct net *net)
5187 {
5188 #ifdef CONFIG_PROC_FS
5189 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5190 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5191 #endif
5192 	return 0;
5193 }
5194 
5195 static void __net_exit ip6_route_net_exit_late(struct net *net)
5196 {
5197 #ifdef CONFIG_PROC_FS
5198 	remove_proc_entry("ipv6_route", net->proc_net);
5199 	remove_proc_entry("rt6_stats", net->proc_net);
5200 #endif
5201 }
5202 
5203 static struct pernet_operations ip6_route_net_ops = {
5204 	.init = ip6_route_net_init,
5205 	.exit = ip6_route_net_exit,
5206 };
5207 
5208 static int __net_init ipv6_inetpeer_init(struct net *net)
5209 {
5210 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5211 
5212 	if (!bp)
5213 		return -ENOMEM;
5214 	inet_peer_base_init(bp);
5215 	net->ipv6.peers = bp;
5216 	return 0;
5217 }
5218 
5219 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5220 {
5221 	struct inet_peer_base *bp = net->ipv6.peers;
5222 
5223 	net->ipv6.peers = NULL;
5224 	inetpeer_invalidate_tree(bp);
5225 	kfree(bp);
5226 }
5227 
5228 static struct pernet_operations ipv6_inetpeer_ops = {
5229 	.init	=	ipv6_inetpeer_init,
5230 	.exit	=	ipv6_inetpeer_exit,
5231 };
5232 
5233 static struct pernet_operations ip6_route_net_late_ops = {
5234 	.init = ip6_route_net_init_late,
5235 	.exit = ip6_route_net_exit_late,
5236 };
5237 
5238 static struct notifier_block ip6_route_dev_notifier = {
5239 	.notifier_call = ip6_route_dev_notify,
5240 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5241 };
5242 
5243 void __init ip6_route_init_special_entries(void)
5244 {
5245 	/* Registering of the loopback is done before this portion of code,
5246 	 * the loopback reference in rt6_info will not be taken, do it
5247 	 * manually for init_net */
5248 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5249 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5250 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5251   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5252 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5253 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5254 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5255 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5256   #endif
5257 }
5258 
5259 int __init ip6_route_init(void)
5260 {
5261 	int ret;
5262 	int cpu;
5263 
5264 	ret = -ENOMEM;
5265 	ip6_dst_ops_template.kmem_cachep =
5266 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5267 				  SLAB_HWCACHE_ALIGN, NULL);
5268 	if (!ip6_dst_ops_template.kmem_cachep)
5269 		goto out;
5270 
5271 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5272 	if (ret)
5273 		goto out_kmem_cache;
5274 
5275 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5276 	if (ret)
5277 		goto out_dst_entries;
5278 
5279 	ret = register_pernet_subsys(&ip6_route_net_ops);
5280 	if (ret)
5281 		goto out_register_inetpeer;
5282 
5283 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5284 
5285 	ret = fib6_init();
5286 	if (ret)
5287 		goto out_register_subsys;
5288 
5289 	ret = xfrm6_init();
5290 	if (ret)
5291 		goto out_fib6_init;
5292 
5293 	ret = fib6_rules_init();
5294 	if (ret)
5295 		goto xfrm6_init;
5296 
5297 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5298 	if (ret)
5299 		goto fib6_rules_init;
5300 
5301 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5302 				   inet6_rtm_newroute, NULL, 0);
5303 	if (ret < 0)
5304 		goto out_register_late_subsys;
5305 
5306 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5307 				   inet6_rtm_delroute, NULL, 0);
5308 	if (ret < 0)
5309 		goto out_register_late_subsys;
5310 
5311 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5312 				   inet6_rtm_getroute, NULL,
5313 				   RTNL_FLAG_DOIT_UNLOCKED);
5314 	if (ret < 0)
5315 		goto out_register_late_subsys;
5316 
5317 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5318 	if (ret)
5319 		goto out_register_late_subsys;
5320 
5321 	for_each_possible_cpu(cpu) {
5322 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5323 
5324 		INIT_LIST_HEAD(&ul->head);
5325 		spin_lock_init(&ul->lock);
5326 	}
5327 
5328 out:
5329 	return ret;
5330 
5331 out_register_late_subsys:
5332 	rtnl_unregister_all(PF_INET6);
5333 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5334 fib6_rules_init:
5335 	fib6_rules_cleanup();
5336 xfrm6_init:
5337 	xfrm6_fini();
5338 out_fib6_init:
5339 	fib6_gc_cleanup();
5340 out_register_subsys:
5341 	unregister_pernet_subsys(&ip6_route_net_ops);
5342 out_register_inetpeer:
5343 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5344 out_dst_entries:
5345 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5346 out_kmem_cache:
5347 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5348 	goto out;
5349 }
5350 
5351 void ip6_route_cleanup(void)
5352 {
5353 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5354 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5355 	fib6_rules_cleanup();
5356 	xfrm6_fini();
5357 	fib6_gc_cleanup();
5358 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5359 	unregister_pernet_subsys(&ip6_route_net_ops);
5360 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5361 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5362 }
5363