xref: /openbmc/linux/net/ipv6/route.c (revision 534420c6)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
106 static size_t rt6_nlmsg_size(struct fib6_info *rt);
107 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108 			 struct fib6_info *rt, struct dst_entry *dst,
109 			 struct in6_addr *dest, struct in6_addr *src,
110 			 int iif, int type, u32 portid, u32 seq,
111 			 unsigned int flags);
112 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 					   struct in6_addr *daddr,
114 					   struct in6_addr *saddr);
115 
116 #ifdef CONFIG_IPV6_ROUTE_INFO
117 static struct fib6_info *rt6_add_route_info(struct net *net,
118 					   const struct in6_addr *prefix, int prefixlen,
119 					   const struct in6_addr *gwaddr,
120 					   struct net_device *dev,
121 					   unsigned int pref);
122 static struct fib6_info *rt6_get_route_info(struct net *net,
123 					   const struct in6_addr *prefix, int prefixlen,
124 					   const struct in6_addr *gwaddr,
125 					   struct net_device *dev);
126 #endif
127 
128 struct uncached_list {
129 	spinlock_t		lock;
130 	struct list_head	head;
131 };
132 
133 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
134 
135 void rt6_uncached_list_add(struct rt6_info *rt)
136 {
137 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
138 
139 	rt->rt6i_uncached_list = ul;
140 
141 	spin_lock_bh(&ul->lock);
142 	list_add_tail(&rt->rt6i_uncached, &ul->head);
143 	spin_unlock_bh(&ul->lock);
144 }
145 
146 void rt6_uncached_list_del(struct rt6_info *rt)
147 {
148 	if (!list_empty(&rt->rt6i_uncached)) {
149 		struct uncached_list *ul = rt->rt6i_uncached_list;
150 		struct net *net = dev_net(rt->dst.dev);
151 
152 		spin_lock_bh(&ul->lock);
153 		list_del(&rt->rt6i_uncached);
154 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 		spin_unlock_bh(&ul->lock);
156 	}
157 }
158 
159 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
160 {
161 	struct net_device *loopback_dev = net->loopback_dev;
162 	int cpu;
163 
164 	if (dev == loopback_dev)
165 		return;
166 
167 	for_each_possible_cpu(cpu) {
168 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
169 		struct rt6_info *rt;
170 
171 		spin_lock_bh(&ul->lock);
172 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
173 			struct inet6_dev *rt_idev = rt->rt6i_idev;
174 			struct net_device *rt_dev = rt->dst.dev;
175 
176 			if (rt_idev->dev == dev) {
177 				rt->rt6i_idev = in6_dev_get(loopback_dev);
178 				in6_dev_put(rt_idev);
179 			}
180 
181 			if (rt_dev == dev) {
182 				rt->dst.dev = loopback_dev;
183 				dev_hold(rt->dst.dev);
184 				dev_put(rt_dev);
185 			}
186 		}
187 		spin_unlock_bh(&ul->lock);
188 	}
189 }
190 
191 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	if (!ipv6_addr_any(p))
196 		return (const void *) p;
197 	else if (skb)
198 		return &ipv6_hdr(skb)->daddr;
199 	return daddr;
200 }
201 
202 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
203 				   struct net_device *dev,
204 				   struct sk_buff *skb,
205 				   const void *daddr)
206 {
207 	struct neighbour *n;
208 
209 	daddr = choose_neigh_daddr(gw, skb, daddr);
210 	n = __ipv6_neigh_lookup(dev, daddr);
211 	if (n)
212 		return n;
213 
214 	n = neigh_create(&nd_tbl, daddr, dev);
215 	return IS_ERR(n) ? NULL : n;
216 }
217 
218 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
219 					      struct sk_buff *skb,
220 					      const void *daddr)
221 {
222 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
223 
224 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 }
226 
227 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 {
229 	struct net_device *dev = dst->dev;
230 	struct rt6_info *rt = (struct rt6_info *)dst;
231 
232 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 	if (!daddr)
234 		return;
235 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236 		return;
237 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238 		return;
239 	__ipv6_confirm_neigh(dev, daddr);
240 }
241 
242 static struct dst_ops ip6_dst_ops_template = {
243 	.family			=	AF_INET6,
244 	.gc			=	ip6_dst_gc,
245 	.gc_thresh		=	1024,
246 	.check			=	ip6_dst_check,
247 	.default_advmss		=	ip6_default_advmss,
248 	.mtu			=	ip6_mtu,
249 	.cow_metrics		=	dst_cow_metrics_generic,
250 	.destroy		=	ip6_dst_destroy,
251 	.ifdown			=	ip6_dst_ifdown,
252 	.negative_advice	=	ip6_negative_advice,
253 	.link_failure		=	ip6_link_failure,
254 	.update_pmtu		=	ip6_rt_update_pmtu,
255 	.redirect		=	rt6_do_redirect,
256 	.local_out		=	__ip6_local_out,
257 	.neigh_lookup		=	ip6_dst_neigh_lookup,
258 	.confirm_neigh		=	ip6_confirm_neigh,
259 };
260 
261 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 {
263 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264 
265 	return mtu ? : dst->dev->mtu;
266 }
267 
268 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
269 					 struct sk_buff *skb, u32 mtu)
270 {
271 }
272 
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
274 				      struct sk_buff *skb)
275 {
276 }
277 
278 static struct dst_ops ip6_dst_blackhole_ops = {
279 	.family			=	AF_INET6,
280 	.destroy		=	ip6_dst_destroy,
281 	.check			=	ip6_dst_check,
282 	.mtu			=	ip6_blackhole_mtu,
283 	.default_advmss		=	ip6_default_advmss,
284 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
285 	.redirect		=	ip6_rt_blackhole_redirect,
286 	.cow_metrics		=	dst_cow_metrics_generic,
287 	.neigh_lookup		=	ip6_dst_neigh_lookup,
288 };
289 
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 	[RTAX_HOPLIMIT - 1] = 0,
292 };
293 
294 static const struct fib6_info fib6_null_entry_template = {
295 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.fib6_protocol  = RTPROT_KERNEL,
297 	.fib6_metric	= ~(u32)0,
298 	.fib6_ref	= ATOMIC_INIT(1),
299 	.fib6_type	= RTN_UNREACHABLE,
300 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
301 };
302 
303 static const struct rt6_info ip6_null_entry_template = {
304 	.dst = {
305 		.__refcnt	= ATOMIC_INIT(1),
306 		.__use		= 1,
307 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
308 		.error		= -ENETUNREACH,
309 		.input		= ip6_pkt_discard,
310 		.output		= ip6_pkt_discard_out,
311 	},
312 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
313 };
314 
315 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
316 
317 static const struct rt6_info ip6_prohibit_entry_template = {
318 	.dst = {
319 		.__refcnt	= ATOMIC_INIT(1),
320 		.__use		= 1,
321 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
322 		.error		= -EACCES,
323 		.input		= ip6_pkt_prohibit,
324 		.output		= ip6_pkt_prohibit_out,
325 	},
326 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
327 };
328 
329 static const struct rt6_info ip6_blk_hole_entry_template = {
330 	.dst = {
331 		.__refcnt	= ATOMIC_INIT(1),
332 		.__use		= 1,
333 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
334 		.error		= -EINVAL,
335 		.input		= dst_discard,
336 		.output		= dst_discard_out,
337 	},
338 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
339 };
340 
341 #endif
342 
343 static void rt6_info_init(struct rt6_info *rt)
344 {
345 	struct dst_entry *dst = &rt->dst;
346 
347 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 	INIT_LIST_HEAD(&rt->rt6i_uncached);
349 }
350 
351 /* allocate dst with ip6_dst_ops */
352 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
353 			       int flags)
354 {
355 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
356 					1, DST_OBSOLETE_FORCE_CHK, flags);
357 
358 	if (rt) {
359 		rt6_info_init(rt);
360 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
361 	}
362 
363 	return rt;
364 }
365 EXPORT_SYMBOL(ip6_dst_alloc);
366 
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369 	struct rt6_info *rt = (struct rt6_info *)dst;
370 	struct fib6_info *from;
371 	struct inet6_dev *idev;
372 
373 	ip_dst_metrics_put(dst);
374 	rt6_uncached_list_del(rt);
375 
376 	idev = rt->rt6i_idev;
377 	if (idev) {
378 		rt->rt6i_idev = NULL;
379 		in6_dev_put(idev);
380 	}
381 
382 	from = xchg((__force struct fib6_info **)&rt->from, NULL);
383 	fib6_info_release(from);
384 }
385 
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387 			   int how)
388 {
389 	struct rt6_info *rt = (struct rt6_info *)dst;
390 	struct inet6_dev *idev = rt->rt6i_idev;
391 	struct net_device *loopback_dev =
392 		dev_net(dev)->loopback_dev;
393 
394 	if (idev && idev->dev != loopback_dev) {
395 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
396 		if (loopback_idev) {
397 			rt->rt6i_idev = loopback_idev;
398 			in6_dev_put(idev);
399 		}
400 	}
401 }
402 
403 static bool __rt6_check_expired(const struct rt6_info *rt)
404 {
405 	if (rt->rt6i_flags & RTF_EXPIRES)
406 		return time_after(jiffies, rt->dst.expires);
407 	else
408 		return false;
409 }
410 
411 static bool rt6_check_expired(const struct rt6_info *rt)
412 {
413 	struct fib6_info *from;
414 
415 	from = rcu_dereference(rt->from);
416 
417 	if (rt->rt6i_flags & RTF_EXPIRES) {
418 		if (time_after(jiffies, rt->dst.expires))
419 			return true;
420 	} else if (from) {
421 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
422 			fib6_check_expired(from);
423 	}
424 	return false;
425 }
426 
427 struct fib6_info *fib6_multipath_select(const struct net *net,
428 					struct fib6_info *match,
429 					struct flowi6 *fl6, int oif,
430 					const struct sk_buff *skb,
431 					int strict)
432 {
433 	struct fib6_info *sibling, *next_sibling;
434 
435 	/* We might have already computed the hash for ICMPv6 errors. In such
436 	 * case it will always be non-zero. Otherwise now is the time to do it.
437 	 */
438 	if (!fl6->mp_hash)
439 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
440 
441 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
442 		return match;
443 
444 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
445 				 fib6_siblings) {
446 		int nh_upper_bound;
447 
448 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
449 		if (fl6->mp_hash > nh_upper_bound)
450 			continue;
451 		if (rt6_score_route(sibling, oif, strict) < 0)
452 			break;
453 		match = sibling;
454 		break;
455 	}
456 
457 	return match;
458 }
459 
460 /*
461  *	Route lookup. rcu_read_lock() should be held.
462  */
463 
464 static inline struct fib6_info *rt6_device_match(struct net *net,
465 						 struct fib6_info *rt,
466 						    const struct in6_addr *saddr,
467 						    int oif,
468 						    int flags)
469 {
470 	struct fib6_info *sprt;
471 
472 	if (!oif && ipv6_addr_any(saddr) &&
473 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
474 		return rt;
475 
476 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
477 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
478 
479 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
480 			continue;
481 
482 		if (oif) {
483 			if (dev->ifindex == oif)
484 				return sprt;
485 		} else {
486 			if (ipv6_chk_addr(net, saddr, dev,
487 					  flags & RT6_LOOKUP_F_IFACE))
488 				return sprt;
489 		}
490 	}
491 
492 	if (oif && flags & RT6_LOOKUP_F_IFACE)
493 		return net->ipv6.fib6_null_entry;
494 
495 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
496 }
497 
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 struct __rt6_probe_work {
500 	struct work_struct work;
501 	struct in6_addr target;
502 	struct net_device *dev;
503 };
504 
505 static void rt6_probe_deferred(struct work_struct *w)
506 {
507 	struct in6_addr mcaddr;
508 	struct __rt6_probe_work *work =
509 		container_of(w, struct __rt6_probe_work, work);
510 
511 	addrconf_addr_solict_mult(&work->target, &mcaddr);
512 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
513 	dev_put(work->dev);
514 	kfree(work);
515 }
516 
517 static void rt6_probe(struct fib6_info *rt)
518 {
519 	struct __rt6_probe_work *work = NULL;
520 	const struct in6_addr *nh_gw;
521 	struct neighbour *neigh;
522 	struct net_device *dev;
523 	struct inet6_dev *idev;
524 
525 	/*
526 	 * Okay, this does not seem to be appropriate
527 	 * for now, however, we need to check if it
528 	 * is really so; aka Router Reachability Probing.
529 	 *
530 	 * Router Reachability Probe MUST be rate-limited
531 	 * to no more than one per minute.
532 	 */
533 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
534 		return;
535 
536 	nh_gw = &rt->fib6_nh.nh_gw;
537 	dev = rt->fib6_nh.nh_dev;
538 	rcu_read_lock_bh();
539 	idev = __in6_dev_get(dev);
540 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
541 	if (neigh) {
542 		if (neigh->nud_state & NUD_VALID)
543 			goto out;
544 
545 		write_lock(&neigh->lock);
546 		if (!(neigh->nud_state & NUD_VALID) &&
547 		    time_after(jiffies,
548 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
549 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
550 			if (work)
551 				__neigh_set_probe_once(neigh);
552 		}
553 		write_unlock(&neigh->lock);
554 	} else if (time_after(jiffies, rt->last_probe +
555 				       idev->cnf.rtr_probe_interval)) {
556 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
557 	}
558 
559 	if (work) {
560 		rt->last_probe = jiffies;
561 		INIT_WORK(&work->work, rt6_probe_deferred);
562 		work->target = *nh_gw;
563 		dev_hold(dev);
564 		work->dev = dev;
565 		schedule_work(&work->work);
566 	}
567 
568 out:
569 	rcu_read_unlock_bh();
570 }
571 #else
572 static inline void rt6_probe(struct fib6_info *rt)
573 {
574 }
575 #endif
576 
577 /*
578  * Default Router Selection (RFC 2461 6.3.6)
579  */
580 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
581 {
582 	const struct net_device *dev = rt->fib6_nh.nh_dev;
583 
584 	if (!oif || dev->ifindex == oif)
585 		return 2;
586 	return 0;
587 }
588 
589 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
590 {
591 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
592 	struct neighbour *neigh;
593 
594 	if (rt->fib6_flags & RTF_NONEXTHOP ||
595 	    !(rt->fib6_flags & RTF_GATEWAY))
596 		return RT6_NUD_SUCCEED;
597 
598 	rcu_read_lock_bh();
599 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
600 					  &rt->fib6_nh.nh_gw);
601 	if (neigh) {
602 		read_lock(&neigh->lock);
603 		if (neigh->nud_state & NUD_VALID)
604 			ret = RT6_NUD_SUCCEED;
605 #ifdef CONFIG_IPV6_ROUTER_PREF
606 		else if (!(neigh->nud_state & NUD_FAILED))
607 			ret = RT6_NUD_SUCCEED;
608 		else
609 			ret = RT6_NUD_FAIL_PROBE;
610 #endif
611 		read_unlock(&neigh->lock);
612 	} else {
613 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
614 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
615 	}
616 	rcu_read_unlock_bh();
617 
618 	return ret;
619 }
620 
621 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
622 {
623 	int m;
624 
625 	m = rt6_check_dev(rt, oif);
626 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
627 		return RT6_NUD_FAIL_HARD;
628 #ifdef CONFIG_IPV6_ROUTER_PREF
629 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
630 #endif
631 	if (strict & RT6_LOOKUP_F_REACHABLE) {
632 		int n = rt6_check_neigh(rt);
633 		if (n < 0)
634 			return n;
635 	}
636 	return m;
637 }
638 
639 /* called with rc_read_lock held */
640 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
641 {
642 	const struct net_device *dev = fib6_info_nh_dev(f6i);
643 	bool rc = false;
644 
645 	if (dev) {
646 		const struct inet6_dev *idev = __in6_dev_get(dev);
647 
648 		rc = !!idev->cnf.ignore_routes_with_linkdown;
649 	}
650 
651 	return rc;
652 }
653 
654 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
655 				   int *mpri, struct fib6_info *match,
656 				   bool *do_rr)
657 {
658 	int m;
659 	bool match_do_rr = false;
660 
661 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
662 		goto out;
663 
664 	if (fib6_ignore_linkdown(rt) &&
665 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
666 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
667 		goto out;
668 
669 	if (fib6_check_expired(rt))
670 		goto out;
671 
672 	m = rt6_score_route(rt, oif, strict);
673 	if (m == RT6_NUD_FAIL_DO_RR) {
674 		match_do_rr = true;
675 		m = 0; /* lowest valid score */
676 	} else if (m == RT6_NUD_FAIL_HARD) {
677 		goto out;
678 	}
679 
680 	if (strict & RT6_LOOKUP_F_REACHABLE)
681 		rt6_probe(rt);
682 
683 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
684 	if (m > *mpri) {
685 		*do_rr = match_do_rr;
686 		*mpri = m;
687 		match = rt;
688 	}
689 out:
690 	return match;
691 }
692 
693 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
694 				     struct fib6_info *leaf,
695 				     struct fib6_info *rr_head,
696 				     u32 metric, int oif, int strict,
697 				     bool *do_rr)
698 {
699 	struct fib6_info *rt, *match, *cont;
700 	int mpri = -1;
701 
702 	match = NULL;
703 	cont = NULL;
704 	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
705 		if (rt->fib6_metric != metric) {
706 			cont = rt;
707 			break;
708 		}
709 
710 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
711 	}
712 
713 	for (rt = leaf; rt && rt != rr_head;
714 	     rt = rcu_dereference(rt->fib6_next)) {
715 		if (rt->fib6_metric != metric) {
716 			cont = rt;
717 			break;
718 		}
719 
720 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 	}
722 
723 	if (match || !cont)
724 		return match;
725 
726 	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
727 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 
729 	return match;
730 }
731 
732 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
733 				   int oif, int strict)
734 {
735 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
736 	struct fib6_info *match, *rt0;
737 	bool do_rr = false;
738 	int key_plen;
739 
740 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
741 		return net->ipv6.fib6_null_entry;
742 
743 	rt0 = rcu_dereference(fn->rr_ptr);
744 	if (!rt0)
745 		rt0 = leaf;
746 
747 	/* Double check to make sure fn is not an intermediate node
748 	 * and fn->leaf does not points to its child's leaf
749 	 * (This might happen if all routes under fn are deleted from
750 	 * the tree and fib6_repair_tree() is called on the node.)
751 	 */
752 	key_plen = rt0->fib6_dst.plen;
753 #ifdef CONFIG_IPV6_SUBTREES
754 	if (rt0->fib6_src.plen)
755 		key_plen = rt0->fib6_src.plen;
756 #endif
757 	if (fn->fn_bit != key_plen)
758 		return net->ipv6.fib6_null_entry;
759 
760 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
761 			     &do_rr);
762 
763 	if (do_rr) {
764 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
765 
766 		/* no entries matched; do round-robin */
767 		if (!next || next->fib6_metric != rt0->fib6_metric)
768 			next = leaf;
769 
770 		if (next != rt0) {
771 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
772 			/* make sure next is not being deleted from the tree */
773 			if (next->fib6_node)
774 				rcu_assign_pointer(fn->rr_ptr, next);
775 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
776 		}
777 	}
778 
779 	return match ? match : net->ipv6.fib6_null_entry;
780 }
781 
782 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
783 {
784 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
785 }
786 
787 #ifdef CONFIG_IPV6_ROUTE_INFO
788 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
789 		  const struct in6_addr *gwaddr)
790 {
791 	struct net *net = dev_net(dev);
792 	struct route_info *rinfo = (struct route_info *) opt;
793 	struct in6_addr prefix_buf, *prefix;
794 	unsigned int pref;
795 	unsigned long lifetime;
796 	struct fib6_info *rt;
797 
798 	if (len < sizeof(struct route_info)) {
799 		return -EINVAL;
800 	}
801 
802 	/* Sanity check for prefix_len and length */
803 	if (rinfo->length > 3) {
804 		return -EINVAL;
805 	} else if (rinfo->prefix_len > 128) {
806 		return -EINVAL;
807 	} else if (rinfo->prefix_len > 64) {
808 		if (rinfo->length < 2) {
809 			return -EINVAL;
810 		}
811 	} else if (rinfo->prefix_len > 0) {
812 		if (rinfo->length < 1) {
813 			return -EINVAL;
814 		}
815 	}
816 
817 	pref = rinfo->route_pref;
818 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
819 		return -EINVAL;
820 
821 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
822 
823 	if (rinfo->length == 3)
824 		prefix = (struct in6_addr *)rinfo->prefix;
825 	else {
826 		/* this function is safe */
827 		ipv6_addr_prefix(&prefix_buf,
828 				 (struct in6_addr *)rinfo->prefix,
829 				 rinfo->prefix_len);
830 		prefix = &prefix_buf;
831 	}
832 
833 	if (rinfo->prefix_len == 0)
834 		rt = rt6_get_dflt_router(net, gwaddr, dev);
835 	else
836 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
837 					gwaddr, dev);
838 
839 	if (rt && !lifetime) {
840 		ip6_del_rt(net, rt);
841 		rt = NULL;
842 	}
843 
844 	if (!rt && lifetime)
845 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
846 					dev, pref);
847 	else if (rt)
848 		rt->fib6_flags = RTF_ROUTEINFO |
849 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
850 
851 	if (rt) {
852 		if (!addrconf_finite_timeout(lifetime))
853 			fib6_clean_expires(rt);
854 		else
855 			fib6_set_expires(rt, jiffies + HZ * lifetime);
856 
857 		fib6_info_release(rt);
858 	}
859 	return 0;
860 }
861 #endif
862 
863 /*
864  *	Misc support functions
865  */
866 
867 /* called with rcu_lock held */
868 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
869 {
870 	struct net_device *dev = rt->fib6_nh.nh_dev;
871 
872 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
873 		/* for copies of local routes, dst->dev needs to be the
874 		 * device if it is a master device, the master device if
875 		 * device is enslaved, and the loopback as the default
876 		 */
877 		if (netif_is_l3_slave(dev) &&
878 		    !rt6_need_strict(&rt->fib6_dst.addr))
879 			dev = l3mdev_master_dev_rcu(dev);
880 		else if (!netif_is_l3_master(dev))
881 			dev = dev_net(dev)->loopback_dev;
882 		/* last case is netif_is_l3_master(dev) is true in which
883 		 * case we want dev returned to be dev
884 		 */
885 	}
886 
887 	return dev;
888 }
889 
890 static const int fib6_prop[RTN_MAX + 1] = {
891 	[RTN_UNSPEC]	= 0,
892 	[RTN_UNICAST]	= 0,
893 	[RTN_LOCAL]	= 0,
894 	[RTN_BROADCAST]	= 0,
895 	[RTN_ANYCAST]	= 0,
896 	[RTN_MULTICAST]	= 0,
897 	[RTN_BLACKHOLE]	= -EINVAL,
898 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
899 	[RTN_PROHIBIT]	= -EACCES,
900 	[RTN_THROW]	= -EAGAIN,
901 	[RTN_NAT]	= -EINVAL,
902 	[RTN_XRESOLVE]	= -EINVAL,
903 };
904 
905 static int ip6_rt_type_to_error(u8 fib6_type)
906 {
907 	return fib6_prop[fib6_type];
908 }
909 
910 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
911 {
912 	unsigned short flags = 0;
913 
914 	if (rt->dst_nocount)
915 		flags |= DST_NOCOUNT;
916 	if (rt->dst_nopolicy)
917 		flags |= DST_NOPOLICY;
918 	if (rt->dst_host)
919 		flags |= DST_HOST;
920 
921 	return flags;
922 }
923 
924 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
925 {
926 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
927 
928 	switch (ort->fib6_type) {
929 	case RTN_BLACKHOLE:
930 		rt->dst.output = dst_discard_out;
931 		rt->dst.input = dst_discard;
932 		break;
933 	case RTN_PROHIBIT:
934 		rt->dst.output = ip6_pkt_prohibit_out;
935 		rt->dst.input = ip6_pkt_prohibit;
936 		break;
937 	case RTN_THROW:
938 	case RTN_UNREACHABLE:
939 	default:
940 		rt->dst.output = ip6_pkt_discard_out;
941 		rt->dst.input = ip6_pkt_discard;
942 		break;
943 	}
944 }
945 
946 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
947 {
948 	if (ort->fib6_flags & RTF_REJECT) {
949 		ip6_rt_init_dst_reject(rt, ort);
950 		return;
951 	}
952 
953 	rt->dst.error = 0;
954 	rt->dst.output = ip6_output;
955 
956 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
957 		rt->dst.input = ip6_input;
958 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
959 		rt->dst.input = ip6_mc_input;
960 	} else {
961 		rt->dst.input = ip6_forward;
962 	}
963 
964 	if (ort->fib6_nh.nh_lwtstate) {
965 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
966 		lwtunnel_set_redirect(&rt->dst);
967 	}
968 
969 	rt->dst.lastuse = jiffies;
970 }
971 
972 /* Caller must already hold reference to @from */
973 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
974 {
975 	rt->rt6i_flags &= ~RTF_EXPIRES;
976 	rcu_assign_pointer(rt->from, from);
977 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
978 }
979 
980 /* Caller must already hold reference to @ort */
981 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982 {
983 	struct net_device *dev = fib6_info_nh_dev(ort);
984 
985 	ip6_rt_init_dst(rt, ort);
986 
987 	rt->rt6i_dst = ort->fib6_dst;
988 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	rt6_set_from(rt, ort);
992 #ifdef CONFIG_IPV6_SUBTREES
993 	rt->rt6i_src = ort->fib6_src;
994 #endif
995 }
996 
997 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
998 					struct in6_addr *saddr)
999 {
1000 	struct fib6_node *pn, *sn;
1001 	while (1) {
1002 		if (fn->fn_flags & RTN_TL_ROOT)
1003 			return NULL;
1004 		pn = rcu_dereference(fn->parent);
1005 		sn = FIB6_SUBTREE(pn);
1006 		if (sn && sn != fn)
1007 			fn = fib6_node_lookup(sn, NULL, saddr);
1008 		else
1009 			fn = pn;
1010 		if (fn->fn_flags & RTN_RTINFO)
1011 			return fn;
1012 	}
1013 }
1014 
1015 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1016 			  bool null_fallback)
1017 {
1018 	struct rt6_info *rt = *prt;
1019 
1020 	if (dst_hold_safe(&rt->dst))
1021 		return true;
1022 	if (null_fallback) {
1023 		rt = net->ipv6.ip6_null_entry;
1024 		dst_hold(&rt->dst);
1025 	} else {
1026 		rt = NULL;
1027 	}
1028 	*prt = rt;
1029 	return false;
1030 }
1031 
1032 /* called with rcu_lock held */
1033 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1034 {
1035 	unsigned short flags = fib6_info_dst_flags(rt);
1036 	struct net_device *dev = rt->fib6_nh.nh_dev;
1037 	struct rt6_info *nrt;
1038 
1039 	if (!fib6_info_hold_safe(rt))
1040 		goto fallback;
1041 
1042 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1043 	if (!nrt) {
1044 		fib6_info_release(rt);
1045 		goto fallback;
1046 	}
1047 
1048 	ip6_rt_copy_init(nrt, rt);
1049 	return nrt;
1050 
1051 fallback:
1052 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1053 	dst_hold(&nrt->dst);
1054 	return nrt;
1055 }
1056 
1057 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1058 					     struct fib6_table *table,
1059 					     struct flowi6 *fl6,
1060 					     const struct sk_buff *skb,
1061 					     int flags)
1062 {
1063 	struct fib6_info *f6i;
1064 	struct fib6_node *fn;
1065 	struct rt6_info *rt;
1066 
1067 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1068 		flags &= ~RT6_LOOKUP_F_IFACE;
1069 
1070 	rcu_read_lock();
1071 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1072 restart:
1073 	f6i = rcu_dereference(fn->leaf);
1074 	if (!f6i) {
1075 		f6i = net->ipv6.fib6_null_entry;
1076 	} else {
1077 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1078 				      fl6->flowi6_oif, flags);
1079 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1080 			f6i = fib6_multipath_select(net, f6i, fl6,
1081 						    fl6->flowi6_oif, skb,
1082 						    flags);
1083 	}
1084 	if (f6i == net->ipv6.fib6_null_entry) {
1085 		fn = fib6_backtrack(fn, &fl6->saddr);
1086 		if (fn)
1087 			goto restart;
1088 	}
1089 
1090 	trace_fib6_table_lookup(net, f6i, table, fl6);
1091 
1092 	/* Search through exception table */
1093 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1094 	if (rt) {
1095 		if (ip6_hold_safe(net, &rt, true))
1096 			dst_use_noref(&rt->dst, jiffies);
1097 	} else if (f6i == net->ipv6.fib6_null_entry) {
1098 		rt = net->ipv6.ip6_null_entry;
1099 		dst_hold(&rt->dst);
1100 	} else {
1101 		rt = ip6_create_rt_rcu(f6i);
1102 	}
1103 
1104 	rcu_read_unlock();
1105 
1106 	return rt;
1107 }
1108 
1109 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1110 				   const struct sk_buff *skb, int flags)
1111 {
1112 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1115 
1116 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1117 			    const struct in6_addr *saddr, int oif,
1118 			    const struct sk_buff *skb, int strict)
1119 {
1120 	struct flowi6 fl6 = {
1121 		.flowi6_oif = oif,
1122 		.daddr = *daddr,
1123 	};
1124 	struct dst_entry *dst;
1125 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1126 
1127 	if (saddr) {
1128 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1129 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1130 	}
1131 
1132 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1133 	if (dst->error == 0)
1134 		return (struct rt6_info *) dst;
1135 
1136 	dst_release(dst);
1137 
1138 	return NULL;
1139 }
1140 EXPORT_SYMBOL(rt6_lookup);
1141 
1142 /* ip6_ins_rt is called with FREE table->tb6_lock.
1143  * It takes new route entry, the addition fails by any reason the
1144  * route is released.
1145  * Caller must hold dst before calling it.
1146  */
1147 
1148 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1149 			struct netlink_ext_ack *extack)
1150 {
1151 	int err;
1152 	struct fib6_table *table;
1153 
1154 	table = rt->fib6_table;
1155 	spin_lock_bh(&table->tb6_lock);
1156 	err = fib6_add(&table->tb6_root, rt, info, extack);
1157 	spin_unlock_bh(&table->tb6_lock);
1158 
1159 	return err;
1160 }
1161 
1162 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1163 {
1164 	struct nl_info info = {	.nl_net = net, };
1165 
1166 	return __ip6_ins_rt(rt, &info, NULL);
1167 }
1168 
1169 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1170 					   const struct in6_addr *daddr,
1171 					   const struct in6_addr *saddr)
1172 {
1173 	struct net_device *dev;
1174 	struct rt6_info *rt;
1175 
1176 	/*
1177 	 *	Clone the route.
1178 	 */
1179 
1180 	if (!fib6_info_hold_safe(ort))
1181 		return NULL;
1182 
1183 	dev = ip6_rt_get_dev_rcu(ort);
1184 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1185 	if (!rt) {
1186 		fib6_info_release(ort);
1187 		return NULL;
1188 	}
1189 
1190 	ip6_rt_copy_init(rt, ort);
1191 	rt->rt6i_flags |= RTF_CACHE;
1192 	rt->dst.flags |= DST_HOST;
1193 	rt->rt6i_dst.addr = *daddr;
1194 	rt->rt6i_dst.plen = 128;
1195 
1196 	if (!rt6_is_gw_or_nonexthop(ort)) {
1197 		if (ort->fib6_dst.plen != 128 &&
1198 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1199 			rt->rt6i_flags |= RTF_ANYCAST;
1200 #ifdef CONFIG_IPV6_SUBTREES
1201 		if (rt->rt6i_src.plen && saddr) {
1202 			rt->rt6i_src.addr = *saddr;
1203 			rt->rt6i_src.plen = 128;
1204 		}
1205 #endif
1206 	}
1207 
1208 	return rt;
1209 }
1210 
1211 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1212 {
1213 	unsigned short flags = fib6_info_dst_flags(rt);
1214 	struct net_device *dev;
1215 	struct rt6_info *pcpu_rt;
1216 
1217 	if (!fib6_info_hold_safe(rt))
1218 		return NULL;
1219 
1220 	rcu_read_lock();
1221 	dev = ip6_rt_get_dev_rcu(rt);
1222 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1223 	rcu_read_unlock();
1224 	if (!pcpu_rt) {
1225 		fib6_info_release(rt);
1226 		return NULL;
1227 	}
1228 	ip6_rt_copy_init(pcpu_rt, rt);
1229 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1230 	return pcpu_rt;
1231 }
1232 
1233 /* It should be called with rcu_read_lock() acquired */
1234 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1235 {
1236 	struct rt6_info *pcpu_rt, **p;
1237 
1238 	p = this_cpu_ptr(rt->rt6i_pcpu);
1239 	pcpu_rt = *p;
1240 
1241 	if (pcpu_rt)
1242 		ip6_hold_safe(NULL, &pcpu_rt, false);
1243 
1244 	return pcpu_rt;
1245 }
1246 
1247 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1248 					    struct fib6_info *rt)
1249 {
1250 	struct rt6_info *pcpu_rt, *prev, **p;
1251 
1252 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1253 	if (!pcpu_rt) {
1254 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1255 		return net->ipv6.ip6_null_entry;
1256 	}
1257 
1258 	dst_hold(&pcpu_rt->dst);
1259 	p = this_cpu_ptr(rt->rt6i_pcpu);
1260 	prev = cmpxchg(p, NULL, pcpu_rt);
1261 	BUG_ON(prev);
1262 
1263 	return pcpu_rt;
1264 }
1265 
1266 /* exception hash table implementation
1267  */
1268 static DEFINE_SPINLOCK(rt6_exception_lock);
1269 
1270 /* Remove rt6_ex from hash table and free the memory
1271  * Caller must hold rt6_exception_lock
1272  */
1273 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1274 				 struct rt6_exception *rt6_ex)
1275 {
1276 	struct fib6_info *from;
1277 	struct net *net;
1278 
1279 	if (!bucket || !rt6_ex)
1280 		return;
1281 
1282 	net = dev_net(rt6_ex->rt6i->dst.dev);
1283 	net->ipv6.rt6_stats->fib_rt_cache--;
1284 
1285 	/* purge completely the exception to allow releasing the held resources:
1286 	 * some [sk] cache may keep the dst around for unlimited time
1287 	 */
1288 	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1289 	fib6_info_release(from);
1290 	dst_dev_put(&rt6_ex->rt6i->dst);
1291 
1292 	hlist_del_rcu(&rt6_ex->hlist);
1293 	dst_release(&rt6_ex->rt6i->dst);
1294 	kfree_rcu(rt6_ex, rcu);
1295 	WARN_ON_ONCE(!bucket->depth);
1296 	bucket->depth--;
1297 }
1298 
1299 /* Remove oldest rt6_ex in bucket and free the memory
1300  * Caller must hold rt6_exception_lock
1301  */
1302 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1303 {
1304 	struct rt6_exception *rt6_ex, *oldest = NULL;
1305 
1306 	if (!bucket)
1307 		return;
1308 
1309 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1310 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1311 			oldest = rt6_ex;
1312 	}
1313 	rt6_remove_exception(bucket, oldest);
1314 }
1315 
1316 static u32 rt6_exception_hash(const struct in6_addr *dst,
1317 			      const struct in6_addr *src)
1318 {
1319 	static u32 seed __read_mostly;
1320 	u32 val;
1321 
1322 	net_get_random_once(&seed, sizeof(seed));
1323 	val = jhash(dst, sizeof(*dst), seed);
1324 
1325 #ifdef CONFIG_IPV6_SUBTREES
1326 	if (src)
1327 		val = jhash(src, sizeof(*src), val);
1328 #endif
1329 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1330 }
1331 
1332 /* Helper function to find the cached rt in the hash table
1333  * and update bucket pointer to point to the bucket for this
1334  * (daddr, saddr) pair
1335  * Caller must hold rt6_exception_lock
1336  */
1337 static struct rt6_exception *
1338 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1339 			      const struct in6_addr *daddr,
1340 			      const struct in6_addr *saddr)
1341 {
1342 	struct rt6_exception *rt6_ex;
1343 	u32 hval;
1344 
1345 	if (!(*bucket) || !daddr)
1346 		return NULL;
1347 
1348 	hval = rt6_exception_hash(daddr, saddr);
1349 	*bucket += hval;
1350 
1351 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1352 		struct rt6_info *rt6 = rt6_ex->rt6i;
1353 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1354 
1355 #ifdef CONFIG_IPV6_SUBTREES
1356 		if (matched && saddr)
1357 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1358 #endif
1359 		if (matched)
1360 			return rt6_ex;
1361 	}
1362 	return NULL;
1363 }
1364 
1365 /* Helper function to find the cached rt in the hash table
1366  * and update bucket pointer to point to the bucket for this
1367  * (daddr, saddr) pair
1368  * Caller must hold rcu_read_lock()
1369  */
1370 static struct rt6_exception *
1371 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1372 			 const struct in6_addr *daddr,
1373 			 const struct in6_addr *saddr)
1374 {
1375 	struct rt6_exception *rt6_ex;
1376 	u32 hval;
1377 
1378 	WARN_ON_ONCE(!rcu_read_lock_held());
1379 
1380 	if (!(*bucket) || !daddr)
1381 		return NULL;
1382 
1383 	hval = rt6_exception_hash(daddr, saddr);
1384 	*bucket += hval;
1385 
1386 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1387 		struct rt6_info *rt6 = rt6_ex->rt6i;
1388 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1389 
1390 #ifdef CONFIG_IPV6_SUBTREES
1391 		if (matched && saddr)
1392 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 #endif
1394 		if (matched)
1395 			return rt6_ex;
1396 	}
1397 	return NULL;
1398 }
1399 
1400 static unsigned int fib6_mtu(const struct fib6_info *rt)
1401 {
1402 	unsigned int mtu;
1403 
1404 	if (rt->fib6_pmtu) {
1405 		mtu = rt->fib6_pmtu;
1406 	} else {
1407 		struct net_device *dev = fib6_info_nh_dev(rt);
1408 		struct inet6_dev *idev;
1409 
1410 		rcu_read_lock();
1411 		idev = __in6_dev_get(dev);
1412 		mtu = idev->cnf.mtu6;
1413 		rcu_read_unlock();
1414 	}
1415 
1416 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1417 
1418 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1419 }
1420 
1421 static int rt6_insert_exception(struct rt6_info *nrt,
1422 				struct fib6_info *ort)
1423 {
1424 	struct net *net = dev_net(nrt->dst.dev);
1425 	struct rt6_exception_bucket *bucket;
1426 	struct in6_addr *src_key = NULL;
1427 	struct rt6_exception *rt6_ex;
1428 	int err = 0;
1429 
1430 	spin_lock_bh(&rt6_exception_lock);
1431 
1432 	if (ort->exception_bucket_flushed) {
1433 		err = -EINVAL;
1434 		goto out;
1435 	}
1436 
1437 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1438 					lockdep_is_held(&rt6_exception_lock));
1439 	if (!bucket) {
1440 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1441 				 GFP_ATOMIC);
1442 		if (!bucket) {
1443 			err = -ENOMEM;
1444 			goto out;
1445 		}
1446 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1447 	}
1448 
1449 #ifdef CONFIG_IPV6_SUBTREES
1450 	/* rt6i_src.plen != 0 indicates ort is in subtree
1451 	 * and exception table is indexed by a hash of
1452 	 * both rt6i_dst and rt6i_src.
1453 	 * Otherwise, the exception table is indexed by
1454 	 * a hash of only rt6i_dst.
1455 	 */
1456 	if (ort->fib6_src.plen)
1457 		src_key = &nrt->rt6i_src.addr;
1458 #endif
1459 	/* rt6_mtu_change() might lower mtu on ort.
1460 	 * Only insert this exception route if its mtu
1461 	 * is less than ort's mtu value.
1462 	 */
1463 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1464 		err = -EINVAL;
1465 		goto out;
1466 	}
1467 
1468 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1469 					       src_key);
1470 	if (rt6_ex)
1471 		rt6_remove_exception(bucket, rt6_ex);
1472 
1473 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1474 	if (!rt6_ex) {
1475 		err = -ENOMEM;
1476 		goto out;
1477 	}
1478 	rt6_ex->rt6i = nrt;
1479 	rt6_ex->stamp = jiffies;
1480 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1481 	bucket->depth++;
1482 	net->ipv6.rt6_stats->fib_rt_cache++;
1483 
1484 	if (bucket->depth > FIB6_MAX_DEPTH)
1485 		rt6_exception_remove_oldest(bucket);
1486 
1487 out:
1488 	spin_unlock_bh(&rt6_exception_lock);
1489 
1490 	/* Update fn->fn_sernum to invalidate all cached dst */
1491 	if (!err) {
1492 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1493 		fib6_update_sernum(net, ort);
1494 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1495 		fib6_force_start_gc(net);
1496 	}
1497 
1498 	return err;
1499 }
1500 
1501 void rt6_flush_exceptions(struct fib6_info *rt)
1502 {
1503 	struct rt6_exception_bucket *bucket;
1504 	struct rt6_exception *rt6_ex;
1505 	struct hlist_node *tmp;
1506 	int i;
1507 
1508 	spin_lock_bh(&rt6_exception_lock);
1509 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1510 	rt->exception_bucket_flushed = 1;
1511 
1512 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1513 				    lockdep_is_held(&rt6_exception_lock));
1514 	if (!bucket)
1515 		goto out;
1516 
1517 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1518 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1519 			rt6_remove_exception(bucket, rt6_ex);
1520 		WARN_ON_ONCE(bucket->depth);
1521 		bucket++;
1522 	}
1523 
1524 out:
1525 	spin_unlock_bh(&rt6_exception_lock);
1526 }
1527 
1528 /* Find cached rt in the hash table inside passed in rt
1529  * Caller has to hold rcu_read_lock()
1530  */
1531 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1532 					   struct in6_addr *daddr,
1533 					   struct in6_addr *saddr)
1534 {
1535 	struct rt6_exception_bucket *bucket;
1536 	struct in6_addr *src_key = NULL;
1537 	struct rt6_exception *rt6_ex;
1538 	struct rt6_info *res = NULL;
1539 
1540 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1541 
1542 #ifdef CONFIG_IPV6_SUBTREES
1543 	/* rt6i_src.plen != 0 indicates rt is in subtree
1544 	 * and exception table is indexed by a hash of
1545 	 * both rt6i_dst and rt6i_src.
1546 	 * Otherwise, the exception table is indexed by
1547 	 * a hash of only rt6i_dst.
1548 	 */
1549 	if (rt->fib6_src.plen)
1550 		src_key = saddr;
1551 #endif
1552 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1553 
1554 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1555 		res = rt6_ex->rt6i;
1556 
1557 	return res;
1558 }
1559 
1560 /* Remove the passed in cached rt from the hash table that contains it */
1561 static int rt6_remove_exception_rt(struct rt6_info *rt)
1562 {
1563 	struct rt6_exception_bucket *bucket;
1564 	struct in6_addr *src_key = NULL;
1565 	struct rt6_exception *rt6_ex;
1566 	struct fib6_info *from;
1567 	int err;
1568 
1569 	from = rcu_dereference(rt->from);
1570 	if (!from ||
1571 	    !(rt->rt6i_flags & RTF_CACHE))
1572 		return -EINVAL;
1573 
1574 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1575 		return -ENOENT;
1576 
1577 	spin_lock_bh(&rt6_exception_lock);
1578 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1579 				    lockdep_is_held(&rt6_exception_lock));
1580 #ifdef CONFIG_IPV6_SUBTREES
1581 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1582 	 * and exception table is indexed by a hash of
1583 	 * both rt6i_dst and rt6i_src.
1584 	 * Otherwise, the exception table is indexed by
1585 	 * a hash of only rt6i_dst.
1586 	 */
1587 	if (from->fib6_src.plen)
1588 		src_key = &rt->rt6i_src.addr;
1589 #endif
1590 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1591 					       &rt->rt6i_dst.addr,
1592 					       src_key);
1593 	if (rt6_ex) {
1594 		rt6_remove_exception(bucket, rt6_ex);
1595 		err = 0;
1596 	} else {
1597 		err = -ENOENT;
1598 	}
1599 
1600 	spin_unlock_bh(&rt6_exception_lock);
1601 	return err;
1602 }
1603 
1604 /* Find rt6_ex which contains the passed in rt cache and
1605  * refresh its stamp
1606  */
1607 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1608 {
1609 	struct rt6_exception_bucket *bucket;
1610 	struct in6_addr *src_key = NULL;
1611 	struct rt6_exception *rt6_ex;
1612 	struct fib6_info *from;
1613 
1614 	rcu_read_lock();
1615 	from = rcu_dereference(rt->from);
1616 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1617 		goto unlock;
1618 
1619 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1620 
1621 #ifdef CONFIG_IPV6_SUBTREES
1622 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1623 	 * and exception table is indexed by a hash of
1624 	 * both rt6i_dst and rt6i_src.
1625 	 * Otherwise, the exception table is indexed by
1626 	 * a hash of only rt6i_dst.
1627 	 */
1628 	if (from->fib6_src.plen)
1629 		src_key = &rt->rt6i_src.addr;
1630 #endif
1631 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1632 					  &rt->rt6i_dst.addr,
1633 					  src_key);
1634 	if (rt6_ex)
1635 		rt6_ex->stamp = jiffies;
1636 
1637 unlock:
1638 	rcu_read_unlock();
1639 }
1640 
1641 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1642 					 struct rt6_info *rt, int mtu)
1643 {
1644 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1645 	 * lowest MTU in the path: always allow updating the route PMTU to
1646 	 * reflect PMTU decreases.
1647 	 *
1648 	 * If the new MTU is higher, and the route PMTU is equal to the local
1649 	 * MTU, this means the old MTU is the lowest in the path, so allow
1650 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1651 	 * handle this.
1652 	 */
1653 
1654 	if (dst_mtu(&rt->dst) >= mtu)
1655 		return true;
1656 
1657 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1658 		return true;
1659 
1660 	return false;
1661 }
1662 
1663 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1664 				       struct fib6_info *rt, int mtu)
1665 {
1666 	struct rt6_exception_bucket *bucket;
1667 	struct rt6_exception *rt6_ex;
1668 	int i;
1669 
1670 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1671 					lockdep_is_held(&rt6_exception_lock));
1672 
1673 	if (!bucket)
1674 		return;
1675 
1676 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1677 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1678 			struct rt6_info *entry = rt6_ex->rt6i;
1679 
1680 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1681 			 * route), the metrics of its rt->from have already
1682 			 * been updated.
1683 			 */
1684 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1685 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1686 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1687 		}
1688 		bucket++;
1689 	}
1690 }
1691 
1692 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1693 
1694 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1695 					struct in6_addr *gateway)
1696 {
1697 	struct rt6_exception_bucket *bucket;
1698 	struct rt6_exception *rt6_ex;
1699 	struct hlist_node *tmp;
1700 	int i;
1701 
1702 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1703 		return;
1704 
1705 	spin_lock_bh(&rt6_exception_lock);
1706 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1707 				     lockdep_is_held(&rt6_exception_lock));
1708 
1709 	if (bucket) {
1710 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1711 			hlist_for_each_entry_safe(rt6_ex, tmp,
1712 						  &bucket->chain, hlist) {
1713 				struct rt6_info *entry = rt6_ex->rt6i;
1714 
1715 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1716 				    RTF_CACHE_GATEWAY &&
1717 				    ipv6_addr_equal(gateway,
1718 						    &entry->rt6i_gateway)) {
1719 					rt6_remove_exception(bucket, rt6_ex);
1720 				}
1721 			}
1722 			bucket++;
1723 		}
1724 	}
1725 
1726 	spin_unlock_bh(&rt6_exception_lock);
1727 }
1728 
1729 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1730 				      struct rt6_exception *rt6_ex,
1731 				      struct fib6_gc_args *gc_args,
1732 				      unsigned long now)
1733 {
1734 	struct rt6_info *rt = rt6_ex->rt6i;
1735 
1736 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1737 	 * even if others have still references to them, so that on next
1738 	 * dst_check() such references can be dropped.
1739 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1740 	 * expired, independently from their aging, as per RFC 8201 section 4
1741 	 */
1742 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1743 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1744 			RT6_TRACE("aging clone %p\n", rt);
1745 			rt6_remove_exception(bucket, rt6_ex);
1746 			return;
1747 		}
1748 	} else if (time_after(jiffies, rt->dst.expires)) {
1749 		RT6_TRACE("purging expired route %p\n", rt);
1750 		rt6_remove_exception(bucket, rt6_ex);
1751 		return;
1752 	}
1753 
1754 	if (rt->rt6i_flags & RTF_GATEWAY) {
1755 		struct neighbour *neigh;
1756 		__u8 neigh_flags = 0;
1757 
1758 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1759 		if (neigh)
1760 			neigh_flags = neigh->flags;
1761 
1762 		if (!(neigh_flags & NTF_ROUTER)) {
1763 			RT6_TRACE("purging route %p via non-router but gateway\n",
1764 				  rt);
1765 			rt6_remove_exception(bucket, rt6_ex);
1766 			return;
1767 		}
1768 	}
1769 
1770 	gc_args->more++;
1771 }
1772 
1773 void rt6_age_exceptions(struct fib6_info *rt,
1774 			struct fib6_gc_args *gc_args,
1775 			unsigned long now)
1776 {
1777 	struct rt6_exception_bucket *bucket;
1778 	struct rt6_exception *rt6_ex;
1779 	struct hlist_node *tmp;
1780 	int i;
1781 
1782 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1783 		return;
1784 
1785 	rcu_read_lock_bh();
1786 	spin_lock(&rt6_exception_lock);
1787 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1788 				    lockdep_is_held(&rt6_exception_lock));
1789 
1790 	if (bucket) {
1791 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1792 			hlist_for_each_entry_safe(rt6_ex, tmp,
1793 						  &bucket->chain, hlist) {
1794 				rt6_age_examine_exception(bucket, rt6_ex,
1795 							  gc_args, now);
1796 			}
1797 			bucket++;
1798 		}
1799 	}
1800 	spin_unlock(&rt6_exception_lock);
1801 	rcu_read_unlock_bh();
1802 }
1803 
1804 /* must be called with rcu lock held */
1805 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1806 				    int oif, struct flowi6 *fl6, int strict)
1807 {
1808 	struct fib6_node *fn, *saved_fn;
1809 	struct fib6_info *f6i;
1810 
1811 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1812 	saved_fn = fn;
1813 
1814 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1815 		oif = 0;
1816 
1817 redo_rt6_select:
1818 	f6i = rt6_select(net, fn, oif, strict);
1819 	if (f6i == net->ipv6.fib6_null_entry) {
1820 		fn = fib6_backtrack(fn, &fl6->saddr);
1821 		if (fn)
1822 			goto redo_rt6_select;
1823 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1824 			/* also consider unreachable route */
1825 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1826 			fn = saved_fn;
1827 			goto redo_rt6_select;
1828 		}
1829 	}
1830 
1831 	trace_fib6_table_lookup(net, f6i, table, fl6);
1832 
1833 	return f6i;
1834 }
1835 
1836 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1837 			       int oif, struct flowi6 *fl6,
1838 			       const struct sk_buff *skb, int flags)
1839 {
1840 	struct fib6_info *f6i;
1841 	struct rt6_info *rt;
1842 	int strict = 0;
1843 
1844 	strict |= flags & RT6_LOOKUP_F_IFACE;
1845 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1846 	if (net->ipv6.devconf_all->forwarding == 0)
1847 		strict |= RT6_LOOKUP_F_REACHABLE;
1848 
1849 	rcu_read_lock();
1850 
1851 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1852 	if (f6i->fib6_nsiblings)
1853 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1854 
1855 	if (f6i == net->ipv6.fib6_null_entry) {
1856 		rt = net->ipv6.ip6_null_entry;
1857 		rcu_read_unlock();
1858 		dst_hold(&rt->dst);
1859 		return rt;
1860 	}
1861 
1862 	/*Search through exception table */
1863 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1864 	if (rt) {
1865 		if (ip6_hold_safe(net, &rt, true))
1866 			dst_use_noref(&rt->dst, jiffies);
1867 
1868 		rcu_read_unlock();
1869 		return rt;
1870 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1871 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1872 		/* Create a RTF_CACHE clone which will not be
1873 		 * owned by the fib6 tree.  It is for the special case where
1874 		 * the daddr in the skb during the neighbor look-up is different
1875 		 * from the fl6->daddr used to look-up route here.
1876 		 */
1877 		struct rt6_info *uncached_rt;
1878 
1879 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1880 
1881 		rcu_read_unlock();
1882 
1883 		if (uncached_rt) {
1884 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1885 			 * No need for another dst_hold()
1886 			 */
1887 			rt6_uncached_list_add(uncached_rt);
1888 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1889 		} else {
1890 			uncached_rt = net->ipv6.ip6_null_entry;
1891 			dst_hold(&uncached_rt->dst);
1892 		}
1893 
1894 		return uncached_rt;
1895 	} else {
1896 		/* Get a percpu copy */
1897 
1898 		struct rt6_info *pcpu_rt;
1899 
1900 		local_bh_disable();
1901 		pcpu_rt = rt6_get_pcpu_route(f6i);
1902 
1903 		if (!pcpu_rt)
1904 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1905 
1906 		local_bh_enable();
1907 		rcu_read_unlock();
1908 
1909 		return pcpu_rt;
1910 	}
1911 }
1912 EXPORT_SYMBOL_GPL(ip6_pol_route);
1913 
1914 static struct rt6_info *ip6_pol_route_input(struct net *net,
1915 					    struct fib6_table *table,
1916 					    struct flowi6 *fl6,
1917 					    const struct sk_buff *skb,
1918 					    int flags)
1919 {
1920 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1921 }
1922 
1923 struct dst_entry *ip6_route_input_lookup(struct net *net,
1924 					 struct net_device *dev,
1925 					 struct flowi6 *fl6,
1926 					 const struct sk_buff *skb,
1927 					 int flags)
1928 {
1929 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1930 		flags |= RT6_LOOKUP_F_IFACE;
1931 
1932 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1933 }
1934 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1935 
1936 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1937 				  struct flow_keys *keys,
1938 				  struct flow_keys *flkeys)
1939 {
1940 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1941 	const struct ipv6hdr *key_iph = outer_iph;
1942 	struct flow_keys *_flkeys = flkeys;
1943 	const struct ipv6hdr *inner_iph;
1944 	const struct icmp6hdr *icmph;
1945 	struct ipv6hdr _inner_iph;
1946 	struct icmp6hdr _icmph;
1947 
1948 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1949 		goto out;
1950 
1951 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1952 				   sizeof(_icmph), &_icmph);
1953 	if (!icmph)
1954 		goto out;
1955 
1956 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1957 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1958 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1959 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1960 		goto out;
1961 
1962 	inner_iph = skb_header_pointer(skb,
1963 				       skb_transport_offset(skb) + sizeof(*icmph),
1964 				       sizeof(_inner_iph), &_inner_iph);
1965 	if (!inner_iph)
1966 		goto out;
1967 
1968 	key_iph = inner_iph;
1969 	_flkeys = NULL;
1970 out:
1971 	if (_flkeys) {
1972 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1973 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1974 		keys->tags.flow_label = _flkeys->tags.flow_label;
1975 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1976 	} else {
1977 		keys->addrs.v6addrs.src = key_iph->saddr;
1978 		keys->addrs.v6addrs.dst = key_iph->daddr;
1979 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1980 		keys->basic.ip_proto = key_iph->nexthdr;
1981 	}
1982 }
1983 
1984 /* if skb is set it will be used and fl6 can be NULL */
1985 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1986 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1987 {
1988 	struct flow_keys hash_keys;
1989 	u32 mhash;
1990 
1991 	switch (ip6_multipath_hash_policy(net)) {
1992 	case 0:
1993 		memset(&hash_keys, 0, sizeof(hash_keys));
1994 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1995 		if (skb) {
1996 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1997 		} else {
1998 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1999 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2000 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2001 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2002 		}
2003 		break;
2004 	case 1:
2005 		if (skb) {
2006 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2007 			struct flow_keys keys;
2008 
2009 			/* short-circuit if we already have L4 hash present */
2010 			if (skb->l4_hash)
2011 				return skb_get_hash_raw(skb) >> 1;
2012 
2013 			memset(&hash_keys, 0, sizeof(hash_keys));
2014 
2015                         if (!flkeys) {
2016 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2017 				flkeys = &keys;
2018 			}
2019 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2020 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2021 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2022 			hash_keys.ports.src = flkeys->ports.src;
2023 			hash_keys.ports.dst = flkeys->ports.dst;
2024 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2025 		} else {
2026 			memset(&hash_keys, 0, sizeof(hash_keys));
2027 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2028 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2029 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2030 			hash_keys.ports.src = fl6->fl6_sport;
2031 			hash_keys.ports.dst = fl6->fl6_dport;
2032 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2033 		}
2034 		break;
2035 	}
2036 	mhash = flow_hash_from_keys(&hash_keys);
2037 
2038 	return mhash >> 1;
2039 }
2040 
2041 void ip6_route_input(struct sk_buff *skb)
2042 {
2043 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2044 	struct net *net = dev_net(skb->dev);
2045 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2046 	struct ip_tunnel_info *tun_info;
2047 	struct flowi6 fl6 = {
2048 		.flowi6_iif = skb->dev->ifindex,
2049 		.daddr = iph->daddr,
2050 		.saddr = iph->saddr,
2051 		.flowlabel = ip6_flowinfo(iph),
2052 		.flowi6_mark = skb->mark,
2053 		.flowi6_proto = iph->nexthdr,
2054 	};
2055 	struct flow_keys *flkeys = NULL, _flkeys;
2056 
2057 	tun_info = skb_tunnel_info(skb);
2058 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2059 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2060 
2061 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2062 		flkeys = &_flkeys;
2063 
2064 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2065 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2066 	skb_dst_drop(skb);
2067 	skb_dst_set(skb,
2068 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2069 }
2070 
2071 static struct rt6_info *ip6_pol_route_output(struct net *net,
2072 					     struct fib6_table *table,
2073 					     struct flowi6 *fl6,
2074 					     const struct sk_buff *skb,
2075 					     int flags)
2076 {
2077 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2078 }
2079 
2080 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2081 					 struct flowi6 *fl6, int flags)
2082 {
2083 	bool any_src;
2084 
2085 	if (ipv6_addr_type(&fl6->daddr) &
2086 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2087 		struct dst_entry *dst;
2088 
2089 		dst = l3mdev_link_scope_lookup(net, fl6);
2090 		if (dst)
2091 			return dst;
2092 	}
2093 
2094 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2095 
2096 	any_src = ipv6_addr_any(&fl6->saddr);
2097 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2098 	    (fl6->flowi6_oif && any_src))
2099 		flags |= RT6_LOOKUP_F_IFACE;
2100 
2101 	if (!any_src)
2102 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2103 	else if (sk)
2104 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2105 
2106 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2107 }
2108 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2109 
2110 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2111 {
2112 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2113 	struct net_device *loopback_dev = net->loopback_dev;
2114 	struct dst_entry *new = NULL;
2115 
2116 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2117 		       DST_OBSOLETE_DEAD, 0);
2118 	if (rt) {
2119 		rt6_info_init(rt);
2120 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2121 
2122 		new = &rt->dst;
2123 		new->__use = 1;
2124 		new->input = dst_discard;
2125 		new->output = dst_discard_out;
2126 
2127 		dst_copy_metrics(new, &ort->dst);
2128 
2129 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2130 		rt->rt6i_gateway = ort->rt6i_gateway;
2131 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2132 
2133 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2134 #ifdef CONFIG_IPV6_SUBTREES
2135 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2136 #endif
2137 	}
2138 
2139 	dst_release(dst_orig);
2140 	return new ? new : ERR_PTR(-ENOMEM);
2141 }
2142 
2143 /*
2144  *	Destination cache support functions
2145  */
2146 
2147 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2148 {
2149 	u32 rt_cookie = 0;
2150 
2151 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2152 		return false;
2153 
2154 	if (fib6_check_expired(f6i))
2155 		return false;
2156 
2157 	return true;
2158 }
2159 
2160 static struct dst_entry *rt6_check(struct rt6_info *rt,
2161 				   struct fib6_info *from,
2162 				   u32 cookie)
2163 {
2164 	u32 rt_cookie = 0;
2165 
2166 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2167 	    rt_cookie != cookie)
2168 		return NULL;
2169 
2170 	if (rt6_check_expired(rt))
2171 		return NULL;
2172 
2173 	return &rt->dst;
2174 }
2175 
2176 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2177 					    struct fib6_info *from,
2178 					    u32 cookie)
2179 {
2180 	if (!__rt6_check_expired(rt) &&
2181 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2182 	    fib6_check(from, cookie))
2183 		return &rt->dst;
2184 	else
2185 		return NULL;
2186 }
2187 
2188 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2189 {
2190 	struct dst_entry *dst_ret;
2191 	struct fib6_info *from;
2192 	struct rt6_info *rt;
2193 
2194 	rt = container_of(dst, struct rt6_info, dst);
2195 
2196 	rcu_read_lock();
2197 
2198 	/* All IPV6 dsts are created with ->obsolete set to the value
2199 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2200 	 * into this function always.
2201 	 */
2202 
2203 	from = rcu_dereference(rt->from);
2204 
2205 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2206 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2207 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2208 	else
2209 		dst_ret = rt6_check(rt, from, cookie);
2210 
2211 	rcu_read_unlock();
2212 
2213 	return dst_ret;
2214 }
2215 
2216 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2217 {
2218 	struct rt6_info *rt = (struct rt6_info *) dst;
2219 
2220 	if (rt) {
2221 		if (rt->rt6i_flags & RTF_CACHE) {
2222 			rcu_read_lock();
2223 			if (rt6_check_expired(rt)) {
2224 				rt6_remove_exception_rt(rt);
2225 				dst = NULL;
2226 			}
2227 			rcu_read_unlock();
2228 		} else {
2229 			dst_release(dst);
2230 			dst = NULL;
2231 		}
2232 	}
2233 	return dst;
2234 }
2235 
2236 static void ip6_link_failure(struct sk_buff *skb)
2237 {
2238 	struct rt6_info *rt;
2239 
2240 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2241 
2242 	rt = (struct rt6_info *) skb_dst(skb);
2243 	if (rt) {
2244 		rcu_read_lock();
2245 		if (rt->rt6i_flags & RTF_CACHE) {
2246 			rt6_remove_exception_rt(rt);
2247 		} else {
2248 			struct fib6_info *from;
2249 			struct fib6_node *fn;
2250 
2251 			from = rcu_dereference(rt->from);
2252 			if (from) {
2253 				fn = rcu_dereference(from->fib6_node);
2254 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2255 					fn->fn_sernum = -1;
2256 			}
2257 		}
2258 		rcu_read_unlock();
2259 	}
2260 }
2261 
2262 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2263 {
2264 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2265 		struct fib6_info *from;
2266 
2267 		rcu_read_lock();
2268 		from = rcu_dereference(rt0->from);
2269 		if (from)
2270 			rt0->dst.expires = from->expires;
2271 		rcu_read_unlock();
2272 	}
2273 
2274 	dst_set_expires(&rt0->dst, timeout);
2275 	rt0->rt6i_flags |= RTF_EXPIRES;
2276 }
2277 
2278 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2279 {
2280 	struct net *net = dev_net(rt->dst.dev);
2281 
2282 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2283 	rt->rt6i_flags |= RTF_MODIFIED;
2284 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2285 }
2286 
2287 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2288 {
2289 	return !(rt->rt6i_flags & RTF_CACHE) &&
2290 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2291 }
2292 
2293 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2294 				 const struct ipv6hdr *iph, u32 mtu)
2295 {
2296 	const struct in6_addr *daddr, *saddr;
2297 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2298 
2299 	if (dst_metric_locked(dst, RTAX_MTU))
2300 		return;
2301 
2302 	if (iph) {
2303 		daddr = &iph->daddr;
2304 		saddr = &iph->saddr;
2305 	} else if (sk) {
2306 		daddr = &sk->sk_v6_daddr;
2307 		saddr = &inet6_sk(sk)->saddr;
2308 	} else {
2309 		daddr = NULL;
2310 		saddr = NULL;
2311 	}
2312 	dst_confirm_neigh(dst, daddr);
2313 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2314 	if (mtu >= dst_mtu(dst))
2315 		return;
2316 
2317 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2318 		rt6_do_update_pmtu(rt6, mtu);
2319 		/* update rt6_ex->stamp for cache */
2320 		if (rt6->rt6i_flags & RTF_CACHE)
2321 			rt6_update_exception_stamp_rt(rt6);
2322 	} else if (daddr) {
2323 		struct fib6_info *from;
2324 		struct rt6_info *nrt6;
2325 
2326 		rcu_read_lock();
2327 		from = rcu_dereference(rt6->from);
2328 		if (!from) {
2329 			rcu_read_unlock();
2330 			return;
2331 		}
2332 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2333 		if (nrt6) {
2334 			rt6_do_update_pmtu(nrt6, mtu);
2335 			if (rt6_insert_exception(nrt6, from))
2336 				dst_release_immediate(&nrt6->dst);
2337 		}
2338 		rcu_read_unlock();
2339 	}
2340 }
2341 
2342 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2343 			       struct sk_buff *skb, u32 mtu)
2344 {
2345 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2346 }
2347 
2348 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2349 		     int oif, u32 mark, kuid_t uid)
2350 {
2351 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2352 	struct dst_entry *dst;
2353 	struct flowi6 fl6 = {
2354 		.flowi6_oif = oif,
2355 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2356 		.daddr = iph->daddr,
2357 		.saddr = iph->saddr,
2358 		.flowlabel = ip6_flowinfo(iph),
2359 		.flowi6_uid = uid,
2360 	};
2361 
2362 	dst = ip6_route_output(net, NULL, &fl6);
2363 	if (!dst->error)
2364 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2365 	dst_release(dst);
2366 }
2367 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2368 
2369 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2370 {
2371 	int oif = sk->sk_bound_dev_if;
2372 	struct dst_entry *dst;
2373 
2374 	if (!oif && skb->dev)
2375 		oif = l3mdev_master_ifindex(skb->dev);
2376 
2377 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2378 
2379 	dst = __sk_dst_get(sk);
2380 	if (!dst || !dst->obsolete ||
2381 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2382 		return;
2383 
2384 	bh_lock_sock(sk);
2385 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2386 		ip6_datagram_dst_update(sk, false);
2387 	bh_unlock_sock(sk);
2388 }
2389 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2390 
2391 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2392 			   const struct flowi6 *fl6)
2393 {
2394 #ifdef CONFIG_IPV6_SUBTREES
2395 	struct ipv6_pinfo *np = inet6_sk(sk);
2396 #endif
2397 
2398 	ip6_dst_store(sk, dst,
2399 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2400 		      &sk->sk_v6_daddr : NULL,
2401 #ifdef CONFIG_IPV6_SUBTREES
2402 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2403 		      &np->saddr :
2404 #endif
2405 		      NULL);
2406 }
2407 
2408 /* Handle redirects */
2409 struct ip6rd_flowi {
2410 	struct flowi6 fl6;
2411 	struct in6_addr gateway;
2412 };
2413 
2414 static struct rt6_info *__ip6_route_redirect(struct net *net,
2415 					     struct fib6_table *table,
2416 					     struct flowi6 *fl6,
2417 					     const struct sk_buff *skb,
2418 					     int flags)
2419 {
2420 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2421 	struct rt6_info *ret = NULL, *rt_cache;
2422 	struct fib6_info *rt;
2423 	struct fib6_node *fn;
2424 
2425 	/* Get the "current" route for this destination and
2426 	 * check if the redirect has come from appropriate router.
2427 	 *
2428 	 * RFC 4861 specifies that redirects should only be
2429 	 * accepted if they come from the nexthop to the target.
2430 	 * Due to the way the routes are chosen, this notion
2431 	 * is a bit fuzzy and one might need to check all possible
2432 	 * routes.
2433 	 */
2434 
2435 	rcu_read_lock();
2436 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2437 restart:
2438 	for_each_fib6_node_rt_rcu(fn) {
2439 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2440 			continue;
2441 		if (fib6_check_expired(rt))
2442 			continue;
2443 		if (rt->fib6_flags & RTF_REJECT)
2444 			break;
2445 		if (!(rt->fib6_flags & RTF_GATEWAY))
2446 			continue;
2447 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2448 			continue;
2449 		/* rt_cache's gateway might be different from its 'parent'
2450 		 * in the case of an ip redirect.
2451 		 * So we keep searching in the exception table if the gateway
2452 		 * is different.
2453 		 */
2454 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2455 			rt_cache = rt6_find_cached_rt(rt,
2456 						      &fl6->daddr,
2457 						      &fl6->saddr);
2458 			if (rt_cache &&
2459 			    ipv6_addr_equal(&rdfl->gateway,
2460 					    &rt_cache->rt6i_gateway)) {
2461 				ret = rt_cache;
2462 				break;
2463 			}
2464 			continue;
2465 		}
2466 		break;
2467 	}
2468 
2469 	if (!rt)
2470 		rt = net->ipv6.fib6_null_entry;
2471 	else if (rt->fib6_flags & RTF_REJECT) {
2472 		ret = net->ipv6.ip6_null_entry;
2473 		goto out;
2474 	}
2475 
2476 	if (rt == net->ipv6.fib6_null_entry) {
2477 		fn = fib6_backtrack(fn, &fl6->saddr);
2478 		if (fn)
2479 			goto restart;
2480 	}
2481 
2482 out:
2483 	if (ret)
2484 		ip6_hold_safe(net, &ret, true);
2485 	else
2486 		ret = ip6_create_rt_rcu(rt);
2487 
2488 	rcu_read_unlock();
2489 
2490 	trace_fib6_table_lookup(net, rt, table, fl6);
2491 	return ret;
2492 };
2493 
2494 static struct dst_entry *ip6_route_redirect(struct net *net,
2495 					    const struct flowi6 *fl6,
2496 					    const struct sk_buff *skb,
2497 					    const struct in6_addr *gateway)
2498 {
2499 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2500 	struct ip6rd_flowi rdfl;
2501 
2502 	rdfl.fl6 = *fl6;
2503 	rdfl.gateway = *gateway;
2504 
2505 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2506 				flags, __ip6_route_redirect);
2507 }
2508 
2509 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2510 		  kuid_t uid)
2511 {
2512 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2513 	struct dst_entry *dst;
2514 	struct flowi6 fl6 = {
2515 		.flowi6_iif = LOOPBACK_IFINDEX,
2516 		.flowi6_oif = oif,
2517 		.flowi6_mark = mark,
2518 		.daddr = iph->daddr,
2519 		.saddr = iph->saddr,
2520 		.flowlabel = ip6_flowinfo(iph),
2521 		.flowi6_uid = uid,
2522 	};
2523 
2524 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2525 	rt6_do_redirect(dst, NULL, skb);
2526 	dst_release(dst);
2527 }
2528 EXPORT_SYMBOL_GPL(ip6_redirect);
2529 
2530 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2531 {
2532 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2533 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2534 	struct dst_entry *dst;
2535 	struct flowi6 fl6 = {
2536 		.flowi6_iif = LOOPBACK_IFINDEX,
2537 		.flowi6_oif = oif,
2538 		.daddr = msg->dest,
2539 		.saddr = iph->daddr,
2540 		.flowi6_uid = sock_net_uid(net, NULL),
2541 	};
2542 
2543 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2544 	rt6_do_redirect(dst, NULL, skb);
2545 	dst_release(dst);
2546 }
2547 
2548 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2549 {
2550 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2551 		     sk->sk_uid);
2552 }
2553 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2554 
2555 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2556 {
2557 	struct net_device *dev = dst->dev;
2558 	unsigned int mtu = dst_mtu(dst);
2559 	struct net *net = dev_net(dev);
2560 
2561 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2562 
2563 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2564 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2565 
2566 	/*
2567 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2568 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2569 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2570 	 * rely only on pmtu discovery"
2571 	 */
2572 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2573 		mtu = IPV6_MAXPLEN;
2574 	return mtu;
2575 }
2576 
2577 static unsigned int ip6_mtu(const struct dst_entry *dst)
2578 {
2579 	struct inet6_dev *idev;
2580 	unsigned int mtu;
2581 
2582 	mtu = dst_metric_raw(dst, RTAX_MTU);
2583 	if (mtu)
2584 		goto out;
2585 
2586 	mtu = IPV6_MIN_MTU;
2587 
2588 	rcu_read_lock();
2589 	idev = __in6_dev_get(dst->dev);
2590 	if (idev)
2591 		mtu = idev->cnf.mtu6;
2592 	rcu_read_unlock();
2593 
2594 out:
2595 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2596 
2597 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2598 }
2599 
2600 /* MTU selection:
2601  * 1. mtu on route is locked - use it
2602  * 2. mtu from nexthop exception
2603  * 3. mtu from egress device
2604  *
2605  * based on ip6_dst_mtu_forward and exception logic of
2606  * rt6_find_cached_rt; called with rcu_read_lock
2607  */
2608 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2609 		      struct in6_addr *saddr)
2610 {
2611 	struct rt6_exception_bucket *bucket;
2612 	struct rt6_exception *rt6_ex;
2613 	struct in6_addr *src_key;
2614 	struct inet6_dev *idev;
2615 	u32 mtu = 0;
2616 
2617 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2618 		mtu = f6i->fib6_pmtu;
2619 		if (mtu)
2620 			goto out;
2621 	}
2622 
2623 	src_key = NULL;
2624 #ifdef CONFIG_IPV6_SUBTREES
2625 	if (f6i->fib6_src.plen)
2626 		src_key = saddr;
2627 #endif
2628 
2629 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2630 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2631 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2632 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2633 
2634 	if (likely(!mtu)) {
2635 		struct net_device *dev = fib6_info_nh_dev(f6i);
2636 
2637 		mtu = IPV6_MIN_MTU;
2638 		idev = __in6_dev_get(dev);
2639 		if (idev && idev->cnf.mtu6 > mtu)
2640 			mtu = idev->cnf.mtu6;
2641 	}
2642 
2643 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2644 out:
2645 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2646 }
2647 
2648 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2649 				  struct flowi6 *fl6)
2650 {
2651 	struct dst_entry *dst;
2652 	struct rt6_info *rt;
2653 	struct inet6_dev *idev = in6_dev_get(dev);
2654 	struct net *net = dev_net(dev);
2655 
2656 	if (unlikely(!idev))
2657 		return ERR_PTR(-ENODEV);
2658 
2659 	rt = ip6_dst_alloc(net, dev, 0);
2660 	if (unlikely(!rt)) {
2661 		in6_dev_put(idev);
2662 		dst = ERR_PTR(-ENOMEM);
2663 		goto out;
2664 	}
2665 
2666 	rt->dst.flags |= DST_HOST;
2667 	rt->dst.input = ip6_input;
2668 	rt->dst.output  = ip6_output;
2669 	rt->rt6i_gateway  = fl6->daddr;
2670 	rt->rt6i_dst.addr = fl6->daddr;
2671 	rt->rt6i_dst.plen = 128;
2672 	rt->rt6i_idev     = idev;
2673 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2674 
2675 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2676 	 * do proper release of the net_device
2677 	 */
2678 	rt6_uncached_list_add(rt);
2679 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2680 
2681 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2682 
2683 out:
2684 	return dst;
2685 }
2686 
2687 static int ip6_dst_gc(struct dst_ops *ops)
2688 {
2689 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2690 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2691 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2692 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2693 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2694 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2695 	int entries;
2696 
2697 	entries = dst_entries_get_fast(ops);
2698 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2699 	    entries <= rt_max_size)
2700 		goto out;
2701 
2702 	net->ipv6.ip6_rt_gc_expire++;
2703 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2704 	entries = dst_entries_get_slow(ops);
2705 	if (entries < ops->gc_thresh)
2706 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2707 out:
2708 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2709 	return entries > rt_max_size;
2710 }
2711 
2712 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2713 					    struct fib6_config *cfg,
2714 					    const struct in6_addr *gw_addr,
2715 					    u32 tbid, int flags)
2716 {
2717 	struct flowi6 fl6 = {
2718 		.flowi6_oif = cfg->fc_ifindex,
2719 		.daddr = *gw_addr,
2720 		.saddr = cfg->fc_prefsrc,
2721 	};
2722 	struct fib6_table *table;
2723 	struct rt6_info *rt;
2724 
2725 	table = fib6_get_table(net, tbid);
2726 	if (!table)
2727 		return NULL;
2728 
2729 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2730 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2731 
2732 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2733 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2734 
2735 	/* if table lookup failed, fall back to full lookup */
2736 	if (rt == net->ipv6.ip6_null_entry) {
2737 		ip6_rt_put(rt);
2738 		rt = NULL;
2739 	}
2740 
2741 	return rt;
2742 }
2743 
2744 static int ip6_route_check_nh_onlink(struct net *net,
2745 				     struct fib6_config *cfg,
2746 				     const struct net_device *dev,
2747 				     struct netlink_ext_ack *extack)
2748 {
2749 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2750 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2751 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2752 	struct fib6_info *from;
2753 	struct rt6_info *grt;
2754 	int err;
2755 
2756 	err = 0;
2757 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2758 	if (grt) {
2759 		rcu_read_lock();
2760 		from = rcu_dereference(grt->from);
2761 		if (!grt->dst.error &&
2762 		    /* ignore match if it is the default route */
2763 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2764 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2765 			NL_SET_ERR_MSG(extack,
2766 				       "Nexthop has invalid gateway or device mismatch");
2767 			err = -EINVAL;
2768 		}
2769 		rcu_read_unlock();
2770 
2771 		ip6_rt_put(grt);
2772 	}
2773 
2774 	return err;
2775 }
2776 
2777 static int ip6_route_check_nh(struct net *net,
2778 			      struct fib6_config *cfg,
2779 			      struct net_device **_dev,
2780 			      struct inet6_dev **idev)
2781 {
2782 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2783 	struct net_device *dev = _dev ? *_dev : NULL;
2784 	struct rt6_info *grt = NULL;
2785 	int err = -EHOSTUNREACH;
2786 
2787 	if (cfg->fc_table) {
2788 		int flags = RT6_LOOKUP_F_IFACE;
2789 
2790 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2791 					  cfg->fc_table, flags);
2792 		if (grt) {
2793 			if (grt->rt6i_flags & RTF_GATEWAY ||
2794 			    (dev && dev != grt->dst.dev)) {
2795 				ip6_rt_put(grt);
2796 				grt = NULL;
2797 			}
2798 		}
2799 	}
2800 
2801 	if (!grt)
2802 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2803 
2804 	if (!grt)
2805 		goto out;
2806 
2807 	if (dev) {
2808 		if (dev != grt->dst.dev) {
2809 			ip6_rt_put(grt);
2810 			goto out;
2811 		}
2812 	} else {
2813 		*_dev = dev = grt->dst.dev;
2814 		*idev = grt->rt6i_idev;
2815 		dev_hold(dev);
2816 		in6_dev_hold(grt->rt6i_idev);
2817 	}
2818 
2819 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2820 		err = 0;
2821 
2822 	ip6_rt_put(grt);
2823 
2824 out:
2825 	return err;
2826 }
2827 
2828 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2829 			   struct net_device **_dev, struct inet6_dev **idev,
2830 			   struct netlink_ext_ack *extack)
2831 {
2832 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2833 	int gwa_type = ipv6_addr_type(gw_addr);
2834 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2835 	const struct net_device *dev = *_dev;
2836 	bool need_addr_check = !dev;
2837 	int err = -EINVAL;
2838 
2839 	/* if gw_addr is local we will fail to detect this in case
2840 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2841 	 * will return already-added prefix route via interface that
2842 	 * prefix route was assigned to, which might be non-loopback.
2843 	 */
2844 	if (dev &&
2845 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2846 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2847 		goto out;
2848 	}
2849 
2850 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2851 		/* IPv6 strictly inhibits using not link-local
2852 		 * addresses as nexthop address.
2853 		 * Otherwise, router will not able to send redirects.
2854 		 * It is very good, but in some (rare!) circumstances
2855 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2856 		 * some exceptions. --ANK
2857 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2858 		 * addressing
2859 		 */
2860 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2861 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2862 			goto out;
2863 		}
2864 
2865 		if (cfg->fc_flags & RTNH_F_ONLINK)
2866 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2867 		else
2868 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2869 
2870 		if (err)
2871 			goto out;
2872 	}
2873 
2874 	/* reload in case device was changed */
2875 	dev = *_dev;
2876 
2877 	err = -EINVAL;
2878 	if (!dev) {
2879 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2880 		goto out;
2881 	} else if (dev->flags & IFF_LOOPBACK) {
2882 		NL_SET_ERR_MSG(extack,
2883 			       "Egress device can not be loopback device for this route");
2884 		goto out;
2885 	}
2886 
2887 	/* if we did not check gw_addr above, do so now that the
2888 	 * egress device has been resolved.
2889 	 */
2890 	if (need_addr_check &&
2891 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2892 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2893 		goto out;
2894 	}
2895 
2896 	err = 0;
2897 out:
2898 	return err;
2899 }
2900 
2901 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2902 					      gfp_t gfp_flags,
2903 					      struct netlink_ext_ack *extack)
2904 {
2905 	struct net *net = cfg->fc_nlinfo.nl_net;
2906 	struct fib6_info *rt = NULL;
2907 	struct net_device *dev = NULL;
2908 	struct inet6_dev *idev = NULL;
2909 	struct fib6_table *table;
2910 	int addr_type;
2911 	int err = -EINVAL;
2912 
2913 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2914 	if (cfg->fc_flags & RTF_PCPU) {
2915 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2916 		goto out;
2917 	}
2918 
2919 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2920 	if (cfg->fc_flags & RTF_CACHE) {
2921 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2922 		goto out;
2923 	}
2924 
2925 	if (cfg->fc_type > RTN_MAX) {
2926 		NL_SET_ERR_MSG(extack, "Invalid route type");
2927 		goto out;
2928 	}
2929 
2930 	if (cfg->fc_dst_len > 128) {
2931 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2932 		goto out;
2933 	}
2934 	if (cfg->fc_src_len > 128) {
2935 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2936 		goto out;
2937 	}
2938 #ifndef CONFIG_IPV6_SUBTREES
2939 	if (cfg->fc_src_len) {
2940 		NL_SET_ERR_MSG(extack,
2941 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2942 		goto out;
2943 	}
2944 #endif
2945 	if (cfg->fc_ifindex) {
2946 		err = -ENODEV;
2947 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2948 		if (!dev)
2949 			goto out;
2950 		idev = in6_dev_get(dev);
2951 		if (!idev)
2952 			goto out;
2953 	}
2954 
2955 	if (cfg->fc_metric == 0)
2956 		cfg->fc_metric = IP6_RT_PRIO_USER;
2957 
2958 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2959 		if (!dev) {
2960 			NL_SET_ERR_MSG(extack,
2961 				       "Nexthop device required for onlink");
2962 			err = -ENODEV;
2963 			goto out;
2964 		}
2965 
2966 		if (!(dev->flags & IFF_UP)) {
2967 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2968 			err = -ENETDOWN;
2969 			goto out;
2970 		}
2971 	}
2972 
2973 	err = -ENOBUFS;
2974 	if (cfg->fc_nlinfo.nlh &&
2975 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2976 		table = fib6_get_table(net, cfg->fc_table);
2977 		if (!table) {
2978 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2979 			table = fib6_new_table(net, cfg->fc_table);
2980 		}
2981 	} else {
2982 		table = fib6_new_table(net, cfg->fc_table);
2983 	}
2984 
2985 	if (!table)
2986 		goto out;
2987 
2988 	err = -ENOMEM;
2989 	rt = fib6_info_alloc(gfp_flags);
2990 	if (!rt)
2991 		goto out;
2992 
2993 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
2994 					       extack);
2995 	if (IS_ERR(rt->fib6_metrics)) {
2996 		err = PTR_ERR(rt->fib6_metrics);
2997 		/* Do not leave garbage there. */
2998 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
2999 		goto out;
3000 	}
3001 
3002 	if (cfg->fc_flags & RTF_ADDRCONF)
3003 		rt->dst_nocount = true;
3004 
3005 	if (cfg->fc_flags & RTF_EXPIRES)
3006 		fib6_set_expires(rt, jiffies +
3007 				clock_t_to_jiffies(cfg->fc_expires));
3008 	else
3009 		fib6_clean_expires(rt);
3010 
3011 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3012 		cfg->fc_protocol = RTPROT_BOOT;
3013 	rt->fib6_protocol = cfg->fc_protocol;
3014 
3015 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3016 
3017 	if (cfg->fc_encap) {
3018 		struct lwtunnel_state *lwtstate;
3019 
3020 		err = lwtunnel_build_state(cfg->fc_encap_type,
3021 					   cfg->fc_encap, AF_INET6, cfg,
3022 					   &lwtstate, extack);
3023 		if (err)
3024 			goto out;
3025 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3026 	}
3027 
3028 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3029 	rt->fib6_dst.plen = cfg->fc_dst_len;
3030 	if (rt->fib6_dst.plen == 128)
3031 		rt->dst_host = true;
3032 
3033 #ifdef CONFIG_IPV6_SUBTREES
3034 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3035 	rt->fib6_src.plen = cfg->fc_src_len;
3036 #endif
3037 
3038 	rt->fib6_metric = cfg->fc_metric;
3039 	rt->fib6_nh.nh_weight = 1;
3040 
3041 	rt->fib6_type = cfg->fc_type;
3042 
3043 	/* We cannot add true routes via loopback here,
3044 	   they would result in kernel looping; promote them to reject routes
3045 	 */
3046 	if ((cfg->fc_flags & RTF_REJECT) ||
3047 	    (dev && (dev->flags & IFF_LOOPBACK) &&
3048 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3049 	     !(cfg->fc_flags & RTF_LOCAL))) {
3050 		/* hold loopback dev/idev if we haven't done so. */
3051 		if (dev != net->loopback_dev) {
3052 			if (dev) {
3053 				dev_put(dev);
3054 				in6_dev_put(idev);
3055 			}
3056 			dev = net->loopback_dev;
3057 			dev_hold(dev);
3058 			idev = in6_dev_get(dev);
3059 			if (!idev) {
3060 				err = -ENODEV;
3061 				goto out;
3062 			}
3063 		}
3064 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3065 		goto install_route;
3066 	}
3067 
3068 	if (cfg->fc_flags & RTF_GATEWAY) {
3069 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3070 		if (err)
3071 			goto out;
3072 
3073 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
3074 	}
3075 
3076 	err = -ENODEV;
3077 	if (!dev)
3078 		goto out;
3079 
3080 	if (idev->cnf.disable_ipv6) {
3081 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3082 		err = -EACCES;
3083 		goto out;
3084 	}
3085 
3086 	if (!(dev->flags & IFF_UP)) {
3087 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3088 		err = -ENETDOWN;
3089 		goto out;
3090 	}
3091 
3092 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3093 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3094 			NL_SET_ERR_MSG(extack, "Invalid source address");
3095 			err = -EINVAL;
3096 			goto out;
3097 		}
3098 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3099 		rt->fib6_prefsrc.plen = 128;
3100 	} else
3101 		rt->fib6_prefsrc.plen = 0;
3102 
3103 	rt->fib6_flags = cfg->fc_flags;
3104 
3105 install_route:
3106 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3107 	    !netif_carrier_ok(dev))
3108 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3109 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3110 	rt->fib6_nh.nh_dev = dev;
3111 	rt->fib6_table = table;
3112 
3113 	if (idev)
3114 		in6_dev_put(idev);
3115 
3116 	return rt;
3117 out:
3118 	if (dev)
3119 		dev_put(dev);
3120 	if (idev)
3121 		in6_dev_put(idev);
3122 
3123 	fib6_info_release(rt);
3124 	return ERR_PTR(err);
3125 }
3126 
3127 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3128 		  struct netlink_ext_ack *extack)
3129 {
3130 	struct fib6_info *rt;
3131 	int err;
3132 
3133 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3134 	if (IS_ERR(rt))
3135 		return PTR_ERR(rt);
3136 
3137 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3138 	fib6_info_release(rt);
3139 
3140 	return err;
3141 }
3142 
3143 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3144 {
3145 	struct net *net = info->nl_net;
3146 	struct fib6_table *table;
3147 	int err;
3148 
3149 	if (rt == net->ipv6.fib6_null_entry) {
3150 		err = -ENOENT;
3151 		goto out;
3152 	}
3153 
3154 	table = rt->fib6_table;
3155 	spin_lock_bh(&table->tb6_lock);
3156 	err = fib6_del(rt, info);
3157 	spin_unlock_bh(&table->tb6_lock);
3158 
3159 out:
3160 	fib6_info_release(rt);
3161 	return err;
3162 }
3163 
3164 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3165 {
3166 	struct nl_info info = { .nl_net = net };
3167 
3168 	return __ip6_del_rt(rt, &info);
3169 }
3170 
3171 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3172 {
3173 	struct nl_info *info = &cfg->fc_nlinfo;
3174 	struct net *net = info->nl_net;
3175 	struct sk_buff *skb = NULL;
3176 	struct fib6_table *table;
3177 	int err = -ENOENT;
3178 
3179 	if (rt == net->ipv6.fib6_null_entry)
3180 		goto out_put;
3181 	table = rt->fib6_table;
3182 	spin_lock_bh(&table->tb6_lock);
3183 
3184 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3185 		struct fib6_info *sibling, *next_sibling;
3186 
3187 		/* prefer to send a single notification with all hops */
3188 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3189 		if (skb) {
3190 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3191 
3192 			if (rt6_fill_node(net, skb, rt, NULL,
3193 					  NULL, NULL, 0, RTM_DELROUTE,
3194 					  info->portid, seq, 0) < 0) {
3195 				kfree_skb(skb);
3196 				skb = NULL;
3197 			} else
3198 				info->skip_notify = 1;
3199 		}
3200 
3201 		list_for_each_entry_safe(sibling, next_sibling,
3202 					 &rt->fib6_siblings,
3203 					 fib6_siblings) {
3204 			err = fib6_del(sibling, info);
3205 			if (err)
3206 				goto out_unlock;
3207 		}
3208 	}
3209 
3210 	err = fib6_del(rt, info);
3211 out_unlock:
3212 	spin_unlock_bh(&table->tb6_lock);
3213 out_put:
3214 	fib6_info_release(rt);
3215 
3216 	if (skb) {
3217 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3218 			    info->nlh, gfp_any());
3219 	}
3220 	return err;
3221 }
3222 
3223 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3224 {
3225 	int rc = -ESRCH;
3226 
3227 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3228 		goto out;
3229 
3230 	if (cfg->fc_flags & RTF_GATEWAY &&
3231 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3232 		goto out;
3233 
3234 	rc = rt6_remove_exception_rt(rt);
3235 out:
3236 	return rc;
3237 }
3238 
3239 static int ip6_route_del(struct fib6_config *cfg,
3240 			 struct netlink_ext_ack *extack)
3241 {
3242 	struct rt6_info *rt_cache;
3243 	struct fib6_table *table;
3244 	struct fib6_info *rt;
3245 	struct fib6_node *fn;
3246 	int err = -ESRCH;
3247 
3248 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3249 	if (!table) {
3250 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3251 		return err;
3252 	}
3253 
3254 	rcu_read_lock();
3255 
3256 	fn = fib6_locate(&table->tb6_root,
3257 			 &cfg->fc_dst, cfg->fc_dst_len,
3258 			 &cfg->fc_src, cfg->fc_src_len,
3259 			 !(cfg->fc_flags & RTF_CACHE));
3260 
3261 	if (fn) {
3262 		for_each_fib6_node_rt_rcu(fn) {
3263 			if (cfg->fc_flags & RTF_CACHE) {
3264 				int rc;
3265 
3266 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3267 							      &cfg->fc_src);
3268 				if (rt_cache) {
3269 					rc = ip6_del_cached_rt(rt_cache, cfg);
3270 					if (rc != -ESRCH) {
3271 						rcu_read_unlock();
3272 						return rc;
3273 					}
3274 				}
3275 				continue;
3276 			}
3277 			if (cfg->fc_ifindex &&
3278 			    (!rt->fib6_nh.nh_dev ||
3279 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3280 				continue;
3281 			if (cfg->fc_flags & RTF_GATEWAY &&
3282 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3283 				continue;
3284 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3285 				continue;
3286 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3287 				continue;
3288 			if (!fib6_info_hold_safe(rt))
3289 				continue;
3290 			rcu_read_unlock();
3291 
3292 			/* if gateway was specified only delete the one hop */
3293 			if (cfg->fc_flags & RTF_GATEWAY)
3294 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3295 
3296 			return __ip6_del_rt_siblings(rt, cfg);
3297 		}
3298 	}
3299 	rcu_read_unlock();
3300 
3301 	return err;
3302 }
3303 
3304 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3305 {
3306 	struct netevent_redirect netevent;
3307 	struct rt6_info *rt, *nrt = NULL;
3308 	struct ndisc_options ndopts;
3309 	struct inet6_dev *in6_dev;
3310 	struct neighbour *neigh;
3311 	struct fib6_info *from;
3312 	struct rd_msg *msg;
3313 	int optlen, on_link;
3314 	u8 *lladdr;
3315 
3316 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3317 	optlen -= sizeof(*msg);
3318 
3319 	if (optlen < 0) {
3320 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3321 		return;
3322 	}
3323 
3324 	msg = (struct rd_msg *)icmp6_hdr(skb);
3325 
3326 	if (ipv6_addr_is_multicast(&msg->dest)) {
3327 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3328 		return;
3329 	}
3330 
3331 	on_link = 0;
3332 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3333 		on_link = 1;
3334 	} else if (ipv6_addr_type(&msg->target) !=
3335 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3336 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3337 		return;
3338 	}
3339 
3340 	in6_dev = __in6_dev_get(skb->dev);
3341 	if (!in6_dev)
3342 		return;
3343 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3344 		return;
3345 
3346 	/* RFC2461 8.1:
3347 	 *	The IP source address of the Redirect MUST be the same as the current
3348 	 *	first-hop router for the specified ICMP Destination Address.
3349 	 */
3350 
3351 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3352 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3353 		return;
3354 	}
3355 
3356 	lladdr = NULL;
3357 	if (ndopts.nd_opts_tgt_lladdr) {
3358 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3359 					     skb->dev);
3360 		if (!lladdr) {
3361 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3362 			return;
3363 		}
3364 	}
3365 
3366 	rt = (struct rt6_info *) dst;
3367 	if (rt->rt6i_flags & RTF_REJECT) {
3368 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3369 		return;
3370 	}
3371 
3372 	/* Redirect received -> path was valid.
3373 	 * Look, redirects are sent only in response to data packets,
3374 	 * so that this nexthop apparently is reachable. --ANK
3375 	 */
3376 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3377 
3378 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3379 	if (!neigh)
3380 		return;
3381 
3382 	/*
3383 	 *	We have finally decided to accept it.
3384 	 */
3385 
3386 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3387 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3388 		     NEIGH_UPDATE_F_OVERRIDE|
3389 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3390 				     NEIGH_UPDATE_F_ISROUTER)),
3391 		     NDISC_REDIRECT, &ndopts);
3392 
3393 	rcu_read_lock();
3394 	from = rcu_dereference(rt->from);
3395 	if (!from)
3396 		goto out;
3397 
3398 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3399 	if (!nrt)
3400 		goto out;
3401 
3402 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3403 	if (on_link)
3404 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3405 
3406 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3407 
3408 	/* rt6_insert_exception() will take care of duplicated exceptions */
3409 	if (rt6_insert_exception(nrt, from)) {
3410 		dst_release_immediate(&nrt->dst);
3411 		goto out;
3412 	}
3413 
3414 	netevent.old = &rt->dst;
3415 	netevent.new = &nrt->dst;
3416 	netevent.daddr = &msg->dest;
3417 	netevent.neigh = neigh;
3418 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3419 
3420 out:
3421 	rcu_read_unlock();
3422 	neigh_release(neigh);
3423 }
3424 
3425 #ifdef CONFIG_IPV6_ROUTE_INFO
3426 static struct fib6_info *rt6_get_route_info(struct net *net,
3427 					   const struct in6_addr *prefix, int prefixlen,
3428 					   const struct in6_addr *gwaddr,
3429 					   struct net_device *dev)
3430 {
3431 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3432 	int ifindex = dev->ifindex;
3433 	struct fib6_node *fn;
3434 	struct fib6_info *rt = NULL;
3435 	struct fib6_table *table;
3436 
3437 	table = fib6_get_table(net, tb_id);
3438 	if (!table)
3439 		return NULL;
3440 
3441 	rcu_read_lock();
3442 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3443 	if (!fn)
3444 		goto out;
3445 
3446 	for_each_fib6_node_rt_rcu(fn) {
3447 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3448 			continue;
3449 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3450 			continue;
3451 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3452 			continue;
3453 		if (!fib6_info_hold_safe(rt))
3454 			continue;
3455 		break;
3456 	}
3457 out:
3458 	rcu_read_unlock();
3459 	return rt;
3460 }
3461 
3462 static struct fib6_info *rt6_add_route_info(struct net *net,
3463 					   const struct in6_addr *prefix, int prefixlen,
3464 					   const struct in6_addr *gwaddr,
3465 					   struct net_device *dev,
3466 					   unsigned int pref)
3467 {
3468 	struct fib6_config cfg = {
3469 		.fc_metric	= IP6_RT_PRIO_USER,
3470 		.fc_ifindex	= dev->ifindex,
3471 		.fc_dst_len	= prefixlen,
3472 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3473 				  RTF_UP | RTF_PREF(pref),
3474 		.fc_protocol = RTPROT_RA,
3475 		.fc_type = RTN_UNICAST,
3476 		.fc_nlinfo.portid = 0,
3477 		.fc_nlinfo.nlh = NULL,
3478 		.fc_nlinfo.nl_net = net,
3479 	};
3480 
3481 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3482 	cfg.fc_dst = *prefix;
3483 	cfg.fc_gateway = *gwaddr;
3484 
3485 	/* We should treat it as a default route if prefix length is 0. */
3486 	if (!prefixlen)
3487 		cfg.fc_flags |= RTF_DEFAULT;
3488 
3489 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3490 
3491 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3492 }
3493 #endif
3494 
3495 struct fib6_info *rt6_get_dflt_router(struct net *net,
3496 				     const struct in6_addr *addr,
3497 				     struct net_device *dev)
3498 {
3499 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3500 	struct fib6_info *rt;
3501 	struct fib6_table *table;
3502 
3503 	table = fib6_get_table(net, tb_id);
3504 	if (!table)
3505 		return NULL;
3506 
3507 	rcu_read_lock();
3508 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3509 		if (dev == rt->fib6_nh.nh_dev &&
3510 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3511 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3512 			break;
3513 	}
3514 	if (rt && !fib6_info_hold_safe(rt))
3515 		rt = NULL;
3516 	rcu_read_unlock();
3517 	return rt;
3518 }
3519 
3520 struct fib6_info *rt6_add_dflt_router(struct net *net,
3521 				     const struct in6_addr *gwaddr,
3522 				     struct net_device *dev,
3523 				     unsigned int pref)
3524 {
3525 	struct fib6_config cfg = {
3526 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3527 		.fc_metric	= IP6_RT_PRIO_USER,
3528 		.fc_ifindex	= dev->ifindex,
3529 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3530 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3531 		.fc_protocol = RTPROT_RA,
3532 		.fc_type = RTN_UNICAST,
3533 		.fc_nlinfo.portid = 0,
3534 		.fc_nlinfo.nlh = NULL,
3535 		.fc_nlinfo.nl_net = net,
3536 	};
3537 
3538 	cfg.fc_gateway = *gwaddr;
3539 
3540 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3541 		struct fib6_table *table;
3542 
3543 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3544 		if (table)
3545 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3546 	}
3547 
3548 	return rt6_get_dflt_router(net, gwaddr, dev);
3549 }
3550 
3551 static void __rt6_purge_dflt_routers(struct net *net,
3552 				     struct fib6_table *table)
3553 {
3554 	struct fib6_info *rt;
3555 
3556 restart:
3557 	rcu_read_lock();
3558 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3559 		struct net_device *dev = fib6_info_nh_dev(rt);
3560 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3561 
3562 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3563 		    (!idev || idev->cnf.accept_ra != 2) &&
3564 		    fib6_info_hold_safe(rt)) {
3565 			rcu_read_unlock();
3566 			ip6_del_rt(net, rt);
3567 			goto restart;
3568 		}
3569 	}
3570 	rcu_read_unlock();
3571 
3572 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3573 }
3574 
3575 void rt6_purge_dflt_routers(struct net *net)
3576 {
3577 	struct fib6_table *table;
3578 	struct hlist_head *head;
3579 	unsigned int h;
3580 
3581 	rcu_read_lock();
3582 
3583 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3584 		head = &net->ipv6.fib_table_hash[h];
3585 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3586 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3587 				__rt6_purge_dflt_routers(net, table);
3588 		}
3589 	}
3590 
3591 	rcu_read_unlock();
3592 }
3593 
3594 static void rtmsg_to_fib6_config(struct net *net,
3595 				 struct in6_rtmsg *rtmsg,
3596 				 struct fib6_config *cfg)
3597 {
3598 	*cfg = (struct fib6_config){
3599 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3600 			 : RT6_TABLE_MAIN,
3601 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3602 		.fc_metric = rtmsg->rtmsg_metric,
3603 		.fc_expires = rtmsg->rtmsg_info,
3604 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3605 		.fc_src_len = rtmsg->rtmsg_src_len,
3606 		.fc_flags = rtmsg->rtmsg_flags,
3607 		.fc_type = rtmsg->rtmsg_type,
3608 
3609 		.fc_nlinfo.nl_net = net,
3610 
3611 		.fc_dst = rtmsg->rtmsg_dst,
3612 		.fc_src = rtmsg->rtmsg_src,
3613 		.fc_gateway = rtmsg->rtmsg_gateway,
3614 	};
3615 }
3616 
3617 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3618 {
3619 	struct fib6_config cfg;
3620 	struct in6_rtmsg rtmsg;
3621 	int err;
3622 
3623 	switch (cmd) {
3624 	case SIOCADDRT:		/* Add a route */
3625 	case SIOCDELRT:		/* Delete a route */
3626 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3627 			return -EPERM;
3628 		err = copy_from_user(&rtmsg, arg,
3629 				     sizeof(struct in6_rtmsg));
3630 		if (err)
3631 			return -EFAULT;
3632 
3633 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3634 
3635 		rtnl_lock();
3636 		switch (cmd) {
3637 		case SIOCADDRT:
3638 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3639 			break;
3640 		case SIOCDELRT:
3641 			err = ip6_route_del(&cfg, NULL);
3642 			break;
3643 		default:
3644 			err = -EINVAL;
3645 		}
3646 		rtnl_unlock();
3647 
3648 		return err;
3649 	}
3650 
3651 	return -EINVAL;
3652 }
3653 
3654 /*
3655  *	Drop the packet on the floor
3656  */
3657 
3658 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3659 {
3660 	struct dst_entry *dst = skb_dst(skb);
3661 	struct net *net = dev_net(dst->dev);
3662 	struct inet6_dev *idev;
3663 	int type;
3664 
3665 	if (netif_is_l3_master(skb->dev) &&
3666 	    dst->dev == net->loopback_dev)
3667 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3668 	else
3669 		idev = ip6_dst_idev(dst);
3670 
3671 	switch (ipstats_mib_noroutes) {
3672 	case IPSTATS_MIB_INNOROUTES:
3673 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3674 		if (type == IPV6_ADDR_ANY) {
3675 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3676 			break;
3677 		}
3678 		/* FALLTHROUGH */
3679 	case IPSTATS_MIB_OUTNOROUTES:
3680 		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3681 		break;
3682 	}
3683 
3684 	/* Start over by dropping the dst for l3mdev case */
3685 	if (netif_is_l3_master(skb->dev))
3686 		skb_dst_drop(skb);
3687 
3688 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3689 	kfree_skb(skb);
3690 	return 0;
3691 }
3692 
3693 static int ip6_pkt_discard(struct sk_buff *skb)
3694 {
3695 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3696 }
3697 
3698 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3699 {
3700 	skb->dev = skb_dst(skb)->dev;
3701 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3702 }
3703 
3704 static int ip6_pkt_prohibit(struct sk_buff *skb)
3705 {
3706 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3707 }
3708 
3709 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3710 {
3711 	skb->dev = skb_dst(skb)->dev;
3712 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3713 }
3714 
3715 /*
3716  *	Allocate a dst for local (unicast / anycast) address.
3717  */
3718 
3719 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3720 				     struct inet6_dev *idev,
3721 				     const struct in6_addr *addr,
3722 				     bool anycast, gfp_t gfp_flags)
3723 {
3724 	u32 tb_id;
3725 	struct net_device *dev = idev->dev;
3726 	struct fib6_info *f6i;
3727 
3728 	f6i = fib6_info_alloc(gfp_flags);
3729 	if (!f6i)
3730 		return ERR_PTR(-ENOMEM);
3731 
3732 	f6i->fib6_metrics = ip_fib_metrics_init(net, NULL, 0, NULL);
3733 	f6i->dst_nocount = true;
3734 	f6i->dst_host = true;
3735 	f6i->fib6_protocol = RTPROT_KERNEL;
3736 	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3737 	if (anycast) {
3738 		f6i->fib6_type = RTN_ANYCAST;
3739 		f6i->fib6_flags |= RTF_ANYCAST;
3740 	} else {
3741 		f6i->fib6_type = RTN_LOCAL;
3742 		f6i->fib6_flags |= RTF_LOCAL;
3743 	}
3744 
3745 	f6i->fib6_nh.nh_gw = *addr;
3746 	dev_hold(dev);
3747 	f6i->fib6_nh.nh_dev = dev;
3748 	f6i->fib6_dst.addr = *addr;
3749 	f6i->fib6_dst.plen = 128;
3750 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3751 	f6i->fib6_table = fib6_get_table(net, tb_id);
3752 
3753 	return f6i;
3754 }
3755 
3756 /* remove deleted ip from prefsrc entries */
3757 struct arg_dev_net_ip {
3758 	struct net_device *dev;
3759 	struct net *net;
3760 	struct in6_addr *addr;
3761 };
3762 
3763 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3764 {
3765 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3766 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3767 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3768 
3769 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3770 	    rt != net->ipv6.fib6_null_entry &&
3771 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3772 		spin_lock_bh(&rt6_exception_lock);
3773 		/* remove prefsrc entry */
3774 		rt->fib6_prefsrc.plen = 0;
3775 		spin_unlock_bh(&rt6_exception_lock);
3776 	}
3777 	return 0;
3778 }
3779 
3780 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3781 {
3782 	struct net *net = dev_net(ifp->idev->dev);
3783 	struct arg_dev_net_ip adni = {
3784 		.dev = ifp->idev->dev,
3785 		.net = net,
3786 		.addr = &ifp->addr,
3787 	};
3788 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3789 }
3790 
3791 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3792 
3793 /* Remove routers and update dst entries when gateway turn into host. */
3794 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3795 {
3796 	struct in6_addr *gateway = (struct in6_addr *)arg;
3797 
3798 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3799 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3800 		return -1;
3801 	}
3802 
3803 	/* Further clean up cached routes in exception table.
3804 	 * This is needed because cached route may have a different
3805 	 * gateway than its 'parent' in the case of an ip redirect.
3806 	 */
3807 	rt6_exceptions_clean_tohost(rt, gateway);
3808 
3809 	return 0;
3810 }
3811 
3812 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3813 {
3814 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3815 }
3816 
3817 struct arg_netdev_event {
3818 	const struct net_device *dev;
3819 	union {
3820 		unsigned int nh_flags;
3821 		unsigned long event;
3822 	};
3823 };
3824 
3825 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3826 {
3827 	struct fib6_info *iter;
3828 	struct fib6_node *fn;
3829 
3830 	fn = rcu_dereference_protected(rt->fib6_node,
3831 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3832 	iter = rcu_dereference_protected(fn->leaf,
3833 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3834 	while (iter) {
3835 		if (iter->fib6_metric == rt->fib6_metric &&
3836 		    rt6_qualify_for_ecmp(iter))
3837 			return iter;
3838 		iter = rcu_dereference_protected(iter->fib6_next,
3839 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3840 	}
3841 
3842 	return NULL;
3843 }
3844 
3845 static bool rt6_is_dead(const struct fib6_info *rt)
3846 {
3847 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3848 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3849 	     fib6_ignore_linkdown(rt)))
3850 		return true;
3851 
3852 	return false;
3853 }
3854 
3855 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3856 {
3857 	struct fib6_info *iter;
3858 	int total = 0;
3859 
3860 	if (!rt6_is_dead(rt))
3861 		total += rt->fib6_nh.nh_weight;
3862 
3863 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3864 		if (!rt6_is_dead(iter))
3865 			total += iter->fib6_nh.nh_weight;
3866 	}
3867 
3868 	return total;
3869 }
3870 
3871 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3872 {
3873 	int upper_bound = -1;
3874 
3875 	if (!rt6_is_dead(rt)) {
3876 		*weight += rt->fib6_nh.nh_weight;
3877 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3878 						    total) - 1;
3879 	}
3880 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3881 }
3882 
3883 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3884 {
3885 	struct fib6_info *iter;
3886 	int weight = 0;
3887 
3888 	rt6_upper_bound_set(rt, &weight, total);
3889 
3890 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3891 		rt6_upper_bound_set(iter, &weight, total);
3892 }
3893 
3894 void rt6_multipath_rebalance(struct fib6_info *rt)
3895 {
3896 	struct fib6_info *first;
3897 	int total;
3898 
3899 	/* In case the entire multipath route was marked for flushing,
3900 	 * then there is no need to rebalance upon the removal of every
3901 	 * sibling route.
3902 	 */
3903 	if (!rt->fib6_nsiblings || rt->should_flush)
3904 		return;
3905 
3906 	/* During lookup routes are evaluated in order, so we need to
3907 	 * make sure upper bounds are assigned from the first sibling
3908 	 * onwards.
3909 	 */
3910 	first = rt6_multipath_first_sibling(rt);
3911 	if (WARN_ON_ONCE(!first))
3912 		return;
3913 
3914 	total = rt6_multipath_total_weight(first);
3915 	rt6_multipath_upper_bound_set(first, total);
3916 }
3917 
3918 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3919 {
3920 	const struct arg_netdev_event *arg = p_arg;
3921 	struct net *net = dev_net(arg->dev);
3922 
3923 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3924 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3925 		fib6_update_sernum_upto_root(net, rt);
3926 		rt6_multipath_rebalance(rt);
3927 	}
3928 
3929 	return 0;
3930 }
3931 
3932 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3933 {
3934 	struct arg_netdev_event arg = {
3935 		.dev = dev,
3936 		{
3937 			.nh_flags = nh_flags,
3938 		},
3939 	};
3940 
3941 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3942 		arg.nh_flags |= RTNH_F_LINKDOWN;
3943 
3944 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3945 }
3946 
3947 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3948 				   const struct net_device *dev)
3949 {
3950 	struct fib6_info *iter;
3951 
3952 	if (rt->fib6_nh.nh_dev == dev)
3953 		return true;
3954 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3955 		if (iter->fib6_nh.nh_dev == dev)
3956 			return true;
3957 
3958 	return false;
3959 }
3960 
3961 static void rt6_multipath_flush(struct fib6_info *rt)
3962 {
3963 	struct fib6_info *iter;
3964 
3965 	rt->should_flush = 1;
3966 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3967 		iter->should_flush = 1;
3968 }
3969 
3970 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3971 					     const struct net_device *down_dev)
3972 {
3973 	struct fib6_info *iter;
3974 	unsigned int dead = 0;
3975 
3976 	if (rt->fib6_nh.nh_dev == down_dev ||
3977 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3978 		dead++;
3979 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980 		if (iter->fib6_nh.nh_dev == down_dev ||
3981 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3982 			dead++;
3983 
3984 	return dead;
3985 }
3986 
3987 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3988 				       const struct net_device *dev,
3989 				       unsigned int nh_flags)
3990 {
3991 	struct fib6_info *iter;
3992 
3993 	if (rt->fib6_nh.nh_dev == dev)
3994 		rt->fib6_nh.nh_flags |= nh_flags;
3995 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3996 		if (iter->fib6_nh.nh_dev == dev)
3997 			iter->fib6_nh.nh_flags |= nh_flags;
3998 }
3999 
4000 /* called with write lock held for table with rt */
4001 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4002 {
4003 	const struct arg_netdev_event *arg = p_arg;
4004 	const struct net_device *dev = arg->dev;
4005 	struct net *net = dev_net(dev);
4006 
4007 	if (rt == net->ipv6.fib6_null_entry)
4008 		return 0;
4009 
4010 	switch (arg->event) {
4011 	case NETDEV_UNREGISTER:
4012 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4013 	case NETDEV_DOWN:
4014 		if (rt->should_flush)
4015 			return -1;
4016 		if (!rt->fib6_nsiblings)
4017 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4018 		if (rt6_multipath_uses_dev(rt, dev)) {
4019 			unsigned int count;
4020 
4021 			count = rt6_multipath_dead_count(rt, dev);
4022 			if (rt->fib6_nsiblings + 1 == count) {
4023 				rt6_multipath_flush(rt);
4024 				return -1;
4025 			}
4026 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4027 						   RTNH_F_LINKDOWN);
4028 			fib6_update_sernum(net, rt);
4029 			rt6_multipath_rebalance(rt);
4030 		}
4031 		return -2;
4032 	case NETDEV_CHANGE:
4033 		if (rt->fib6_nh.nh_dev != dev ||
4034 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4035 			break;
4036 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4037 		rt6_multipath_rebalance(rt);
4038 		break;
4039 	}
4040 
4041 	return 0;
4042 }
4043 
4044 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4045 {
4046 	struct arg_netdev_event arg = {
4047 		.dev = dev,
4048 		{
4049 			.event = event,
4050 		},
4051 	};
4052 	struct net *net = dev_net(dev);
4053 
4054 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4055 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4056 	else
4057 		fib6_clean_all(net, fib6_ifdown, &arg);
4058 }
4059 
4060 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4061 {
4062 	rt6_sync_down_dev(dev, event);
4063 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4064 	neigh_ifdown(&nd_tbl, dev);
4065 }
4066 
4067 struct rt6_mtu_change_arg {
4068 	struct net_device *dev;
4069 	unsigned int mtu;
4070 };
4071 
4072 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4073 {
4074 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4075 	struct inet6_dev *idev;
4076 
4077 	/* In IPv6 pmtu discovery is not optional,
4078 	   so that RTAX_MTU lock cannot disable it.
4079 	   We still use this lock to block changes
4080 	   caused by addrconf/ndisc.
4081 	*/
4082 
4083 	idev = __in6_dev_get(arg->dev);
4084 	if (!idev)
4085 		return 0;
4086 
4087 	/* For administrative MTU increase, there is no way to discover
4088 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4089 	   Since RFC 1981 doesn't include administrative MTU increase
4090 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4091 	 */
4092 	if (rt->fib6_nh.nh_dev == arg->dev &&
4093 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4094 		u32 mtu = rt->fib6_pmtu;
4095 
4096 		if (mtu >= arg->mtu ||
4097 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4098 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4099 
4100 		spin_lock_bh(&rt6_exception_lock);
4101 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4102 		spin_unlock_bh(&rt6_exception_lock);
4103 	}
4104 	return 0;
4105 }
4106 
4107 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4108 {
4109 	struct rt6_mtu_change_arg arg = {
4110 		.dev = dev,
4111 		.mtu = mtu,
4112 	};
4113 
4114 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4115 }
4116 
4117 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4118 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4119 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4120 	[RTA_OIF]               = { .type = NLA_U32 },
4121 	[RTA_IIF]		= { .type = NLA_U32 },
4122 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4123 	[RTA_METRICS]           = { .type = NLA_NESTED },
4124 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4125 	[RTA_PREF]              = { .type = NLA_U8 },
4126 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4127 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4128 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4129 	[RTA_UID]		= { .type = NLA_U32 },
4130 	[RTA_MARK]		= { .type = NLA_U32 },
4131 	[RTA_TABLE]		= { .type = NLA_U32 },
4132 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4133 	[RTA_SPORT]		= { .type = NLA_U16 },
4134 	[RTA_DPORT]		= { .type = NLA_U16 },
4135 };
4136 
4137 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4138 			      struct fib6_config *cfg,
4139 			      struct netlink_ext_ack *extack)
4140 {
4141 	struct rtmsg *rtm;
4142 	struct nlattr *tb[RTA_MAX+1];
4143 	unsigned int pref;
4144 	int err;
4145 
4146 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4147 			  extack);
4148 	if (err < 0)
4149 		goto errout;
4150 
4151 	err = -EINVAL;
4152 	rtm = nlmsg_data(nlh);
4153 
4154 	*cfg = (struct fib6_config){
4155 		.fc_table = rtm->rtm_table,
4156 		.fc_dst_len = rtm->rtm_dst_len,
4157 		.fc_src_len = rtm->rtm_src_len,
4158 		.fc_flags = RTF_UP,
4159 		.fc_protocol = rtm->rtm_protocol,
4160 		.fc_type = rtm->rtm_type,
4161 
4162 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4163 		.fc_nlinfo.nlh = nlh,
4164 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4165 	};
4166 
4167 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4168 	    rtm->rtm_type == RTN_BLACKHOLE ||
4169 	    rtm->rtm_type == RTN_PROHIBIT ||
4170 	    rtm->rtm_type == RTN_THROW)
4171 		cfg->fc_flags |= RTF_REJECT;
4172 
4173 	if (rtm->rtm_type == RTN_LOCAL)
4174 		cfg->fc_flags |= RTF_LOCAL;
4175 
4176 	if (rtm->rtm_flags & RTM_F_CLONED)
4177 		cfg->fc_flags |= RTF_CACHE;
4178 
4179 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4180 
4181 	if (tb[RTA_GATEWAY]) {
4182 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4183 		cfg->fc_flags |= RTF_GATEWAY;
4184 	}
4185 	if (tb[RTA_VIA]) {
4186 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4187 		goto errout;
4188 	}
4189 
4190 	if (tb[RTA_DST]) {
4191 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4192 
4193 		if (nla_len(tb[RTA_DST]) < plen)
4194 			goto errout;
4195 
4196 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4197 	}
4198 
4199 	if (tb[RTA_SRC]) {
4200 		int plen = (rtm->rtm_src_len + 7) >> 3;
4201 
4202 		if (nla_len(tb[RTA_SRC]) < plen)
4203 			goto errout;
4204 
4205 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4206 	}
4207 
4208 	if (tb[RTA_PREFSRC])
4209 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4210 
4211 	if (tb[RTA_OIF])
4212 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4213 
4214 	if (tb[RTA_PRIORITY])
4215 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4216 
4217 	if (tb[RTA_METRICS]) {
4218 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4219 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4220 	}
4221 
4222 	if (tb[RTA_TABLE])
4223 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4224 
4225 	if (tb[RTA_MULTIPATH]) {
4226 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4227 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4228 
4229 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4230 						     cfg->fc_mp_len, extack);
4231 		if (err < 0)
4232 			goto errout;
4233 	}
4234 
4235 	if (tb[RTA_PREF]) {
4236 		pref = nla_get_u8(tb[RTA_PREF]);
4237 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4238 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4239 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4240 		cfg->fc_flags |= RTF_PREF(pref);
4241 	}
4242 
4243 	if (tb[RTA_ENCAP])
4244 		cfg->fc_encap = tb[RTA_ENCAP];
4245 
4246 	if (tb[RTA_ENCAP_TYPE]) {
4247 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4248 
4249 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4250 		if (err < 0)
4251 			goto errout;
4252 	}
4253 
4254 	if (tb[RTA_EXPIRES]) {
4255 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4256 
4257 		if (addrconf_finite_timeout(timeout)) {
4258 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4259 			cfg->fc_flags |= RTF_EXPIRES;
4260 		}
4261 	}
4262 
4263 	err = 0;
4264 errout:
4265 	return err;
4266 }
4267 
4268 struct rt6_nh {
4269 	struct fib6_info *fib6_info;
4270 	struct fib6_config r_cfg;
4271 	struct list_head next;
4272 };
4273 
4274 static int ip6_route_info_append(struct net *net,
4275 				 struct list_head *rt6_nh_list,
4276 				 struct fib6_info *rt,
4277 				 struct fib6_config *r_cfg)
4278 {
4279 	struct rt6_nh *nh;
4280 	int err = -EEXIST;
4281 
4282 	list_for_each_entry(nh, rt6_nh_list, next) {
4283 		/* check if fib6_info already exists */
4284 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4285 			return err;
4286 	}
4287 
4288 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4289 	if (!nh)
4290 		return -ENOMEM;
4291 	nh->fib6_info = rt;
4292 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4293 	list_add_tail(&nh->next, rt6_nh_list);
4294 
4295 	return 0;
4296 }
4297 
4298 static void ip6_route_mpath_notify(struct fib6_info *rt,
4299 				   struct fib6_info *rt_last,
4300 				   struct nl_info *info,
4301 				   __u16 nlflags)
4302 {
4303 	/* if this is an APPEND route, then rt points to the first route
4304 	 * inserted and rt_last points to last route inserted. Userspace
4305 	 * wants a consistent dump of the route which starts at the first
4306 	 * nexthop. Since sibling routes are always added at the end of
4307 	 * the list, find the first sibling of the last route appended
4308 	 */
4309 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4310 		rt = list_first_entry(&rt_last->fib6_siblings,
4311 				      struct fib6_info,
4312 				      fib6_siblings);
4313 	}
4314 
4315 	if (rt)
4316 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4317 }
4318 
4319 static int ip6_route_multipath_add(struct fib6_config *cfg,
4320 				   struct netlink_ext_ack *extack)
4321 {
4322 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4323 	struct nl_info *info = &cfg->fc_nlinfo;
4324 	struct fib6_config r_cfg;
4325 	struct rtnexthop *rtnh;
4326 	struct fib6_info *rt;
4327 	struct rt6_nh *err_nh;
4328 	struct rt6_nh *nh, *nh_safe;
4329 	__u16 nlflags;
4330 	int remaining;
4331 	int attrlen;
4332 	int err = 1;
4333 	int nhn = 0;
4334 	int replace = (cfg->fc_nlinfo.nlh &&
4335 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4336 	LIST_HEAD(rt6_nh_list);
4337 
4338 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4339 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4340 		nlflags |= NLM_F_APPEND;
4341 
4342 	remaining = cfg->fc_mp_len;
4343 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4344 
4345 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4346 	 * fib6_info structs per nexthop
4347 	 */
4348 	while (rtnh_ok(rtnh, remaining)) {
4349 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4350 		if (rtnh->rtnh_ifindex)
4351 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4352 
4353 		attrlen = rtnh_attrlen(rtnh);
4354 		if (attrlen > 0) {
4355 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4356 
4357 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4358 			if (nla) {
4359 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4360 				r_cfg.fc_flags |= RTF_GATEWAY;
4361 			}
4362 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4363 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4364 			if (nla)
4365 				r_cfg.fc_encap_type = nla_get_u16(nla);
4366 		}
4367 
4368 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4369 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4370 		if (IS_ERR(rt)) {
4371 			err = PTR_ERR(rt);
4372 			rt = NULL;
4373 			goto cleanup;
4374 		}
4375 		if (!rt6_qualify_for_ecmp(rt)) {
4376 			err = -EINVAL;
4377 			NL_SET_ERR_MSG(extack,
4378 				       "Device only routes can not be added for IPv6 using the multipath API.");
4379 			fib6_info_release(rt);
4380 			goto cleanup;
4381 		}
4382 
4383 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4384 
4385 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4386 					    rt, &r_cfg);
4387 		if (err) {
4388 			fib6_info_release(rt);
4389 			goto cleanup;
4390 		}
4391 
4392 		rtnh = rtnh_next(rtnh, &remaining);
4393 	}
4394 
4395 	/* for add and replace send one notification with all nexthops.
4396 	 * Skip the notification in fib6_add_rt2node and send one with
4397 	 * the full route when done
4398 	 */
4399 	info->skip_notify = 1;
4400 
4401 	err_nh = NULL;
4402 	list_for_each_entry(nh, &rt6_nh_list, next) {
4403 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4404 		fib6_info_release(nh->fib6_info);
4405 
4406 		if (!err) {
4407 			/* save reference to last route successfully inserted */
4408 			rt_last = nh->fib6_info;
4409 
4410 			/* save reference to first route for notification */
4411 			if (!rt_notif)
4412 				rt_notif = nh->fib6_info;
4413 		}
4414 
4415 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4416 		nh->fib6_info = NULL;
4417 		if (err) {
4418 			if (replace && nhn)
4419 				NL_SET_ERR_MSG_MOD(extack,
4420 						   "multipath route replace failed (check consistency of installed routes)");
4421 			err_nh = nh;
4422 			goto add_errout;
4423 		}
4424 
4425 		/* Because each route is added like a single route we remove
4426 		 * these flags after the first nexthop: if there is a collision,
4427 		 * we have already failed to add the first nexthop:
4428 		 * fib6_add_rt2node() has rejected it; when replacing, old
4429 		 * nexthops have been replaced by first new, the rest should
4430 		 * be added to it.
4431 		 */
4432 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4433 						     NLM_F_REPLACE);
4434 		nhn++;
4435 	}
4436 
4437 	/* success ... tell user about new route */
4438 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4439 	goto cleanup;
4440 
4441 add_errout:
4442 	/* send notification for routes that were added so that
4443 	 * the delete notifications sent by ip6_route_del are
4444 	 * coherent
4445 	 */
4446 	if (rt_notif)
4447 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4448 
4449 	/* Delete routes that were already added */
4450 	list_for_each_entry(nh, &rt6_nh_list, next) {
4451 		if (err_nh == nh)
4452 			break;
4453 		ip6_route_del(&nh->r_cfg, extack);
4454 	}
4455 
4456 cleanup:
4457 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4458 		if (nh->fib6_info)
4459 			fib6_info_release(nh->fib6_info);
4460 		list_del(&nh->next);
4461 		kfree(nh);
4462 	}
4463 
4464 	return err;
4465 }
4466 
4467 static int ip6_route_multipath_del(struct fib6_config *cfg,
4468 				   struct netlink_ext_ack *extack)
4469 {
4470 	struct fib6_config r_cfg;
4471 	struct rtnexthop *rtnh;
4472 	int remaining;
4473 	int attrlen;
4474 	int err = 1, last_err = 0;
4475 
4476 	remaining = cfg->fc_mp_len;
4477 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4478 
4479 	/* Parse a Multipath Entry */
4480 	while (rtnh_ok(rtnh, remaining)) {
4481 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4482 		if (rtnh->rtnh_ifindex)
4483 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4484 
4485 		attrlen = rtnh_attrlen(rtnh);
4486 		if (attrlen > 0) {
4487 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4488 
4489 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4490 			if (nla) {
4491 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4492 				r_cfg.fc_flags |= RTF_GATEWAY;
4493 			}
4494 		}
4495 		err = ip6_route_del(&r_cfg, extack);
4496 		if (err)
4497 			last_err = err;
4498 
4499 		rtnh = rtnh_next(rtnh, &remaining);
4500 	}
4501 
4502 	return last_err;
4503 }
4504 
4505 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4506 			      struct netlink_ext_ack *extack)
4507 {
4508 	struct fib6_config cfg;
4509 	int err;
4510 
4511 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4512 	if (err < 0)
4513 		return err;
4514 
4515 	if (cfg.fc_mp)
4516 		return ip6_route_multipath_del(&cfg, extack);
4517 	else {
4518 		cfg.fc_delete_all_nh = 1;
4519 		return ip6_route_del(&cfg, extack);
4520 	}
4521 }
4522 
4523 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4524 			      struct netlink_ext_ack *extack)
4525 {
4526 	struct fib6_config cfg;
4527 	int err;
4528 
4529 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4530 	if (err < 0)
4531 		return err;
4532 
4533 	if (cfg.fc_mp)
4534 		return ip6_route_multipath_add(&cfg, extack);
4535 	else
4536 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4537 }
4538 
4539 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4540 {
4541 	int nexthop_len = 0;
4542 
4543 	if (rt->fib6_nsiblings) {
4544 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4545 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4546 			    + nla_total_size(16) /* RTA_GATEWAY */
4547 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4548 
4549 		nexthop_len *= rt->fib6_nsiblings;
4550 	}
4551 
4552 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4553 	       + nla_total_size(16) /* RTA_SRC */
4554 	       + nla_total_size(16) /* RTA_DST */
4555 	       + nla_total_size(16) /* RTA_GATEWAY */
4556 	       + nla_total_size(16) /* RTA_PREFSRC */
4557 	       + nla_total_size(4) /* RTA_TABLE */
4558 	       + nla_total_size(4) /* RTA_IIF */
4559 	       + nla_total_size(4) /* RTA_OIF */
4560 	       + nla_total_size(4) /* RTA_PRIORITY */
4561 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4562 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4563 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4564 	       + nla_total_size(1) /* RTA_PREF */
4565 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4566 	       + nexthop_len;
4567 }
4568 
4569 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4570 			    unsigned int *flags, bool skip_oif)
4571 {
4572 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4573 		*flags |= RTNH_F_DEAD;
4574 
4575 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4576 		*flags |= RTNH_F_LINKDOWN;
4577 
4578 		rcu_read_lock();
4579 		if (fib6_ignore_linkdown(rt))
4580 			*flags |= RTNH_F_DEAD;
4581 		rcu_read_unlock();
4582 	}
4583 
4584 	if (rt->fib6_flags & RTF_GATEWAY) {
4585 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4586 			goto nla_put_failure;
4587 	}
4588 
4589 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4590 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4591 		*flags |= RTNH_F_OFFLOAD;
4592 
4593 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4594 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4595 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4596 		goto nla_put_failure;
4597 
4598 	if (rt->fib6_nh.nh_lwtstate &&
4599 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4600 		goto nla_put_failure;
4601 
4602 	return 0;
4603 
4604 nla_put_failure:
4605 	return -EMSGSIZE;
4606 }
4607 
4608 /* add multipath next hop */
4609 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4610 {
4611 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4612 	struct rtnexthop *rtnh;
4613 	unsigned int flags = 0;
4614 
4615 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4616 	if (!rtnh)
4617 		goto nla_put_failure;
4618 
4619 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4620 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4621 
4622 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4623 		goto nla_put_failure;
4624 
4625 	rtnh->rtnh_flags = flags;
4626 
4627 	/* length of rtnetlink header + attributes */
4628 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4629 
4630 	return 0;
4631 
4632 nla_put_failure:
4633 	return -EMSGSIZE;
4634 }
4635 
4636 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4637 			 struct fib6_info *rt, struct dst_entry *dst,
4638 			 struct in6_addr *dest, struct in6_addr *src,
4639 			 int iif, int type, u32 portid, u32 seq,
4640 			 unsigned int flags)
4641 {
4642 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4643 	struct rt6key *rt6_dst, *rt6_src;
4644 	u32 *pmetrics, table, rt6_flags;
4645 	struct nlmsghdr *nlh;
4646 	struct rtmsg *rtm;
4647 	long expires = 0;
4648 
4649 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4650 	if (!nlh)
4651 		return -EMSGSIZE;
4652 
4653 	if (rt6) {
4654 		rt6_dst = &rt6->rt6i_dst;
4655 		rt6_src = &rt6->rt6i_src;
4656 		rt6_flags = rt6->rt6i_flags;
4657 	} else {
4658 		rt6_dst = &rt->fib6_dst;
4659 		rt6_src = &rt->fib6_src;
4660 		rt6_flags = rt->fib6_flags;
4661 	}
4662 
4663 	rtm = nlmsg_data(nlh);
4664 	rtm->rtm_family = AF_INET6;
4665 	rtm->rtm_dst_len = rt6_dst->plen;
4666 	rtm->rtm_src_len = rt6_src->plen;
4667 	rtm->rtm_tos = 0;
4668 	if (rt->fib6_table)
4669 		table = rt->fib6_table->tb6_id;
4670 	else
4671 		table = RT6_TABLE_UNSPEC;
4672 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4673 	if (nla_put_u32(skb, RTA_TABLE, table))
4674 		goto nla_put_failure;
4675 
4676 	rtm->rtm_type = rt->fib6_type;
4677 	rtm->rtm_flags = 0;
4678 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4679 	rtm->rtm_protocol = rt->fib6_protocol;
4680 
4681 	if (rt6_flags & RTF_CACHE)
4682 		rtm->rtm_flags |= RTM_F_CLONED;
4683 
4684 	if (dest) {
4685 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4686 			goto nla_put_failure;
4687 		rtm->rtm_dst_len = 128;
4688 	} else if (rtm->rtm_dst_len)
4689 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4690 			goto nla_put_failure;
4691 #ifdef CONFIG_IPV6_SUBTREES
4692 	if (src) {
4693 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4694 			goto nla_put_failure;
4695 		rtm->rtm_src_len = 128;
4696 	} else if (rtm->rtm_src_len &&
4697 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4698 		goto nla_put_failure;
4699 #endif
4700 	if (iif) {
4701 #ifdef CONFIG_IPV6_MROUTE
4702 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4703 			int err = ip6mr_get_route(net, skb, rtm, portid);
4704 
4705 			if (err == 0)
4706 				return 0;
4707 			if (err < 0)
4708 				goto nla_put_failure;
4709 		} else
4710 #endif
4711 			if (nla_put_u32(skb, RTA_IIF, iif))
4712 				goto nla_put_failure;
4713 	} else if (dest) {
4714 		struct in6_addr saddr_buf;
4715 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4716 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4717 			goto nla_put_failure;
4718 	}
4719 
4720 	if (rt->fib6_prefsrc.plen) {
4721 		struct in6_addr saddr_buf;
4722 		saddr_buf = rt->fib6_prefsrc.addr;
4723 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4724 			goto nla_put_failure;
4725 	}
4726 
4727 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4728 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4729 		goto nla_put_failure;
4730 
4731 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4732 		goto nla_put_failure;
4733 
4734 	/* For multipath routes, walk the siblings list and add
4735 	 * each as a nexthop within RTA_MULTIPATH.
4736 	 */
4737 	if (rt6) {
4738 		if (rt6_flags & RTF_GATEWAY &&
4739 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4740 			goto nla_put_failure;
4741 
4742 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4743 			goto nla_put_failure;
4744 	} else if (rt->fib6_nsiblings) {
4745 		struct fib6_info *sibling, *next_sibling;
4746 		struct nlattr *mp;
4747 
4748 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4749 		if (!mp)
4750 			goto nla_put_failure;
4751 
4752 		if (rt6_add_nexthop(skb, rt) < 0)
4753 			goto nla_put_failure;
4754 
4755 		list_for_each_entry_safe(sibling, next_sibling,
4756 					 &rt->fib6_siblings, fib6_siblings) {
4757 			if (rt6_add_nexthop(skb, sibling) < 0)
4758 				goto nla_put_failure;
4759 		}
4760 
4761 		nla_nest_end(skb, mp);
4762 	} else {
4763 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4764 			goto nla_put_failure;
4765 	}
4766 
4767 	if (rt6_flags & RTF_EXPIRES) {
4768 		expires = dst ? dst->expires : rt->expires;
4769 		expires -= jiffies;
4770 	}
4771 
4772 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4773 		goto nla_put_failure;
4774 
4775 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4776 		goto nla_put_failure;
4777 
4778 
4779 	nlmsg_end(skb, nlh);
4780 	return 0;
4781 
4782 nla_put_failure:
4783 	nlmsg_cancel(skb, nlh);
4784 	return -EMSGSIZE;
4785 }
4786 
4787 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4788 			       const struct net_device *dev)
4789 {
4790 	if (f6i->fib6_nh.nh_dev == dev)
4791 		return true;
4792 
4793 	if (f6i->fib6_nsiblings) {
4794 		struct fib6_info *sibling, *next_sibling;
4795 
4796 		list_for_each_entry_safe(sibling, next_sibling,
4797 					 &f6i->fib6_siblings, fib6_siblings) {
4798 			if (sibling->fib6_nh.nh_dev == dev)
4799 				return true;
4800 		}
4801 	}
4802 
4803 	return false;
4804 }
4805 
4806 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4807 {
4808 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4809 	struct fib_dump_filter *filter = &arg->filter;
4810 	unsigned int flags = NLM_F_MULTI;
4811 	struct net *net = arg->net;
4812 
4813 	if (rt == net->ipv6.fib6_null_entry)
4814 		return 0;
4815 
4816 	if ((filter->flags & RTM_F_PREFIX) &&
4817 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4818 		/* success since this is not a prefix route */
4819 		return 1;
4820 	}
4821 	if (filter->filter_set) {
4822 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4823 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4824 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4825 			return 1;
4826 		}
4827 		flags |= NLM_F_DUMP_FILTERED;
4828 	}
4829 
4830 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4831 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4832 			     arg->cb->nlh->nlmsg_seq, flags);
4833 }
4834 
4835 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4836 					const struct nlmsghdr *nlh,
4837 					struct nlattr **tb,
4838 					struct netlink_ext_ack *extack)
4839 {
4840 	struct rtmsg *rtm;
4841 	int i, err;
4842 
4843 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4844 		NL_SET_ERR_MSG_MOD(extack,
4845 				   "Invalid header for get route request");
4846 		return -EINVAL;
4847 	}
4848 
4849 	if (!netlink_strict_get_check(skb))
4850 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4851 				   rtm_ipv6_policy, extack);
4852 
4853 	rtm = nlmsg_data(nlh);
4854 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4855 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4856 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4857 	    rtm->rtm_type) {
4858 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4859 		return -EINVAL;
4860 	}
4861 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4862 		NL_SET_ERR_MSG_MOD(extack,
4863 				   "Invalid flags for get route request");
4864 		return -EINVAL;
4865 	}
4866 
4867 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4868 				 rtm_ipv6_policy, extack);
4869 	if (err)
4870 		return err;
4871 
4872 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4873 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4874 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4875 		return -EINVAL;
4876 	}
4877 
4878 	for (i = 0; i <= RTA_MAX; i++) {
4879 		if (!tb[i])
4880 			continue;
4881 
4882 		switch (i) {
4883 		case RTA_SRC:
4884 		case RTA_DST:
4885 		case RTA_IIF:
4886 		case RTA_OIF:
4887 		case RTA_MARK:
4888 		case RTA_UID:
4889 		case RTA_SPORT:
4890 		case RTA_DPORT:
4891 		case RTA_IP_PROTO:
4892 			break;
4893 		default:
4894 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4895 			return -EINVAL;
4896 		}
4897 	}
4898 
4899 	return 0;
4900 }
4901 
4902 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4903 			      struct netlink_ext_ack *extack)
4904 {
4905 	struct net *net = sock_net(in_skb->sk);
4906 	struct nlattr *tb[RTA_MAX+1];
4907 	int err, iif = 0, oif = 0;
4908 	struct fib6_info *from;
4909 	struct dst_entry *dst;
4910 	struct rt6_info *rt;
4911 	struct sk_buff *skb;
4912 	struct rtmsg *rtm;
4913 	struct flowi6 fl6 = {};
4914 	bool fibmatch;
4915 
4916 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4917 	if (err < 0)
4918 		goto errout;
4919 
4920 	err = -EINVAL;
4921 	rtm = nlmsg_data(nlh);
4922 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4923 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4924 
4925 	if (tb[RTA_SRC]) {
4926 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4927 			goto errout;
4928 
4929 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4930 	}
4931 
4932 	if (tb[RTA_DST]) {
4933 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4934 			goto errout;
4935 
4936 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4937 	}
4938 
4939 	if (tb[RTA_IIF])
4940 		iif = nla_get_u32(tb[RTA_IIF]);
4941 
4942 	if (tb[RTA_OIF])
4943 		oif = nla_get_u32(tb[RTA_OIF]);
4944 
4945 	if (tb[RTA_MARK])
4946 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4947 
4948 	if (tb[RTA_UID])
4949 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4950 					   nla_get_u32(tb[RTA_UID]));
4951 	else
4952 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4953 
4954 	if (tb[RTA_SPORT])
4955 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4956 
4957 	if (tb[RTA_DPORT])
4958 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4959 
4960 	if (tb[RTA_IP_PROTO]) {
4961 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4962 						  &fl6.flowi6_proto, AF_INET6,
4963 						  extack);
4964 		if (err)
4965 			goto errout;
4966 	}
4967 
4968 	if (iif) {
4969 		struct net_device *dev;
4970 		int flags = 0;
4971 
4972 		rcu_read_lock();
4973 
4974 		dev = dev_get_by_index_rcu(net, iif);
4975 		if (!dev) {
4976 			rcu_read_unlock();
4977 			err = -ENODEV;
4978 			goto errout;
4979 		}
4980 
4981 		fl6.flowi6_iif = iif;
4982 
4983 		if (!ipv6_addr_any(&fl6.saddr))
4984 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4985 
4986 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4987 
4988 		rcu_read_unlock();
4989 	} else {
4990 		fl6.flowi6_oif = oif;
4991 
4992 		dst = ip6_route_output(net, NULL, &fl6);
4993 	}
4994 
4995 
4996 	rt = container_of(dst, struct rt6_info, dst);
4997 	if (rt->dst.error) {
4998 		err = rt->dst.error;
4999 		ip6_rt_put(rt);
5000 		goto errout;
5001 	}
5002 
5003 	if (rt == net->ipv6.ip6_null_entry) {
5004 		err = rt->dst.error;
5005 		ip6_rt_put(rt);
5006 		goto errout;
5007 	}
5008 
5009 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5010 	if (!skb) {
5011 		ip6_rt_put(rt);
5012 		err = -ENOBUFS;
5013 		goto errout;
5014 	}
5015 
5016 	skb_dst_set(skb, &rt->dst);
5017 
5018 	rcu_read_lock();
5019 	from = rcu_dereference(rt->from);
5020 	if (from) {
5021 		if (fibmatch)
5022 			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5023 					    iif, RTM_NEWROUTE,
5024 					    NETLINK_CB(in_skb).portid,
5025 					    nlh->nlmsg_seq, 0);
5026 		else
5027 			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5028 					    &fl6.saddr, iif, RTM_NEWROUTE,
5029 					    NETLINK_CB(in_skb).portid,
5030 					    nlh->nlmsg_seq, 0);
5031 	} else {
5032 		err = -ENETUNREACH;
5033 	}
5034 	rcu_read_unlock();
5035 
5036 	if (err < 0) {
5037 		kfree_skb(skb);
5038 		goto errout;
5039 	}
5040 
5041 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5042 errout:
5043 	return err;
5044 }
5045 
5046 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5047 		     unsigned int nlm_flags)
5048 {
5049 	struct sk_buff *skb;
5050 	struct net *net = info->nl_net;
5051 	u32 seq;
5052 	int err;
5053 
5054 	err = -ENOBUFS;
5055 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5056 
5057 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5058 	if (!skb)
5059 		goto errout;
5060 
5061 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5062 			    event, info->portid, seq, nlm_flags);
5063 	if (err < 0) {
5064 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5065 		WARN_ON(err == -EMSGSIZE);
5066 		kfree_skb(skb);
5067 		goto errout;
5068 	}
5069 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5070 		    info->nlh, gfp_any());
5071 	return;
5072 errout:
5073 	if (err < 0)
5074 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5075 }
5076 
5077 static int ip6_route_dev_notify(struct notifier_block *this,
5078 				unsigned long event, void *ptr)
5079 {
5080 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5081 	struct net *net = dev_net(dev);
5082 
5083 	if (!(dev->flags & IFF_LOOPBACK))
5084 		return NOTIFY_OK;
5085 
5086 	if (event == NETDEV_REGISTER) {
5087 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5088 		net->ipv6.ip6_null_entry->dst.dev = dev;
5089 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5090 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5091 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5092 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5093 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5094 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5095 #endif
5096 	 } else if (event == NETDEV_UNREGISTER &&
5097 		    dev->reg_state != NETREG_UNREGISTERED) {
5098 		/* NETDEV_UNREGISTER could be fired for multiple times by
5099 		 * netdev_wait_allrefs(). Make sure we only call this once.
5100 		 */
5101 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5102 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5103 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5104 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5105 #endif
5106 	}
5107 
5108 	return NOTIFY_OK;
5109 }
5110 
5111 /*
5112  *	/proc
5113  */
5114 
5115 #ifdef CONFIG_PROC_FS
5116 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5117 {
5118 	struct net *net = (struct net *)seq->private;
5119 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5120 		   net->ipv6.rt6_stats->fib_nodes,
5121 		   net->ipv6.rt6_stats->fib_route_nodes,
5122 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5123 		   net->ipv6.rt6_stats->fib_rt_entries,
5124 		   net->ipv6.rt6_stats->fib_rt_cache,
5125 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5126 		   net->ipv6.rt6_stats->fib_discarded_routes);
5127 
5128 	return 0;
5129 }
5130 #endif	/* CONFIG_PROC_FS */
5131 
5132 #ifdef CONFIG_SYSCTL
5133 
5134 static
5135 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5136 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5137 {
5138 	struct net *net;
5139 	int delay;
5140 	int ret;
5141 	if (!write)
5142 		return -EINVAL;
5143 
5144 	net = (struct net *)ctl->extra1;
5145 	delay = net->ipv6.sysctl.flush_delay;
5146 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5147 	if (ret)
5148 		return ret;
5149 
5150 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5151 	return 0;
5152 }
5153 
5154 static int zero;
5155 static int one = 1;
5156 
5157 static struct ctl_table ipv6_route_table_template[] = {
5158 	{
5159 		.procname	=	"flush",
5160 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5161 		.maxlen		=	sizeof(int),
5162 		.mode		=	0200,
5163 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5164 	},
5165 	{
5166 		.procname	=	"gc_thresh",
5167 		.data		=	&ip6_dst_ops_template.gc_thresh,
5168 		.maxlen		=	sizeof(int),
5169 		.mode		=	0644,
5170 		.proc_handler	=	proc_dointvec,
5171 	},
5172 	{
5173 		.procname	=	"max_size",
5174 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5175 		.maxlen		=	sizeof(int),
5176 		.mode		=	0644,
5177 		.proc_handler	=	proc_dointvec,
5178 	},
5179 	{
5180 		.procname	=	"gc_min_interval",
5181 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5182 		.maxlen		=	sizeof(int),
5183 		.mode		=	0644,
5184 		.proc_handler	=	proc_dointvec_jiffies,
5185 	},
5186 	{
5187 		.procname	=	"gc_timeout",
5188 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5189 		.maxlen		=	sizeof(int),
5190 		.mode		=	0644,
5191 		.proc_handler	=	proc_dointvec_jiffies,
5192 	},
5193 	{
5194 		.procname	=	"gc_interval",
5195 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5196 		.maxlen		=	sizeof(int),
5197 		.mode		=	0644,
5198 		.proc_handler	=	proc_dointvec_jiffies,
5199 	},
5200 	{
5201 		.procname	=	"gc_elasticity",
5202 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5203 		.maxlen		=	sizeof(int),
5204 		.mode		=	0644,
5205 		.proc_handler	=	proc_dointvec,
5206 	},
5207 	{
5208 		.procname	=	"mtu_expires",
5209 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5210 		.maxlen		=	sizeof(int),
5211 		.mode		=	0644,
5212 		.proc_handler	=	proc_dointvec_jiffies,
5213 	},
5214 	{
5215 		.procname	=	"min_adv_mss",
5216 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5217 		.maxlen		=	sizeof(int),
5218 		.mode		=	0644,
5219 		.proc_handler	=	proc_dointvec,
5220 	},
5221 	{
5222 		.procname	=	"gc_min_interval_ms",
5223 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5224 		.maxlen		=	sizeof(int),
5225 		.mode		=	0644,
5226 		.proc_handler	=	proc_dointvec_ms_jiffies,
5227 	},
5228 	{
5229 		.procname	=	"skip_notify_on_dev_down",
5230 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5231 		.maxlen		=	sizeof(int),
5232 		.mode		=	0644,
5233 		.proc_handler	=	proc_dointvec,
5234 		.extra1		=	&zero,
5235 		.extra2		=	&one,
5236 	},
5237 	{ }
5238 };
5239 
5240 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5241 {
5242 	struct ctl_table *table;
5243 
5244 	table = kmemdup(ipv6_route_table_template,
5245 			sizeof(ipv6_route_table_template),
5246 			GFP_KERNEL);
5247 
5248 	if (table) {
5249 		table[0].data = &net->ipv6.sysctl.flush_delay;
5250 		table[0].extra1 = net;
5251 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5252 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5253 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5254 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5255 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5256 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5257 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5258 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5259 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5260 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5261 
5262 		/* Don't export sysctls to unprivileged users */
5263 		if (net->user_ns != &init_user_ns)
5264 			table[0].procname = NULL;
5265 	}
5266 
5267 	return table;
5268 }
5269 #endif
5270 
5271 static int __net_init ip6_route_net_init(struct net *net)
5272 {
5273 	int ret = -ENOMEM;
5274 
5275 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5276 	       sizeof(net->ipv6.ip6_dst_ops));
5277 
5278 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5279 		goto out_ip6_dst_ops;
5280 
5281 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5282 					    sizeof(*net->ipv6.fib6_null_entry),
5283 					    GFP_KERNEL);
5284 	if (!net->ipv6.fib6_null_entry)
5285 		goto out_ip6_dst_entries;
5286 
5287 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5288 					   sizeof(*net->ipv6.ip6_null_entry),
5289 					   GFP_KERNEL);
5290 	if (!net->ipv6.ip6_null_entry)
5291 		goto out_fib6_null_entry;
5292 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5293 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5294 			 ip6_template_metrics, true);
5295 
5296 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5297 	net->ipv6.fib6_has_custom_rules = false;
5298 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5299 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5300 					       GFP_KERNEL);
5301 	if (!net->ipv6.ip6_prohibit_entry)
5302 		goto out_ip6_null_entry;
5303 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5304 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5305 			 ip6_template_metrics, true);
5306 
5307 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5308 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5309 					       GFP_KERNEL);
5310 	if (!net->ipv6.ip6_blk_hole_entry)
5311 		goto out_ip6_prohibit_entry;
5312 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5313 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5314 			 ip6_template_metrics, true);
5315 #endif
5316 
5317 	net->ipv6.sysctl.flush_delay = 0;
5318 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5319 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5320 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5321 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5322 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5323 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5324 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5325 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5326 
5327 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5328 
5329 	ret = 0;
5330 out:
5331 	return ret;
5332 
5333 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5334 out_ip6_prohibit_entry:
5335 	kfree(net->ipv6.ip6_prohibit_entry);
5336 out_ip6_null_entry:
5337 	kfree(net->ipv6.ip6_null_entry);
5338 #endif
5339 out_fib6_null_entry:
5340 	kfree(net->ipv6.fib6_null_entry);
5341 out_ip6_dst_entries:
5342 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5343 out_ip6_dst_ops:
5344 	goto out;
5345 }
5346 
5347 static void __net_exit ip6_route_net_exit(struct net *net)
5348 {
5349 	kfree(net->ipv6.fib6_null_entry);
5350 	kfree(net->ipv6.ip6_null_entry);
5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5352 	kfree(net->ipv6.ip6_prohibit_entry);
5353 	kfree(net->ipv6.ip6_blk_hole_entry);
5354 #endif
5355 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5356 }
5357 
5358 static int __net_init ip6_route_net_init_late(struct net *net)
5359 {
5360 #ifdef CONFIG_PROC_FS
5361 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5362 			sizeof(struct ipv6_route_iter));
5363 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5364 			rt6_stats_seq_show, NULL);
5365 #endif
5366 	return 0;
5367 }
5368 
5369 static void __net_exit ip6_route_net_exit_late(struct net *net)
5370 {
5371 #ifdef CONFIG_PROC_FS
5372 	remove_proc_entry("ipv6_route", net->proc_net);
5373 	remove_proc_entry("rt6_stats", net->proc_net);
5374 #endif
5375 }
5376 
5377 static struct pernet_operations ip6_route_net_ops = {
5378 	.init = ip6_route_net_init,
5379 	.exit = ip6_route_net_exit,
5380 };
5381 
5382 static int __net_init ipv6_inetpeer_init(struct net *net)
5383 {
5384 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5385 
5386 	if (!bp)
5387 		return -ENOMEM;
5388 	inet_peer_base_init(bp);
5389 	net->ipv6.peers = bp;
5390 	return 0;
5391 }
5392 
5393 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5394 {
5395 	struct inet_peer_base *bp = net->ipv6.peers;
5396 
5397 	net->ipv6.peers = NULL;
5398 	inetpeer_invalidate_tree(bp);
5399 	kfree(bp);
5400 }
5401 
5402 static struct pernet_operations ipv6_inetpeer_ops = {
5403 	.init	=	ipv6_inetpeer_init,
5404 	.exit	=	ipv6_inetpeer_exit,
5405 };
5406 
5407 static struct pernet_operations ip6_route_net_late_ops = {
5408 	.init = ip6_route_net_init_late,
5409 	.exit = ip6_route_net_exit_late,
5410 };
5411 
5412 static struct notifier_block ip6_route_dev_notifier = {
5413 	.notifier_call = ip6_route_dev_notify,
5414 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5415 };
5416 
5417 void __init ip6_route_init_special_entries(void)
5418 {
5419 	/* Registering of the loopback is done before this portion of code,
5420 	 * the loopback reference in rt6_info will not be taken, do it
5421 	 * manually for init_net */
5422 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5423 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5424 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5425   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5426 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5427 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5428 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5429 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5430   #endif
5431 }
5432 
5433 int __init ip6_route_init(void)
5434 {
5435 	int ret;
5436 	int cpu;
5437 
5438 	ret = -ENOMEM;
5439 	ip6_dst_ops_template.kmem_cachep =
5440 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5441 				  SLAB_HWCACHE_ALIGN, NULL);
5442 	if (!ip6_dst_ops_template.kmem_cachep)
5443 		goto out;
5444 
5445 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5446 	if (ret)
5447 		goto out_kmem_cache;
5448 
5449 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5450 	if (ret)
5451 		goto out_dst_entries;
5452 
5453 	ret = register_pernet_subsys(&ip6_route_net_ops);
5454 	if (ret)
5455 		goto out_register_inetpeer;
5456 
5457 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5458 
5459 	ret = fib6_init();
5460 	if (ret)
5461 		goto out_register_subsys;
5462 
5463 	ret = xfrm6_init();
5464 	if (ret)
5465 		goto out_fib6_init;
5466 
5467 	ret = fib6_rules_init();
5468 	if (ret)
5469 		goto xfrm6_init;
5470 
5471 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5472 	if (ret)
5473 		goto fib6_rules_init;
5474 
5475 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5476 				   inet6_rtm_newroute, NULL, 0);
5477 	if (ret < 0)
5478 		goto out_register_late_subsys;
5479 
5480 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5481 				   inet6_rtm_delroute, NULL, 0);
5482 	if (ret < 0)
5483 		goto out_register_late_subsys;
5484 
5485 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5486 				   inet6_rtm_getroute, NULL,
5487 				   RTNL_FLAG_DOIT_UNLOCKED);
5488 	if (ret < 0)
5489 		goto out_register_late_subsys;
5490 
5491 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5492 	if (ret)
5493 		goto out_register_late_subsys;
5494 
5495 	for_each_possible_cpu(cpu) {
5496 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5497 
5498 		INIT_LIST_HEAD(&ul->head);
5499 		spin_lock_init(&ul->lock);
5500 	}
5501 
5502 out:
5503 	return ret;
5504 
5505 out_register_late_subsys:
5506 	rtnl_unregister_all(PF_INET6);
5507 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5508 fib6_rules_init:
5509 	fib6_rules_cleanup();
5510 xfrm6_init:
5511 	xfrm6_fini();
5512 out_fib6_init:
5513 	fib6_gc_cleanup();
5514 out_register_subsys:
5515 	unregister_pernet_subsys(&ip6_route_net_ops);
5516 out_register_inetpeer:
5517 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5518 out_dst_entries:
5519 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5520 out_kmem_cache:
5521 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5522 	goto out;
5523 }
5524 
5525 void ip6_route_cleanup(void)
5526 {
5527 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5528 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5529 	fib6_rules_cleanup();
5530 	xfrm6_fini();
5531 	fib6_gc_cleanup();
5532 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5533 	unregister_pernet_subsys(&ip6_route_net_ops);
5534 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5535 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5536 }
5537