xref: /openbmc/linux/net/ipv6/route.c (revision 0d16158149ab6b02fcd945b2f5a5cf31262a445b)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= ATOMIC_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 void fib6_select_path(const struct net *net, struct fib6_result *res,
432 		      struct flowi6 *fl6, int oif, bool have_oif_match,
433 		      const struct sk_buff *skb, int strict)
434 {
435 	struct fib6_info *sibling, *next_sibling;
436 	struct fib6_info *match = res->f6i;
437 
438 	if (!match->fib6_nsiblings || have_oif_match)
439 		goto out;
440 
441 	/* We might have already computed the hash for ICMPv6 errors. In such
442 	 * case it will always be non-zero. Otherwise now is the time to do it.
443 	 */
444 	if (!fl6->mp_hash)
445 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
446 
447 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
448 		goto out;
449 
450 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 				 fib6_siblings) {
452 		const struct fib6_nh *nh = &sibling->fib6_nh;
453 		int nh_upper_bound;
454 
455 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
456 		if (fl6->mp_hash > nh_upper_bound)
457 			continue;
458 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
459 			break;
460 		match = sibling;
461 		break;
462 	}
463 
464 out:
465 	res->f6i = match;
466 	res->nh = &match->fib6_nh;
467 }
468 
469 /*
470  *	Route lookup. rcu_read_lock() should be held.
471  */
472 
473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
474 			       const struct in6_addr *saddr, int oif, int flags)
475 {
476 	const struct net_device *dev;
477 
478 	if (nh->fib_nh_flags & RTNH_F_DEAD)
479 		return false;
480 
481 	dev = nh->fib_nh_dev;
482 	if (oif) {
483 		if (dev->ifindex == oif)
484 			return true;
485 	} else {
486 		if (ipv6_chk_addr(net, saddr, dev,
487 				  flags & RT6_LOOKUP_F_IFACE))
488 			return true;
489 	}
490 
491 	return false;
492 }
493 
494 static inline struct fib6_info *rt6_device_match(struct net *net,
495 						 struct fib6_info *rt,
496 						    const struct in6_addr *saddr,
497 						    int oif,
498 						    int flags)
499 {
500 	const struct fib6_nh *nh;
501 	struct fib6_info *sprt;
502 
503 	if (!oif && ipv6_addr_any(saddr) &&
504 	    !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
505 		return rt;
506 
507 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
508 		nh = &sprt->fib6_nh;
509 		if (__rt6_device_match(net, nh, saddr, oif, flags))
510 			return sprt;
511 	}
512 
513 	if (oif && flags & RT6_LOOKUP_F_IFACE)
514 		return net->ipv6.fib6_null_entry;
515 
516 	return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
517 }
518 
519 #ifdef CONFIG_IPV6_ROUTER_PREF
520 struct __rt6_probe_work {
521 	struct work_struct work;
522 	struct in6_addr target;
523 	struct net_device *dev;
524 };
525 
526 static void rt6_probe_deferred(struct work_struct *w)
527 {
528 	struct in6_addr mcaddr;
529 	struct __rt6_probe_work *work =
530 		container_of(w, struct __rt6_probe_work, work);
531 
532 	addrconf_addr_solict_mult(&work->target, &mcaddr);
533 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
534 	dev_put(work->dev);
535 	kfree(work);
536 }
537 
538 static void rt6_probe(struct fib6_nh *fib6_nh)
539 {
540 	struct __rt6_probe_work *work = NULL;
541 	const struct in6_addr *nh_gw;
542 	struct neighbour *neigh;
543 	struct net_device *dev;
544 	struct inet6_dev *idev;
545 
546 	/*
547 	 * Okay, this does not seem to be appropriate
548 	 * for now, however, we need to check if it
549 	 * is really so; aka Router Reachability Probing.
550 	 *
551 	 * Router Reachability Probe MUST be rate-limited
552 	 * to no more than one per minute.
553 	 */
554 	if (fib6_nh->fib_nh_gw_family)
555 		return;
556 
557 	nh_gw = &fib6_nh->fib_nh_gw6;
558 	dev = fib6_nh->fib_nh_dev;
559 	rcu_read_lock_bh();
560 	idev = __in6_dev_get(dev);
561 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
562 	if (neigh) {
563 		if (neigh->nud_state & NUD_VALID)
564 			goto out;
565 
566 		write_lock(&neigh->lock);
567 		if (!(neigh->nud_state & NUD_VALID) &&
568 		    time_after(jiffies,
569 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
570 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
571 			if (work)
572 				__neigh_set_probe_once(neigh);
573 		}
574 		write_unlock(&neigh->lock);
575 	} else if (time_after(jiffies, fib6_nh->last_probe +
576 				       idev->cnf.rtr_probe_interval)) {
577 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
578 	}
579 
580 	if (work) {
581 		fib6_nh->last_probe = jiffies;
582 		INIT_WORK(&work->work, rt6_probe_deferred);
583 		work->target = *nh_gw;
584 		dev_hold(dev);
585 		work->dev = dev;
586 		schedule_work(&work->work);
587 	}
588 
589 out:
590 	rcu_read_unlock_bh();
591 }
592 #else
593 static inline void rt6_probe(struct fib6_nh *fib6_nh)
594 {
595 }
596 #endif
597 
598 /*
599  * Default Router Selection (RFC 2461 6.3.6)
600  */
601 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
602 {
603 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
604 	struct neighbour *neigh;
605 
606 	rcu_read_lock_bh();
607 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
608 					  &fib6_nh->fib_nh_gw6);
609 	if (neigh) {
610 		read_lock(&neigh->lock);
611 		if (neigh->nud_state & NUD_VALID)
612 			ret = RT6_NUD_SUCCEED;
613 #ifdef CONFIG_IPV6_ROUTER_PREF
614 		else if (!(neigh->nud_state & NUD_FAILED))
615 			ret = RT6_NUD_SUCCEED;
616 		else
617 			ret = RT6_NUD_FAIL_PROBE;
618 #endif
619 		read_unlock(&neigh->lock);
620 	} else {
621 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
622 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
623 	}
624 	rcu_read_unlock_bh();
625 
626 	return ret;
627 }
628 
629 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
630 			   int strict)
631 {
632 	int m = 0;
633 
634 	if (!oif || nh->fib_nh_dev->ifindex == oif)
635 		m = 2;
636 
637 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
638 		return RT6_NUD_FAIL_HARD;
639 #ifdef CONFIG_IPV6_ROUTER_PREF
640 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
641 #endif
642 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
643 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
644 		int n = rt6_check_neigh(nh);
645 		if (n < 0)
646 			return n;
647 	}
648 	return m;
649 }
650 
651 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
652 		       int oif, int strict, int *mpri, bool *do_rr)
653 {
654 	bool match_do_rr = false;
655 	bool rc = false;
656 	int m;
657 
658 	if (nh->fib_nh_flags & RTNH_F_DEAD)
659 		goto out;
660 
661 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
662 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
663 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
664 		goto out;
665 
666 	m = rt6_score_route(nh, fib6_flags, oif, strict);
667 	if (m == RT6_NUD_FAIL_DO_RR) {
668 		match_do_rr = true;
669 		m = 0; /* lowest valid score */
670 	} else if (m == RT6_NUD_FAIL_HARD) {
671 		goto out;
672 	}
673 
674 	if (strict & RT6_LOOKUP_F_REACHABLE)
675 		rt6_probe(nh);
676 
677 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
678 	if (m > *mpri) {
679 		*do_rr = match_do_rr;
680 		*mpri = m;
681 		rc = true;
682 	}
683 out:
684 	return rc;
685 }
686 
687 static void __find_rr_leaf(struct fib6_info *rt_start,
688 			   struct fib6_info *nomatch, u32 metric,
689 			   struct fib6_info **match, struct fib6_info **cont,
690 			   int oif, int strict, bool *do_rr, int *mpri)
691 {
692 	struct fib6_info *rt;
693 
694 	for (rt = rt_start;
695 	     rt && rt != nomatch;
696 	     rt = rcu_dereference(rt->fib6_next)) {
697 		struct fib6_nh *nh;
698 
699 		if (cont && rt->fib6_metric != metric) {
700 			*cont = rt;
701 			return;
702 		}
703 
704 		if (fib6_check_expired(rt))
705 			continue;
706 
707 		nh = &rt->fib6_nh;
708 		if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
709 			*match = rt;
710 	}
711 }
712 
713 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
714 				      struct fib6_info *leaf,
715 				      struct fib6_info *rr_head,
716 				      u32 metric, int oif, int strict,
717 				      bool *do_rr)
718 {
719 	struct fib6_info *match = NULL, *cont = NULL;
720 	int mpri = -1;
721 
722 	__find_rr_leaf(rr_head, NULL, metric, &match, &cont,
723 		       oif, strict, do_rr, &mpri);
724 
725 	__find_rr_leaf(leaf, rr_head, metric, &match, &cont,
726 		       oif, strict, do_rr, &mpri);
727 
728 	if (match || !cont)
729 		return match;
730 
731 	__find_rr_leaf(cont, NULL, metric, &match, NULL,
732 		       oif, strict, do_rr, &mpri);
733 
734 	return match;
735 }
736 
737 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
738 				   int oif, int strict)
739 {
740 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
741 	struct fib6_info *match, *rt0;
742 	bool do_rr = false;
743 	int key_plen;
744 
745 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
746 		return net->ipv6.fib6_null_entry;
747 
748 	rt0 = rcu_dereference(fn->rr_ptr);
749 	if (!rt0)
750 		rt0 = leaf;
751 
752 	/* Double check to make sure fn is not an intermediate node
753 	 * and fn->leaf does not points to its child's leaf
754 	 * (This might happen if all routes under fn are deleted from
755 	 * the tree and fib6_repair_tree() is called on the node.)
756 	 */
757 	key_plen = rt0->fib6_dst.plen;
758 #ifdef CONFIG_IPV6_SUBTREES
759 	if (rt0->fib6_src.plen)
760 		key_plen = rt0->fib6_src.plen;
761 #endif
762 	if (fn->fn_bit != key_plen)
763 		return net->ipv6.fib6_null_entry;
764 
765 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
766 			     &do_rr);
767 
768 	if (do_rr) {
769 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
770 
771 		/* no entries matched; do round-robin */
772 		if (!next || next->fib6_metric != rt0->fib6_metric)
773 			next = leaf;
774 
775 		if (next != rt0) {
776 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
777 			/* make sure next is not being deleted from the tree */
778 			if (next->fib6_node)
779 				rcu_assign_pointer(fn->rr_ptr, next);
780 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
781 		}
782 	}
783 
784 	return match ? match : net->ipv6.fib6_null_entry;
785 }
786 
787 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
788 {
789 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
790 	       res->nh->fib_nh_gw_family;
791 }
792 
793 #ifdef CONFIG_IPV6_ROUTE_INFO
794 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
795 		  const struct in6_addr *gwaddr)
796 {
797 	struct net *net = dev_net(dev);
798 	struct route_info *rinfo = (struct route_info *) opt;
799 	struct in6_addr prefix_buf, *prefix;
800 	unsigned int pref;
801 	unsigned long lifetime;
802 	struct fib6_info *rt;
803 
804 	if (len < sizeof(struct route_info)) {
805 		return -EINVAL;
806 	}
807 
808 	/* Sanity check for prefix_len and length */
809 	if (rinfo->length > 3) {
810 		return -EINVAL;
811 	} else if (rinfo->prefix_len > 128) {
812 		return -EINVAL;
813 	} else if (rinfo->prefix_len > 64) {
814 		if (rinfo->length < 2) {
815 			return -EINVAL;
816 		}
817 	} else if (rinfo->prefix_len > 0) {
818 		if (rinfo->length < 1) {
819 			return -EINVAL;
820 		}
821 	}
822 
823 	pref = rinfo->route_pref;
824 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
825 		return -EINVAL;
826 
827 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
828 
829 	if (rinfo->length == 3)
830 		prefix = (struct in6_addr *)rinfo->prefix;
831 	else {
832 		/* this function is safe */
833 		ipv6_addr_prefix(&prefix_buf,
834 				 (struct in6_addr *)rinfo->prefix,
835 				 rinfo->prefix_len);
836 		prefix = &prefix_buf;
837 	}
838 
839 	if (rinfo->prefix_len == 0)
840 		rt = rt6_get_dflt_router(net, gwaddr, dev);
841 	else
842 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
843 					gwaddr, dev);
844 
845 	if (rt && !lifetime) {
846 		ip6_del_rt(net, rt);
847 		rt = NULL;
848 	}
849 
850 	if (!rt && lifetime)
851 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
852 					dev, pref);
853 	else if (rt)
854 		rt->fib6_flags = RTF_ROUTEINFO |
855 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
856 
857 	if (rt) {
858 		if (!addrconf_finite_timeout(lifetime))
859 			fib6_clean_expires(rt);
860 		else
861 			fib6_set_expires(rt, jiffies + HZ * lifetime);
862 
863 		fib6_info_release(rt);
864 	}
865 	return 0;
866 }
867 #endif
868 
869 /*
870  *	Misc support functions
871  */
872 
873 /* called with rcu_lock held */
874 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
875 {
876 	struct net_device *dev = res->nh->fib_nh_dev;
877 	const struct fib6_info *f6i = res->f6i;
878 
879 	if (f6i->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
880 		/* for copies of local routes, dst->dev needs to be the
881 		 * device if it is a master device, the master device if
882 		 * device is enslaved, and the loopback as the default
883 		 */
884 		if (netif_is_l3_slave(dev) &&
885 		    !rt6_need_strict(&f6i->fib6_dst.addr))
886 			dev = l3mdev_master_dev_rcu(dev);
887 		else if (!netif_is_l3_master(dev))
888 			dev = dev_net(dev)->loopback_dev;
889 		/* last case is netif_is_l3_master(dev) is true in which
890 		 * case we want dev returned to be dev
891 		 */
892 	}
893 
894 	return dev;
895 }
896 
897 static const int fib6_prop[RTN_MAX + 1] = {
898 	[RTN_UNSPEC]	= 0,
899 	[RTN_UNICAST]	= 0,
900 	[RTN_LOCAL]	= 0,
901 	[RTN_BROADCAST]	= 0,
902 	[RTN_ANYCAST]	= 0,
903 	[RTN_MULTICAST]	= 0,
904 	[RTN_BLACKHOLE]	= -EINVAL,
905 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
906 	[RTN_PROHIBIT]	= -EACCES,
907 	[RTN_THROW]	= -EAGAIN,
908 	[RTN_NAT]	= -EINVAL,
909 	[RTN_XRESOLVE]	= -EINVAL,
910 };
911 
912 static int ip6_rt_type_to_error(u8 fib6_type)
913 {
914 	return fib6_prop[fib6_type];
915 }
916 
917 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
918 {
919 	unsigned short flags = 0;
920 
921 	if (rt->dst_nocount)
922 		flags |= DST_NOCOUNT;
923 	if (rt->dst_nopolicy)
924 		flags |= DST_NOPOLICY;
925 	if (rt->dst_host)
926 		flags |= DST_HOST;
927 
928 	return flags;
929 }
930 
931 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
932 {
933 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
934 
935 	switch (ort->fib6_type) {
936 	case RTN_BLACKHOLE:
937 		rt->dst.output = dst_discard_out;
938 		rt->dst.input = dst_discard;
939 		break;
940 	case RTN_PROHIBIT:
941 		rt->dst.output = ip6_pkt_prohibit_out;
942 		rt->dst.input = ip6_pkt_prohibit;
943 		break;
944 	case RTN_THROW:
945 	case RTN_UNREACHABLE:
946 	default:
947 		rt->dst.output = ip6_pkt_discard_out;
948 		rt->dst.input = ip6_pkt_discard;
949 		break;
950 	}
951 }
952 
953 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
954 {
955 	struct fib6_info *ort = res->f6i;
956 
957 	if (ort->fib6_flags & RTF_REJECT) {
958 		ip6_rt_init_dst_reject(rt, ort);
959 		return;
960 	}
961 
962 	rt->dst.error = 0;
963 	rt->dst.output = ip6_output;
964 
965 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
966 		rt->dst.input = ip6_input;
967 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
968 		rt->dst.input = ip6_mc_input;
969 	} else {
970 		rt->dst.input = ip6_forward;
971 	}
972 
973 	if (res->nh->fib_nh_lws) {
974 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
975 		lwtunnel_set_redirect(&rt->dst);
976 	}
977 
978 	rt->dst.lastuse = jiffies;
979 }
980 
981 /* Caller must already hold reference to @from */
982 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
983 {
984 	rt->rt6i_flags &= ~RTF_EXPIRES;
985 	rcu_assign_pointer(rt->from, from);
986 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
987 }
988 
989 /* Caller must already hold reference to f6i in result */
990 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
991 {
992 	const struct fib6_nh *nh = res->nh;
993 	const struct net_device *dev = nh->fib_nh_dev;
994 	struct fib6_info *f6i = res->f6i;
995 
996 	ip6_rt_init_dst(rt, res);
997 
998 	rt->rt6i_dst = f6i->fib6_dst;
999 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1000 	rt->rt6i_flags = f6i->fib6_flags;
1001 	if (nh->fib_nh_gw_family) {
1002 		rt->rt6i_gateway = nh->fib_nh_gw6;
1003 		rt->rt6i_flags |= RTF_GATEWAY;
1004 	}
1005 	rt6_set_from(rt, f6i);
1006 #ifdef CONFIG_IPV6_SUBTREES
1007 	rt->rt6i_src = f6i->fib6_src;
1008 #endif
1009 }
1010 
1011 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1012 					struct in6_addr *saddr)
1013 {
1014 	struct fib6_node *pn, *sn;
1015 	while (1) {
1016 		if (fn->fn_flags & RTN_TL_ROOT)
1017 			return NULL;
1018 		pn = rcu_dereference(fn->parent);
1019 		sn = FIB6_SUBTREE(pn);
1020 		if (sn && sn != fn)
1021 			fn = fib6_node_lookup(sn, NULL, saddr);
1022 		else
1023 			fn = pn;
1024 		if (fn->fn_flags & RTN_RTINFO)
1025 			return fn;
1026 	}
1027 }
1028 
1029 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1030 {
1031 	struct rt6_info *rt = *prt;
1032 
1033 	if (dst_hold_safe(&rt->dst))
1034 		return true;
1035 	if (net) {
1036 		rt = net->ipv6.ip6_null_entry;
1037 		dst_hold(&rt->dst);
1038 	} else {
1039 		rt = NULL;
1040 	}
1041 	*prt = rt;
1042 	return false;
1043 }
1044 
1045 /* called with rcu_lock held */
1046 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1047 {
1048 	struct net_device *dev = res->nh->fib_nh_dev;
1049 	struct fib6_info *f6i = res->f6i;
1050 	unsigned short flags;
1051 	struct rt6_info *nrt;
1052 
1053 	if (!fib6_info_hold_safe(f6i))
1054 		goto fallback;
1055 
1056 	flags = fib6_info_dst_flags(f6i);
1057 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1058 	if (!nrt) {
1059 		fib6_info_release(f6i);
1060 		goto fallback;
1061 	}
1062 
1063 	ip6_rt_copy_init(nrt, res);
1064 	return nrt;
1065 
1066 fallback:
1067 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1068 	dst_hold(&nrt->dst);
1069 	return nrt;
1070 }
1071 
1072 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1073 					     struct fib6_table *table,
1074 					     struct flowi6 *fl6,
1075 					     const struct sk_buff *skb,
1076 					     int flags)
1077 {
1078 	struct fib6_result res = {};
1079 	struct fib6_node *fn;
1080 	struct rt6_info *rt;
1081 
1082 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1083 		flags &= ~RT6_LOOKUP_F_IFACE;
1084 
1085 	rcu_read_lock();
1086 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1087 restart:
1088 	res.f6i = rcu_dereference(fn->leaf);
1089 	if (!res.f6i)
1090 		res.f6i = net->ipv6.fib6_null_entry;
1091 	else
1092 		res.f6i = rt6_device_match(net, res.f6i, &fl6->saddr,
1093 					   fl6->flowi6_oif, flags);
1094 
1095 	if (res.f6i == net->ipv6.fib6_null_entry) {
1096 		fn = fib6_backtrack(fn, &fl6->saddr);
1097 		if (fn)
1098 			goto restart;
1099 
1100 		rt = net->ipv6.ip6_null_entry;
1101 		dst_hold(&rt->dst);
1102 		goto out;
1103 	}
1104 
1105 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1106 			 fl6->flowi6_oif != 0, skb, flags);
1107 
1108 	/* Search through exception table */
1109 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1110 	if (rt) {
1111 		if (ip6_hold_safe(net, &rt))
1112 			dst_use_noref(&rt->dst, jiffies);
1113 	} else {
1114 		rt = ip6_create_rt_rcu(&res);
1115 	}
1116 
1117 out:
1118 	trace_fib6_table_lookup(net, res.f6i, table, fl6);
1119 
1120 	rcu_read_unlock();
1121 
1122 	return rt;
1123 }
1124 
1125 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1126 				   const struct sk_buff *skb, int flags)
1127 {
1128 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1129 }
1130 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1131 
1132 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1133 			    const struct in6_addr *saddr, int oif,
1134 			    const struct sk_buff *skb, int strict)
1135 {
1136 	struct flowi6 fl6 = {
1137 		.flowi6_oif = oif,
1138 		.daddr = *daddr,
1139 	};
1140 	struct dst_entry *dst;
1141 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1142 
1143 	if (saddr) {
1144 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1145 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1146 	}
1147 
1148 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1149 	if (dst->error == 0)
1150 		return (struct rt6_info *) dst;
1151 
1152 	dst_release(dst);
1153 
1154 	return NULL;
1155 }
1156 EXPORT_SYMBOL(rt6_lookup);
1157 
1158 /* ip6_ins_rt is called with FREE table->tb6_lock.
1159  * It takes new route entry, the addition fails by any reason the
1160  * route is released.
1161  * Caller must hold dst before calling it.
1162  */
1163 
1164 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1165 			struct netlink_ext_ack *extack)
1166 {
1167 	int err;
1168 	struct fib6_table *table;
1169 
1170 	table = rt->fib6_table;
1171 	spin_lock_bh(&table->tb6_lock);
1172 	err = fib6_add(&table->tb6_root, rt, info, extack);
1173 	spin_unlock_bh(&table->tb6_lock);
1174 
1175 	return err;
1176 }
1177 
1178 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1179 {
1180 	struct nl_info info = {	.nl_net = net, };
1181 
1182 	return __ip6_ins_rt(rt, &info, NULL);
1183 }
1184 
1185 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1186 					   const struct in6_addr *daddr,
1187 					   const struct in6_addr *saddr)
1188 {
1189 	struct fib6_info *f6i = res->f6i;
1190 	struct net_device *dev;
1191 	struct rt6_info *rt;
1192 
1193 	/*
1194 	 *	Clone the route.
1195 	 */
1196 
1197 	if (!fib6_info_hold_safe(f6i))
1198 		return NULL;
1199 
1200 	dev = ip6_rt_get_dev_rcu(res);
1201 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1202 	if (!rt) {
1203 		fib6_info_release(f6i);
1204 		return NULL;
1205 	}
1206 
1207 	ip6_rt_copy_init(rt, res);
1208 	rt->rt6i_flags |= RTF_CACHE;
1209 	rt->dst.flags |= DST_HOST;
1210 	rt->rt6i_dst.addr = *daddr;
1211 	rt->rt6i_dst.plen = 128;
1212 
1213 	if (!rt6_is_gw_or_nonexthop(res)) {
1214 		if (f6i->fib6_dst.plen != 128 &&
1215 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1216 			rt->rt6i_flags |= RTF_ANYCAST;
1217 #ifdef CONFIG_IPV6_SUBTREES
1218 		if (rt->rt6i_src.plen && saddr) {
1219 			rt->rt6i_src.addr = *saddr;
1220 			rt->rt6i_src.plen = 128;
1221 		}
1222 #endif
1223 	}
1224 
1225 	return rt;
1226 }
1227 
1228 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1229 {
1230 	struct fib6_info *f6i = res->f6i;
1231 	unsigned short flags = fib6_info_dst_flags(f6i);
1232 	struct net_device *dev;
1233 	struct rt6_info *pcpu_rt;
1234 
1235 	if (!fib6_info_hold_safe(f6i))
1236 		return NULL;
1237 
1238 	rcu_read_lock();
1239 	dev = ip6_rt_get_dev_rcu(res);
1240 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1241 	rcu_read_unlock();
1242 	if (!pcpu_rt) {
1243 		fib6_info_release(f6i);
1244 		return NULL;
1245 	}
1246 	ip6_rt_copy_init(pcpu_rt, res);
1247 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1248 	return pcpu_rt;
1249 }
1250 
1251 /* It should be called with rcu_read_lock() acquired */
1252 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1253 {
1254 	struct rt6_info *pcpu_rt, **p;
1255 
1256 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1257 	pcpu_rt = *p;
1258 
1259 	if (pcpu_rt)
1260 		ip6_hold_safe(NULL, &pcpu_rt);
1261 
1262 	return pcpu_rt;
1263 }
1264 
1265 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1266 					    const struct fib6_result *res)
1267 {
1268 	struct rt6_info *pcpu_rt, *prev, **p;
1269 
1270 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1271 	if (!pcpu_rt) {
1272 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1273 		return net->ipv6.ip6_null_entry;
1274 	}
1275 
1276 	dst_hold(&pcpu_rt->dst);
1277 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1278 	prev = cmpxchg(p, NULL, pcpu_rt);
1279 	BUG_ON(prev);
1280 
1281 	return pcpu_rt;
1282 }
1283 
1284 /* exception hash table implementation
1285  */
1286 static DEFINE_SPINLOCK(rt6_exception_lock);
1287 
1288 /* Remove rt6_ex from hash table and free the memory
1289  * Caller must hold rt6_exception_lock
1290  */
1291 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1292 				 struct rt6_exception *rt6_ex)
1293 {
1294 	struct fib6_info *from;
1295 	struct net *net;
1296 
1297 	if (!bucket || !rt6_ex)
1298 		return;
1299 
1300 	net = dev_net(rt6_ex->rt6i->dst.dev);
1301 	net->ipv6.rt6_stats->fib_rt_cache--;
1302 
1303 	/* purge completely the exception to allow releasing the held resources:
1304 	 * some [sk] cache may keep the dst around for unlimited time
1305 	 */
1306 	from = rcu_dereference_protected(rt6_ex->rt6i->from,
1307 					 lockdep_is_held(&rt6_exception_lock));
1308 	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1309 	fib6_info_release(from);
1310 	dst_dev_put(&rt6_ex->rt6i->dst);
1311 
1312 	hlist_del_rcu(&rt6_ex->hlist);
1313 	dst_release(&rt6_ex->rt6i->dst);
1314 	kfree_rcu(rt6_ex, rcu);
1315 	WARN_ON_ONCE(!bucket->depth);
1316 	bucket->depth--;
1317 }
1318 
1319 /* Remove oldest rt6_ex in bucket and free the memory
1320  * Caller must hold rt6_exception_lock
1321  */
1322 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1323 {
1324 	struct rt6_exception *rt6_ex, *oldest = NULL;
1325 
1326 	if (!bucket)
1327 		return;
1328 
1329 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1330 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1331 			oldest = rt6_ex;
1332 	}
1333 	rt6_remove_exception(bucket, oldest);
1334 }
1335 
1336 static u32 rt6_exception_hash(const struct in6_addr *dst,
1337 			      const struct in6_addr *src)
1338 {
1339 	static u32 seed __read_mostly;
1340 	u32 val;
1341 
1342 	net_get_random_once(&seed, sizeof(seed));
1343 	val = jhash(dst, sizeof(*dst), seed);
1344 
1345 #ifdef CONFIG_IPV6_SUBTREES
1346 	if (src)
1347 		val = jhash(src, sizeof(*src), val);
1348 #endif
1349 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1350 }
1351 
1352 /* Helper function to find the cached rt in the hash table
1353  * and update bucket pointer to point to the bucket for this
1354  * (daddr, saddr) pair
1355  * Caller must hold rt6_exception_lock
1356  */
1357 static struct rt6_exception *
1358 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1359 			      const struct in6_addr *daddr,
1360 			      const struct in6_addr *saddr)
1361 {
1362 	struct rt6_exception *rt6_ex;
1363 	u32 hval;
1364 
1365 	if (!(*bucket) || !daddr)
1366 		return NULL;
1367 
1368 	hval = rt6_exception_hash(daddr, saddr);
1369 	*bucket += hval;
1370 
1371 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1372 		struct rt6_info *rt6 = rt6_ex->rt6i;
1373 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1374 
1375 #ifdef CONFIG_IPV6_SUBTREES
1376 		if (matched && saddr)
1377 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1378 #endif
1379 		if (matched)
1380 			return rt6_ex;
1381 	}
1382 	return NULL;
1383 }
1384 
1385 /* Helper function to find the cached rt in the hash table
1386  * and update bucket pointer to point to the bucket for this
1387  * (daddr, saddr) pair
1388  * Caller must hold rcu_read_lock()
1389  */
1390 static struct rt6_exception *
1391 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1392 			 const struct in6_addr *daddr,
1393 			 const struct in6_addr *saddr)
1394 {
1395 	struct rt6_exception *rt6_ex;
1396 	u32 hval;
1397 
1398 	WARN_ON_ONCE(!rcu_read_lock_held());
1399 
1400 	if (!(*bucket) || !daddr)
1401 		return NULL;
1402 
1403 	hval = rt6_exception_hash(daddr, saddr);
1404 	*bucket += hval;
1405 
1406 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1407 		struct rt6_info *rt6 = rt6_ex->rt6i;
1408 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1409 
1410 #ifdef CONFIG_IPV6_SUBTREES
1411 		if (matched && saddr)
1412 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1413 #endif
1414 		if (matched)
1415 			return rt6_ex;
1416 	}
1417 	return NULL;
1418 }
1419 
1420 static unsigned int fib6_mtu(const struct fib6_info *rt)
1421 {
1422 	unsigned int mtu;
1423 
1424 	if (rt->fib6_pmtu) {
1425 		mtu = rt->fib6_pmtu;
1426 	} else {
1427 		struct net_device *dev = fib6_info_nh_dev(rt);
1428 		struct inet6_dev *idev;
1429 
1430 		rcu_read_lock();
1431 		idev = __in6_dev_get(dev);
1432 		mtu = idev->cnf.mtu6;
1433 		rcu_read_unlock();
1434 	}
1435 
1436 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1437 
1438 	return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1439 }
1440 
1441 static int rt6_insert_exception(struct rt6_info *nrt,
1442 				struct fib6_info *ort)
1443 {
1444 	struct net *net = dev_net(nrt->dst.dev);
1445 	struct rt6_exception_bucket *bucket;
1446 	struct in6_addr *src_key = NULL;
1447 	struct rt6_exception *rt6_ex;
1448 	int err = 0;
1449 
1450 	spin_lock_bh(&rt6_exception_lock);
1451 
1452 	if (ort->exception_bucket_flushed) {
1453 		err = -EINVAL;
1454 		goto out;
1455 	}
1456 
1457 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1458 					lockdep_is_held(&rt6_exception_lock));
1459 	if (!bucket) {
1460 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1461 				 GFP_ATOMIC);
1462 		if (!bucket) {
1463 			err = -ENOMEM;
1464 			goto out;
1465 		}
1466 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1467 	}
1468 
1469 #ifdef CONFIG_IPV6_SUBTREES
1470 	/* rt6i_src.plen != 0 indicates ort is in subtree
1471 	 * and exception table is indexed by a hash of
1472 	 * both rt6i_dst and rt6i_src.
1473 	 * Otherwise, the exception table is indexed by
1474 	 * a hash of only rt6i_dst.
1475 	 */
1476 	if (ort->fib6_src.plen)
1477 		src_key = &nrt->rt6i_src.addr;
1478 #endif
1479 	/* rt6_mtu_change() might lower mtu on ort.
1480 	 * Only insert this exception route if its mtu
1481 	 * is less than ort's mtu value.
1482 	 */
1483 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1484 		err = -EINVAL;
1485 		goto out;
1486 	}
1487 
1488 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1489 					       src_key);
1490 	if (rt6_ex)
1491 		rt6_remove_exception(bucket, rt6_ex);
1492 
1493 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1494 	if (!rt6_ex) {
1495 		err = -ENOMEM;
1496 		goto out;
1497 	}
1498 	rt6_ex->rt6i = nrt;
1499 	rt6_ex->stamp = jiffies;
1500 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1501 	bucket->depth++;
1502 	net->ipv6.rt6_stats->fib_rt_cache++;
1503 
1504 	if (bucket->depth > FIB6_MAX_DEPTH)
1505 		rt6_exception_remove_oldest(bucket);
1506 
1507 out:
1508 	spin_unlock_bh(&rt6_exception_lock);
1509 
1510 	/* Update fn->fn_sernum to invalidate all cached dst */
1511 	if (!err) {
1512 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1513 		fib6_update_sernum(net, ort);
1514 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1515 		fib6_force_start_gc(net);
1516 	}
1517 
1518 	return err;
1519 }
1520 
1521 void rt6_flush_exceptions(struct fib6_info *rt)
1522 {
1523 	struct rt6_exception_bucket *bucket;
1524 	struct rt6_exception *rt6_ex;
1525 	struct hlist_node *tmp;
1526 	int i;
1527 
1528 	spin_lock_bh(&rt6_exception_lock);
1529 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1530 	rt->exception_bucket_flushed = 1;
1531 
1532 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1533 				    lockdep_is_held(&rt6_exception_lock));
1534 	if (!bucket)
1535 		goto out;
1536 
1537 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1538 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1539 			rt6_remove_exception(bucket, rt6_ex);
1540 		WARN_ON_ONCE(bucket->depth);
1541 		bucket++;
1542 	}
1543 
1544 out:
1545 	spin_unlock_bh(&rt6_exception_lock);
1546 }
1547 
1548 /* Find cached rt in the hash table inside passed in rt
1549  * Caller has to hold rcu_read_lock()
1550  */
1551 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1552 					   struct in6_addr *daddr,
1553 					   struct in6_addr *saddr)
1554 {
1555 	struct rt6_exception_bucket *bucket;
1556 	struct in6_addr *src_key = NULL;
1557 	struct rt6_exception *rt6_ex;
1558 	struct rt6_info *ret = NULL;
1559 
1560 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1561 
1562 #ifdef CONFIG_IPV6_SUBTREES
1563 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1564 	 * and exception table is indexed by a hash of
1565 	 * both fib6_dst and fib6_src.
1566 	 * Otherwise, the exception table is indexed by
1567 	 * a hash of only fib6_dst.
1568 	 */
1569 	if (res->f6i->fib6_src.plen)
1570 		src_key = saddr;
1571 #endif
1572 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1573 
1574 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1575 		ret = rt6_ex->rt6i;
1576 
1577 	return ret;
1578 }
1579 
1580 /* Remove the passed in cached rt from the hash table that contains it */
1581 static int rt6_remove_exception_rt(struct rt6_info *rt)
1582 {
1583 	struct rt6_exception_bucket *bucket;
1584 	struct in6_addr *src_key = NULL;
1585 	struct rt6_exception *rt6_ex;
1586 	struct fib6_info *from;
1587 	int err;
1588 
1589 	from = rcu_dereference(rt->from);
1590 	if (!from ||
1591 	    !(rt->rt6i_flags & RTF_CACHE))
1592 		return -EINVAL;
1593 
1594 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1595 		return -ENOENT;
1596 
1597 	spin_lock_bh(&rt6_exception_lock);
1598 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1599 				    lockdep_is_held(&rt6_exception_lock));
1600 #ifdef CONFIG_IPV6_SUBTREES
1601 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1602 	 * and exception table is indexed by a hash of
1603 	 * both rt6i_dst and rt6i_src.
1604 	 * Otherwise, the exception table is indexed by
1605 	 * a hash of only rt6i_dst.
1606 	 */
1607 	if (from->fib6_src.plen)
1608 		src_key = &rt->rt6i_src.addr;
1609 #endif
1610 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1611 					       &rt->rt6i_dst.addr,
1612 					       src_key);
1613 	if (rt6_ex) {
1614 		rt6_remove_exception(bucket, rt6_ex);
1615 		err = 0;
1616 	} else {
1617 		err = -ENOENT;
1618 	}
1619 
1620 	spin_unlock_bh(&rt6_exception_lock);
1621 	return err;
1622 }
1623 
1624 /* Find rt6_ex which contains the passed in rt cache and
1625  * refresh its stamp
1626  */
1627 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1628 {
1629 	struct rt6_exception_bucket *bucket;
1630 	struct in6_addr *src_key = NULL;
1631 	struct rt6_exception *rt6_ex;
1632 	struct fib6_info *from;
1633 
1634 	rcu_read_lock();
1635 	from = rcu_dereference(rt->from);
1636 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1637 		goto unlock;
1638 
1639 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1640 
1641 #ifdef CONFIG_IPV6_SUBTREES
1642 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1643 	 * and exception table is indexed by a hash of
1644 	 * both rt6i_dst and rt6i_src.
1645 	 * Otherwise, the exception table is indexed by
1646 	 * a hash of only rt6i_dst.
1647 	 */
1648 	if (from->fib6_src.plen)
1649 		src_key = &rt->rt6i_src.addr;
1650 #endif
1651 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1652 					  &rt->rt6i_dst.addr,
1653 					  src_key);
1654 	if (rt6_ex)
1655 		rt6_ex->stamp = jiffies;
1656 
1657 unlock:
1658 	rcu_read_unlock();
1659 }
1660 
1661 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1662 					 struct rt6_info *rt, int mtu)
1663 {
1664 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1665 	 * lowest MTU in the path: always allow updating the route PMTU to
1666 	 * reflect PMTU decreases.
1667 	 *
1668 	 * If the new MTU is higher, and the route PMTU is equal to the local
1669 	 * MTU, this means the old MTU is the lowest in the path, so allow
1670 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1671 	 * handle this.
1672 	 */
1673 
1674 	if (dst_mtu(&rt->dst) >= mtu)
1675 		return true;
1676 
1677 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1678 		return true;
1679 
1680 	return false;
1681 }
1682 
1683 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1684 				       struct fib6_info *rt, int mtu)
1685 {
1686 	struct rt6_exception_bucket *bucket;
1687 	struct rt6_exception *rt6_ex;
1688 	int i;
1689 
1690 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1691 					lockdep_is_held(&rt6_exception_lock));
1692 
1693 	if (!bucket)
1694 		return;
1695 
1696 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1697 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1698 			struct rt6_info *entry = rt6_ex->rt6i;
1699 
1700 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1701 			 * route), the metrics of its rt->from have already
1702 			 * been updated.
1703 			 */
1704 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1705 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1706 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1707 		}
1708 		bucket++;
1709 	}
1710 }
1711 
1712 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1713 
1714 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1715 					struct in6_addr *gateway)
1716 {
1717 	struct rt6_exception_bucket *bucket;
1718 	struct rt6_exception *rt6_ex;
1719 	struct hlist_node *tmp;
1720 	int i;
1721 
1722 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1723 		return;
1724 
1725 	spin_lock_bh(&rt6_exception_lock);
1726 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1727 				     lockdep_is_held(&rt6_exception_lock));
1728 
1729 	if (bucket) {
1730 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1731 			hlist_for_each_entry_safe(rt6_ex, tmp,
1732 						  &bucket->chain, hlist) {
1733 				struct rt6_info *entry = rt6_ex->rt6i;
1734 
1735 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1736 				    RTF_CACHE_GATEWAY &&
1737 				    ipv6_addr_equal(gateway,
1738 						    &entry->rt6i_gateway)) {
1739 					rt6_remove_exception(bucket, rt6_ex);
1740 				}
1741 			}
1742 			bucket++;
1743 		}
1744 	}
1745 
1746 	spin_unlock_bh(&rt6_exception_lock);
1747 }
1748 
1749 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1750 				      struct rt6_exception *rt6_ex,
1751 				      struct fib6_gc_args *gc_args,
1752 				      unsigned long now)
1753 {
1754 	struct rt6_info *rt = rt6_ex->rt6i;
1755 
1756 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1757 	 * even if others have still references to them, so that on next
1758 	 * dst_check() such references can be dropped.
1759 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1760 	 * expired, independently from their aging, as per RFC 8201 section 4
1761 	 */
1762 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1763 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1764 			RT6_TRACE("aging clone %p\n", rt);
1765 			rt6_remove_exception(bucket, rt6_ex);
1766 			return;
1767 		}
1768 	} else if (time_after(jiffies, rt->dst.expires)) {
1769 		RT6_TRACE("purging expired route %p\n", rt);
1770 		rt6_remove_exception(bucket, rt6_ex);
1771 		return;
1772 	}
1773 
1774 	if (rt->rt6i_flags & RTF_GATEWAY) {
1775 		struct neighbour *neigh;
1776 		__u8 neigh_flags = 0;
1777 
1778 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1779 		if (neigh)
1780 			neigh_flags = neigh->flags;
1781 
1782 		if (!(neigh_flags & NTF_ROUTER)) {
1783 			RT6_TRACE("purging route %p via non-router but gateway\n",
1784 				  rt);
1785 			rt6_remove_exception(bucket, rt6_ex);
1786 			return;
1787 		}
1788 	}
1789 
1790 	gc_args->more++;
1791 }
1792 
1793 void rt6_age_exceptions(struct fib6_info *rt,
1794 			struct fib6_gc_args *gc_args,
1795 			unsigned long now)
1796 {
1797 	struct rt6_exception_bucket *bucket;
1798 	struct rt6_exception *rt6_ex;
1799 	struct hlist_node *tmp;
1800 	int i;
1801 
1802 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1803 		return;
1804 
1805 	rcu_read_lock_bh();
1806 	spin_lock(&rt6_exception_lock);
1807 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1808 				    lockdep_is_held(&rt6_exception_lock));
1809 
1810 	if (bucket) {
1811 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1812 			hlist_for_each_entry_safe(rt6_ex, tmp,
1813 						  &bucket->chain, hlist) {
1814 				rt6_age_examine_exception(bucket, rt6_ex,
1815 							  gc_args, now);
1816 			}
1817 			bucket++;
1818 		}
1819 	}
1820 	spin_unlock(&rt6_exception_lock);
1821 	rcu_read_unlock_bh();
1822 }
1823 
1824 /* must be called with rcu lock held */
1825 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1826 				    int oif, struct flowi6 *fl6, int strict)
1827 {
1828 	struct fib6_node *fn, *saved_fn;
1829 	struct fib6_info *f6i;
1830 
1831 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1832 	saved_fn = fn;
1833 
1834 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1835 		oif = 0;
1836 
1837 redo_rt6_select:
1838 	f6i = rt6_select(net, fn, oif, strict);
1839 	if (f6i == net->ipv6.fib6_null_entry) {
1840 		fn = fib6_backtrack(fn, &fl6->saddr);
1841 		if (fn)
1842 			goto redo_rt6_select;
1843 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1844 			/* also consider unreachable route */
1845 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1846 			fn = saved_fn;
1847 			goto redo_rt6_select;
1848 		}
1849 	}
1850 
1851 	trace_fib6_table_lookup(net, f6i, table, fl6);
1852 
1853 	return f6i;
1854 }
1855 
1856 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1857 			       int oif, struct flowi6 *fl6,
1858 			       const struct sk_buff *skb, int flags)
1859 {
1860 	struct fib6_result res = {};
1861 	struct rt6_info *rt;
1862 	int strict = 0;
1863 
1864 	strict |= flags & RT6_LOOKUP_F_IFACE;
1865 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1866 	if (net->ipv6.devconf_all->forwarding == 0)
1867 		strict |= RT6_LOOKUP_F_REACHABLE;
1868 
1869 	rcu_read_lock();
1870 
1871 	res.f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1872 	if (res.f6i == net->ipv6.fib6_null_entry) {
1873 		rt = net->ipv6.ip6_null_entry;
1874 		rcu_read_unlock();
1875 		dst_hold(&rt->dst);
1876 		return rt;
1877 	}
1878 
1879 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1880 
1881 	/*Search through exception table */
1882 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1883 	if (rt) {
1884 		if (ip6_hold_safe(net, &rt))
1885 			dst_use_noref(&rt->dst, jiffies);
1886 
1887 		rcu_read_unlock();
1888 		return rt;
1889 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1890 			    !res.nh->fib_nh_gw_family)) {
1891 		/* Create a RTF_CACHE clone which will not be
1892 		 * owned by the fib6 tree.  It is for the special case where
1893 		 * the daddr in the skb during the neighbor look-up is different
1894 		 * from the fl6->daddr used to look-up route here.
1895 		 */
1896 		struct rt6_info *uncached_rt;
1897 
1898 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1899 
1900 		rcu_read_unlock();
1901 
1902 		if (uncached_rt) {
1903 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1904 			 * No need for another dst_hold()
1905 			 */
1906 			rt6_uncached_list_add(uncached_rt);
1907 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1908 		} else {
1909 			uncached_rt = net->ipv6.ip6_null_entry;
1910 			dst_hold(&uncached_rt->dst);
1911 		}
1912 
1913 		return uncached_rt;
1914 	} else {
1915 		/* Get a percpu copy */
1916 
1917 		struct rt6_info *pcpu_rt;
1918 
1919 		local_bh_disable();
1920 		pcpu_rt = rt6_get_pcpu_route(&res);
1921 
1922 		if (!pcpu_rt)
1923 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1924 
1925 		local_bh_enable();
1926 		rcu_read_unlock();
1927 
1928 		return pcpu_rt;
1929 	}
1930 }
1931 EXPORT_SYMBOL_GPL(ip6_pol_route);
1932 
1933 static struct rt6_info *ip6_pol_route_input(struct net *net,
1934 					    struct fib6_table *table,
1935 					    struct flowi6 *fl6,
1936 					    const struct sk_buff *skb,
1937 					    int flags)
1938 {
1939 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1940 }
1941 
1942 struct dst_entry *ip6_route_input_lookup(struct net *net,
1943 					 struct net_device *dev,
1944 					 struct flowi6 *fl6,
1945 					 const struct sk_buff *skb,
1946 					 int flags)
1947 {
1948 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1949 		flags |= RT6_LOOKUP_F_IFACE;
1950 
1951 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1952 }
1953 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1954 
1955 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1956 				  struct flow_keys *keys,
1957 				  struct flow_keys *flkeys)
1958 {
1959 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1960 	const struct ipv6hdr *key_iph = outer_iph;
1961 	struct flow_keys *_flkeys = flkeys;
1962 	const struct ipv6hdr *inner_iph;
1963 	const struct icmp6hdr *icmph;
1964 	struct ipv6hdr _inner_iph;
1965 	struct icmp6hdr _icmph;
1966 
1967 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1968 		goto out;
1969 
1970 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1971 				   sizeof(_icmph), &_icmph);
1972 	if (!icmph)
1973 		goto out;
1974 
1975 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1976 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1977 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1978 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1979 		goto out;
1980 
1981 	inner_iph = skb_header_pointer(skb,
1982 				       skb_transport_offset(skb) + sizeof(*icmph),
1983 				       sizeof(_inner_iph), &_inner_iph);
1984 	if (!inner_iph)
1985 		goto out;
1986 
1987 	key_iph = inner_iph;
1988 	_flkeys = NULL;
1989 out:
1990 	if (_flkeys) {
1991 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1992 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1993 		keys->tags.flow_label = _flkeys->tags.flow_label;
1994 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1995 	} else {
1996 		keys->addrs.v6addrs.src = key_iph->saddr;
1997 		keys->addrs.v6addrs.dst = key_iph->daddr;
1998 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1999 		keys->basic.ip_proto = key_iph->nexthdr;
2000 	}
2001 }
2002 
2003 /* if skb is set it will be used and fl6 can be NULL */
2004 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2005 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2006 {
2007 	struct flow_keys hash_keys;
2008 	u32 mhash;
2009 
2010 	switch (ip6_multipath_hash_policy(net)) {
2011 	case 0:
2012 		memset(&hash_keys, 0, sizeof(hash_keys));
2013 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2014 		if (skb) {
2015 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2016 		} else {
2017 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2018 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2019 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2020 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2021 		}
2022 		break;
2023 	case 1:
2024 		if (skb) {
2025 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2026 			struct flow_keys keys;
2027 
2028 			/* short-circuit if we already have L4 hash present */
2029 			if (skb->l4_hash)
2030 				return skb_get_hash_raw(skb) >> 1;
2031 
2032 			memset(&hash_keys, 0, sizeof(hash_keys));
2033 
2034                         if (!flkeys) {
2035 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2036 				flkeys = &keys;
2037 			}
2038 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2039 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2040 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2041 			hash_keys.ports.src = flkeys->ports.src;
2042 			hash_keys.ports.dst = flkeys->ports.dst;
2043 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2044 		} else {
2045 			memset(&hash_keys, 0, sizeof(hash_keys));
2046 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2047 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2048 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2049 			hash_keys.ports.src = fl6->fl6_sport;
2050 			hash_keys.ports.dst = fl6->fl6_dport;
2051 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2052 		}
2053 		break;
2054 	}
2055 	mhash = flow_hash_from_keys(&hash_keys);
2056 
2057 	return mhash >> 1;
2058 }
2059 
2060 void ip6_route_input(struct sk_buff *skb)
2061 {
2062 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2063 	struct net *net = dev_net(skb->dev);
2064 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2065 	struct ip_tunnel_info *tun_info;
2066 	struct flowi6 fl6 = {
2067 		.flowi6_iif = skb->dev->ifindex,
2068 		.daddr = iph->daddr,
2069 		.saddr = iph->saddr,
2070 		.flowlabel = ip6_flowinfo(iph),
2071 		.flowi6_mark = skb->mark,
2072 		.flowi6_proto = iph->nexthdr,
2073 	};
2074 	struct flow_keys *flkeys = NULL, _flkeys;
2075 
2076 	tun_info = skb_tunnel_info(skb);
2077 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2078 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2079 
2080 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2081 		flkeys = &_flkeys;
2082 
2083 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2084 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2085 	skb_dst_drop(skb);
2086 	skb_dst_set(skb,
2087 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2088 }
2089 
2090 static struct rt6_info *ip6_pol_route_output(struct net *net,
2091 					     struct fib6_table *table,
2092 					     struct flowi6 *fl6,
2093 					     const struct sk_buff *skb,
2094 					     int flags)
2095 {
2096 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2097 }
2098 
2099 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2100 					 struct flowi6 *fl6, int flags)
2101 {
2102 	bool any_src;
2103 
2104 	if (ipv6_addr_type(&fl6->daddr) &
2105 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2106 		struct dst_entry *dst;
2107 
2108 		dst = l3mdev_link_scope_lookup(net, fl6);
2109 		if (dst)
2110 			return dst;
2111 	}
2112 
2113 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2114 
2115 	any_src = ipv6_addr_any(&fl6->saddr);
2116 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2117 	    (fl6->flowi6_oif && any_src))
2118 		flags |= RT6_LOOKUP_F_IFACE;
2119 
2120 	if (!any_src)
2121 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2122 	else if (sk)
2123 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2124 
2125 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2126 }
2127 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2128 
2129 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2130 {
2131 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2132 	struct net_device *loopback_dev = net->loopback_dev;
2133 	struct dst_entry *new = NULL;
2134 
2135 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2136 		       DST_OBSOLETE_DEAD, 0);
2137 	if (rt) {
2138 		rt6_info_init(rt);
2139 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2140 
2141 		new = &rt->dst;
2142 		new->__use = 1;
2143 		new->input = dst_discard;
2144 		new->output = dst_discard_out;
2145 
2146 		dst_copy_metrics(new, &ort->dst);
2147 
2148 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2149 		rt->rt6i_gateway = ort->rt6i_gateway;
2150 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2151 
2152 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2153 #ifdef CONFIG_IPV6_SUBTREES
2154 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2155 #endif
2156 	}
2157 
2158 	dst_release(dst_orig);
2159 	return new ? new : ERR_PTR(-ENOMEM);
2160 }
2161 
2162 /*
2163  *	Destination cache support functions
2164  */
2165 
2166 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2167 {
2168 	u32 rt_cookie = 0;
2169 
2170 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2171 		return false;
2172 
2173 	if (fib6_check_expired(f6i))
2174 		return false;
2175 
2176 	return true;
2177 }
2178 
2179 static struct dst_entry *rt6_check(struct rt6_info *rt,
2180 				   struct fib6_info *from,
2181 				   u32 cookie)
2182 {
2183 	u32 rt_cookie = 0;
2184 
2185 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2186 	    rt_cookie != cookie)
2187 		return NULL;
2188 
2189 	if (rt6_check_expired(rt))
2190 		return NULL;
2191 
2192 	return &rt->dst;
2193 }
2194 
2195 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2196 					    struct fib6_info *from,
2197 					    u32 cookie)
2198 {
2199 	if (!__rt6_check_expired(rt) &&
2200 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2201 	    fib6_check(from, cookie))
2202 		return &rt->dst;
2203 	else
2204 		return NULL;
2205 }
2206 
2207 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2208 {
2209 	struct dst_entry *dst_ret;
2210 	struct fib6_info *from;
2211 	struct rt6_info *rt;
2212 
2213 	rt = container_of(dst, struct rt6_info, dst);
2214 
2215 	rcu_read_lock();
2216 
2217 	/* All IPV6 dsts are created with ->obsolete set to the value
2218 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2219 	 * into this function always.
2220 	 */
2221 
2222 	from = rcu_dereference(rt->from);
2223 
2224 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2225 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2226 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2227 	else
2228 		dst_ret = rt6_check(rt, from, cookie);
2229 
2230 	rcu_read_unlock();
2231 
2232 	return dst_ret;
2233 }
2234 
2235 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2236 {
2237 	struct rt6_info *rt = (struct rt6_info *) dst;
2238 
2239 	if (rt) {
2240 		if (rt->rt6i_flags & RTF_CACHE) {
2241 			rcu_read_lock();
2242 			if (rt6_check_expired(rt)) {
2243 				rt6_remove_exception_rt(rt);
2244 				dst = NULL;
2245 			}
2246 			rcu_read_unlock();
2247 		} else {
2248 			dst_release(dst);
2249 			dst = NULL;
2250 		}
2251 	}
2252 	return dst;
2253 }
2254 
2255 static void ip6_link_failure(struct sk_buff *skb)
2256 {
2257 	struct rt6_info *rt;
2258 
2259 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2260 
2261 	rt = (struct rt6_info *) skb_dst(skb);
2262 	if (rt) {
2263 		rcu_read_lock();
2264 		if (rt->rt6i_flags & RTF_CACHE) {
2265 			rt6_remove_exception_rt(rt);
2266 		} else {
2267 			struct fib6_info *from;
2268 			struct fib6_node *fn;
2269 
2270 			from = rcu_dereference(rt->from);
2271 			if (from) {
2272 				fn = rcu_dereference(from->fib6_node);
2273 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2274 					fn->fn_sernum = -1;
2275 			}
2276 		}
2277 		rcu_read_unlock();
2278 	}
2279 }
2280 
2281 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2282 {
2283 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2284 		struct fib6_info *from;
2285 
2286 		rcu_read_lock();
2287 		from = rcu_dereference(rt0->from);
2288 		if (from)
2289 			rt0->dst.expires = from->expires;
2290 		rcu_read_unlock();
2291 	}
2292 
2293 	dst_set_expires(&rt0->dst, timeout);
2294 	rt0->rt6i_flags |= RTF_EXPIRES;
2295 }
2296 
2297 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2298 {
2299 	struct net *net = dev_net(rt->dst.dev);
2300 
2301 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2302 	rt->rt6i_flags |= RTF_MODIFIED;
2303 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2304 }
2305 
2306 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2307 {
2308 	return !(rt->rt6i_flags & RTF_CACHE) &&
2309 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2310 }
2311 
2312 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2313 				 const struct ipv6hdr *iph, u32 mtu)
2314 {
2315 	const struct in6_addr *daddr, *saddr;
2316 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2317 
2318 	if (dst_metric_locked(dst, RTAX_MTU))
2319 		return;
2320 
2321 	if (iph) {
2322 		daddr = &iph->daddr;
2323 		saddr = &iph->saddr;
2324 	} else if (sk) {
2325 		daddr = &sk->sk_v6_daddr;
2326 		saddr = &inet6_sk(sk)->saddr;
2327 	} else {
2328 		daddr = NULL;
2329 		saddr = NULL;
2330 	}
2331 	dst_confirm_neigh(dst, daddr);
2332 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2333 	if (mtu >= dst_mtu(dst))
2334 		return;
2335 
2336 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2337 		rt6_do_update_pmtu(rt6, mtu);
2338 		/* update rt6_ex->stamp for cache */
2339 		if (rt6->rt6i_flags & RTF_CACHE)
2340 			rt6_update_exception_stamp_rt(rt6);
2341 	} else if (daddr) {
2342 		struct fib6_result res = {};
2343 		struct rt6_info *nrt6;
2344 
2345 		rcu_read_lock();
2346 		res.f6i = rcu_dereference(rt6->from);
2347 		if (!res.f6i) {
2348 			rcu_read_unlock();
2349 			return;
2350 		}
2351 		res.nh = &res.f6i->fib6_nh;
2352 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2353 		if (nrt6) {
2354 			rt6_do_update_pmtu(nrt6, mtu);
2355 			if (rt6_insert_exception(nrt6, res.f6i))
2356 				dst_release_immediate(&nrt6->dst);
2357 		}
2358 		rcu_read_unlock();
2359 	}
2360 }
2361 
2362 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2363 			       struct sk_buff *skb, u32 mtu)
2364 {
2365 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2366 }
2367 
2368 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2369 		     int oif, u32 mark, kuid_t uid)
2370 {
2371 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2372 	struct dst_entry *dst;
2373 	struct flowi6 fl6 = {
2374 		.flowi6_oif = oif,
2375 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2376 		.daddr = iph->daddr,
2377 		.saddr = iph->saddr,
2378 		.flowlabel = ip6_flowinfo(iph),
2379 		.flowi6_uid = uid,
2380 	};
2381 
2382 	dst = ip6_route_output(net, NULL, &fl6);
2383 	if (!dst->error)
2384 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2385 	dst_release(dst);
2386 }
2387 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2388 
2389 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2390 {
2391 	int oif = sk->sk_bound_dev_if;
2392 	struct dst_entry *dst;
2393 
2394 	if (!oif && skb->dev)
2395 		oif = l3mdev_master_ifindex(skb->dev);
2396 
2397 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2398 
2399 	dst = __sk_dst_get(sk);
2400 	if (!dst || !dst->obsolete ||
2401 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2402 		return;
2403 
2404 	bh_lock_sock(sk);
2405 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2406 		ip6_datagram_dst_update(sk, false);
2407 	bh_unlock_sock(sk);
2408 }
2409 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2410 
2411 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2412 			   const struct flowi6 *fl6)
2413 {
2414 #ifdef CONFIG_IPV6_SUBTREES
2415 	struct ipv6_pinfo *np = inet6_sk(sk);
2416 #endif
2417 
2418 	ip6_dst_store(sk, dst,
2419 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2420 		      &sk->sk_v6_daddr : NULL,
2421 #ifdef CONFIG_IPV6_SUBTREES
2422 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2423 		      &np->saddr :
2424 #endif
2425 		      NULL);
2426 }
2427 
2428 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2429 				  struct flowi6 *fl6,
2430 				  const struct in6_addr *gw,
2431 				  struct rt6_info **ret)
2432 {
2433 	const struct fib6_nh *nh = res->nh;
2434 
2435 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2436 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2437 		return false;
2438 
2439 	/* rt_cache's gateway might be different from its 'parent'
2440 	 * in the case of an ip redirect.
2441 	 * So we keep searching in the exception table if the gateway
2442 	 * is different.
2443 	 */
2444 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2445 		struct rt6_info *rt_cache;
2446 
2447 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2448 		if (rt_cache &&
2449 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2450 			*ret = rt_cache;
2451 			return true;
2452 		}
2453 		return false;
2454 	}
2455 	return true;
2456 }
2457 
2458 /* Handle redirects */
2459 struct ip6rd_flowi {
2460 	struct flowi6 fl6;
2461 	struct in6_addr gateway;
2462 };
2463 
2464 static struct rt6_info *__ip6_route_redirect(struct net *net,
2465 					     struct fib6_table *table,
2466 					     struct flowi6 *fl6,
2467 					     const struct sk_buff *skb,
2468 					     int flags)
2469 {
2470 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2471 	struct rt6_info *ret = NULL;
2472 	struct fib6_result res = {};
2473 	struct fib6_info *rt;
2474 	struct fib6_node *fn;
2475 
2476 	/* Get the "current" route for this destination and
2477 	 * check if the redirect has come from appropriate router.
2478 	 *
2479 	 * RFC 4861 specifies that redirects should only be
2480 	 * accepted if they come from the nexthop to the target.
2481 	 * Due to the way the routes are chosen, this notion
2482 	 * is a bit fuzzy and one might need to check all possible
2483 	 * routes.
2484 	 */
2485 
2486 	rcu_read_lock();
2487 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2488 restart:
2489 	for_each_fib6_node_rt_rcu(fn) {
2490 		res.f6i = rt;
2491 		res.nh = &rt->fib6_nh;
2492 
2493 		if (fib6_check_expired(rt))
2494 			continue;
2495 		if (rt->fib6_flags & RTF_REJECT)
2496 			break;
2497 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2498 			goto out;
2499 	}
2500 
2501 	if (!rt)
2502 		rt = net->ipv6.fib6_null_entry;
2503 	else if (rt->fib6_flags & RTF_REJECT) {
2504 		ret = net->ipv6.ip6_null_entry;
2505 		goto out;
2506 	}
2507 
2508 	if (rt == net->ipv6.fib6_null_entry) {
2509 		fn = fib6_backtrack(fn, &fl6->saddr);
2510 		if (fn)
2511 			goto restart;
2512 	}
2513 
2514 	res.f6i = rt;
2515 	res.nh = &rt->fib6_nh;
2516 out:
2517 	if (ret)
2518 		ip6_hold_safe(net, &ret);
2519 	else
2520 		ret = ip6_create_rt_rcu(&res);
2521 
2522 	rcu_read_unlock();
2523 
2524 	trace_fib6_table_lookup(net, rt, table, fl6);
2525 	return ret;
2526 };
2527 
2528 static struct dst_entry *ip6_route_redirect(struct net *net,
2529 					    const struct flowi6 *fl6,
2530 					    const struct sk_buff *skb,
2531 					    const struct in6_addr *gateway)
2532 {
2533 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2534 	struct ip6rd_flowi rdfl;
2535 
2536 	rdfl.fl6 = *fl6;
2537 	rdfl.gateway = *gateway;
2538 
2539 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2540 				flags, __ip6_route_redirect);
2541 }
2542 
2543 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2544 		  kuid_t uid)
2545 {
2546 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2547 	struct dst_entry *dst;
2548 	struct flowi6 fl6 = {
2549 		.flowi6_iif = LOOPBACK_IFINDEX,
2550 		.flowi6_oif = oif,
2551 		.flowi6_mark = mark,
2552 		.daddr = iph->daddr,
2553 		.saddr = iph->saddr,
2554 		.flowlabel = ip6_flowinfo(iph),
2555 		.flowi6_uid = uid,
2556 	};
2557 
2558 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2559 	rt6_do_redirect(dst, NULL, skb);
2560 	dst_release(dst);
2561 }
2562 EXPORT_SYMBOL_GPL(ip6_redirect);
2563 
2564 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2565 {
2566 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2567 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2568 	struct dst_entry *dst;
2569 	struct flowi6 fl6 = {
2570 		.flowi6_iif = LOOPBACK_IFINDEX,
2571 		.flowi6_oif = oif,
2572 		.daddr = msg->dest,
2573 		.saddr = iph->daddr,
2574 		.flowi6_uid = sock_net_uid(net, NULL),
2575 	};
2576 
2577 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2578 	rt6_do_redirect(dst, NULL, skb);
2579 	dst_release(dst);
2580 }
2581 
2582 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2583 {
2584 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2585 		     sk->sk_uid);
2586 }
2587 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2588 
2589 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2590 {
2591 	struct net_device *dev = dst->dev;
2592 	unsigned int mtu = dst_mtu(dst);
2593 	struct net *net = dev_net(dev);
2594 
2595 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2596 
2597 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2598 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2599 
2600 	/*
2601 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2602 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2603 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2604 	 * rely only on pmtu discovery"
2605 	 */
2606 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2607 		mtu = IPV6_MAXPLEN;
2608 	return mtu;
2609 }
2610 
2611 static unsigned int ip6_mtu(const struct dst_entry *dst)
2612 {
2613 	struct inet6_dev *idev;
2614 	unsigned int mtu;
2615 
2616 	mtu = dst_metric_raw(dst, RTAX_MTU);
2617 	if (mtu)
2618 		goto out;
2619 
2620 	mtu = IPV6_MIN_MTU;
2621 
2622 	rcu_read_lock();
2623 	idev = __in6_dev_get(dst->dev);
2624 	if (idev)
2625 		mtu = idev->cnf.mtu6;
2626 	rcu_read_unlock();
2627 
2628 out:
2629 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2630 
2631 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2632 }
2633 
2634 /* MTU selection:
2635  * 1. mtu on route is locked - use it
2636  * 2. mtu from nexthop exception
2637  * 3. mtu from egress device
2638  *
2639  * based on ip6_dst_mtu_forward and exception logic of
2640  * rt6_find_cached_rt; called with rcu_read_lock
2641  */
2642 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2643 		      struct in6_addr *saddr)
2644 {
2645 	struct rt6_exception_bucket *bucket;
2646 	struct rt6_exception *rt6_ex;
2647 	struct in6_addr *src_key;
2648 	struct inet6_dev *idev;
2649 	u32 mtu = 0;
2650 
2651 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2652 		mtu = f6i->fib6_pmtu;
2653 		if (mtu)
2654 			goto out;
2655 	}
2656 
2657 	src_key = NULL;
2658 #ifdef CONFIG_IPV6_SUBTREES
2659 	if (f6i->fib6_src.plen)
2660 		src_key = saddr;
2661 #endif
2662 
2663 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2664 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2665 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2666 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2667 
2668 	if (likely(!mtu)) {
2669 		struct net_device *dev = fib6_info_nh_dev(f6i);
2670 
2671 		mtu = IPV6_MIN_MTU;
2672 		idev = __in6_dev_get(dev);
2673 		if (idev && idev->cnf.mtu6 > mtu)
2674 			mtu = idev->cnf.mtu6;
2675 	}
2676 
2677 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2678 out:
2679 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2680 }
2681 
2682 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2683 				  struct flowi6 *fl6)
2684 {
2685 	struct dst_entry *dst;
2686 	struct rt6_info *rt;
2687 	struct inet6_dev *idev = in6_dev_get(dev);
2688 	struct net *net = dev_net(dev);
2689 
2690 	if (unlikely(!idev))
2691 		return ERR_PTR(-ENODEV);
2692 
2693 	rt = ip6_dst_alloc(net, dev, 0);
2694 	if (unlikely(!rt)) {
2695 		in6_dev_put(idev);
2696 		dst = ERR_PTR(-ENOMEM);
2697 		goto out;
2698 	}
2699 
2700 	rt->dst.flags |= DST_HOST;
2701 	rt->dst.input = ip6_input;
2702 	rt->dst.output  = ip6_output;
2703 	rt->rt6i_gateway  = fl6->daddr;
2704 	rt->rt6i_dst.addr = fl6->daddr;
2705 	rt->rt6i_dst.plen = 128;
2706 	rt->rt6i_idev     = idev;
2707 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2708 
2709 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2710 	 * do proper release of the net_device
2711 	 */
2712 	rt6_uncached_list_add(rt);
2713 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2714 
2715 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2716 
2717 out:
2718 	return dst;
2719 }
2720 
2721 static int ip6_dst_gc(struct dst_ops *ops)
2722 {
2723 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2724 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2725 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2726 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2727 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2728 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2729 	int entries;
2730 
2731 	entries = dst_entries_get_fast(ops);
2732 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2733 	    entries <= rt_max_size)
2734 		goto out;
2735 
2736 	net->ipv6.ip6_rt_gc_expire++;
2737 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2738 	entries = dst_entries_get_slow(ops);
2739 	if (entries < ops->gc_thresh)
2740 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2741 out:
2742 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2743 	return entries > rt_max_size;
2744 }
2745 
2746 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2747 					    struct fib6_config *cfg,
2748 					    const struct in6_addr *gw_addr,
2749 					    u32 tbid, int flags)
2750 {
2751 	struct flowi6 fl6 = {
2752 		.flowi6_oif = cfg->fc_ifindex,
2753 		.daddr = *gw_addr,
2754 		.saddr = cfg->fc_prefsrc,
2755 	};
2756 	struct fib6_table *table;
2757 	struct rt6_info *rt;
2758 
2759 	table = fib6_get_table(net, tbid);
2760 	if (!table)
2761 		return NULL;
2762 
2763 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2764 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2765 
2766 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2767 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2768 
2769 	/* if table lookup failed, fall back to full lookup */
2770 	if (rt == net->ipv6.ip6_null_entry) {
2771 		ip6_rt_put(rt);
2772 		rt = NULL;
2773 	}
2774 
2775 	return rt;
2776 }
2777 
2778 static int ip6_route_check_nh_onlink(struct net *net,
2779 				     struct fib6_config *cfg,
2780 				     const struct net_device *dev,
2781 				     struct netlink_ext_ack *extack)
2782 {
2783 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2784 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2785 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2786 	struct fib6_info *from;
2787 	struct rt6_info *grt;
2788 	int err;
2789 
2790 	err = 0;
2791 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2792 	if (grt) {
2793 		rcu_read_lock();
2794 		from = rcu_dereference(grt->from);
2795 		if (!grt->dst.error &&
2796 		    /* ignore match if it is the default route */
2797 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2798 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2799 			NL_SET_ERR_MSG(extack,
2800 				       "Nexthop has invalid gateway or device mismatch");
2801 			err = -EINVAL;
2802 		}
2803 		rcu_read_unlock();
2804 
2805 		ip6_rt_put(grt);
2806 	}
2807 
2808 	return err;
2809 }
2810 
2811 static int ip6_route_check_nh(struct net *net,
2812 			      struct fib6_config *cfg,
2813 			      struct net_device **_dev,
2814 			      struct inet6_dev **idev)
2815 {
2816 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2817 	struct net_device *dev = _dev ? *_dev : NULL;
2818 	struct rt6_info *grt = NULL;
2819 	int err = -EHOSTUNREACH;
2820 
2821 	if (cfg->fc_table) {
2822 		int flags = RT6_LOOKUP_F_IFACE;
2823 
2824 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2825 					  cfg->fc_table, flags);
2826 		if (grt) {
2827 			if (grt->rt6i_flags & RTF_GATEWAY ||
2828 			    (dev && dev != grt->dst.dev)) {
2829 				ip6_rt_put(grt);
2830 				grt = NULL;
2831 			}
2832 		}
2833 	}
2834 
2835 	if (!grt)
2836 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2837 
2838 	if (!grt)
2839 		goto out;
2840 
2841 	if (dev) {
2842 		if (dev != grt->dst.dev) {
2843 			ip6_rt_put(grt);
2844 			goto out;
2845 		}
2846 	} else {
2847 		*_dev = dev = grt->dst.dev;
2848 		*idev = grt->rt6i_idev;
2849 		dev_hold(dev);
2850 		in6_dev_hold(grt->rt6i_idev);
2851 	}
2852 
2853 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2854 		err = 0;
2855 
2856 	ip6_rt_put(grt);
2857 
2858 out:
2859 	return err;
2860 }
2861 
2862 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2863 			   struct net_device **_dev, struct inet6_dev **idev,
2864 			   struct netlink_ext_ack *extack)
2865 {
2866 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2867 	int gwa_type = ipv6_addr_type(gw_addr);
2868 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2869 	const struct net_device *dev = *_dev;
2870 	bool need_addr_check = !dev;
2871 	int err = -EINVAL;
2872 
2873 	/* if gw_addr is local we will fail to detect this in case
2874 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2875 	 * will return already-added prefix route via interface that
2876 	 * prefix route was assigned to, which might be non-loopback.
2877 	 */
2878 	if (dev &&
2879 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2880 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2881 		goto out;
2882 	}
2883 
2884 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2885 		/* IPv6 strictly inhibits using not link-local
2886 		 * addresses as nexthop address.
2887 		 * Otherwise, router will not able to send redirects.
2888 		 * It is very good, but in some (rare!) circumstances
2889 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2890 		 * some exceptions. --ANK
2891 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2892 		 * addressing
2893 		 */
2894 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2895 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2896 			goto out;
2897 		}
2898 
2899 		if (cfg->fc_flags & RTNH_F_ONLINK)
2900 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2901 		else
2902 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2903 
2904 		if (err)
2905 			goto out;
2906 	}
2907 
2908 	/* reload in case device was changed */
2909 	dev = *_dev;
2910 
2911 	err = -EINVAL;
2912 	if (!dev) {
2913 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2914 		goto out;
2915 	} else if (dev->flags & IFF_LOOPBACK) {
2916 		NL_SET_ERR_MSG(extack,
2917 			       "Egress device can not be loopback device for this route");
2918 		goto out;
2919 	}
2920 
2921 	/* if we did not check gw_addr above, do so now that the
2922 	 * egress device has been resolved.
2923 	 */
2924 	if (need_addr_check &&
2925 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2926 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2927 		goto out;
2928 	}
2929 
2930 	err = 0;
2931 out:
2932 	return err;
2933 }
2934 
2935 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2936 {
2937 	if ((flags & RTF_REJECT) ||
2938 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2939 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2940 	     !(flags & RTF_LOCAL)))
2941 		return true;
2942 
2943 	return false;
2944 }
2945 
2946 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2947 		 struct fib6_config *cfg, gfp_t gfp_flags,
2948 		 struct netlink_ext_ack *extack)
2949 {
2950 	struct net_device *dev = NULL;
2951 	struct inet6_dev *idev = NULL;
2952 	int addr_type;
2953 	int err;
2954 
2955 	fib6_nh->fib_nh_family = AF_INET6;
2956 
2957 	err = -ENODEV;
2958 	if (cfg->fc_ifindex) {
2959 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2960 		if (!dev)
2961 			goto out;
2962 		idev = in6_dev_get(dev);
2963 		if (!idev)
2964 			goto out;
2965 	}
2966 
2967 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2968 		if (!dev) {
2969 			NL_SET_ERR_MSG(extack,
2970 				       "Nexthop device required for onlink");
2971 			goto out;
2972 		}
2973 
2974 		if (!(dev->flags & IFF_UP)) {
2975 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2976 			err = -ENETDOWN;
2977 			goto out;
2978 		}
2979 
2980 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2981 	}
2982 
2983 	fib6_nh->fib_nh_weight = 1;
2984 
2985 	/* We cannot add true routes via loopback here,
2986 	 * they would result in kernel looping; promote them to reject routes
2987 	 */
2988 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2989 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2990 		/* hold loopback dev/idev if we haven't done so. */
2991 		if (dev != net->loopback_dev) {
2992 			if (dev) {
2993 				dev_put(dev);
2994 				in6_dev_put(idev);
2995 			}
2996 			dev = net->loopback_dev;
2997 			dev_hold(dev);
2998 			idev = in6_dev_get(dev);
2999 			if (!idev) {
3000 				err = -ENODEV;
3001 				goto out;
3002 			}
3003 		}
3004 		goto set_dev;
3005 	}
3006 
3007 	if (cfg->fc_flags & RTF_GATEWAY) {
3008 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3009 		if (err)
3010 			goto out;
3011 
3012 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3013 		fib6_nh->fib_nh_gw_family = AF_INET6;
3014 	}
3015 
3016 	err = -ENODEV;
3017 	if (!dev)
3018 		goto out;
3019 
3020 	if (idev->cnf.disable_ipv6) {
3021 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3022 		err = -EACCES;
3023 		goto out;
3024 	}
3025 
3026 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3027 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3028 		err = -ENETDOWN;
3029 		goto out;
3030 	}
3031 
3032 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3033 	    !netif_carrier_ok(dev))
3034 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3035 
3036 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3037 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3038 	if (err)
3039 		goto out;
3040 set_dev:
3041 	fib6_nh->fib_nh_dev = dev;
3042 	fib6_nh->fib_nh_oif = dev->ifindex;
3043 	err = 0;
3044 out:
3045 	if (idev)
3046 		in6_dev_put(idev);
3047 
3048 	if (err) {
3049 		lwtstate_put(fib6_nh->fib_nh_lws);
3050 		fib6_nh->fib_nh_lws = NULL;
3051 		if (dev)
3052 			dev_put(dev);
3053 	}
3054 
3055 	return err;
3056 }
3057 
3058 void fib6_nh_release(struct fib6_nh *fib6_nh)
3059 {
3060 	fib_nh_common_release(&fib6_nh->nh_common);
3061 }
3062 
3063 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3064 					      gfp_t gfp_flags,
3065 					      struct netlink_ext_ack *extack)
3066 {
3067 	struct net *net = cfg->fc_nlinfo.nl_net;
3068 	struct fib6_info *rt = NULL;
3069 	struct fib6_table *table;
3070 	int err = -EINVAL;
3071 	int addr_type;
3072 
3073 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3074 	if (cfg->fc_flags & RTF_PCPU) {
3075 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3076 		goto out;
3077 	}
3078 
3079 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3080 	if (cfg->fc_flags & RTF_CACHE) {
3081 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3082 		goto out;
3083 	}
3084 
3085 	if (cfg->fc_type > RTN_MAX) {
3086 		NL_SET_ERR_MSG(extack, "Invalid route type");
3087 		goto out;
3088 	}
3089 
3090 	if (cfg->fc_dst_len > 128) {
3091 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3092 		goto out;
3093 	}
3094 	if (cfg->fc_src_len > 128) {
3095 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3096 		goto out;
3097 	}
3098 #ifndef CONFIG_IPV6_SUBTREES
3099 	if (cfg->fc_src_len) {
3100 		NL_SET_ERR_MSG(extack,
3101 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3102 		goto out;
3103 	}
3104 #endif
3105 
3106 	err = -ENOBUFS;
3107 	if (cfg->fc_nlinfo.nlh &&
3108 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3109 		table = fib6_get_table(net, cfg->fc_table);
3110 		if (!table) {
3111 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3112 			table = fib6_new_table(net, cfg->fc_table);
3113 		}
3114 	} else {
3115 		table = fib6_new_table(net, cfg->fc_table);
3116 	}
3117 
3118 	if (!table)
3119 		goto out;
3120 
3121 	err = -ENOMEM;
3122 	rt = fib6_info_alloc(gfp_flags);
3123 	if (!rt)
3124 		goto out;
3125 
3126 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3127 					       extack);
3128 	if (IS_ERR(rt->fib6_metrics)) {
3129 		err = PTR_ERR(rt->fib6_metrics);
3130 		/* Do not leave garbage there. */
3131 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3132 		goto out;
3133 	}
3134 
3135 	if (cfg->fc_flags & RTF_ADDRCONF)
3136 		rt->dst_nocount = true;
3137 
3138 	if (cfg->fc_flags & RTF_EXPIRES)
3139 		fib6_set_expires(rt, jiffies +
3140 				clock_t_to_jiffies(cfg->fc_expires));
3141 	else
3142 		fib6_clean_expires(rt);
3143 
3144 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3145 		cfg->fc_protocol = RTPROT_BOOT;
3146 	rt->fib6_protocol = cfg->fc_protocol;
3147 
3148 	rt->fib6_table = table;
3149 	rt->fib6_metric = cfg->fc_metric;
3150 	rt->fib6_type = cfg->fc_type;
3151 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3152 
3153 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3154 	rt->fib6_dst.plen = cfg->fc_dst_len;
3155 	if (rt->fib6_dst.plen == 128)
3156 		rt->dst_host = true;
3157 
3158 #ifdef CONFIG_IPV6_SUBTREES
3159 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3160 	rt->fib6_src.plen = cfg->fc_src_len;
3161 #endif
3162 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3163 	if (err)
3164 		goto out;
3165 
3166 	/* We cannot add true routes via loopback here,
3167 	 * they would result in kernel looping; promote them to reject routes
3168 	 */
3169 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3170 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3171 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3172 
3173 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3174 		struct net_device *dev = fib6_info_nh_dev(rt);
3175 
3176 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3177 			NL_SET_ERR_MSG(extack, "Invalid source address");
3178 			err = -EINVAL;
3179 			goto out;
3180 		}
3181 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3182 		rt->fib6_prefsrc.plen = 128;
3183 	} else
3184 		rt->fib6_prefsrc.plen = 0;
3185 
3186 	return rt;
3187 out:
3188 	fib6_info_release(rt);
3189 	return ERR_PTR(err);
3190 }
3191 
3192 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3193 		  struct netlink_ext_ack *extack)
3194 {
3195 	struct fib6_info *rt;
3196 	int err;
3197 
3198 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3199 	if (IS_ERR(rt))
3200 		return PTR_ERR(rt);
3201 
3202 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3203 	fib6_info_release(rt);
3204 
3205 	return err;
3206 }
3207 
3208 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3209 {
3210 	struct net *net = info->nl_net;
3211 	struct fib6_table *table;
3212 	int err;
3213 
3214 	if (rt == net->ipv6.fib6_null_entry) {
3215 		err = -ENOENT;
3216 		goto out;
3217 	}
3218 
3219 	table = rt->fib6_table;
3220 	spin_lock_bh(&table->tb6_lock);
3221 	err = fib6_del(rt, info);
3222 	spin_unlock_bh(&table->tb6_lock);
3223 
3224 out:
3225 	fib6_info_release(rt);
3226 	return err;
3227 }
3228 
3229 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3230 {
3231 	struct nl_info info = { .nl_net = net };
3232 
3233 	return __ip6_del_rt(rt, &info);
3234 }
3235 
3236 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3237 {
3238 	struct nl_info *info = &cfg->fc_nlinfo;
3239 	struct net *net = info->nl_net;
3240 	struct sk_buff *skb = NULL;
3241 	struct fib6_table *table;
3242 	int err = -ENOENT;
3243 
3244 	if (rt == net->ipv6.fib6_null_entry)
3245 		goto out_put;
3246 	table = rt->fib6_table;
3247 	spin_lock_bh(&table->tb6_lock);
3248 
3249 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3250 		struct fib6_info *sibling, *next_sibling;
3251 
3252 		/* prefer to send a single notification with all hops */
3253 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3254 		if (skb) {
3255 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3256 
3257 			if (rt6_fill_node(net, skb, rt, NULL,
3258 					  NULL, NULL, 0, RTM_DELROUTE,
3259 					  info->portid, seq, 0) < 0) {
3260 				kfree_skb(skb);
3261 				skb = NULL;
3262 			} else
3263 				info->skip_notify = 1;
3264 		}
3265 
3266 		list_for_each_entry_safe(sibling, next_sibling,
3267 					 &rt->fib6_siblings,
3268 					 fib6_siblings) {
3269 			err = fib6_del(sibling, info);
3270 			if (err)
3271 				goto out_unlock;
3272 		}
3273 	}
3274 
3275 	err = fib6_del(rt, info);
3276 out_unlock:
3277 	spin_unlock_bh(&table->tb6_lock);
3278 out_put:
3279 	fib6_info_release(rt);
3280 
3281 	if (skb) {
3282 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3283 			    info->nlh, gfp_any());
3284 	}
3285 	return err;
3286 }
3287 
3288 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3289 {
3290 	int rc = -ESRCH;
3291 
3292 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3293 		goto out;
3294 
3295 	if (cfg->fc_flags & RTF_GATEWAY &&
3296 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3297 		goto out;
3298 
3299 	rc = rt6_remove_exception_rt(rt);
3300 out:
3301 	return rc;
3302 }
3303 
3304 static int ip6_route_del(struct fib6_config *cfg,
3305 			 struct netlink_ext_ack *extack)
3306 {
3307 	struct rt6_info *rt_cache;
3308 	struct fib6_table *table;
3309 	struct fib6_info *rt;
3310 	struct fib6_node *fn;
3311 	int err = -ESRCH;
3312 
3313 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3314 	if (!table) {
3315 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3316 		return err;
3317 	}
3318 
3319 	rcu_read_lock();
3320 
3321 	fn = fib6_locate(&table->tb6_root,
3322 			 &cfg->fc_dst, cfg->fc_dst_len,
3323 			 &cfg->fc_src, cfg->fc_src_len,
3324 			 !(cfg->fc_flags & RTF_CACHE));
3325 
3326 	if (fn) {
3327 		for_each_fib6_node_rt_rcu(fn) {
3328 			struct fib6_nh *nh;
3329 
3330 			if (cfg->fc_flags & RTF_CACHE) {
3331 				struct fib6_result res = {
3332 					.f6i = rt,
3333 				};
3334 				int rc;
3335 
3336 				rt_cache = rt6_find_cached_rt(&res,
3337 							      &cfg->fc_dst,
3338 							      &cfg->fc_src);
3339 				if (rt_cache) {
3340 					rc = ip6_del_cached_rt(rt_cache, cfg);
3341 					if (rc != -ESRCH) {
3342 						rcu_read_unlock();
3343 						return rc;
3344 					}
3345 				}
3346 				continue;
3347 			}
3348 
3349 			nh = &rt->fib6_nh;
3350 			if (cfg->fc_ifindex &&
3351 			    (!nh->fib_nh_dev ||
3352 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3353 				continue;
3354 			if (cfg->fc_flags & RTF_GATEWAY &&
3355 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3356 				continue;
3357 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3358 				continue;
3359 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3360 				continue;
3361 			if (!fib6_info_hold_safe(rt))
3362 				continue;
3363 			rcu_read_unlock();
3364 
3365 			/* if gateway was specified only delete the one hop */
3366 			if (cfg->fc_flags & RTF_GATEWAY)
3367 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3368 
3369 			return __ip6_del_rt_siblings(rt, cfg);
3370 		}
3371 	}
3372 	rcu_read_unlock();
3373 
3374 	return err;
3375 }
3376 
3377 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3378 {
3379 	struct netevent_redirect netevent;
3380 	struct rt6_info *rt, *nrt = NULL;
3381 	struct fib6_result res = {};
3382 	struct ndisc_options ndopts;
3383 	struct inet6_dev *in6_dev;
3384 	struct neighbour *neigh;
3385 	struct rd_msg *msg;
3386 	int optlen, on_link;
3387 	u8 *lladdr;
3388 
3389 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3390 	optlen -= sizeof(*msg);
3391 
3392 	if (optlen < 0) {
3393 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3394 		return;
3395 	}
3396 
3397 	msg = (struct rd_msg *)icmp6_hdr(skb);
3398 
3399 	if (ipv6_addr_is_multicast(&msg->dest)) {
3400 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3401 		return;
3402 	}
3403 
3404 	on_link = 0;
3405 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3406 		on_link = 1;
3407 	} else if (ipv6_addr_type(&msg->target) !=
3408 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3409 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3410 		return;
3411 	}
3412 
3413 	in6_dev = __in6_dev_get(skb->dev);
3414 	if (!in6_dev)
3415 		return;
3416 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3417 		return;
3418 
3419 	/* RFC2461 8.1:
3420 	 *	The IP source address of the Redirect MUST be the same as the current
3421 	 *	first-hop router for the specified ICMP Destination Address.
3422 	 */
3423 
3424 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3425 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3426 		return;
3427 	}
3428 
3429 	lladdr = NULL;
3430 	if (ndopts.nd_opts_tgt_lladdr) {
3431 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3432 					     skb->dev);
3433 		if (!lladdr) {
3434 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3435 			return;
3436 		}
3437 	}
3438 
3439 	rt = (struct rt6_info *) dst;
3440 	if (rt->rt6i_flags & RTF_REJECT) {
3441 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3442 		return;
3443 	}
3444 
3445 	/* Redirect received -> path was valid.
3446 	 * Look, redirects are sent only in response to data packets,
3447 	 * so that this nexthop apparently is reachable. --ANK
3448 	 */
3449 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3450 
3451 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3452 	if (!neigh)
3453 		return;
3454 
3455 	/*
3456 	 *	We have finally decided to accept it.
3457 	 */
3458 
3459 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3460 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3461 		     NEIGH_UPDATE_F_OVERRIDE|
3462 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3463 				     NEIGH_UPDATE_F_ISROUTER)),
3464 		     NDISC_REDIRECT, &ndopts);
3465 
3466 	rcu_read_lock();
3467 	res.f6i = rcu_dereference(rt->from);
3468 	/* This fib6_info_hold() is safe here because we hold reference to rt
3469 	 * and rt already holds reference to fib6_info.
3470 	 */
3471 	fib6_info_hold(res.f6i);
3472 	rcu_read_unlock();
3473 
3474 	res.nh = &res.f6i->fib6_nh;
3475 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3476 	if (!nrt)
3477 		goto out;
3478 
3479 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3480 	if (on_link)
3481 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3482 
3483 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3484 
3485 	/* No need to remove rt from the exception table if rt is
3486 	 * a cached route because rt6_insert_exception() will
3487 	 * takes care of it
3488 	 */
3489 	if (rt6_insert_exception(nrt, res.f6i)) {
3490 		dst_release_immediate(&nrt->dst);
3491 		goto out;
3492 	}
3493 
3494 	netevent.old = &rt->dst;
3495 	netevent.new = &nrt->dst;
3496 	netevent.daddr = &msg->dest;
3497 	netevent.neigh = neigh;
3498 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3499 
3500 out:
3501 	fib6_info_release(res.f6i);
3502 	neigh_release(neigh);
3503 }
3504 
3505 #ifdef CONFIG_IPV6_ROUTE_INFO
3506 static struct fib6_info *rt6_get_route_info(struct net *net,
3507 					   const struct in6_addr *prefix, int prefixlen,
3508 					   const struct in6_addr *gwaddr,
3509 					   struct net_device *dev)
3510 {
3511 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3512 	int ifindex = dev->ifindex;
3513 	struct fib6_node *fn;
3514 	struct fib6_info *rt = NULL;
3515 	struct fib6_table *table;
3516 
3517 	table = fib6_get_table(net, tb_id);
3518 	if (!table)
3519 		return NULL;
3520 
3521 	rcu_read_lock();
3522 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3523 	if (!fn)
3524 		goto out;
3525 
3526 	for_each_fib6_node_rt_rcu(fn) {
3527 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3528 			continue;
3529 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3530 		    !rt->fib6_nh.fib_nh_gw_family)
3531 			continue;
3532 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3533 			continue;
3534 		if (!fib6_info_hold_safe(rt))
3535 			continue;
3536 		break;
3537 	}
3538 out:
3539 	rcu_read_unlock();
3540 	return rt;
3541 }
3542 
3543 static struct fib6_info *rt6_add_route_info(struct net *net,
3544 					   const struct in6_addr *prefix, int prefixlen,
3545 					   const struct in6_addr *gwaddr,
3546 					   struct net_device *dev,
3547 					   unsigned int pref)
3548 {
3549 	struct fib6_config cfg = {
3550 		.fc_metric	= IP6_RT_PRIO_USER,
3551 		.fc_ifindex	= dev->ifindex,
3552 		.fc_dst_len	= prefixlen,
3553 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3554 				  RTF_UP | RTF_PREF(pref),
3555 		.fc_protocol = RTPROT_RA,
3556 		.fc_type = RTN_UNICAST,
3557 		.fc_nlinfo.portid = 0,
3558 		.fc_nlinfo.nlh = NULL,
3559 		.fc_nlinfo.nl_net = net,
3560 	};
3561 
3562 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3563 	cfg.fc_dst = *prefix;
3564 	cfg.fc_gateway = *gwaddr;
3565 
3566 	/* We should treat it as a default route if prefix length is 0. */
3567 	if (!prefixlen)
3568 		cfg.fc_flags |= RTF_DEFAULT;
3569 
3570 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3571 
3572 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3573 }
3574 #endif
3575 
3576 struct fib6_info *rt6_get_dflt_router(struct net *net,
3577 				     const struct in6_addr *addr,
3578 				     struct net_device *dev)
3579 {
3580 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3581 	struct fib6_info *rt;
3582 	struct fib6_table *table;
3583 
3584 	table = fib6_get_table(net, tb_id);
3585 	if (!table)
3586 		return NULL;
3587 
3588 	rcu_read_lock();
3589 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3590 		struct fib6_nh *nh = &rt->fib6_nh;
3591 
3592 		if (dev == nh->fib_nh_dev &&
3593 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3594 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3595 			break;
3596 	}
3597 	if (rt && !fib6_info_hold_safe(rt))
3598 		rt = NULL;
3599 	rcu_read_unlock();
3600 	return rt;
3601 }
3602 
3603 struct fib6_info *rt6_add_dflt_router(struct net *net,
3604 				     const struct in6_addr *gwaddr,
3605 				     struct net_device *dev,
3606 				     unsigned int pref)
3607 {
3608 	struct fib6_config cfg = {
3609 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3610 		.fc_metric	= IP6_RT_PRIO_USER,
3611 		.fc_ifindex	= dev->ifindex,
3612 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3613 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3614 		.fc_protocol = RTPROT_RA,
3615 		.fc_type = RTN_UNICAST,
3616 		.fc_nlinfo.portid = 0,
3617 		.fc_nlinfo.nlh = NULL,
3618 		.fc_nlinfo.nl_net = net,
3619 	};
3620 
3621 	cfg.fc_gateway = *gwaddr;
3622 
3623 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3624 		struct fib6_table *table;
3625 
3626 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3627 		if (table)
3628 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3629 	}
3630 
3631 	return rt6_get_dflt_router(net, gwaddr, dev);
3632 }
3633 
3634 static void __rt6_purge_dflt_routers(struct net *net,
3635 				     struct fib6_table *table)
3636 {
3637 	struct fib6_info *rt;
3638 
3639 restart:
3640 	rcu_read_lock();
3641 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3642 		struct net_device *dev = fib6_info_nh_dev(rt);
3643 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3644 
3645 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3646 		    (!idev || idev->cnf.accept_ra != 2) &&
3647 		    fib6_info_hold_safe(rt)) {
3648 			rcu_read_unlock();
3649 			ip6_del_rt(net, rt);
3650 			goto restart;
3651 		}
3652 	}
3653 	rcu_read_unlock();
3654 
3655 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3656 }
3657 
3658 void rt6_purge_dflt_routers(struct net *net)
3659 {
3660 	struct fib6_table *table;
3661 	struct hlist_head *head;
3662 	unsigned int h;
3663 
3664 	rcu_read_lock();
3665 
3666 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3667 		head = &net->ipv6.fib_table_hash[h];
3668 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3669 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3670 				__rt6_purge_dflt_routers(net, table);
3671 		}
3672 	}
3673 
3674 	rcu_read_unlock();
3675 }
3676 
3677 static void rtmsg_to_fib6_config(struct net *net,
3678 				 struct in6_rtmsg *rtmsg,
3679 				 struct fib6_config *cfg)
3680 {
3681 	*cfg = (struct fib6_config){
3682 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3683 			 : RT6_TABLE_MAIN,
3684 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3685 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3686 		.fc_expires = rtmsg->rtmsg_info,
3687 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3688 		.fc_src_len = rtmsg->rtmsg_src_len,
3689 		.fc_flags = rtmsg->rtmsg_flags,
3690 		.fc_type = rtmsg->rtmsg_type,
3691 
3692 		.fc_nlinfo.nl_net = net,
3693 
3694 		.fc_dst = rtmsg->rtmsg_dst,
3695 		.fc_src = rtmsg->rtmsg_src,
3696 		.fc_gateway = rtmsg->rtmsg_gateway,
3697 	};
3698 }
3699 
3700 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3701 {
3702 	struct fib6_config cfg;
3703 	struct in6_rtmsg rtmsg;
3704 	int err;
3705 
3706 	switch (cmd) {
3707 	case SIOCADDRT:		/* Add a route */
3708 	case SIOCDELRT:		/* Delete a route */
3709 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3710 			return -EPERM;
3711 		err = copy_from_user(&rtmsg, arg,
3712 				     sizeof(struct in6_rtmsg));
3713 		if (err)
3714 			return -EFAULT;
3715 
3716 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3717 
3718 		rtnl_lock();
3719 		switch (cmd) {
3720 		case SIOCADDRT:
3721 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3722 			break;
3723 		case SIOCDELRT:
3724 			err = ip6_route_del(&cfg, NULL);
3725 			break;
3726 		default:
3727 			err = -EINVAL;
3728 		}
3729 		rtnl_unlock();
3730 
3731 		return err;
3732 	}
3733 
3734 	return -EINVAL;
3735 }
3736 
3737 /*
3738  *	Drop the packet on the floor
3739  */
3740 
3741 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3742 {
3743 	int type;
3744 	struct dst_entry *dst = skb_dst(skb);
3745 	switch (ipstats_mib_noroutes) {
3746 	case IPSTATS_MIB_INNOROUTES:
3747 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3748 		if (type == IPV6_ADDR_ANY) {
3749 			IP6_INC_STATS(dev_net(dst->dev),
3750 				      __in6_dev_get_safely(skb->dev),
3751 				      IPSTATS_MIB_INADDRERRORS);
3752 			break;
3753 		}
3754 		/* FALLTHROUGH */
3755 	case IPSTATS_MIB_OUTNOROUTES:
3756 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3757 			      ipstats_mib_noroutes);
3758 		break;
3759 	}
3760 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3761 	kfree_skb(skb);
3762 	return 0;
3763 }
3764 
3765 static int ip6_pkt_discard(struct sk_buff *skb)
3766 {
3767 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3768 }
3769 
3770 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3771 {
3772 	skb->dev = skb_dst(skb)->dev;
3773 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3774 }
3775 
3776 static int ip6_pkt_prohibit(struct sk_buff *skb)
3777 {
3778 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3779 }
3780 
3781 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3782 {
3783 	skb->dev = skb_dst(skb)->dev;
3784 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3785 }
3786 
3787 /*
3788  *	Allocate a dst for local (unicast / anycast) address.
3789  */
3790 
3791 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3792 				     struct inet6_dev *idev,
3793 				     const struct in6_addr *addr,
3794 				     bool anycast, gfp_t gfp_flags)
3795 {
3796 	struct fib6_config cfg = {
3797 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3798 		.fc_ifindex = idev->dev->ifindex,
3799 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3800 		.fc_dst = *addr,
3801 		.fc_dst_len = 128,
3802 		.fc_protocol = RTPROT_KERNEL,
3803 		.fc_nlinfo.nl_net = net,
3804 		.fc_ignore_dev_down = true,
3805 	};
3806 
3807 	if (anycast) {
3808 		cfg.fc_type = RTN_ANYCAST;
3809 		cfg.fc_flags |= RTF_ANYCAST;
3810 	} else {
3811 		cfg.fc_type = RTN_LOCAL;
3812 		cfg.fc_flags |= RTF_LOCAL;
3813 	}
3814 
3815 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3816 }
3817 
3818 /* remove deleted ip from prefsrc entries */
3819 struct arg_dev_net_ip {
3820 	struct net_device *dev;
3821 	struct net *net;
3822 	struct in6_addr *addr;
3823 };
3824 
3825 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3826 {
3827 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3828 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3829 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3830 
3831 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3832 	    rt != net->ipv6.fib6_null_entry &&
3833 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3834 		spin_lock_bh(&rt6_exception_lock);
3835 		/* remove prefsrc entry */
3836 		rt->fib6_prefsrc.plen = 0;
3837 		spin_unlock_bh(&rt6_exception_lock);
3838 	}
3839 	return 0;
3840 }
3841 
3842 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3843 {
3844 	struct net *net = dev_net(ifp->idev->dev);
3845 	struct arg_dev_net_ip adni = {
3846 		.dev = ifp->idev->dev,
3847 		.net = net,
3848 		.addr = &ifp->addr,
3849 	};
3850 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3851 }
3852 
3853 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3854 
3855 /* Remove routers and update dst entries when gateway turn into host. */
3856 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3857 {
3858 	struct in6_addr *gateway = (struct in6_addr *)arg;
3859 
3860 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3861 	    rt->fib6_nh.fib_nh_gw_family &&
3862 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3863 		return -1;
3864 	}
3865 
3866 	/* Further clean up cached routes in exception table.
3867 	 * This is needed because cached route may have a different
3868 	 * gateway than its 'parent' in the case of an ip redirect.
3869 	 */
3870 	rt6_exceptions_clean_tohost(rt, gateway);
3871 
3872 	return 0;
3873 }
3874 
3875 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3876 {
3877 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3878 }
3879 
3880 struct arg_netdev_event {
3881 	const struct net_device *dev;
3882 	union {
3883 		unsigned int nh_flags;
3884 		unsigned long event;
3885 	};
3886 };
3887 
3888 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3889 {
3890 	struct fib6_info *iter;
3891 	struct fib6_node *fn;
3892 
3893 	fn = rcu_dereference_protected(rt->fib6_node,
3894 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3895 	iter = rcu_dereference_protected(fn->leaf,
3896 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3897 	while (iter) {
3898 		if (iter->fib6_metric == rt->fib6_metric &&
3899 		    rt6_qualify_for_ecmp(iter))
3900 			return iter;
3901 		iter = rcu_dereference_protected(iter->fib6_next,
3902 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3903 	}
3904 
3905 	return NULL;
3906 }
3907 
3908 static bool rt6_is_dead(const struct fib6_info *rt)
3909 {
3910 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3911 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3912 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3913 		return true;
3914 
3915 	return false;
3916 }
3917 
3918 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3919 {
3920 	struct fib6_info *iter;
3921 	int total = 0;
3922 
3923 	if (!rt6_is_dead(rt))
3924 		total += rt->fib6_nh.fib_nh_weight;
3925 
3926 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3927 		if (!rt6_is_dead(iter))
3928 			total += iter->fib6_nh.fib_nh_weight;
3929 	}
3930 
3931 	return total;
3932 }
3933 
3934 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3935 {
3936 	int upper_bound = -1;
3937 
3938 	if (!rt6_is_dead(rt)) {
3939 		*weight += rt->fib6_nh.fib_nh_weight;
3940 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3941 						    total) - 1;
3942 	}
3943 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3944 }
3945 
3946 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3947 {
3948 	struct fib6_info *iter;
3949 	int weight = 0;
3950 
3951 	rt6_upper_bound_set(rt, &weight, total);
3952 
3953 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3954 		rt6_upper_bound_set(iter, &weight, total);
3955 }
3956 
3957 void rt6_multipath_rebalance(struct fib6_info *rt)
3958 {
3959 	struct fib6_info *first;
3960 	int total;
3961 
3962 	/* In case the entire multipath route was marked for flushing,
3963 	 * then there is no need to rebalance upon the removal of every
3964 	 * sibling route.
3965 	 */
3966 	if (!rt->fib6_nsiblings || rt->should_flush)
3967 		return;
3968 
3969 	/* During lookup routes are evaluated in order, so we need to
3970 	 * make sure upper bounds are assigned from the first sibling
3971 	 * onwards.
3972 	 */
3973 	first = rt6_multipath_first_sibling(rt);
3974 	if (WARN_ON_ONCE(!first))
3975 		return;
3976 
3977 	total = rt6_multipath_total_weight(first);
3978 	rt6_multipath_upper_bound_set(first, total);
3979 }
3980 
3981 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3982 {
3983 	const struct arg_netdev_event *arg = p_arg;
3984 	struct net *net = dev_net(arg->dev);
3985 
3986 	if (rt != net->ipv6.fib6_null_entry &&
3987 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
3988 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3989 		fib6_update_sernum_upto_root(net, rt);
3990 		rt6_multipath_rebalance(rt);
3991 	}
3992 
3993 	return 0;
3994 }
3995 
3996 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3997 {
3998 	struct arg_netdev_event arg = {
3999 		.dev = dev,
4000 		{
4001 			.nh_flags = nh_flags,
4002 		},
4003 	};
4004 
4005 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4006 		arg.nh_flags |= RTNH_F_LINKDOWN;
4007 
4008 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4009 }
4010 
4011 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4012 				   const struct net_device *dev)
4013 {
4014 	struct fib6_info *iter;
4015 
4016 	if (rt->fib6_nh.fib_nh_dev == dev)
4017 		return true;
4018 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4019 		if (iter->fib6_nh.fib_nh_dev == dev)
4020 			return true;
4021 
4022 	return false;
4023 }
4024 
4025 static void rt6_multipath_flush(struct fib6_info *rt)
4026 {
4027 	struct fib6_info *iter;
4028 
4029 	rt->should_flush = 1;
4030 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4031 		iter->should_flush = 1;
4032 }
4033 
4034 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4035 					     const struct net_device *down_dev)
4036 {
4037 	struct fib6_info *iter;
4038 	unsigned int dead = 0;
4039 
4040 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4041 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4042 		dead++;
4043 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4044 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4045 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4046 			dead++;
4047 
4048 	return dead;
4049 }
4050 
4051 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4052 				       const struct net_device *dev,
4053 				       unsigned int nh_flags)
4054 {
4055 	struct fib6_info *iter;
4056 
4057 	if (rt->fib6_nh.fib_nh_dev == dev)
4058 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4059 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4060 		if (iter->fib6_nh.fib_nh_dev == dev)
4061 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4062 }
4063 
4064 /* called with write lock held for table with rt */
4065 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4066 {
4067 	const struct arg_netdev_event *arg = p_arg;
4068 	const struct net_device *dev = arg->dev;
4069 	struct net *net = dev_net(dev);
4070 
4071 	if (rt == net->ipv6.fib6_null_entry)
4072 		return 0;
4073 
4074 	switch (arg->event) {
4075 	case NETDEV_UNREGISTER:
4076 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4077 	case NETDEV_DOWN:
4078 		if (rt->should_flush)
4079 			return -1;
4080 		if (!rt->fib6_nsiblings)
4081 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4082 		if (rt6_multipath_uses_dev(rt, dev)) {
4083 			unsigned int count;
4084 
4085 			count = rt6_multipath_dead_count(rt, dev);
4086 			if (rt->fib6_nsiblings + 1 == count) {
4087 				rt6_multipath_flush(rt);
4088 				return -1;
4089 			}
4090 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4091 						   RTNH_F_LINKDOWN);
4092 			fib6_update_sernum(net, rt);
4093 			rt6_multipath_rebalance(rt);
4094 		}
4095 		return -2;
4096 	case NETDEV_CHANGE:
4097 		if (rt->fib6_nh.fib_nh_dev != dev ||
4098 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4099 			break;
4100 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4101 		rt6_multipath_rebalance(rt);
4102 		break;
4103 	}
4104 
4105 	return 0;
4106 }
4107 
4108 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4109 {
4110 	struct arg_netdev_event arg = {
4111 		.dev = dev,
4112 		{
4113 			.event = event,
4114 		},
4115 	};
4116 	struct net *net = dev_net(dev);
4117 
4118 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4119 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4120 	else
4121 		fib6_clean_all(net, fib6_ifdown, &arg);
4122 }
4123 
4124 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4125 {
4126 	rt6_sync_down_dev(dev, event);
4127 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4128 	neigh_ifdown(&nd_tbl, dev);
4129 }
4130 
4131 struct rt6_mtu_change_arg {
4132 	struct net_device *dev;
4133 	unsigned int mtu;
4134 };
4135 
4136 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4137 {
4138 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4139 	struct inet6_dev *idev;
4140 
4141 	/* In IPv6 pmtu discovery is not optional,
4142 	   so that RTAX_MTU lock cannot disable it.
4143 	   We still use this lock to block changes
4144 	   caused by addrconf/ndisc.
4145 	*/
4146 
4147 	idev = __in6_dev_get(arg->dev);
4148 	if (!idev)
4149 		return 0;
4150 
4151 	/* For administrative MTU increase, there is no way to discover
4152 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4153 	   Since RFC 1981 doesn't include administrative MTU increase
4154 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4155 	 */
4156 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4157 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4158 		u32 mtu = rt->fib6_pmtu;
4159 
4160 		if (mtu >= arg->mtu ||
4161 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4162 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4163 
4164 		spin_lock_bh(&rt6_exception_lock);
4165 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4166 		spin_unlock_bh(&rt6_exception_lock);
4167 	}
4168 	return 0;
4169 }
4170 
4171 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4172 {
4173 	struct rt6_mtu_change_arg arg = {
4174 		.dev = dev,
4175 		.mtu = mtu,
4176 	};
4177 
4178 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4179 }
4180 
4181 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4182 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4183 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4184 	[RTA_OIF]               = { .type = NLA_U32 },
4185 	[RTA_IIF]		= { .type = NLA_U32 },
4186 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4187 	[RTA_METRICS]           = { .type = NLA_NESTED },
4188 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4189 	[RTA_PREF]              = { .type = NLA_U8 },
4190 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4191 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4192 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4193 	[RTA_UID]		= { .type = NLA_U32 },
4194 	[RTA_MARK]		= { .type = NLA_U32 },
4195 	[RTA_TABLE]		= { .type = NLA_U32 },
4196 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4197 	[RTA_SPORT]		= { .type = NLA_U16 },
4198 	[RTA_DPORT]		= { .type = NLA_U16 },
4199 };
4200 
4201 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4202 			      struct fib6_config *cfg,
4203 			      struct netlink_ext_ack *extack)
4204 {
4205 	struct rtmsg *rtm;
4206 	struct nlattr *tb[RTA_MAX+1];
4207 	unsigned int pref;
4208 	int err;
4209 
4210 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4211 			  extack);
4212 	if (err < 0)
4213 		goto errout;
4214 
4215 	err = -EINVAL;
4216 	rtm = nlmsg_data(nlh);
4217 
4218 	*cfg = (struct fib6_config){
4219 		.fc_table = rtm->rtm_table,
4220 		.fc_dst_len = rtm->rtm_dst_len,
4221 		.fc_src_len = rtm->rtm_src_len,
4222 		.fc_flags = RTF_UP,
4223 		.fc_protocol = rtm->rtm_protocol,
4224 		.fc_type = rtm->rtm_type,
4225 
4226 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4227 		.fc_nlinfo.nlh = nlh,
4228 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4229 	};
4230 
4231 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4232 	    rtm->rtm_type == RTN_BLACKHOLE ||
4233 	    rtm->rtm_type == RTN_PROHIBIT ||
4234 	    rtm->rtm_type == RTN_THROW)
4235 		cfg->fc_flags |= RTF_REJECT;
4236 
4237 	if (rtm->rtm_type == RTN_LOCAL)
4238 		cfg->fc_flags |= RTF_LOCAL;
4239 
4240 	if (rtm->rtm_flags & RTM_F_CLONED)
4241 		cfg->fc_flags |= RTF_CACHE;
4242 
4243 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4244 
4245 	if (tb[RTA_GATEWAY]) {
4246 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4247 		cfg->fc_flags |= RTF_GATEWAY;
4248 	}
4249 	if (tb[RTA_VIA]) {
4250 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4251 		goto errout;
4252 	}
4253 
4254 	if (tb[RTA_DST]) {
4255 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4256 
4257 		if (nla_len(tb[RTA_DST]) < plen)
4258 			goto errout;
4259 
4260 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4261 	}
4262 
4263 	if (tb[RTA_SRC]) {
4264 		int plen = (rtm->rtm_src_len + 7) >> 3;
4265 
4266 		if (nla_len(tb[RTA_SRC]) < plen)
4267 			goto errout;
4268 
4269 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4270 	}
4271 
4272 	if (tb[RTA_PREFSRC])
4273 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4274 
4275 	if (tb[RTA_OIF])
4276 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4277 
4278 	if (tb[RTA_PRIORITY])
4279 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4280 
4281 	if (tb[RTA_METRICS]) {
4282 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4283 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4284 	}
4285 
4286 	if (tb[RTA_TABLE])
4287 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4288 
4289 	if (tb[RTA_MULTIPATH]) {
4290 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4291 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4292 
4293 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4294 						     cfg->fc_mp_len, extack);
4295 		if (err < 0)
4296 			goto errout;
4297 	}
4298 
4299 	if (tb[RTA_PREF]) {
4300 		pref = nla_get_u8(tb[RTA_PREF]);
4301 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4302 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4303 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4304 		cfg->fc_flags |= RTF_PREF(pref);
4305 	}
4306 
4307 	if (tb[RTA_ENCAP])
4308 		cfg->fc_encap = tb[RTA_ENCAP];
4309 
4310 	if (tb[RTA_ENCAP_TYPE]) {
4311 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4312 
4313 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4314 		if (err < 0)
4315 			goto errout;
4316 	}
4317 
4318 	if (tb[RTA_EXPIRES]) {
4319 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4320 
4321 		if (addrconf_finite_timeout(timeout)) {
4322 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4323 			cfg->fc_flags |= RTF_EXPIRES;
4324 		}
4325 	}
4326 
4327 	err = 0;
4328 errout:
4329 	return err;
4330 }
4331 
4332 struct rt6_nh {
4333 	struct fib6_info *fib6_info;
4334 	struct fib6_config r_cfg;
4335 	struct list_head next;
4336 };
4337 
4338 static int ip6_route_info_append(struct net *net,
4339 				 struct list_head *rt6_nh_list,
4340 				 struct fib6_info *rt,
4341 				 struct fib6_config *r_cfg)
4342 {
4343 	struct rt6_nh *nh;
4344 	int err = -EEXIST;
4345 
4346 	list_for_each_entry(nh, rt6_nh_list, next) {
4347 		/* check if fib6_info already exists */
4348 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4349 			return err;
4350 	}
4351 
4352 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4353 	if (!nh)
4354 		return -ENOMEM;
4355 	nh->fib6_info = rt;
4356 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4357 	list_add_tail(&nh->next, rt6_nh_list);
4358 
4359 	return 0;
4360 }
4361 
4362 static void ip6_route_mpath_notify(struct fib6_info *rt,
4363 				   struct fib6_info *rt_last,
4364 				   struct nl_info *info,
4365 				   __u16 nlflags)
4366 {
4367 	/* if this is an APPEND route, then rt points to the first route
4368 	 * inserted and rt_last points to last route inserted. Userspace
4369 	 * wants a consistent dump of the route which starts at the first
4370 	 * nexthop. Since sibling routes are always added at the end of
4371 	 * the list, find the first sibling of the last route appended
4372 	 */
4373 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4374 		rt = list_first_entry(&rt_last->fib6_siblings,
4375 				      struct fib6_info,
4376 				      fib6_siblings);
4377 	}
4378 
4379 	if (rt)
4380 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4381 }
4382 
4383 static int ip6_route_multipath_add(struct fib6_config *cfg,
4384 				   struct netlink_ext_ack *extack)
4385 {
4386 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4387 	struct nl_info *info = &cfg->fc_nlinfo;
4388 	struct fib6_config r_cfg;
4389 	struct rtnexthop *rtnh;
4390 	struct fib6_info *rt;
4391 	struct rt6_nh *err_nh;
4392 	struct rt6_nh *nh, *nh_safe;
4393 	__u16 nlflags;
4394 	int remaining;
4395 	int attrlen;
4396 	int err = 1;
4397 	int nhn = 0;
4398 	int replace = (cfg->fc_nlinfo.nlh &&
4399 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4400 	LIST_HEAD(rt6_nh_list);
4401 
4402 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4403 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4404 		nlflags |= NLM_F_APPEND;
4405 
4406 	remaining = cfg->fc_mp_len;
4407 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4408 
4409 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4410 	 * fib6_info structs per nexthop
4411 	 */
4412 	while (rtnh_ok(rtnh, remaining)) {
4413 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4414 		if (rtnh->rtnh_ifindex)
4415 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4416 
4417 		attrlen = rtnh_attrlen(rtnh);
4418 		if (attrlen > 0) {
4419 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4420 
4421 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4422 			if (nla) {
4423 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4424 				r_cfg.fc_flags |= RTF_GATEWAY;
4425 			}
4426 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4427 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4428 			if (nla)
4429 				r_cfg.fc_encap_type = nla_get_u16(nla);
4430 		}
4431 
4432 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4433 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4434 		if (IS_ERR(rt)) {
4435 			err = PTR_ERR(rt);
4436 			rt = NULL;
4437 			goto cleanup;
4438 		}
4439 		if (!rt6_qualify_for_ecmp(rt)) {
4440 			err = -EINVAL;
4441 			NL_SET_ERR_MSG(extack,
4442 				       "Device only routes can not be added for IPv6 using the multipath API.");
4443 			fib6_info_release(rt);
4444 			goto cleanup;
4445 		}
4446 
4447 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4448 
4449 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4450 					    rt, &r_cfg);
4451 		if (err) {
4452 			fib6_info_release(rt);
4453 			goto cleanup;
4454 		}
4455 
4456 		rtnh = rtnh_next(rtnh, &remaining);
4457 	}
4458 
4459 	/* for add and replace send one notification with all nexthops.
4460 	 * Skip the notification in fib6_add_rt2node and send one with
4461 	 * the full route when done
4462 	 */
4463 	info->skip_notify = 1;
4464 
4465 	err_nh = NULL;
4466 	list_for_each_entry(nh, &rt6_nh_list, next) {
4467 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4468 		fib6_info_release(nh->fib6_info);
4469 
4470 		if (!err) {
4471 			/* save reference to last route successfully inserted */
4472 			rt_last = nh->fib6_info;
4473 
4474 			/* save reference to first route for notification */
4475 			if (!rt_notif)
4476 				rt_notif = nh->fib6_info;
4477 		}
4478 
4479 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4480 		nh->fib6_info = NULL;
4481 		if (err) {
4482 			if (replace && nhn)
4483 				NL_SET_ERR_MSG_MOD(extack,
4484 						   "multipath route replace failed (check consistency of installed routes)");
4485 			err_nh = nh;
4486 			goto add_errout;
4487 		}
4488 
4489 		/* Because each route is added like a single route we remove
4490 		 * these flags after the first nexthop: if there is a collision,
4491 		 * we have already failed to add the first nexthop:
4492 		 * fib6_add_rt2node() has rejected it; when replacing, old
4493 		 * nexthops have been replaced by first new, the rest should
4494 		 * be added to it.
4495 		 */
4496 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4497 						     NLM_F_REPLACE);
4498 		nhn++;
4499 	}
4500 
4501 	/* success ... tell user about new route */
4502 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4503 	goto cleanup;
4504 
4505 add_errout:
4506 	/* send notification for routes that were added so that
4507 	 * the delete notifications sent by ip6_route_del are
4508 	 * coherent
4509 	 */
4510 	if (rt_notif)
4511 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4512 
4513 	/* Delete routes that were already added */
4514 	list_for_each_entry(nh, &rt6_nh_list, next) {
4515 		if (err_nh == nh)
4516 			break;
4517 		ip6_route_del(&nh->r_cfg, extack);
4518 	}
4519 
4520 cleanup:
4521 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4522 		if (nh->fib6_info)
4523 			fib6_info_release(nh->fib6_info);
4524 		list_del(&nh->next);
4525 		kfree(nh);
4526 	}
4527 
4528 	return err;
4529 }
4530 
4531 static int ip6_route_multipath_del(struct fib6_config *cfg,
4532 				   struct netlink_ext_ack *extack)
4533 {
4534 	struct fib6_config r_cfg;
4535 	struct rtnexthop *rtnh;
4536 	int remaining;
4537 	int attrlen;
4538 	int err = 1, last_err = 0;
4539 
4540 	remaining = cfg->fc_mp_len;
4541 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4542 
4543 	/* Parse a Multipath Entry */
4544 	while (rtnh_ok(rtnh, remaining)) {
4545 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4546 		if (rtnh->rtnh_ifindex)
4547 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4548 
4549 		attrlen = rtnh_attrlen(rtnh);
4550 		if (attrlen > 0) {
4551 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4552 
4553 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4554 			if (nla) {
4555 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4556 				r_cfg.fc_flags |= RTF_GATEWAY;
4557 			}
4558 		}
4559 		err = ip6_route_del(&r_cfg, extack);
4560 		if (err)
4561 			last_err = err;
4562 
4563 		rtnh = rtnh_next(rtnh, &remaining);
4564 	}
4565 
4566 	return last_err;
4567 }
4568 
4569 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4570 			      struct netlink_ext_ack *extack)
4571 {
4572 	struct fib6_config cfg;
4573 	int err;
4574 
4575 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4576 	if (err < 0)
4577 		return err;
4578 
4579 	if (cfg.fc_mp)
4580 		return ip6_route_multipath_del(&cfg, extack);
4581 	else {
4582 		cfg.fc_delete_all_nh = 1;
4583 		return ip6_route_del(&cfg, extack);
4584 	}
4585 }
4586 
4587 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4588 			      struct netlink_ext_ack *extack)
4589 {
4590 	struct fib6_config cfg;
4591 	int err;
4592 
4593 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4594 	if (err < 0)
4595 		return err;
4596 
4597 	if (cfg.fc_metric == 0)
4598 		cfg.fc_metric = IP6_RT_PRIO_USER;
4599 
4600 	if (cfg.fc_mp)
4601 		return ip6_route_multipath_add(&cfg, extack);
4602 	else
4603 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4604 }
4605 
4606 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4607 {
4608 	int nexthop_len = 0;
4609 
4610 	if (rt->fib6_nsiblings) {
4611 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4612 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4613 			    + nla_total_size(16) /* RTA_GATEWAY */
4614 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4615 
4616 		nexthop_len *= rt->fib6_nsiblings;
4617 	}
4618 
4619 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4620 	       + nla_total_size(16) /* RTA_SRC */
4621 	       + nla_total_size(16) /* RTA_DST */
4622 	       + nla_total_size(16) /* RTA_GATEWAY */
4623 	       + nla_total_size(16) /* RTA_PREFSRC */
4624 	       + nla_total_size(4) /* RTA_TABLE */
4625 	       + nla_total_size(4) /* RTA_IIF */
4626 	       + nla_total_size(4) /* RTA_OIF */
4627 	       + nla_total_size(4) /* RTA_PRIORITY */
4628 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4629 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4630 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4631 	       + nla_total_size(1) /* RTA_PREF */
4632 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4633 	       + nexthop_len;
4634 }
4635 
4636 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4637 			 struct fib6_info *rt, struct dst_entry *dst,
4638 			 struct in6_addr *dest, struct in6_addr *src,
4639 			 int iif, int type, u32 portid, u32 seq,
4640 			 unsigned int flags)
4641 {
4642 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4643 	struct rt6key *rt6_dst, *rt6_src;
4644 	u32 *pmetrics, table, rt6_flags;
4645 	struct nlmsghdr *nlh;
4646 	struct rtmsg *rtm;
4647 	long expires = 0;
4648 
4649 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4650 	if (!nlh)
4651 		return -EMSGSIZE;
4652 
4653 	if (rt6) {
4654 		rt6_dst = &rt6->rt6i_dst;
4655 		rt6_src = &rt6->rt6i_src;
4656 		rt6_flags = rt6->rt6i_flags;
4657 	} else {
4658 		rt6_dst = &rt->fib6_dst;
4659 		rt6_src = &rt->fib6_src;
4660 		rt6_flags = rt->fib6_flags;
4661 	}
4662 
4663 	rtm = nlmsg_data(nlh);
4664 	rtm->rtm_family = AF_INET6;
4665 	rtm->rtm_dst_len = rt6_dst->plen;
4666 	rtm->rtm_src_len = rt6_src->plen;
4667 	rtm->rtm_tos = 0;
4668 	if (rt->fib6_table)
4669 		table = rt->fib6_table->tb6_id;
4670 	else
4671 		table = RT6_TABLE_UNSPEC;
4672 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4673 	if (nla_put_u32(skb, RTA_TABLE, table))
4674 		goto nla_put_failure;
4675 
4676 	rtm->rtm_type = rt->fib6_type;
4677 	rtm->rtm_flags = 0;
4678 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4679 	rtm->rtm_protocol = rt->fib6_protocol;
4680 
4681 	if (rt6_flags & RTF_CACHE)
4682 		rtm->rtm_flags |= RTM_F_CLONED;
4683 
4684 	if (dest) {
4685 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4686 			goto nla_put_failure;
4687 		rtm->rtm_dst_len = 128;
4688 	} else if (rtm->rtm_dst_len)
4689 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4690 			goto nla_put_failure;
4691 #ifdef CONFIG_IPV6_SUBTREES
4692 	if (src) {
4693 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4694 			goto nla_put_failure;
4695 		rtm->rtm_src_len = 128;
4696 	} else if (rtm->rtm_src_len &&
4697 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4698 		goto nla_put_failure;
4699 #endif
4700 	if (iif) {
4701 #ifdef CONFIG_IPV6_MROUTE
4702 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4703 			int err = ip6mr_get_route(net, skb, rtm, portid);
4704 
4705 			if (err == 0)
4706 				return 0;
4707 			if (err < 0)
4708 				goto nla_put_failure;
4709 		} else
4710 #endif
4711 			if (nla_put_u32(skb, RTA_IIF, iif))
4712 				goto nla_put_failure;
4713 	} else if (dest) {
4714 		struct in6_addr saddr_buf;
4715 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4716 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4717 			goto nla_put_failure;
4718 	}
4719 
4720 	if (rt->fib6_prefsrc.plen) {
4721 		struct in6_addr saddr_buf;
4722 		saddr_buf = rt->fib6_prefsrc.addr;
4723 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4724 			goto nla_put_failure;
4725 	}
4726 
4727 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4728 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4729 		goto nla_put_failure;
4730 
4731 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4732 		goto nla_put_failure;
4733 
4734 	/* For multipath routes, walk the siblings list and add
4735 	 * each as a nexthop within RTA_MULTIPATH.
4736 	 */
4737 	if (rt6) {
4738 		if (rt6_flags & RTF_GATEWAY &&
4739 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4740 			goto nla_put_failure;
4741 
4742 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4743 			goto nla_put_failure;
4744 	} else if (rt->fib6_nsiblings) {
4745 		struct fib6_info *sibling, *next_sibling;
4746 		struct nlattr *mp;
4747 
4748 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4749 		if (!mp)
4750 			goto nla_put_failure;
4751 
4752 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4753 				    rt->fib6_nh.fib_nh_weight) < 0)
4754 			goto nla_put_failure;
4755 
4756 		list_for_each_entry_safe(sibling, next_sibling,
4757 					 &rt->fib6_siblings, fib6_siblings) {
4758 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4759 					    sibling->fib6_nh.fib_nh_weight) < 0)
4760 				goto nla_put_failure;
4761 		}
4762 
4763 		nla_nest_end(skb, mp);
4764 	} else {
4765 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4766 				     &rtm->rtm_flags, false) < 0)
4767 			goto nla_put_failure;
4768 	}
4769 
4770 	if (rt6_flags & RTF_EXPIRES) {
4771 		expires = dst ? dst->expires : rt->expires;
4772 		expires -= jiffies;
4773 	}
4774 
4775 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4776 		goto nla_put_failure;
4777 
4778 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4779 		goto nla_put_failure;
4780 
4781 
4782 	nlmsg_end(skb, nlh);
4783 	return 0;
4784 
4785 nla_put_failure:
4786 	nlmsg_cancel(skb, nlh);
4787 	return -EMSGSIZE;
4788 }
4789 
4790 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4791 			       const struct net_device *dev)
4792 {
4793 	if (f6i->fib6_nh.fib_nh_dev == dev)
4794 		return true;
4795 
4796 	if (f6i->fib6_nsiblings) {
4797 		struct fib6_info *sibling, *next_sibling;
4798 
4799 		list_for_each_entry_safe(sibling, next_sibling,
4800 					 &f6i->fib6_siblings, fib6_siblings) {
4801 			if (sibling->fib6_nh.fib_nh_dev == dev)
4802 				return true;
4803 		}
4804 	}
4805 
4806 	return false;
4807 }
4808 
4809 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4810 {
4811 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4812 	struct fib_dump_filter *filter = &arg->filter;
4813 	unsigned int flags = NLM_F_MULTI;
4814 	struct net *net = arg->net;
4815 
4816 	if (rt == net->ipv6.fib6_null_entry)
4817 		return 0;
4818 
4819 	if ((filter->flags & RTM_F_PREFIX) &&
4820 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4821 		/* success since this is not a prefix route */
4822 		return 1;
4823 	}
4824 	if (filter->filter_set) {
4825 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4826 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4827 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4828 			return 1;
4829 		}
4830 		flags |= NLM_F_DUMP_FILTERED;
4831 	}
4832 
4833 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4834 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4835 			     arg->cb->nlh->nlmsg_seq, flags);
4836 }
4837 
4838 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4839 					const struct nlmsghdr *nlh,
4840 					struct nlattr **tb,
4841 					struct netlink_ext_ack *extack)
4842 {
4843 	struct rtmsg *rtm;
4844 	int i, err;
4845 
4846 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4847 		NL_SET_ERR_MSG_MOD(extack,
4848 				   "Invalid header for get route request");
4849 		return -EINVAL;
4850 	}
4851 
4852 	if (!netlink_strict_get_check(skb))
4853 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4854 				   rtm_ipv6_policy, extack);
4855 
4856 	rtm = nlmsg_data(nlh);
4857 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4858 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4859 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4860 	    rtm->rtm_type) {
4861 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4862 		return -EINVAL;
4863 	}
4864 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4865 		NL_SET_ERR_MSG_MOD(extack,
4866 				   "Invalid flags for get route request");
4867 		return -EINVAL;
4868 	}
4869 
4870 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4871 				 rtm_ipv6_policy, extack);
4872 	if (err)
4873 		return err;
4874 
4875 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4876 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4877 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4878 		return -EINVAL;
4879 	}
4880 
4881 	for (i = 0; i <= RTA_MAX; i++) {
4882 		if (!tb[i])
4883 			continue;
4884 
4885 		switch (i) {
4886 		case RTA_SRC:
4887 		case RTA_DST:
4888 		case RTA_IIF:
4889 		case RTA_OIF:
4890 		case RTA_MARK:
4891 		case RTA_UID:
4892 		case RTA_SPORT:
4893 		case RTA_DPORT:
4894 		case RTA_IP_PROTO:
4895 			break;
4896 		default:
4897 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4898 			return -EINVAL;
4899 		}
4900 	}
4901 
4902 	return 0;
4903 }
4904 
4905 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4906 			      struct netlink_ext_ack *extack)
4907 {
4908 	struct net *net = sock_net(in_skb->sk);
4909 	struct nlattr *tb[RTA_MAX+1];
4910 	int err, iif = 0, oif = 0;
4911 	struct fib6_info *from;
4912 	struct dst_entry *dst;
4913 	struct rt6_info *rt;
4914 	struct sk_buff *skb;
4915 	struct rtmsg *rtm;
4916 	struct flowi6 fl6 = {};
4917 	bool fibmatch;
4918 
4919 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4920 	if (err < 0)
4921 		goto errout;
4922 
4923 	err = -EINVAL;
4924 	rtm = nlmsg_data(nlh);
4925 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4926 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4927 
4928 	if (tb[RTA_SRC]) {
4929 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4930 			goto errout;
4931 
4932 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4933 	}
4934 
4935 	if (tb[RTA_DST]) {
4936 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4937 			goto errout;
4938 
4939 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4940 	}
4941 
4942 	if (tb[RTA_IIF])
4943 		iif = nla_get_u32(tb[RTA_IIF]);
4944 
4945 	if (tb[RTA_OIF])
4946 		oif = nla_get_u32(tb[RTA_OIF]);
4947 
4948 	if (tb[RTA_MARK])
4949 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4950 
4951 	if (tb[RTA_UID])
4952 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4953 					   nla_get_u32(tb[RTA_UID]));
4954 	else
4955 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4956 
4957 	if (tb[RTA_SPORT])
4958 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4959 
4960 	if (tb[RTA_DPORT])
4961 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4962 
4963 	if (tb[RTA_IP_PROTO]) {
4964 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4965 						  &fl6.flowi6_proto, AF_INET6,
4966 						  extack);
4967 		if (err)
4968 			goto errout;
4969 	}
4970 
4971 	if (iif) {
4972 		struct net_device *dev;
4973 		int flags = 0;
4974 
4975 		rcu_read_lock();
4976 
4977 		dev = dev_get_by_index_rcu(net, iif);
4978 		if (!dev) {
4979 			rcu_read_unlock();
4980 			err = -ENODEV;
4981 			goto errout;
4982 		}
4983 
4984 		fl6.flowi6_iif = iif;
4985 
4986 		if (!ipv6_addr_any(&fl6.saddr))
4987 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4988 
4989 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4990 
4991 		rcu_read_unlock();
4992 	} else {
4993 		fl6.flowi6_oif = oif;
4994 
4995 		dst = ip6_route_output(net, NULL, &fl6);
4996 	}
4997 
4998 
4999 	rt = container_of(dst, struct rt6_info, dst);
5000 	if (rt->dst.error) {
5001 		err = rt->dst.error;
5002 		ip6_rt_put(rt);
5003 		goto errout;
5004 	}
5005 
5006 	if (rt == net->ipv6.ip6_null_entry) {
5007 		err = rt->dst.error;
5008 		ip6_rt_put(rt);
5009 		goto errout;
5010 	}
5011 
5012 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5013 	if (!skb) {
5014 		ip6_rt_put(rt);
5015 		err = -ENOBUFS;
5016 		goto errout;
5017 	}
5018 
5019 	skb_dst_set(skb, &rt->dst);
5020 
5021 	rcu_read_lock();
5022 	from = rcu_dereference(rt->from);
5023 
5024 	if (fibmatch)
5025 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5026 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5027 				    nlh->nlmsg_seq, 0);
5028 	else
5029 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5030 				    &fl6.saddr, iif, RTM_NEWROUTE,
5031 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5032 				    0);
5033 	rcu_read_unlock();
5034 
5035 	if (err < 0) {
5036 		kfree_skb(skb);
5037 		goto errout;
5038 	}
5039 
5040 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5041 errout:
5042 	return err;
5043 }
5044 
5045 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5046 		     unsigned int nlm_flags)
5047 {
5048 	struct sk_buff *skb;
5049 	struct net *net = info->nl_net;
5050 	u32 seq;
5051 	int err;
5052 
5053 	err = -ENOBUFS;
5054 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5055 
5056 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5057 	if (!skb)
5058 		goto errout;
5059 
5060 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5061 			    event, info->portid, seq, nlm_flags);
5062 	if (err < 0) {
5063 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5064 		WARN_ON(err == -EMSGSIZE);
5065 		kfree_skb(skb);
5066 		goto errout;
5067 	}
5068 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5069 		    info->nlh, gfp_any());
5070 	return;
5071 errout:
5072 	if (err < 0)
5073 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5074 }
5075 
5076 static int ip6_route_dev_notify(struct notifier_block *this,
5077 				unsigned long event, void *ptr)
5078 {
5079 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5080 	struct net *net = dev_net(dev);
5081 
5082 	if (!(dev->flags & IFF_LOOPBACK))
5083 		return NOTIFY_OK;
5084 
5085 	if (event == NETDEV_REGISTER) {
5086 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5087 		net->ipv6.ip6_null_entry->dst.dev = dev;
5088 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5089 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5090 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5091 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5092 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5093 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5094 #endif
5095 	 } else if (event == NETDEV_UNREGISTER &&
5096 		    dev->reg_state != NETREG_UNREGISTERED) {
5097 		/* NETDEV_UNREGISTER could be fired for multiple times by
5098 		 * netdev_wait_allrefs(). Make sure we only call this once.
5099 		 */
5100 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5101 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5102 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5103 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5104 #endif
5105 	}
5106 
5107 	return NOTIFY_OK;
5108 }
5109 
5110 /*
5111  *	/proc
5112  */
5113 
5114 #ifdef CONFIG_PROC_FS
5115 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5116 {
5117 	struct net *net = (struct net *)seq->private;
5118 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5119 		   net->ipv6.rt6_stats->fib_nodes,
5120 		   net->ipv6.rt6_stats->fib_route_nodes,
5121 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5122 		   net->ipv6.rt6_stats->fib_rt_entries,
5123 		   net->ipv6.rt6_stats->fib_rt_cache,
5124 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5125 		   net->ipv6.rt6_stats->fib_discarded_routes);
5126 
5127 	return 0;
5128 }
5129 #endif	/* CONFIG_PROC_FS */
5130 
5131 #ifdef CONFIG_SYSCTL
5132 
5133 static
5134 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5135 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5136 {
5137 	struct net *net;
5138 	int delay;
5139 	int ret;
5140 	if (!write)
5141 		return -EINVAL;
5142 
5143 	net = (struct net *)ctl->extra1;
5144 	delay = net->ipv6.sysctl.flush_delay;
5145 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5146 	if (ret)
5147 		return ret;
5148 
5149 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5150 	return 0;
5151 }
5152 
5153 static int zero;
5154 static int one = 1;
5155 
5156 static struct ctl_table ipv6_route_table_template[] = {
5157 	{
5158 		.procname	=	"flush",
5159 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5160 		.maxlen		=	sizeof(int),
5161 		.mode		=	0200,
5162 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5163 	},
5164 	{
5165 		.procname	=	"gc_thresh",
5166 		.data		=	&ip6_dst_ops_template.gc_thresh,
5167 		.maxlen		=	sizeof(int),
5168 		.mode		=	0644,
5169 		.proc_handler	=	proc_dointvec,
5170 	},
5171 	{
5172 		.procname	=	"max_size",
5173 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5174 		.maxlen		=	sizeof(int),
5175 		.mode		=	0644,
5176 		.proc_handler	=	proc_dointvec,
5177 	},
5178 	{
5179 		.procname	=	"gc_min_interval",
5180 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5181 		.maxlen		=	sizeof(int),
5182 		.mode		=	0644,
5183 		.proc_handler	=	proc_dointvec_jiffies,
5184 	},
5185 	{
5186 		.procname	=	"gc_timeout",
5187 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5188 		.maxlen		=	sizeof(int),
5189 		.mode		=	0644,
5190 		.proc_handler	=	proc_dointvec_jiffies,
5191 	},
5192 	{
5193 		.procname	=	"gc_interval",
5194 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5195 		.maxlen		=	sizeof(int),
5196 		.mode		=	0644,
5197 		.proc_handler	=	proc_dointvec_jiffies,
5198 	},
5199 	{
5200 		.procname	=	"gc_elasticity",
5201 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5202 		.maxlen		=	sizeof(int),
5203 		.mode		=	0644,
5204 		.proc_handler	=	proc_dointvec,
5205 	},
5206 	{
5207 		.procname	=	"mtu_expires",
5208 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5209 		.maxlen		=	sizeof(int),
5210 		.mode		=	0644,
5211 		.proc_handler	=	proc_dointvec_jiffies,
5212 	},
5213 	{
5214 		.procname	=	"min_adv_mss",
5215 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5216 		.maxlen		=	sizeof(int),
5217 		.mode		=	0644,
5218 		.proc_handler	=	proc_dointvec,
5219 	},
5220 	{
5221 		.procname	=	"gc_min_interval_ms",
5222 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5223 		.maxlen		=	sizeof(int),
5224 		.mode		=	0644,
5225 		.proc_handler	=	proc_dointvec_ms_jiffies,
5226 	},
5227 	{
5228 		.procname	=	"skip_notify_on_dev_down",
5229 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5230 		.maxlen		=	sizeof(int),
5231 		.mode		=	0644,
5232 		.proc_handler	=	proc_dointvec,
5233 		.extra1		=	&zero,
5234 		.extra2		=	&one,
5235 	},
5236 	{ }
5237 };
5238 
5239 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5240 {
5241 	struct ctl_table *table;
5242 
5243 	table = kmemdup(ipv6_route_table_template,
5244 			sizeof(ipv6_route_table_template),
5245 			GFP_KERNEL);
5246 
5247 	if (table) {
5248 		table[0].data = &net->ipv6.sysctl.flush_delay;
5249 		table[0].extra1 = net;
5250 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5251 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5252 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5253 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5254 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5255 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5256 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5257 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5258 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5259 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5260 
5261 		/* Don't export sysctls to unprivileged users */
5262 		if (net->user_ns != &init_user_ns)
5263 			table[0].procname = NULL;
5264 	}
5265 
5266 	return table;
5267 }
5268 #endif
5269 
5270 static int __net_init ip6_route_net_init(struct net *net)
5271 {
5272 	int ret = -ENOMEM;
5273 
5274 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5275 	       sizeof(net->ipv6.ip6_dst_ops));
5276 
5277 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5278 		goto out_ip6_dst_ops;
5279 
5280 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5281 					    sizeof(*net->ipv6.fib6_null_entry),
5282 					    GFP_KERNEL);
5283 	if (!net->ipv6.fib6_null_entry)
5284 		goto out_ip6_dst_entries;
5285 
5286 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5287 					   sizeof(*net->ipv6.ip6_null_entry),
5288 					   GFP_KERNEL);
5289 	if (!net->ipv6.ip6_null_entry)
5290 		goto out_fib6_null_entry;
5291 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5292 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5293 			 ip6_template_metrics, true);
5294 
5295 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5296 	net->ipv6.fib6_has_custom_rules = false;
5297 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5298 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5299 					       GFP_KERNEL);
5300 	if (!net->ipv6.ip6_prohibit_entry)
5301 		goto out_ip6_null_entry;
5302 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5303 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5304 			 ip6_template_metrics, true);
5305 
5306 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5307 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5308 					       GFP_KERNEL);
5309 	if (!net->ipv6.ip6_blk_hole_entry)
5310 		goto out_ip6_prohibit_entry;
5311 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5312 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5313 			 ip6_template_metrics, true);
5314 #endif
5315 
5316 	net->ipv6.sysctl.flush_delay = 0;
5317 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5318 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5319 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5320 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5321 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5322 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5323 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5324 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5325 
5326 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5327 
5328 	ret = 0;
5329 out:
5330 	return ret;
5331 
5332 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5333 out_ip6_prohibit_entry:
5334 	kfree(net->ipv6.ip6_prohibit_entry);
5335 out_ip6_null_entry:
5336 	kfree(net->ipv6.ip6_null_entry);
5337 #endif
5338 out_fib6_null_entry:
5339 	kfree(net->ipv6.fib6_null_entry);
5340 out_ip6_dst_entries:
5341 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5342 out_ip6_dst_ops:
5343 	goto out;
5344 }
5345 
5346 static void __net_exit ip6_route_net_exit(struct net *net)
5347 {
5348 	kfree(net->ipv6.fib6_null_entry);
5349 	kfree(net->ipv6.ip6_null_entry);
5350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5351 	kfree(net->ipv6.ip6_prohibit_entry);
5352 	kfree(net->ipv6.ip6_blk_hole_entry);
5353 #endif
5354 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5355 }
5356 
5357 static int __net_init ip6_route_net_init_late(struct net *net)
5358 {
5359 #ifdef CONFIG_PROC_FS
5360 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5361 			sizeof(struct ipv6_route_iter));
5362 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5363 			rt6_stats_seq_show, NULL);
5364 #endif
5365 	return 0;
5366 }
5367 
5368 static void __net_exit ip6_route_net_exit_late(struct net *net)
5369 {
5370 #ifdef CONFIG_PROC_FS
5371 	remove_proc_entry("ipv6_route", net->proc_net);
5372 	remove_proc_entry("rt6_stats", net->proc_net);
5373 #endif
5374 }
5375 
5376 static struct pernet_operations ip6_route_net_ops = {
5377 	.init = ip6_route_net_init,
5378 	.exit = ip6_route_net_exit,
5379 };
5380 
5381 static int __net_init ipv6_inetpeer_init(struct net *net)
5382 {
5383 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5384 
5385 	if (!bp)
5386 		return -ENOMEM;
5387 	inet_peer_base_init(bp);
5388 	net->ipv6.peers = bp;
5389 	return 0;
5390 }
5391 
5392 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5393 {
5394 	struct inet_peer_base *bp = net->ipv6.peers;
5395 
5396 	net->ipv6.peers = NULL;
5397 	inetpeer_invalidate_tree(bp);
5398 	kfree(bp);
5399 }
5400 
5401 static struct pernet_operations ipv6_inetpeer_ops = {
5402 	.init	=	ipv6_inetpeer_init,
5403 	.exit	=	ipv6_inetpeer_exit,
5404 };
5405 
5406 static struct pernet_operations ip6_route_net_late_ops = {
5407 	.init = ip6_route_net_init_late,
5408 	.exit = ip6_route_net_exit_late,
5409 };
5410 
5411 static struct notifier_block ip6_route_dev_notifier = {
5412 	.notifier_call = ip6_route_dev_notify,
5413 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5414 };
5415 
5416 void __init ip6_route_init_special_entries(void)
5417 {
5418 	/* Registering of the loopback is done before this portion of code,
5419 	 * the loopback reference in rt6_info will not be taken, do it
5420 	 * manually for init_net */
5421 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5422 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5423 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5424   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5425 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5426 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5427 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5428 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5429   #endif
5430 }
5431 
5432 int __init ip6_route_init(void)
5433 {
5434 	int ret;
5435 	int cpu;
5436 
5437 	ret = -ENOMEM;
5438 	ip6_dst_ops_template.kmem_cachep =
5439 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5440 				  SLAB_HWCACHE_ALIGN, NULL);
5441 	if (!ip6_dst_ops_template.kmem_cachep)
5442 		goto out;
5443 
5444 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5445 	if (ret)
5446 		goto out_kmem_cache;
5447 
5448 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5449 	if (ret)
5450 		goto out_dst_entries;
5451 
5452 	ret = register_pernet_subsys(&ip6_route_net_ops);
5453 	if (ret)
5454 		goto out_register_inetpeer;
5455 
5456 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5457 
5458 	ret = fib6_init();
5459 	if (ret)
5460 		goto out_register_subsys;
5461 
5462 	ret = xfrm6_init();
5463 	if (ret)
5464 		goto out_fib6_init;
5465 
5466 	ret = fib6_rules_init();
5467 	if (ret)
5468 		goto xfrm6_init;
5469 
5470 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5471 	if (ret)
5472 		goto fib6_rules_init;
5473 
5474 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5475 				   inet6_rtm_newroute, NULL, 0);
5476 	if (ret < 0)
5477 		goto out_register_late_subsys;
5478 
5479 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5480 				   inet6_rtm_delroute, NULL, 0);
5481 	if (ret < 0)
5482 		goto out_register_late_subsys;
5483 
5484 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5485 				   inet6_rtm_getroute, NULL,
5486 				   RTNL_FLAG_DOIT_UNLOCKED);
5487 	if (ret < 0)
5488 		goto out_register_late_subsys;
5489 
5490 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5491 	if (ret)
5492 		goto out_register_late_subsys;
5493 
5494 	for_each_possible_cpu(cpu) {
5495 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5496 
5497 		INIT_LIST_HEAD(&ul->head);
5498 		spin_lock_init(&ul->lock);
5499 	}
5500 
5501 out:
5502 	return ret;
5503 
5504 out_register_late_subsys:
5505 	rtnl_unregister_all(PF_INET6);
5506 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5507 fib6_rules_init:
5508 	fib6_rules_cleanup();
5509 xfrm6_init:
5510 	xfrm6_fini();
5511 out_fib6_init:
5512 	fib6_gc_cleanup();
5513 out_register_subsys:
5514 	unregister_pernet_subsys(&ip6_route_net_ops);
5515 out_register_inetpeer:
5516 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5517 out_dst_entries:
5518 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5519 out_kmem_cache:
5520 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5521 	goto out;
5522 }
5523 
5524 void ip6_route_cleanup(void)
5525 {
5526 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5527 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5528 	fib6_rules_cleanup();
5529 	xfrm6_fini();
5530 	fib6_gc_cleanup();
5531 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5532 	unregister_pernet_subsys(&ip6_route_net_ops);
5533 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5534 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5535 }
5536