xref: /openbmc/linux/net/ipv6/route.c (revision dd5b2498)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= ATOMIC_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 struct fib6_info *fib6_multipath_select(const struct net *net,
432 					struct fib6_info *match,
433 					struct flowi6 *fl6, int oif,
434 					const struct sk_buff *skb,
435 					int strict)
436 {
437 	struct fib6_info *sibling, *next_sibling;
438 
439 	/* We might have already computed the hash for ICMPv6 errors. In such
440 	 * case it will always be non-zero. Otherwise now is the time to do it.
441 	 */
442 	if (!fl6->mp_hash)
443 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444 
445 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
446 		return match;
447 
448 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
449 				 fib6_siblings) {
450 		const struct fib6_nh *nh = &sibling->fib6_nh;
451 		int nh_upper_bound;
452 
453 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
454 		if (fl6->mp_hash > nh_upper_bound)
455 			continue;
456 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
457 			break;
458 		match = sibling;
459 		break;
460 	}
461 
462 	return match;
463 }
464 
465 /*
466  *	Route lookup. rcu_read_lock() should be held.
467  */
468 
469 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
470 			       const struct in6_addr *saddr, int oif, int flags)
471 {
472 	const struct net_device *dev;
473 
474 	if (nh->fib_nh_flags & RTNH_F_DEAD)
475 		return false;
476 
477 	dev = nh->fib_nh_dev;
478 	if (oif) {
479 		if (dev->ifindex == oif)
480 			return true;
481 	} else {
482 		if (ipv6_chk_addr(net, saddr, dev,
483 				  flags & RT6_LOOKUP_F_IFACE))
484 			return true;
485 	}
486 
487 	return false;
488 }
489 
490 static inline struct fib6_info *rt6_device_match(struct net *net,
491 						 struct fib6_info *rt,
492 						    const struct in6_addr *saddr,
493 						    int oif,
494 						    int flags)
495 {
496 	const struct fib6_nh *nh;
497 	struct fib6_info *sprt;
498 
499 	if (!oif && ipv6_addr_any(saddr) &&
500 	    !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
501 		return rt;
502 
503 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
504 		nh = &sprt->fib6_nh;
505 		if (__rt6_device_match(net, nh, saddr, oif, flags))
506 			return sprt;
507 	}
508 
509 	if (oif && flags & RT6_LOOKUP_F_IFACE)
510 		return net->ipv6.fib6_null_entry;
511 
512 	return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
513 }
514 
515 #ifdef CONFIG_IPV6_ROUTER_PREF
516 struct __rt6_probe_work {
517 	struct work_struct work;
518 	struct in6_addr target;
519 	struct net_device *dev;
520 };
521 
522 static void rt6_probe_deferred(struct work_struct *w)
523 {
524 	struct in6_addr mcaddr;
525 	struct __rt6_probe_work *work =
526 		container_of(w, struct __rt6_probe_work, work);
527 
528 	addrconf_addr_solict_mult(&work->target, &mcaddr);
529 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
530 	dev_put(work->dev);
531 	kfree(work);
532 }
533 
534 static void rt6_probe(struct fib6_nh *fib6_nh)
535 {
536 	struct __rt6_probe_work *work = NULL;
537 	const struct in6_addr *nh_gw;
538 	struct neighbour *neigh;
539 	struct net_device *dev;
540 	struct inet6_dev *idev;
541 
542 	/*
543 	 * Okay, this does not seem to be appropriate
544 	 * for now, however, we need to check if it
545 	 * is really so; aka Router Reachability Probing.
546 	 *
547 	 * Router Reachability Probe MUST be rate-limited
548 	 * to no more than one per minute.
549 	 */
550 	if (fib6_nh->fib_nh_gw_family)
551 		return;
552 
553 	nh_gw = &fib6_nh->fib_nh_gw6;
554 	dev = fib6_nh->fib_nh_dev;
555 	rcu_read_lock_bh();
556 	idev = __in6_dev_get(dev);
557 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
558 	if (neigh) {
559 		if (neigh->nud_state & NUD_VALID)
560 			goto out;
561 
562 		write_lock(&neigh->lock);
563 		if (!(neigh->nud_state & NUD_VALID) &&
564 		    time_after(jiffies,
565 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
566 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
567 			if (work)
568 				__neigh_set_probe_once(neigh);
569 		}
570 		write_unlock(&neigh->lock);
571 	} else if (time_after(jiffies, fib6_nh->last_probe +
572 				       idev->cnf.rtr_probe_interval)) {
573 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 	}
575 
576 	if (work) {
577 		fib6_nh->last_probe = jiffies;
578 		INIT_WORK(&work->work, rt6_probe_deferred);
579 		work->target = *nh_gw;
580 		dev_hold(dev);
581 		work->dev = dev;
582 		schedule_work(&work->work);
583 	}
584 
585 out:
586 	rcu_read_unlock_bh();
587 }
588 #else
589 static inline void rt6_probe(struct fib6_nh *fib6_nh)
590 {
591 }
592 #endif
593 
594 /*
595  * Default Router Selection (RFC 2461 6.3.6)
596  */
597 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
598 {
599 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
600 	struct neighbour *neigh;
601 
602 	rcu_read_lock_bh();
603 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
604 					  &fib6_nh->fib_nh_gw6);
605 	if (neigh) {
606 		read_lock(&neigh->lock);
607 		if (neigh->nud_state & NUD_VALID)
608 			ret = RT6_NUD_SUCCEED;
609 #ifdef CONFIG_IPV6_ROUTER_PREF
610 		else if (!(neigh->nud_state & NUD_FAILED))
611 			ret = RT6_NUD_SUCCEED;
612 		else
613 			ret = RT6_NUD_FAIL_PROBE;
614 #endif
615 		read_unlock(&neigh->lock);
616 	} else {
617 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
618 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619 	}
620 	rcu_read_unlock_bh();
621 
622 	return ret;
623 }
624 
625 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
626 			   int strict)
627 {
628 	int m = 0;
629 
630 	if (!oif || nh->fib_nh_dev->ifindex == oif)
631 		m = 2;
632 
633 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
634 		return RT6_NUD_FAIL_HARD;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
637 #endif
638 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
639 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
640 		int n = rt6_check_neigh(nh);
641 		if (n < 0)
642 			return n;
643 	}
644 	return m;
645 }
646 
647 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
648 		       int oif, int strict, int *mpri, bool *do_rr)
649 {
650 	bool match_do_rr = false;
651 	bool rc = false;
652 	int m;
653 
654 	if (nh->fib_nh_flags & RTNH_F_DEAD)
655 		goto out;
656 
657 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
658 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
659 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
660 		goto out;
661 
662 	m = rt6_score_route(nh, fib6_flags, oif, strict);
663 	if (m == RT6_NUD_FAIL_DO_RR) {
664 		match_do_rr = true;
665 		m = 0; /* lowest valid score */
666 	} else if (m == RT6_NUD_FAIL_HARD) {
667 		goto out;
668 	}
669 
670 	if (strict & RT6_LOOKUP_F_REACHABLE)
671 		rt6_probe(nh);
672 
673 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
674 	if (m > *mpri) {
675 		*do_rr = match_do_rr;
676 		*mpri = m;
677 		rc = true;
678 	}
679 out:
680 	return rc;
681 }
682 
683 static void __find_rr_leaf(struct fib6_info *rt_start,
684 			   struct fib6_info *nomatch, u32 metric,
685 			   struct fib6_info **match, struct fib6_info **cont,
686 			   int oif, int strict, bool *do_rr, int *mpri)
687 {
688 	struct fib6_info *rt;
689 
690 	for (rt = rt_start;
691 	     rt && rt != nomatch;
692 	     rt = rcu_dereference(rt->fib6_next)) {
693 		struct fib6_nh *nh;
694 
695 		if (cont && rt->fib6_metric != metric) {
696 			*cont = rt;
697 			return;
698 		}
699 
700 		if (fib6_check_expired(rt))
701 			continue;
702 
703 		nh = &rt->fib6_nh;
704 		if (find_match(nh, rt->fib6_flags, oif, strict, mpri, do_rr))
705 			*match = rt;
706 	}
707 }
708 
709 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
710 				      struct fib6_info *leaf,
711 				      struct fib6_info *rr_head,
712 				      u32 metric, int oif, int strict,
713 				      bool *do_rr)
714 {
715 	struct fib6_info *match = NULL, *cont = NULL;
716 	int mpri = -1;
717 
718 	__find_rr_leaf(rr_head, NULL, metric, &match, &cont,
719 		       oif, strict, do_rr, &mpri);
720 
721 	__find_rr_leaf(leaf, rr_head, metric, &match, &cont,
722 		       oif, strict, do_rr, &mpri);
723 
724 	if (match || !cont)
725 		return match;
726 
727 	__find_rr_leaf(cont, NULL, metric, &match, NULL,
728 		       oif, strict, do_rr, &mpri);
729 
730 	return match;
731 }
732 
733 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
734 				   int oif, int strict)
735 {
736 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
737 	struct fib6_info *match, *rt0;
738 	bool do_rr = false;
739 	int key_plen;
740 
741 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
742 		return net->ipv6.fib6_null_entry;
743 
744 	rt0 = rcu_dereference(fn->rr_ptr);
745 	if (!rt0)
746 		rt0 = leaf;
747 
748 	/* Double check to make sure fn is not an intermediate node
749 	 * and fn->leaf does not points to its child's leaf
750 	 * (This might happen if all routes under fn are deleted from
751 	 * the tree and fib6_repair_tree() is called on the node.)
752 	 */
753 	key_plen = rt0->fib6_dst.plen;
754 #ifdef CONFIG_IPV6_SUBTREES
755 	if (rt0->fib6_src.plen)
756 		key_plen = rt0->fib6_src.plen;
757 #endif
758 	if (fn->fn_bit != key_plen)
759 		return net->ipv6.fib6_null_entry;
760 
761 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
762 			     &do_rr);
763 
764 	if (do_rr) {
765 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
766 
767 		/* no entries matched; do round-robin */
768 		if (!next || next->fib6_metric != rt0->fib6_metric)
769 			next = leaf;
770 
771 		if (next != rt0) {
772 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
773 			/* make sure next is not being deleted from the tree */
774 			if (next->fib6_node)
775 				rcu_assign_pointer(fn->rr_ptr, next);
776 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
777 		}
778 	}
779 
780 	return match ? match : net->ipv6.fib6_null_entry;
781 }
782 
783 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
784 {
785 	return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_gw_family;
786 }
787 
788 #ifdef CONFIG_IPV6_ROUTE_INFO
789 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
790 		  const struct in6_addr *gwaddr)
791 {
792 	struct net *net = dev_net(dev);
793 	struct route_info *rinfo = (struct route_info *) opt;
794 	struct in6_addr prefix_buf, *prefix;
795 	unsigned int pref;
796 	unsigned long lifetime;
797 	struct fib6_info *rt;
798 
799 	if (len < sizeof(struct route_info)) {
800 		return -EINVAL;
801 	}
802 
803 	/* Sanity check for prefix_len and length */
804 	if (rinfo->length > 3) {
805 		return -EINVAL;
806 	} else if (rinfo->prefix_len > 128) {
807 		return -EINVAL;
808 	} else if (rinfo->prefix_len > 64) {
809 		if (rinfo->length < 2) {
810 			return -EINVAL;
811 		}
812 	} else if (rinfo->prefix_len > 0) {
813 		if (rinfo->length < 1) {
814 			return -EINVAL;
815 		}
816 	}
817 
818 	pref = rinfo->route_pref;
819 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
820 		return -EINVAL;
821 
822 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
823 
824 	if (rinfo->length == 3)
825 		prefix = (struct in6_addr *)rinfo->prefix;
826 	else {
827 		/* this function is safe */
828 		ipv6_addr_prefix(&prefix_buf,
829 				 (struct in6_addr *)rinfo->prefix,
830 				 rinfo->prefix_len);
831 		prefix = &prefix_buf;
832 	}
833 
834 	if (rinfo->prefix_len == 0)
835 		rt = rt6_get_dflt_router(net, gwaddr, dev);
836 	else
837 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
838 					gwaddr, dev);
839 
840 	if (rt && !lifetime) {
841 		ip6_del_rt(net, rt);
842 		rt = NULL;
843 	}
844 
845 	if (!rt && lifetime)
846 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
847 					dev, pref);
848 	else if (rt)
849 		rt->fib6_flags = RTF_ROUTEINFO |
850 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
851 
852 	if (rt) {
853 		if (!addrconf_finite_timeout(lifetime))
854 			fib6_clean_expires(rt);
855 		else
856 			fib6_set_expires(rt, jiffies + HZ * lifetime);
857 
858 		fib6_info_release(rt);
859 	}
860 	return 0;
861 }
862 #endif
863 
864 /*
865  *	Misc support functions
866  */
867 
868 /* called with rcu_lock held */
869 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
870 {
871 	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
872 
873 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
874 		/* for copies of local routes, dst->dev needs to be the
875 		 * device if it is a master device, the master device if
876 		 * device is enslaved, and the loopback as the default
877 		 */
878 		if (netif_is_l3_slave(dev) &&
879 		    !rt6_need_strict(&rt->fib6_dst.addr))
880 			dev = l3mdev_master_dev_rcu(dev);
881 		else if (!netif_is_l3_master(dev))
882 			dev = dev_net(dev)->loopback_dev;
883 		/* last case is netif_is_l3_master(dev) is true in which
884 		 * case we want dev returned to be dev
885 		 */
886 	}
887 
888 	return dev;
889 }
890 
891 static const int fib6_prop[RTN_MAX + 1] = {
892 	[RTN_UNSPEC]	= 0,
893 	[RTN_UNICAST]	= 0,
894 	[RTN_LOCAL]	= 0,
895 	[RTN_BROADCAST]	= 0,
896 	[RTN_ANYCAST]	= 0,
897 	[RTN_MULTICAST]	= 0,
898 	[RTN_BLACKHOLE]	= -EINVAL,
899 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
900 	[RTN_PROHIBIT]	= -EACCES,
901 	[RTN_THROW]	= -EAGAIN,
902 	[RTN_NAT]	= -EINVAL,
903 	[RTN_XRESOLVE]	= -EINVAL,
904 };
905 
906 static int ip6_rt_type_to_error(u8 fib6_type)
907 {
908 	return fib6_prop[fib6_type];
909 }
910 
911 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
912 {
913 	unsigned short flags = 0;
914 
915 	if (rt->dst_nocount)
916 		flags |= DST_NOCOUNT;
917 	if (rt->dst_nopolicy)
918 		flags |= DST_NOPOLICY;
919 	if (rt->dst_host)
920 		flags |= DST_HOST;
921 
922 	return flags;
923 }
924 
925 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
926 {
927 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
928 
929 	switch (ort->fib6_type) {
930 	case RTN_BLACKHOLE:
931 		rt->dst.output = dst_discard_out;
932 		rt->dst.input = dst_discard;
933 		break;
934 	case RTN_PROHIBIT:
935 		rt->dst.output = ip6_pkt_prohibit_out;
936 		rt->dst.input = ip6_pkt_prohibit;
937 		break;
938 	case RTN_THROW:
939 	case RTN_UNREACHABLE:
940 	default:
941 		rt->dst.output = ip6_pkt_discard_out;
942 		rt->dst.input = ip6_pkt_discard;
943 		break;
944 	}
945 }
946 
947 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
948 {
949 	if (ort->fib6_flags & RTF_REJECT) {
950 		ip6_rt_init_dst_reject(rt, ort);
951 		return;
952 	}
953 
954 	rt->dst.error = 0;
955 	rt->dst.output = ip6_output;
956 
957 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
958 		rt->dst.input = ip6_input;
959 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
960 		rt->dst.input = ip6_mc_input;
961 	} else {
962 		rt->dst.input = ip6_forward;
963 	}
964 
965 	if (ort->fib6_nh.fib_nh_lws) {
966 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
967 		lwtunnel_set_redirect(&rt->dst);
968 	}
969 
970 	rt->dst.lastuse = jiffies;
971 }
972 
973 /* Caller must already hold reference to @from */
974 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
975 {
976 	rt->rt6i_flags &= ~RTF_EXPIRES;
977 	rcu_assign_pointer(rt->from, from);
978 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
979 }
980 
981 /* Caller must already hold reference to @ort */
982 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
983 {
984 	struct net_device *dev = fib6_info_nh_dev(ort);
985 
986 	ip6_rt_init_dst(rt, ort);
987 
988 	rt->rt6i_dst = ort->fib6_dst;
989 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
990 	rt->rt6i_flags = ort->fib6_flags;
991 	if (ort->fib6_nh.fib_nh_gw_family) {
992 		rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
993 		rt->rt6i_flags |= RTF_GATEWAY;
994 	}
995 	rt6_set_from(rt, ort);
996 #ifdef CONFIG_IPV6_SUBTREES
997 	rt->rt6i_src = ort->fib6_src;
998 #endif
999 }
1000 
1001 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1002 					struct in6_addr *saddr)
1003 {
1004 	struct fib6_node *pn, *sn;
1005 	while (1) {
1006 		if (fn->fn_flags & RTN_TL_ROOT)
1007 			return NULL;
1008 		pn = rcu_dereference(fn->parent);
1009 		sn = FIB6_SUBTREE(pn);
1010 		if (sn && sn != fn)
1011 			fn = fib6_node_lookup(sn, NULL, saddr);
1012 		else
1013 			fn = pn;
1014 		if (fn->fn_flags & RTN_RTINFO)
1015 			return fn;
1016 	}
1017 }
1018 
1019 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1020 {
1021 	struct rt6_info *rt = *prt;
1022 
1023 	if (dst_hold_safe(&rt->dst))
1024 		return true;
1025 	if (net) {
1026 		rt = net->ipv6.ip6_null_entry;
1027 		dst_hold(&rt->dst);
1028 	} else {
1029 		rt = NULL;
1030 	}
1031 	*prt = rt;
1032 	return false;
1033 }
1034 
1035 /* called with rcu_lock held */
1036 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1037 {
1038 	unsigned short flags = fib6_info_dst_flags(rt);
1039 	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1040 	struct rt6_info *nrt;
1041 
1042 	if (!fib6_info_hold_safe(rt))
1043 		goto fallback;
1044 
1045 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1046 	if (!nrt) {
1047 		fib6_info_release(rt);
1048 		goto fallback;
1049 	}
1050 
1051 	ip6_rt_copy_init(nrt, rt);
1052 	return nrt;
1053 
1054 fallback:
1055 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1056 	dst_hold(&nrt->dst);
1057 	return nrt;
1058 }
1059 
1060 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1061 					     struct fib6_table *table,
1062 					     struct flowi6 *fl6,
1063 					     const struct sk_buff *skb,
1064 					     int flags)
1065 {
1066 	struct fib6_info *f6i;
1067 	struct fib6_node *fn;
1068 	struct rt6_info *rt;
1069 
1070 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1071 		flags &= ~RT6_LOOKUP_F_IFACE;
1072 
1073 	rcu_read_lock();
1074 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1075 restart:
1076 	f6i = rcu_dereference(fn->leaf);
1077 	if (!f6i)
1078 		f6i = net->ipv6.fib6_null_entry;
1079 	else
1080 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081 				      fl6->flowi6_oif, flags);
1082 
1083 	if (f6i == net->ipv6.fib6_null_entry) {
1084 		fn = fib6_backtrack(fn, &fl6->saddr);
1085 		if (fn)
1086 			goto restart;
1087 
1088 		rt = net->ipv6.ip6_null_entry;
1089 		dst_hold(&rt->dst);
1090 		goto out;
1091 	}
1092 
1093 	if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1094 		f6i = fib6_multipath_select(net, f6i, fl6, fl6->flowi6_oif, skb,
1095 					    flags);
1096 	/* Search through exception table */
1097 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1098 	if (rt) {
1099 		if (ip6_hold_safe(net, &rt))
1100 			dst_use_noref(&rt->dst, jiffies);
1101 	} else {
1102 		rt = ip6_create_rt_rcu(f6i);
1103 	}
1104 
1105 out:
1106 	trace_fib6_table_lookup(net, f6i, table, fl6);
1107 
1108 	rcu_read_unlock();
1109 
1110 	return rt;
1111 }
1112 
1113 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1114 				   const struct sk_buff *skb, int flags)
1115 {
1116 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1117 }
1118 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1119 
1120 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1121 			    const struct in6_addr *saddr, int oif,
1122 			    const struct sk_buff *skb, int strict)
1123 {
1124 	struct flowi6 fl6 = {
1125 		.flowi6_oif = oif,
1126 		.daddr = *daddr,
1127 	};
1128 	struct dst_entry *dst;
1129 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1130 
1131 	if (saddr) {
1132 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1133 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1134 	}
1135 
1136 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1137 	if (dst->error == 0)
1138 		return (struct rt6_info *) dst;
1139 
1140 	dst_release(dst);
1141 
1142 	return NULL;
1143 }
1144 EXPORT_SYMBOL(rt6_lookup);
1145 
1146 /* ip6_ins_rt is called with FREE table->tb6_lock.
1147  * It takes new route entry, the addition fails by any reason the
1148  * route is released.
1149  * Caller must hold dst before calling it.
1150  */
1151 
1152 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1153 			struct netlink_ext_ack *extack)
1154 {
1155 	int err;
1156 	struct fib6_table *table;
1157 
1158 	table = rt->fib6_table;
1159 	spin_lock_bh(&table->tb6_lock);
1160 	err = fib6_add(&table->tb6_root, rt, info, extack);
1161 	spin_unlock_bh(&table->tb6_lock);
1162 
1163 	return err;
1164 }
1165 
1166 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1167 {
1168 	struct nl_info info = {	.nl_net = net, };
1169 
1170 	return __ip6_ins_rt(rt, &info, NULL);
1171 }
1172 
1173 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1174 					   const struct in6_addr *daddr,
1175 					   const struct in6_addr *saddr)
1176 {
1177 	struct net_device *dev;
1178 	struct rt6_info *rt;
1179 
1180 	/*
1181 	 *	Clone the route.
1182 	 */
1183 
1184 	if (!fib6_info_hold_safe(ort))
1185 		return NULL;
1186 
1187 	dev = ip6_rt_get_dev_rcu(ort);
1188 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1189 	if (!rt) {
1190 		fib6_info_release(ort);
1191 		return NULL;
1192 	}
1193 
1194 	ip6_rt_copy_init(rt, ort);
1195 	rt->rt6i_flags |= RTF_CACHE;
1196 	rt->dst.flags |= DST_HOST;
1197 	rt->rt6i_dst.addr = *daddr;
1198 	rt->rt6i_dst.plen = 128;
1199 
1200 	if (!rt6_is_gw_or_nonexthop(ort)) {
1201 		if (ort->fib6_dst.plen != 128 &&
1202 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1203 			rt->rt6i_flags |= RTF_ANYCAST;
1204 #ifdef CONFIG_IPV6_SUBTREES
1205 		if (rt->rt6i_src.plen && saddr) {
1206 			rt->rt6i_src.addr = *saddr;
1207 			rt->rt6i_src.plen = 128;
1208 		}
1209 #endif
1210 	}
1211 
1212 	return rt;
1213 }
1214 
1215 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1216 {
1217 	unsigned short flags = fib6_info_dst_flags(rt);
1218 	struct net_device *dev;
1219 	struct rt6_info *pcpu_rt;
1220 
1221 	if (!fib6_info_hold_safe(rt))
1222 		return NULL;
1223 
1224 	rcu_read_lock();
1225 	dev = ip6_rt_get_dev_rcu(rt);
1226 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1227 	rcu_read_unlock();
1228 	if (!pcpu_rt) {
1229 		fib6_info_release(rt);
1230 		return NULL;
1231 	}
1232 	ip6_rt_copy_init(pcpu_rt, rt);
1233 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1234 	return pcpu_rt;
1235 }
1236 
1237 /* It should be called with rcu_read_lock() acquired */
1238 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1239 {
1240 	struct rt6_info *pcpu_rt, **p;
1241 
1242 	p = this_cpu_ptr(rt->rt6i_pcpu);
1243 	pcpu_rt = *p;
1244 
1245 	if (pcpu_rt)
1246 		ip6_hold_safe(NULL, &pcpu_rt);
1247 
1248 	return pcpu_rt;
1249 }
1250 
1251 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1252 					    struct fib6_info *rt)
1253 {
1254 	struct rt6_info *pcpu_rt, *prev, **p;
1255 
1256 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1257 	if (!pcpu_rt) {
1258 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1259 		return net->ipv6.ip6_null_entry;
1260 	}
1261 
1262 	dst_hold(&pcpu_rt->dst);
1263 	p = this_cpu_ptr(rt->rt6i_pcpu);
1264 	prev = cmpxchg(p, NULL, pcpu_rt);
1265 	BUG_ON(prev);
1266 
1267 	return pcpu_rt;
1268 }
1269 
1270 /* exception hash table implementation
1271  */
1272 static DEFINE_SPINLOCK(rt6_exception_lock);
1273 
1274 /* Remove rt6_ex from hash table and free the memory
1275  * Caller must hold rt6_exception_lock
1276  */
1277 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1278 				 struct rt6_exception *rt6_ex)
1279 {
1280 	struct fib6_info *from;
1281 	struct net *net;
1282 
1283 	if (!bucket || !rt6_ex)
1284 		return;
1285 
1286 	net = dev_net(rt6_ex->rt6i->dst.dev);
1287 	net->ipv6.rt6_stats->fib_rt_cache--;
1288 
1289 	/* purge completely the exception to allow releasing the held resources:
1290 	 * some [sk] cache may keep the dst around for unlimited time
1291 	 */
1292 	from = rcu_dereference_protected(rt6_ex->rt6i->from,
1293 					 lockdep_is_held(&rt6_exception_lock));
1294 	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1295 	fib6_info_release(from);
1296 	dst_dev_put(&rt6_ex->rt6i->dst);
1297 
1298 	hlist_del_rcu(&rt6_ex->hlist);
1299 	dst_release(&rt6_ex->rt6i->dst);
1300 	kfree_rcu(rt6_ex, rcu);
1301 	WARN_ON_ONCE(!bucket->depth);
1302 	bucket->depth--;
1303 }
1304 
1305 /* Remove oldest rt6_ex in bucket and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1309 {
1310 	struct rt6_exception *rt6_ex, *oldest = NULL;
1311 
1312 	if (!bucket)
1313 		return;
1314 
1315 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1316 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1317 			oldest = rt6_ex;
1318 	}
1319 	rt6_remove_exception(bucket, oldest);
1320 }
1321 
1322 static u32 rt6_exception_hash(const struct in6_addr *dst,
1323 			      const struct in6_addr *src)
1324 {
1325 	static u32 seed __read_mostly;
1326 	u32 val;
1327 
1328 	net_get_random_once(&seed, sizeof(seed));
1329 	val = jhash(dst, sizeof(*dst), seed);
1330 
1331 #ifdef CONFIG_IPV6_SUBTREES
1332 	if (src)
1333 		val = jhash(src, sizeof(*src), val);
1334 #endif
1335 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1336 }
1337 
1338 /* Helper function to find the cached rt in the hash table
1339  * and update bucket pointer to point to the bucket for this
1340  * (daddr, saddr) pair
1341  * Caller must hold rt6_exception_lock
1342  */
1343 static struct rt6_exception *
1344 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1345 			      const struct in6_addr *daddr,
1346 			      const struct in6_addr *saddr)
1347 {
1348 	struct rt6_exception *rt6_ex;
1349 	u32 hval;
1350 
1351 	if (!(*bucket) || !daddr)
1352 		return NULL;
1353 
1354 	hval = rt6_exception_hash(daddr, saddr);
1355 	*bucket += hval;
1356 
1357 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1358 		struct rt6_info *rt6 = rt6_ex->rt6i;
1359 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1360 
1361 #ifdef CONFIG_IPV6_SUBTREES
1362 		if (matched && saddr)
1363 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1364 #endif
1365 		if (matched)
1366 			return rt6_ex;
1367 	}
1368 	return NULL;
1369 }
1370 
1371 /* Helper function to find the cached rt in the hash table
1372  * and update bucket pointer to point to the bucket for this
1373  * (daddr, saddr) pair
1374  * Caller must hold rcu_read_lock()
1375  */
1376 static struct rt6_exception *
1377 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1378 			 const struct in6_addr *daddr,
1379 			 const struct in6_addr *saddr)
1380 {
1381 	struct rt6_exception *rt6_ex;
1382 	u32 hval;
1383 
1384 	WARN_ON_ONCE(!rcu_read_lock_held());
1385 
1386 	if (!(*bucket) || !daddr)
1387 		return NULL;
1388 
1389 	hval = rt6_exception_hash(daddr, saddr);
1390 	*bucket += hval;
1391 
1392 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1393 		struct rt6_info *rt6 = rt6_ex->rt6i;
1394 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1395 
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 		if (matched && saddr)
1398 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1399 #endif
1400 		if (matched)
1401 			return rt6_ex;
1402 	}
1403 	return NULL;
1404 }
1405 
1406 static unsigned int fib6_mtu(const struct fib6_info *rt)
1407 {
1408 	unsigned int mtu;
1409 
1410 	if (rt->fib6_pmtu) {
1411 		mtu = rt->fib6_pmtu;
1412 	} else {
1413 		struct net_device *dev = fib6_info_nh_dev(rt);
1414 		struct inet6_dev *idev;
1415 
1416 		rcu_read_lock();
1417 		idev = __in6_dev_get(dev);
1418 		mtu = idev->cnf.mtu6;
1419 		rcu_read_unlock();
1420 	}
1421 
1422 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1423 
1424 	return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1425 }
1426 
1427 static int rt6_insert_exception(struct rt6_info *nrt,
1428 				struct fib6_info *ort)
1429 {
1430 	struct net *net = dev_net(nrt->dst.dev);
1431 	struct rt6_exception_bucket *bucket;
1432 	struct in6_addr *src_key = NULL;
1433 	struct rt6_exception *rt6_ex;
1434 	int err = 0;
1435 
1436 	spin_lock_bh(&rt6_exception_lock);
1437 
1438 	if (ort->exception_bucket_flushed) {
1439 		err = -EINVAL;
1440 		goto out;
1441 	}
1442 
1443 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1444 					lockdep_is_held(&rt6_exception_lock));
1445 	if (!bucket) {
1446 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1447 				 GFP_ATOMIC);
1448 		if (!bucket) {
1449 			err = -ENOMEM;
1450 			goto out;
1451 		}
1452 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1453 	}
1454 
1455 #ifdef CONFIG_IPV6_SUBTREES
1456 	/* rt6i_src.plen != 0 indicates ort is in subtree
1457 	 * and exception table is indexed by a hash of
1458 	 * both rt6i_dst and rt6i_src.
1459 	 * Otherwise, the exception table is indexed by
1460 	 * a hash of only rt6i_dst.
1461 	 */
1462 	if (ort->fib6_src.plen)
1463 		src_key = &nrt->rt6i_src.addr;
1464 #endif
1465 	/* rt6_mtu_change() might lower mtu on ort.
1466 	 * Only insert this exception route if its mtu
1467 	 * is less than ort's mtu value.
1468 	 */
1469 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1470 		err = -EINVAL;
1471 		goto out;
1472 	}
1473 
1474 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1475 					       src_key);
1476 	if (rt6_ex)
1477 		rt6_remove_exception(bucket, rt6_ex);
1478 
1479 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1480 	if (!rt6_ex) {
1481 		err = -ENOMEM;
1482 		goto out;
1483 	}
1484 	rt6_ex->rt6i = nrt;
1485 	rt6_ex->stamp = jiffies;
1486 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1487 	bucket->depth++;
1488 	net->ipv6.rt6_stats->fib_rt_cache++;
1489 
1490 	if (bucket->depth > FIB6_MAX_DEPTH)
1491 		rt6_exception_remove_oldest(bucket);
1492 
1493 out:
1494 	spin_unlock_bh(&rt6_exception_lock);
1495 
1496 	/* Update fn->fn_sernum to invalidate all cached dst */
1497 	if (!err) {
1498 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1499 		fib6_update_sernum(net, ort);
1500 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1501 		fib6_force_start_gc(net);
1502 	}
1503 
1504 	return err;
1505 }
1506 
1507 void rt6_flush_exceptions(struct fib6_info *rt)
1508 {
1509 	struct rt6_exception_bucket *bucket;
1510 	struct rt6_exception *rt6_ex;
1511 	struct hlist_node *tmp;
1512 	int i;
1513 
1514 	spin_lock_bh(&rt6_exception_lock);
1515 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1516 	rt->exception_bucket_flushed = 1;
1517 
1518 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519 				    lockdep_is_held(&rt6_exception_lock));
1520 	if (!bucket)
1521 		goto out;
1522 
1523 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1525 			rt6_remove_exception(bucket, rt6_ex);
1526 		WARN_ON_ONCE(bucket->depth);
1527 		bucket++;
1528 	}
1529 
1530 out:
1531 	spin_unlock_bh(&rt6_exception_lock);
1532 }
1533 
1534 /* Find cached rt in the hash table inside passed in rt
1535  * Caller has to hold rcu_read_lock()
1536  */
1537 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1538 					   struct in6_addr *daddr,
1539 					   struct in6_addr *saddr)
1540 {
1541 	struct rt6_exception_bucket *bucket;
1542 	struct in6_addr *src_key = NULL;
1543 	struct rt6_exception *rt6_ex;
1544 	struct rt6_info *res = NULL;
1545 
1546 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1547 
1548 #ifdef CONFIG_IPV6_SUBTREES
1549 	/* rt6i_src.plen != 0 indicates rt is in subtree
1550 	 * and exception table is indexed by a hash of
1551 	 * both rt6i_dst and rt6i_src.
1552 	 * Otherwise, the exception table is indexed by
1553 	 * a hash of only rt6i_dst.
1554 	 */
1555 	if (rt->fib6_src.plen)
1556 		src_key = saddr;
1557 #endif
1558 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1559 
1560 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1561 		res = rt6_ex->rt6i;
1562 
1563 	return res;
1564 }
1565 
1566 /* Remove the passed in cached rt from the hash table that contains it */
1567 static int rt6_remove_exception_rt(struct rt6_info *rt)
1568 {
1569 	struct rt6_exception_bucket *bucket;
1570 	struct in6_addr *src_key = NULL;
1571 	struct rt6_exception *rt6_ex;
1572 	struct fib6_info *from;
1573 	int err;
1574 
1575 	from = rcu_dereference(rt->from);
1576 	if (!from ||
1577 	    !(rt->rt6i_flags & RTF_CACHE))
1578 		return -EINVAL;
1579 
1580 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1581 		return -ENOENT;
1582 
1583 	spin_lock_bh(&rt6_exception_lock);
1584 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1585 				    lockdep_is_held(&rt6_exception_lock));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1588 	 * and exception table is indexed by a hash of
1589 	 * both rt6i_dst and rt6i_src.
1590 	 * Otherwise, the exception table is indexed by
1591 	 * a hash of only rt6i_dst.
1592 	 */
1593 	if (from->fib6_src.plen)
1594 		src_key = &rt->rt6i_src.addr;
1595 #endif
1596 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1597 					       &rt->rt6i_dst.addr,
1598 					       src_key);
1599 	if (rt6_ex) {
1600 		rt6_remove_exception(bucket, rt6_ex);
1601 		err = 0;
1602 	} else {
1603 		err = -ENOENT;
1604 	}
1605 
1606 	spin_unlock_bh(&rt6_exception_lock);
1607 	return err;
1608 }
1609 
1610 /* Find rt6_ex which contains the passed in rt cache and
1611  * refresh its stamp
1612  */
1613 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1614 {
1615 	struct rt6_exception_bucket *bucket;
1616 	struct in6_addr *src_key = NULL;
1617 	struct rt6_exception *rt6_ex;
1618 	struct fib6_info *from;
1619 
1620 	rcu_read_lock();
1621 	from = rcu_dereference(rt->from);
1622 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1623 		goto unlock;
1624 
1625 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1626 
1627 #ifdef CONFIG_IPV6_SUBTREES
1628 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1629 	 * and exception table is indexed by a hash of
1630 	 * both rt6i_dst and rt6i_src.
1631 	 * Otherwise, the exception table is indexed by
1632 	 * a hash of only rt6i_dst.
1633 	 */
1634 	if (from->fib6_src.plen)
1635 		src_key = &rt->rt6i_src.addr;
1636 #endif
1637 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1638 					  &rt->rt6i_dst.addr,
1639 					  src_key);
1640 	if (rt6_ex)
1641 		rt6_ex->stamp = jiffies;
1642 
1643 unlock:
1644 	rcu_read_unlock();
1645 }
1646 
1647 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1648 					 struct rt6_info *rt, int mtu)
1649 {
1650 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1651 	 * lowest MTU in the path: always allow updating the route PMTU to
1652 	 * reflect PMTU decreases.
1653 	 *
1654 	 * If the new MTU is higher, and the route PMTU is equal to the local
1655 	 * MTU, this means the old MTU is the lowest in the path, so allow
1656 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1657 	 * handle this.
1658 	 */
1659 
1660 	if (dst_mtu(&rt->dst) >= mtu)
1661 		return true;
1662 
1663 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1664 		return true;
1665 
1666 	return false;
1667 }
1668 
1669 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1670 				       struct fib6_info *rt, int mtu)
1671 {
1672 	struct rt6_exception_bucket *bucket;
1673 	struct rt6_exception *rt6_ex;
1674 	int i;
1675 
1676 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1677 					lockdep_is_held(&rt6_exception_lock));
1678 
1679 	if (!bucket)
1680 		return;
1681 
1682 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1683 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1684 			struct rt6_info *entry = rt6_ex->rt6i;
1685 
1686 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1687 			 * route), the metrics of its rt->from have already
1688 			 * been updated.
1689 			 */
1690 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1691 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1692 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1693 		}
1694 		bucket++;
1695 	}
1696 }
1697 
1698 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1699 
1700 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1701 					struct in6_addr *gateway)
1702 {
1703 	struct rt6_exception_bucket *bucket;
1704 	struct rt6_exception *rt6_ex;
1705 	struct hlist_node *tmp;
1706 	int i;
1707 
1708 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1709 		return;
1710 
1711 	spin_lock_bh(&rt6_exception_lock);
1712 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713 				     lockdep_is_held(&rt6_exception_lock));
1714 
1715 	if (bucket) {
1716 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1717 			hlist_for_each_entry_safe(rt6_ex, tmp,
1718 						  &bucket->chain, hlist) {
1719 				struct rt6_info *entry = rt6_ex->rt6i;
1720 
1721 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1722 				    RTF_CACHE_GATEWAY &&
1723 				    ipv6_addr_equal(gateway,
1724 						    &entry->rt6i_gateway)) {
1725 					rt6_remove_exception(bucket, rt6_ex);
1726 				}
1727 			}
1728 			bucket++;
1729 		}
1730 	}
1731 
1732 	spin_unlock_bh(&rt6_exception_lock);
1733 }
1734 
1735 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1736 				      struct rt6_exception *rt6_ex,
1737 				      struct fib6_gc_args *gc_args,
1738 				      unsigned long now)
1739 {
1740 	struct rt6_info *rt = rt6_ex->rt6i;
1741 
1742 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1743 	 * even if others have still references to them, so that on next
1744 	 * dst_check() such references can be dropped.
1745 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1746 	 * expired, independently from their aging, as per RFC 8201 section 4
1747 	 */
1748 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1749 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1750 			RT6_TRACE("aging clone %p\n", rt);
1751 			rt6_remove_exception(bucket, rt6_ex);
1752 			return;
1753 		}
1754 	} else if (time_after(jiffies, rt->dst.expires)) {
1755 		RT6_TRACE("purging expired route %p\n", rt);
1756 		rt6_remove_exception(bucket, rt6_ex);
1757 		return;
1758 	}
1759 
1760 	if (rt->rt6i_flags & RTF_GATEWAY) {
1761 		struct neighbour *neigh;
1762 		__u8 neigh_flags = 0;
1763 
1764 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1765 		if (neigh)
1766 			neigh_flags = neigh->flags;
1767 
1768 		if (!(neigh_flags & NTF_ROUTER)) {
1769 			RT6_TRACE("purging route %p via non-router but gateway\n",
1770 				  rt);
1771 			rt6_remove_exception(bucket, rt6_ex);
1772 			return;
1773 		}
1774 	}
1775 
1776 	gc_args->more++;
1777 }
1778 
1779 void rt6_age_exceptions(struct fib6_info *rt,
1780 			struct fib6_gc_args *gc_args,
1781 			unsigned long now)
1782 {
1783 	struct rt6_exception_bucket *bucket;
1784 	struct rt6_exception *rt6_ex;
1785 	struct hlist_node *tmp;
1786 	int i;
1787 
1788 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1789 		return;
1790 
1791 	rcu_read_lock_bh();
1792 	spin_lock(&rt6_exception_lock);
1793 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1794 				    lockdep_is_held(&rt6_exception_lock));
1795 
1796 	if (bucket) {
1797 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1798 			hlist_for_each_entry_safe(rt6_ex, tmp,
1799 						  &bucket->chain, hlist) {
1800 				rt6_age_examine_exception(bucket, rt6_ex,
1801 							  gc_args, now);
1802 			}
1803 			bucket++;
1804 		}
1805 	}
1806 	spin_unlock(&rt6_exception_lock);
1807 	rcu_read_unlock_bh();
1808 }
1809 
1810 /* must be called with rcu lock held */
1811 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1812 				    int oif, struct flowi6 *fl6, int strict)
1813 {
1814 	struct fib6_node *fn, *saved_fn;
1815 	struct fib6_info *f6i;
1816 
1817 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1818 	saved_fn = fn;
1819 
1820 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1821 		oif = 0;
1822 
1823 redo_rt6_select:
1824 	f6i = rt6_select(net, fn, oif, strict);
1825 	if (f6i == net->ipv6.fib6_null_entry) {
1826 		fn = fib6_backtrack(fn, &fl6->saddr);
1827 		if (fn)
1828 			goto redo_rt6_select;
1829 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1830 			/* also consider unreachable route */
1831 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1832 			fn = saved_fn;
1833 			goto redo_rt6_select;
1834 		}
1835 	}
1836 
1837 	trace_fib6_table_lookup(net, f6i, table, fl6);
1838 
1839 	return f6i;
1840 }
1841 
1842 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1843 			       int oif, struct flowi6 *fl6,
1844 			       const struct sk_buff *skb, int flags)
1845 {
1846 	struct fib6_info *f6i;
1847 	struct rt6_info *rt;
1848 	int strict = 0;
1849 
1850 	strict |= flags & RT6_LOOKUP_F_IFACE;
1851 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1852 	if (net->ipv6.devconf_all->forwarding == 0)
1853 		strict |= RT6_LOOKUP_F_REACHABLE;
1854 
1855 	rcu_read_lock();
1856 
1857 	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1858 	if (f6i == net->ipv6.fib6_null_entry) {
1859 		rt = net->ipv6.ip6_null_entry;
1860 		rcu_read_unlock();
1861 		dst_hold(&rt->dst);
1862 		return rt;
1863 	}
1864 
1865 	if (f6i->fib6_nsiblings)
1866 		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1867 
1868 	/*Search through exception table */
1869 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1870 	if (rt) {
1871 		if (ip6_hold_safe(net, &rt))
1872 			dst_use_noref(&rt->dst, jiffies);
1873 
1874 		rcu_read_unlock();
1875 		return rt;
1876 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1877 			    !f6i->fib6_nh.fib_nh_gw_family)) {
1878 		/* Create a RTF_CACHE clone which will not be
1879 		 * owned by the fib6 tree.  It is for the special case where
1880 		 * the daddr in the skb during the neighbor look-up is different
1881 		 * from the fl6->daddr used to look-up route here.
1882 		 */
1883 		struct rt6_info *uncached_rt;
1884 
1885 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1886 
1887 		rcu_read_unlock();
1888 
1889 		if (uncached_rt) {
1890 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1891 			 * No need for another dst_hold()
1892 			 */
1893 			rt6_uncached_list_add(uncached_rt);
1894 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1895 		} else {
1896 			uncached_rt = net->ipv6.ip6_null_entry;
1897 			dst_hold(&uncached_rt->dst);
1898 		}
1899 
1900 		return uncached_rt;
1901 	} else {
1902 		/* Get a percpu copy */
1903 
1904 		struct rt6_info *pcpu_rt;
1905 
1906 		local_bh_disable();
1907 		pcpu_rt = rt6_get_pcpu_route(f6i);
1908 
1909 		if (!pcpu_rt)
1910 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1911 
1912 		local_bh_enable();
1913 		rcu_read_unlock();
1914 
1915 		return pcpu_rt;
1916 	}
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_pol_route);
1919 
1920 static struct rt6_info *ip6_pol_route_input(struct net *net,
1921 					    struct fib6_table *table,
1922 					    struct flowi6 *fl6,
1923 					    const struct sk_buff *skb,
1924 					    int flags)
1925 {
1926 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1927 }
1928 
1929 struct dst_entry *ip6_route_input_lookup(struct net *net,
1930 					 struct net_device *dev,
1931 					 struct flowi6 *fl6,
1932 					 const struct sk_buff *skb,
1933 					 int flags)
1934 {
1935 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1936 		flags |= RT6_LOOKUP_F_IFACE;
1937 
1938 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1939 }
1940 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1941 
1942 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1943 				  struct flow_keys *keys,
1944 				  struct flow_keys *flkeys)
1945 {
1946 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1947 	const struct ipv6hdr *key_iph = outer_iph;
1948 	struct flow_keys *_flkeys = flkeys;
1949 	const struct ipv6hdr *inner_iph;
1950 	const struct icmp6hdr *icmph;
1951 	struct ipv6hdr _inner_iph;
1952 	struct icmp6hdr _icmph;
1953 
1954 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1955 		goto out;
1956 
1957 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1958 				   sizeof(_icmph), &_icmph);
1959 	if (!icmph)
1960 		goto out;
1961 
1962 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1963 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1964 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1965 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1966 		goto out;
1967 
1968 	inner_iph = skb_header_pointer(skb,
1969 				       skb_transport_offset(skb) + sizeof(*icmph),
1970 				       sizeof(_inner_iph), &_inner_iph);
1971 	if (!inner_iph)
1972 		goto out;
1973 
1974 	key_iph = inner_iph;
1975 	_flkeys = NULL;
1976 out:
1977 	if (_flkeys) {
1978 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1979 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1980 		keys->tags.flow_label = _flkeys->tags.flow_label;
1981 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1982 	} else {
1983 		keys->addrs.v6addrs.src = key_iph->saddr;
1984 		keys->addrs.v6addrs.dst = key_iph->daddr;
1985 		keys->tags.flow_label = ip6_flowlabel(key_iph);
1986 		keys->basic.ip_proto = key_iph->nexthdr;
1987 	}
1988 }
1989 
1990 /* if skb is set it will be used and fl6 can be NULL */
1991 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1992 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1993 {
1994 	struct flow_keys hash_keys;
1995 	u32 mhash;
1996 
1997 	switch (ip6_multipath_hash_policy(net)) {
1998 	case 0:
1999 		memset(&hash_keys, 0, sizeof(hash_keys));
2000 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2001 		if (skb) {
2002 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2003 		} else {
2004 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2005 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2006 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2007 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2008 		}
2009 		break;
2010 	case 1:
2011 		if (skb) {
2012 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2013 			struct flow_keys keys;
2014 
2015 			/* short-circuit if we already have L4 hash present */
2016 			if (skb->l4_hash)
2017 				return skb_get_hash_raw(skb) >> 1;
2018 
2019 			memset(&hash_keys, 0, sizeof(hash_keys));
2020 
2021                         if (!flkeys) {
2022 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2023 				flkeys = &keys;
2024 			}
2025 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2026 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2027 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2028 			hash_keys.ports.src = flkeys->ports.src;
2029 			hash_keys.ports.dst = flkeys->ports.dst;
2030 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2031 		} else {
2032 			memset(&hash_keys, 0, sizeof(hash_keys));
2033 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2034 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2035 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2036 			hash_keys.ports.src = fl6->fl6_sport;
2037 			hash_keys.ports.dst = fl6->fl6_dport;
2038 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2039 		}
2040 		break;
2041 	}
2042 	mhash = flow_hash_from_keys(&hash_keys);
2043 
2044 	return mhash >> 1;
2045 }
2046 
2047 void ip6_route_input(struct sk_buff *skb)
2048 {
2049 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2050 	struct net *net = dev_net(skb->dev);
2051 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2052 	struct ip_tunnel_info *tun_info;
2053 	struct flowi6 fl6 = {
2054 		.flowi6_iif = skb->dev->ifindex,
2055 		.daddr = iph->daddr,
2056 		.saddr = iph->saddr,
2057 		.flowlabel = ip6_flowinfo(iph),
2058 		.flowi6_mark = skb->mark,
2059 		.flowi6_proto = iph->nexthdr,
2060 	};
2061 	struct flow_keys *flkeys = NULL, _flkeys;
2062 
2063 	tun_info = skb_tunnel_info(skb);
2064 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2065 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2066 
2067 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2068 		flkeys = &_flkeys;
2069 
2070 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2071 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2072 	skb_dst_drop(skb);
2073 	skb_dst_set(skb,
2074 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2075 }
2076 
2077 static struct rt6_info *ip6_pol_route_output(struct net *net,
2078 					     struct fib6_table *table,
2079 					     struct flowi6 *fl6,
2080 					     const struct sk_buff *skb,
2081 					     int flags)
2082 {
2083 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2084 }
2085 
2086 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2087 					 struct flowi6 *fl6, int flags)
2088 {
2089 	bool any_src;
2090 
2091 	if (ipv6_addr_type(&fl6->daddr) &
2092 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2093 		struct dst_entry *dst;
2094 
2095 		dst = l3mdev_link_scope_lookup(net, fl6);
2096 		if (dst)
2097 			return dst;
2098 	}
2099 
2100 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2101 
2102 	any_src = ipv6_addr_any(&fl6->saddr);
2103 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2104 	    (fl6->flowi6_oif && any_src))
2105 		flags |= RT6_LOOKUP_F_IFACE;
2106 
2107 	if (!any_src)
2108 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2109 	else if (sk)
2110 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2111 
2112 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2113 }
2114 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2115 
2116 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2117 {
2118 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2119 	struct net_device *loopback_dev = net->loopback_dev;
2120 	struct dst_entry *new = NULL;
2121 
2122 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2123 		       DST_OBSOLETE_DEAD, 0);
2124 	if (rt) {
2125 		rt6_info_init(rt);
2126 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2127 
2128 		new = &rt->dst;
2129 		new->__use = 1;
2130 		new->input = dst_discard;
2131 		new->output = dst_discard_out;
2132 
2133 		dst_copy_metrics(new, &ort->dst);
2134 
2135 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2136 		rt->rt6i_gateway = ort->rt6i_gateway;
2137 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2138 
2139 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2140 #ifdef CONFIG_IPV6_SUBTREES
2141 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2142 #endif
2143 	}
2144 
2145 	dst_release(dst_orig);
2146 	return new ? new : ERR_PTR(-ENOMEM);
2147 }
2148 
2149 /*
2150  *	Destination cache support functions
2151  */
2152 
2153 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2154 {
2155 	u32 rt_cookie = 0;
2156 
2157 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2158 		return false;
2159 
2160 	if (fib6_check_expired(f6i))
2161 		return false;
2162 
2163 	return true;
2164 }
2165 
2166 static struct dst_entry *rt6_check(struct rt6_info *rt,
2167 				   struct fib6_info *from,
2168 				   u32 cookie)
2169 {
2170 	u32 rt_cookie = 0;
2171 
2172 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2173 	    rt_cookie != cookie)
2174 		return NULL;
2175 
2176 	if (rt6_check_expired(rt))
2177 		return NULL;
2178 
2179 	return &rt->dst;
2180 }
2181 
2182 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2183 					    struct fib6_info *from,
2184 					    u32 cookie)
2185 {
2186 	if (!__rt6_check_expired(rt) &&
2187 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2188 	    fib6_check(from, cookie))
2189 		return &rt->dst;
2190 	else
2191 		return NULL;
2192 }
2193 
2194 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2195 {
2196 	struct dst_entry *dst_ret;
2197 	struct fib6_info *from;
2198 	struct rt6_info *rt;
2199 
2200 	rt = container_of(dst, struct rt6_info, dst);
2201 
2202 	rcu_read_lock();
2203 
2204 	/* All IPV6 dsts are created with ->obsolete set to the value
2205 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2206 	 * into this function always.
2207 	 */
2208 
2209 	from = rcu_dereference(rt->from);
2210 
2211 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2212 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2213 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2214 	else
2215 		dst_ret = rt6_check(rt, from, cookie);
2216 
2217 	rcu_read_unlock();
2218 
2219 	return dst_ret;
2220 }
2221 
2222 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2223 {
2224 	struct rt6_info *rt = (struct rt6_info *) dst;
2225 
2226 	if (rt) {
2227 		if (rt->rt6i_flags & RTF_CACHE) {
2228 			rcu_read_lock();
2229 			if (rt6_check_expired(rt)) {
2230 				rt6_remove_exception_rt(rt);
2231 				dst = NULL;
2232 			}
2233 			rcu_read_unlock();
2234 		} else {
2235 			dst_release(dst);
2236 			dst = NULL;
2237 		}
2238 	}
2239 	return dst;
2240 }
2241 
2242 static void ip6_link_failure(struct sk_buff *skb)
2243 {
2244 	struct rt6_info *rt;
2245 
2246 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2247 
2248 	rt = (struct rt6_info *) skb_dst(skb);
2249 	if (rt) {
2250 		rcu_read_lock();
2251 		if (rt->rt6i_flags & RTF_CACHE) {
2252 			rt6_remove_exception_rt(rt);
2253 		} else {
2254 			struct fib6_info *from;
2255 			struct fib6_node *fn;
2256 
2257 			from = rcu_dereference(rt->from);
2258 			if (from) {
2259 				fn = rcu_dereference(from->fib6_node);
2260 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2261 					fn->fn_sernum = -1;
2262 			}
2263 		}
2264 		rcu_read_unlock();
2265 	}
2266 }
2267 
2268 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2269 {
2270 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2271 		struct fib6_info *from;
2272 
2273 		rcu_read_lock();
2274 		from = rcu_dereference(rt0->from);
2275 		if (from)
2276 			rt0->dst.expires = from->expires;
2277 		rcu_read_unlock();
2278 	}
2279 
2280 	dst_set_expires(&rt0->dst, timeout);
2281 	rt0->rt6i_flags |= RTF_EXPIRES;
2282 }
2283 
2284 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2285 {
2286 	struct net *net = dev_net(rt->dst.dev);
2287 
2288 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2289 	rt->rt6i_flags |= RTF_MODIFIED;
2290 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2291 }
2292 
2293 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2294 {
2295 	return !(rt->rt6i_flags & RTF_CACHE) &&
2296 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2297 }
2298 
2299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2300 				 const struct ipv6hdr *iph, u32 mtu)
2301 {
2302 	const struct in6_addr *daddr, *saddr;
2303 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2304 
2305 	if (dst_metric_locked(dst, RTAX_MTU))
2306 		return;
2307 
2308 	if (iph) {
2309 		daddr = &iph->daddr;
2310 		saddr = &iph->saddr;
2311 	} else if (sk) {
2312 		daddr = &sk->sk_v6_daddr;
2313 		saddr = &inet6_sk(sk)->saddr;
2314 	} else {
2315 		daddr = NULL;
2316 		saddr = NULL;
2317 	}
2318 	dst_confirm_neigh(dst, daddr);
2319 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2320 	if (mtu >= dst_mtu(dst))
2321 		return;
2322 
2323 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2324 		rt6_do_update_pmtu(rt6, mtu);
2325 		/* update rt6_ex->stamp for cache */
2326 		if (rt6->rt6i_flags & RTF_CACHE)
2327 			rt6_update_exception_stamp_rt(rt6);
2328 	} else if (daddr) {
2329 		struct fib6_info *from;
2330 		struct rt6_info *nrt6;
2331 
2332 		rcu_read_lock();
2333 		from = rcu_dereference(rt6->from);
2334 		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2335 		if (nrt6) {
2336 			rt6_do_update_pmtu(nrt6, mtu);
2337 			if (rt6_insert_exception(nrt6, from))
2338 				dst_release_immediate(&nrt6->dst);
2339 		}
2340 		rcu_read_unlock();
2341 	}
2342 }
2343 
2344 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2345 			       struct sk_buff *skb, u32 mtu)
2346 {
2347 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2348 }
2349 
2350 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2351 		     int oif, u32 mark, kuid_t uid)
2352 {
2353 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2354 	struct dst_entry *dst;
2355 	struct flowi6 fl6 = {
2356 		.flowi6_oif = oif,
2357 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2358 		.daddr = iph->daddr,
2359 		.saddr = iph->saddr,
2360 		.flowlabel = ip6_flowinfo(iph),
2361 		.flowi6_uid = uid,
2362 	};
2363 
2364 	dst = ip6_route_output(net, NULL, &fl6);
2365 	if (!dst->error)
2366 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2367 	dst_release(dst);
2368 }
2369 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2370 
2371 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2372 {
2373 	int oif = sk->sk_bound_dev_if;
2374 	struct dst_entry *dst;
2375 
2376 	if (!oif && skb->dev)
2377 		oif = l3mdev_master_ifindex(skb->dev);
2378 
2379 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2380 
2381 	dst = __sk_dst_get(sk);
2382 	if (!dst || !dst->obsolete ||
2383 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2384 		return;
2385 
2386 	bh_lock_sock(sk);
2387 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2388 		ip6_datagram_dst_update(sk, false);
2389 	bh_unlock_sock(sk);
2390 }
2391 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2392 
2393 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2394 			   const struct flowi6 *fl6)
2395 {
2396 #ifdef CONFIG_IPV6_SUBTREES
2397 	struct ipv6_pinfo *np = inet6_sk(sk);
2398 #endif
2399 
2400 	ip6_dst_store(sk, dst,
2401 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2402 		      &sk->sk_v6_daddr : NULL,
2403 #ifdef CONFIG_IPV6_SUBTREES
2404 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2405 		      &np->saddr :
2406 #endif
2407 		      NULL);
2408 }
2409 
2410 static bool ip6_redirect_nh_match(struct fib6_info *f6i,
2411 				  struct fib6_nh *nh,
2412 				  struct flowi6 *fl6,
2413 				  const struct in6_addr *gw,
2414 				  struct rt6_info **ret)
2415 {
2416 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2417 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2418 		return false;
2419 
2420 	/* rt_cache's gateway might be different from its 'parent'
2421 	 * in the case of an ip redirect.
2422 	 * So we keep searching in the exception table if the gateway
2423 	 * is different.
2424 	 */
2425 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2426 		struct rt6_info *rt_cache;
2427 
2428 		rt_cache = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
2429 		if (rt_cache &&
2430 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2431 			*ret = rt_cache;
2432 			return true;
2433 		}
2434 		return false;
2435 	}
2436 	return true;
2437 }
2438 
2439 /* Handle redirects */
2440 struct ip6rd_flowi {
2441 	struct flowi6 fl6;
2442 	struct in6_addr gateway;
2443 };
2444 
2445 static struct rt6_info *__ip6_route_redirect(struct net *net,
2446 					     struct fib6_table *table,
2447 					     struct flowi6 *fl6,
2448 					     const struct sk_buff *skb,
2449 					     int flags)
2450 {
2451 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2452 	struct rt6_info *ret = NULL;
2453 	struct fib6_info *rt;
2454 	struct fib6_node *fn;
2455 
2456 	/* Get the "current" route for this destination and
2457 	 * check if the redirect has come from appropriate router.
2458 	 *
2459 	 * RFC 4861 specifies that redirects should only be
2460 	 * accepted if they come from the nexthop to the target.
2461 	 * Due to the way the routes are chosen, this notion
2462 	 * is a bit fuzzy and one might need to check all possible
2463 	 * routes.
2464 	 */
2465 
2466 	rcu_read_lock();
2467 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2468 restart:
2469 	for_each_fib6_node_rt_rcu(fn) {
2470 		if (fib6_check_expired(rt))
2471 			continue;
2472 		if (rt->fib6_flags & RTF_REJECT)
2473 			break;
2474 		if (ip6_redirect_nh_match(rt, &rt->fib6_nh, fl6,
2475 					  &rdfl->gateway, &ret))
2476 			goto out;
2477 	}
2478 
2479 	if (!rt)
2480 		rt = net->ipv6.fib6_null_entry;
2481 	else if (rt->fib6_flags & RTF_REJECT) {
2482 		ret = net->ipv6.ip6_null_entry;
2483 		goto out;
2484 	}
2485 
2486 	if (rt == net->ipv6.fib6_null_entry) {
2487 		fn = fib6_backtrack(fn, &fl6->saddr);
2488 		if (fn)
2489 			goto restart;
2490 	}
2491 
2492 out:
2493 	if (ret)
2494 		ip6_hold_safe(net, &ret);
2495 	else
2496 		ret = ip6_create_rt_rcu(rt);
2497 
2498 	rcu_read_unlock();
2499 
2500 	trace_fib6_table_lookup(net, rt, table, fl6);
2501 	return ret;
2502 };
2503 
2504 static struct dst_entry *ip6_route_redirect(struct net *net,
2505 					    const struct flowi6 *fl6,
2506 					    const struct sk_buff *skb,
2507 					    const struct in6_addr *gateway)
2508 {
2509 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2510 	struct ip6rd_flowi rdfl;
2511 
2512 	rdfl.fl6 = *fl6;
2513 	rdfl.gateway = *gateway;
2514 
2515 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2516 				flags, __ip6_route_redirect);
2517 }
2518 
2519 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2520 		  kuid_t uid)
2521 {
2522 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2523 	struct dst_entry *dst;
2524 	struct flowi6 fl6 = {
2525 		.flowi6_iif = LOOPBACK_IFINDEX,
2526 		.flowi6_oif = oif,
2527 		.flowi6_mark = mark,
2528 		.daddr = iph->daddr,
2529 		.saddr = iph->saddr,
2530 		.flowlabel = ip6_flowinfo(iph),
2531 		.flowi6_uid = uid,
2532 	};
2533 
2534 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2535 	rt6_do_redirect(dst, NULL, skb);
2536 	dst_release(dst);
2537 }
2538 EXPORT_SYMBOL_GPL(ip6_redirect);
2539 
2540 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2541 {
2542 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2543 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2544 	struct dst_entry *dst;
2545 	struct flowi6 fl6 = {
2546 		.flowi6_iif = LOOPBACK_IFINDEX,
2547 		.flowi6_oif = oif,
2548 		.daddr = msg->dest,
2549 		.saddr = iph->daddr,
2550 		.flowi6_uid = sock_net_uid(net, NULL),
2551 	};
2552 
2553 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2554 	rt6_do_redirect(dst, NULL, skb);
2555 	dst_release(dst);
2556 }
2557 
2558 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2559 {
2560 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2561 		     sk->sk_uid);
2562 }
2563 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2564 
2565 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2566 {
2567 	struct net_device *dev = dst->dev;
2568 	unsigned int mtu = dst_mtu(dst);
2569 	struct net *net = dev_net(dev);
2570 
2571 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2572 
2573 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2574 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2575 
2576 	/*
2577 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2578 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2579 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2580 	 * rely only on pmtu discovery"
2581 	 */
2582 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2583 		mtu = IPV6_MAXPLEN;
2584 	return mtu;
2585 }
2586 
2587 static unsigned int ip6_mtu(const struct dst_entry *dst)
2588 {
2589 	struct inet6_dev *idev;
2590 	unsigned int mtu;
2591 
2592 	mtu = dst_metric_raw(dst, RTAX_MTU);
2593 	if (mtu)
2594 		goto out;
2595 
2596 	mtu = IPV6_MIN_MTU;
2597 
2598 	rcu_read_lock();
2599 	idev = __in6_dev_get(dst->dev);
2600 	if (idev)
2601 		mtu = idev->cnf.mtu6;
2602 	rcu_read_unlock();
2603 
2604 out:
2605 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2606 
2607 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2608 }
2609 
2610 /* MTU selection:
2611  * 1. mtu on route is locked - use it
2612  * 2. mtu from nexthop exception
2613  * 3. mtu from egress device
2614  *
2615  * based on ip6_dst_mtu_forward and exception logic of
2616  * rt6_find_cached_rt; called with rcu_read_lock
2617  */
2618 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2619 		      struct in6_addr *saddr)
2620 {
2621 	struct rt6_exception_bucket *bucket;
2622 	struct rt6_exception *rt6_ex;
2623 	struct in6_addr *src_key;
2624 	struct inet6_dev *idev;
2625 	u32 mtu = 0;
2626 
2627 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2628 		mtu = f6i->fib6_pmtu;
2629 		if (mtu)
2630 			goto out;
2631 	}
2632 
2633 	src_key = NULL;
2634 #ifdef CONFIG_IPV6_SUBTREES
2635 	if (f6i->fib6_src.plen)
2636 		src_key = saddr;
2637 #endif
2638 
2639 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2640 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2641 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2642 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2643 
2644 	if (likely(!mtu)) {
2645 		struct net_device *dev = fib6_info_nh_dev(f6i);
2646 
2647 		mtu = IPV6_MIN_MTU;
2648 		idev = __in6_dev_get(dev);
2649 		if (idev && idev->cnf.mtu6 > mtu)
2650 			mtu = idev->cnf.mtu6;
2651 	}
2652 
2653 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2654 out:
2655 	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2656 }
2657 
2658 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2659 				  struct flowi6 *fl6)
2660 {
2661 	struct dst_entry *dst;
2662 	struct rt6_info *rt;
2663 	struct inet6_dev *idev = in6_dev_get(dev);
2664 	struct net *net = dev_net(dev);
2665 
2666 	if (unlikely(!idev))
2667 		return ERR_PTR(-ENODEV);
2668 
2669 	rt = ip6_dst_alloc(net, dev, 0);
2670 	if (unlikely(!rt)) {
2671 		in6_dev_put(idev);
2672 		dst = ERR_PTR(-ENOMEM);
2673 		goto out;
2674 	}
2675 
2676 	rt->dst.flags |= DST_HOST;
2677 	rt->dst.input = ip6_input;
2678 	rt->dst.output  = ip6_output;
2679 	rt->rt6i_gateway  = fl6->daddr;
2680 	rt->rt6i_dst.addr = fl6->daddr;
2681 	rt->rt6i_dst.plen = 128;
2682 	rt->rt6i_idev     = idev;
2683 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2684 
2685 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2686 	 * do proper release of the net_device
2687 	 */
2688 	rt6_uncached_list_add(rt);
2689 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2690 
2691 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2692 
2693 out:
2694 	return dst;
2695 }
2696 
2697 static int ip6_dst_gc(struct dst_ops *ops)
2698 {
2699 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2700 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2701 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2702 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2703 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2704 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2705 	int entries;
2706 
2707 	entries = dst_entries_get_fast(ops);
2708 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2709 	    entries <= rt_max_size)
2710 		goto out;
2711 
2712 	net->ipv6.ip6_rt_gc_expire++;
2713 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2714 	entries = dst_entries_get_slow(ops);
2715 	if (entries < ops->gc_thresh)
2716 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2717 out:
2718 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2719 	return entries > rt_max_size;
2720 }
2721 
2722 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2723 					    struct fib6_config *cfg,
2724 					    const struct in6_addr *gw_addr,
2725 					    u32 tbid, int flags)
2726 {
2727 	struct flowi6 fl6 = {
2728 		.flowi6_oif = cfg->fc_ifindex,
2729 		.daddr = *gw_addr,
2730 		.saddr = cfg->fc_prefsrc,
2731 	};
2732 	struct fib6_table *table;
2733 	struct rt6_info *rt;
2734 
2735 	table = fib6_get_table(net, tbid);
2736 	if (!table)
2737 		return NULL;
2738 
2739 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2740 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2741 
2742 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2743 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2744 
2745 	/* if table lookup failed, fall back to full lookup */
2746 	if (rt == net->ipv6.ip6_null_entry) {
2747 		ip6_rt_put(rt);
2748 		rt = NULL;
2749 	}
2750 
2751 	return rt;
2752 }
2753 
2754 static int ip6_route_check_nh_onlink(struct net *net,
2755 				     struct fib6_config *cfg,
2756 				     const struct net_device *dev,
2757 				     struct netlink_ext_ack *extack)
2758 {
2759 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2760 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2761 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2762 	struct fib6_info *from;
2763 	struct rt6_info *grt;
2764 	int err;
2765 
2766 	err = 0;
2767 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2768 	if (grt) {
2769 		rcu_read_lock();
2770 		from = rcu_dereference(grt->from);
2771 		if (!grt->dst.error &&
2772 		    /* ignore match if it is the default route */
2773 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2774 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2775 			NL_SET_ERR_MSG(extack,
2776 				       "Nexthop has invalid gateway or device mismatch");
2777 			err = -EINVAL;
2778 		}
2779 		rcu_read_unlock();
2780 
2781 		ip6_rt_put(grt);
2782 	}
2783 
2784 	return err;
2785 }
2786 
2787 static int ip6_route_check_nh(struct net *net,
2788 			      struct fib6_config *cfg,
2789 			      struct net_device **_dev,
2790 			      struct inet6_dev **idev)
2791 {
2792 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2793 	struct net_device *dev = _dev ? *_dev : NULL;
2794 	struct rt6_info *grt = NULL;
2795 	int err = -EHOSTUNREACH;
2796 
2797 	if (cfg->fc_table) {
2798 		int flags = RT6_LOOKUP_F_IFACE;
2799 
2800 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2801 					  cfg->fc_table, flags);
2802 		if (grt) {
2803 			if (grt->rt6i_flags & RTF_GATEWAY ||
2804 			    (dev && dev != grt->dst.dev)) {
2805 				ip6_rt_put(grt);
2806 				grt = NULL;
2807 			}
2808 		}
2809 	}
2810 
2811 	if (!grt)
2812 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2813 
2814 	if (!grt)
2815 		goto out;
2816 
2817 	if (dev) {
2818 		if (dev != grt->dst.dev) {
2819 			ip6_rt_put(grt);
2820 			goto out;
2821 		}
2822 	} else {
2823 		*_dev = dev = grt->dst.dev;
2824 		*idev = grt->rt6i_idev;
2825 		dev_hold(dev);
2826 		in6_dev_hold(grt->rt6i_idev);
2827 	}
2828 
2829 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2830 		err = 0;
2831 
2832 	ip6_rt_put(grt);
2833 
2834 out:
2835 	return err;
2836 }
2837 
2838 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2839 			   struct net_device **_dev, struct inet6_dev **idev,
2840 			   struct netlink_ext_ack *extack)
2841 {
2842 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2843 	int gwa_type = ipv6_addr_type(gw_addr);
2844 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2845 	const struct net_device *dev = *_dev;
2846 	bool need_addr_check = !dev;
2847 	int err = -EINVAL;
2848 
2849 	/* if gw_addr is local we will fail to detect this in case
2850 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2851 	 * will return already-added prefix route via interface that
2852 	 * prefix route was assigned to, which might be non-loopback.
2853 	 */
2854 	if (dev &&
2855 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2856 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2857 		goto out;
2858 	}
2859 
2860 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2861 		/* IPv6 strictly inhibits using not link-local
2862 		 * addresses as nexthop address.
2863 		 * Otherwise, router will not able to send redirects.
2864 		 * It is very good, but in some (rare!) circumstances
2865 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2866 		 * some exceptions. --ANK
2867 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2868 		 * addressing
2869 		 */
2870 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2871 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2872 			goto out;
2873 		}
2874 
2875 		if (cfg->fc_flags & RTNH_F_ONLINK)
2876 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2877 		else
2878 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2879 
2880 		if (err)
2881 			goto out;
2882 	}
2883 
2884 	/* reload in case device was changed */
2885 	dev = *_dev;
2886 
2887 	err = -EINVAL;
2888 	if (!dev) {
2889 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2890 		goto out;
2891 	} else if (dev->flags & IFF_LOOPBACK) {
2892 		NL_SET_ERR_MSG(extack,
2893 			       "Egress device can not be loopback device for this route");
2894 		goto out;
2895 	}
2896 
2897 	/* if we did not check gw_addr above, do so now that the
2898 	 * egress device has been resolved.
2899 	 */
2900 	if (need_addr_check &&
2901 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2902 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2903 		goto out;
2904 	}
2905 
2906 	err = 0;
2907 out:
2908 	return err;
2909 }
2910 
2911 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2912 {
2913 	if ((flags & RTF_REJECT) ||
2914 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2915 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2916 	     !(flags & RTF_LOCAL)))
2917 		return true;
2918 
2919 	return false;
2920 }
2921 
2922 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2923 		 struct fib6_config *cfg, gfp_t gfp_flags,
2924 		 struct netlink_ext_ack *extack)
2925 {
2926 	struct net_device *dev = NULL;
2927 	struct inet6_dev *idev = NULL;
2928 	int addr_type;
2929 	int err;
2930 
2931 	fib6_nh->fib_nh_family = AF_INET6;
2932 
2933 	err = -ENODEV;
2934 	if (cfg->fc_ifindex) {
2935 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2936 		if (!dev)
2937 			goto out;
2938 		idev = in6_dev_get(dev);
2939 		if (!idev)
2940 			goto out;
2941 	}
2942 
2943 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2944 		if (!dev) {
2945 			NL_SET_ERR_MSG(extack,
2946 				       "Nexthop device required for onlink");
2947 			goto out;
2948 		}
2949 
2950 		if (!(dev->flags & IFF_UP)) {
2951 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2952 			err = -ENETDOWN;
2953 			goto out;
2954 		}
2955 
2956 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2957 	}
2958 
2959 	fib6_nh->fib_nh_weight = 1;
2960 
2961 	/* We cannot add true routes via loopback here,
2962 	 * they would result in kernel looping; promote them to reject routes
2963 	 */
2964 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2965 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
2966 		/* hold loopback dev/idev if we haven't done so. */
2967 		if (dev != net->loopback_dev) {
2968 			if (dev) {
2969 				dev_put(dev);
2970 				in6_dev_put(idev);
2971 			}
2972 			dev = net->loopback_dev;
2973 			dev_hold(dev);
2974 			idev = in6_dev_get(dev);
2975 			if (!idev) {
2976 				err = -ENODEV;
2977 				goto out;
2978 			}
2979 		}
2980 		goto set_dev;
2981 	}
2982 
2983 	if (cfg->fc_flags & RTF_GATEWAY) {
2984 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2985 		if (err)
2986 			goto out;
2987 
2988 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2989 		fib6_nh->fib_nh_gw_family = AF_INET6;
2990 	}
2991 
2992 	err = -ENODEV;
2993 	if (!dev)
2994 		goto out;
2995 
2996 	if (idev->cnf.disable_ipv6) {
2997 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2998 		err = -EACCES;
2999 		goto out;
3000 	}
3001 
3002 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3003 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3004 		err = -ENETDOWN;
3005 		goto out;
3006 	}
3007 
3008 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3009 	    !netif_carrier_ok(dev))
3010 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3011 
3012 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3013 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3014 	if (err)
3015 		goto out;
3016 set_dev:
3017 	fib6_nh->fib_nh_dev = dev;
3018 	fib6_nh->fib_nh_oif = dev->ifindex;
3019 	err = 0;
3020 out:
3021 	if (idev)
3022 		in6_dev_put(idev);
3023 
3024 	if (err) {
3025 		lwtstate_put(fib6_nh->fib_nh_lws);
3026 		fib6_nh->fib_nh_lws = NULL;
3027 		if (dev)
3028 			dev_put(dev);
3029 	}
3030 
3031 	return err;
3032 }
3033 
3034 void fib6_nh_release(struct fib6_nh *fib6_nh)
3035 {
3036 	fib_nh_common_release(&fib6_nh->nh_common);
3037 }
3038 
3039 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3040 					      gfp_t gfp_flags,
3041 					      struct netlink_ext_ack *extack)
3042 {
3043 	struct net *net = cfg->fc_nlinfo.nl_net;
3044 	struct fib6_info *rt = NULL;
3045 	struct fib6_table *table;
3046 	int err = -EINVAL;
3047 	int addr_type;
3048 
3049 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3050 	if (cfg->fc_flags & RTF_PCPU) {
3051 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3052 		goto out;
3053 	}
3054 
3055 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3056 	if (cfg->fc_flags & RTF_CACHE) {
3057 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3058 		goto out;
3059 	}
3060 
3061 	if (cfg->fc_type > RTN_MAX) {
3062 		NL_SET_ERR_MSG(extack, "Invalid route type");
3063 		goto out;
3064 	}
3065 
3066 	if (cfg->fc_dst_len > 128) {
3067 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3068 		goto out;
3069 	}
3070 	if (cfg->fc_src_len > 128) {
3071 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3072 		goto out;
3073 	}
3074 #ifndef CONFIG_IPV6_SUBTREES
3075 	if (cfg->fc_src_len) {
3076 		NL_SET_ERR_MSG(extack,
3077 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3078 		goto out;
3079 	}
3080 #endif
3081 
3082 	err = -ENOBUFS;
3083 	if (cfg->fc_nlinfo.nlh &&
3084 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3085 		table = fib6_get_table(net, cfg->fc_table);
3086 		if (!table) {
3087 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3088 			table = fib6_new_table(net, cfg->fc_table);
3089 		}
3090 	} else {
3091 		table = fib6_new_table(net, cfg->fc_table);
3092 	}
3093 
3094 	if (!table)
3095 		goto out;
3096 
3097 	err = -ENOMEM;
3098 	rt = fib6_info_alloc(gfp_flags);
3099 	if (!rt)
3100 		goto out;
3101 
3102 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3103 					       extack);
3104 	if (IS_ERR(rt->fib6_metrics)) {
3105 		err = PTR_ERR(rt->fib6_metrics);
3106 		/* Do not leave garbage there. */
3107 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3108 		goto out;
3109 	}
3110 
3111 	if (cfg->fc_flags & RTF_ADDRCONF)
3112 		rt->dst_nocount = true;
3113 
3114 	if (cfg->fc_flags & RTF_EXPIRES)
3115 		fib6_set_expires(rt, jiffies +
3116 				clock_t_to_jiffies(cfg->fc_expires));
3117 	else
3118 		fib6_clean_expires(rt);
3119 
3120 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3121 		cfg->fc_protocol = RTPROT_BOOT;
3122 	rt->fib6_protocol = cfg->fc_protocol;
3123 
3124 	rt->fib6_table = table;
3125 	rt->fib6_metric = cfg->fc_metric;
3126 	rt->fib6_type = cfg->fc_type;
3127 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3128 
3129 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3130 	rt->fib6_dst.plen = cfg->fc_dst_len;
3131 	if (rt->fib6_dst.plen == 128)
3132 		rt->dst_host = true;
3133 
3134 #ifdef CONFIG_IPV6_SUBTREES
3135 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3136 	rt->fib6_src.plen = cfg->fc_src_len;
3137 #endif
3138 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3139 	if (err)
3140 		goto out;
3141 
3142 	/* We cannot add true routes via loopback here,
3143 	 * they would result in kernel looping; promote them to reject routes
3144 	 */
3145 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3146 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3147 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3148 
3149 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3150 		struct net_device *dev = fib6_info_nh_dev(rt);
3151 
3152 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3153 			NL_SET_ERR_MSG(extack, "Invalid source address");
3154 			err = -EINVAL;
3155 			goto out;
3156 		}
3157 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3158 		rt->fib6_prefsrc.plen = 128;
3159 	} else
3160 		rt->fib6_prefsrc.plen = 0;
3161 
3162 	return rt;
3163 out:
3164 	fib6_info_release(rt);
3165 	return ERR_PTR(err);
3166 }
3167 
3168 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3169 		  struct netlink_ext_ack *extack)
3170 {
3171 	struct fib6_info *rt;
3172 	int err;
3173 
3174 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3175 	if (IS_ERR(rt))
3176 		return PTR_ERR(rt);
3177 
3178 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3179 	fib6_info_release(rt);
3180 
3181 	return err;
3182 }
3183 
3184 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3185 {
3186 	struct net *net = info->nl_net;
3187 	struct fib6_table *table;
3188 	int err;
3189 
3190 	if (rt == net->ipv6.fib6_null_entry) {
3191 		err = -ENOENT;
3192 		goto out;
3193 	}
3194 
3195 	table = rt->fib6_table;
3196 	spin_lock_bh(&table->tb6_lock);
3197 	err = fib6_del(rt, info);
3198 	spin_unlock_bh(&table->tb6_lock);
3199 
3200 out:
3201 	fib6_info_release(rt);
3202 	return err;
3203 }
3204 
3205 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3206 {
3207 	struct nl_info info = { .nl_net = net };
3208 
3209 	return __ip6_del_rt(rt, &info);
3210 }
3211 
3212 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3213 {
3214 	struct nl_info *info = &cfg->fc_nlinfo;
3215 	struct net *net = info->nl_net;
3216 	struct sk_buff *skb = NULL;
3217 	struct fib6_table *table;
3218 	int err = -ENOENT;
3219 
3220 	if (rt == net->ipv6.fib6_null_entry)
3221 		goto out_put;
3222 	table = rt->fib6_table;
3223 	spin_lock_bh(&table->tb6_lock);
3224 
3225 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3226 		struct fib6_info *sibling, *next_sibling;
3227 
3228 		/* prefer to send a single notification with all hops */
3229 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3230 		if (skb) {
3231 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3232 
3233 			if (rt6_fill_node(net, skb, rt, NULL,
3234 					  NULL, NULL, 0, RTM_DELROUTE,
3235 					  info->portid, seq, 0) < 0) {
3236 				kfree_skb(skb);
3237 				skb = NULL;
3238 			} else
3239 				info->skip_notify = 1;
3240 		}
3241 
3242 		list_for_each_entry_safe(sibling, next_sibling,
3243 					 &rt->fib6_siblings,
3244 					 fib6_siblings) {
3245 			err = fib6_del(sibling, info);
3246 			if (err)
3247 				goto out_unlock;
3248 		}
3249 	}
3250 
3251 	err = fib6_del(rt, info);
3252 out_unlock:
3253 	spin_unlock_bh(&table->tb6_lock);
3254 out_put:
3255 	fib6_info_release(rt);
3256 
3257 	if (skb) {
3258 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3259 			    info->nlh, gfp_any());
3260 	}
3261 	return err;
3262 }
3263 
3264 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3265 {
3266 	int rc = -ESRCH;
3267 
3268 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3269 		goto out;
3270 
3271 	if (cfg->fc_flags & RTF_GATEWAY &&
3272 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3273 		goto out;
3274 
3275 	rc = rt6_remove_exception_rt(rt);
3276 out:
3277 	return rc;
3278 }
3279 
3280 static int ip6_route_del(struct fib6_config *cfg,
3281 			 struct netlink_ext_ack *extack)
3282 {
3283 	struct rt6_info *rt_cache;
3284 	struct fib6_table *table;
3285 	struct fib6_info *rt;
3286 	struct fib6_node *fn;
3287 	int err = -ESRCH;
3288 
3289 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3290 	if (!table) {
3291 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3292 		return err;
3293 	}
3294 
3295 	rcu_read_lock();
3296 
3297 	fn = fib6_locate(&table->tb6_root,
3298 			 &cfg->fc_dst, cfg->fc_dst_len,
3299 			 &cfg->fc_src, cfg->fc_src_len,
3300 			 !(cfg->fc_flags & RTF_CACHE));
3301 
3302 	if (fn) {
3303 		for_each_fib6_node_rt_rcu(fn) {
3304 			struct fib6_nh *nh;
3305 
3306 			if (cfg->fc_flags & RTF_CACHE) {
3307 				int rc;
3308 
3309 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3310 							      &cfg->fc_src);
3311 				if (rt_cache) {
3312 					rc = ip6_del_cached_rt(rt_cache, cfg);
3313 					if (rc != -ESRCH) {
3314 						rcu_read_unlock();
3315 						return rc;
3316 					}
3317 				}
3318 				continue;
3319 			}
3320 
3321 			nh = &rt->fib6_nh;
3322 			if (cfg->fc_ifindex &&
3323 			    (!nh->fib_nh_dev ||
3324 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3325 				continue;
3326 			if (cfg->fc_flags & RTF_GATEWAY &&
3327 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3328 				continue;
3329 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3330 				continue;
3331 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3332 				continue;
3333 			if (!fib6_info_hold_safe(rt))
3334 				continue;
3335 			rcu_read_unlock();
3336 
3337 			/* if gateway was specified only delete the one hop */
3338 			if (cfg->fc_flags & RTF_GATEWAY)
3339 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3340 
3341 			return __ip6_del_rt_siblings(rt, cfg);
3342 		}
3343 	}
3344 	rcu_read_unlock();
3345 
3346 	return err;
3347 }
3348 
3349 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3350 {
3351 	struct netevent_redirect netevent;
3352 	struct rt6_info *rt, *nrt = NULL;
3353 	struct ndisc_options ndopts;
3354 	struct inet6_dev *in6_dev;
3355 	struct neighbour *neigh;
3356 	struct fib6_info *from;
3357 	struct rd_msg *msg;
3358 	int optlen, on_link;
3359 	u8 *lladdr;
3360 
3361 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3362 	optlen -= sizeof(*msg);
3363 
3364 	if (optlen < 0) {
3365 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3366 		return;
3367 	}
3368 
3369 	msg = (struct rd_msg *)icmp6_hdr(skb);
3370 
3371 	if (ipv6_addr_is_multicast(&msg->dest)) {
3372 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3373 		return;
3374 	}
3375 
3376 	on_link = 0;
3377 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3378 		on_link = 1;
3379 	} else if (ipv6_addr_type(&msg->target) !=
3380 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3381 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3382 		return;
3383 	}
3384 
3385 	in6_dev = __in6_dev_get(skb->dev);
3386 	if (!in6_dev)
3387 		return;
3388 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3389 		return;
3390 
3391 	/* RFC2461 8.1:
3392 	 *	The IP source address of the Redirect MUST be the same as the current
3393 	 *	first-hop router for the specified ICMP Destination Address.
3394 	 */
3395 
3396 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3397 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3398 		return;
3399 	}
3400 
3401 	lladdr = NULL;
3402 	if (ndopts.nd_opts_tgt_lladdr) {
3403 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3404 					     skb->dev);
3405 		if (!lladdr) {
3406 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3407 			return;
3408 		}
3409 	}
3410 
3411 	rt = (struct rt6_info *) dst;
3412 	if (rt->rt6i_flags & RTF_REJECT) {
3413 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3414 		return;
3415 	}
3416 
3417 	/* Redirect received -> path was valid.
3418 	 * Look, redirects are sent only in response to data packets,
3419 	 * so that this nexthop apparently is reachable. --ANK
3420 	 */
3421 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3422 
3423 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3424 	if (!neigh)
3425 		return;
3426 
3427 	/*
3428 	 *	We have finally decided to accept it.
3429 	 */
3430 
3431 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3432 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3433 		     NEIGH_UPDATE_F_OVERRIDE|
3434 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3435 				     NEIGH_UPDATE_F_ISROUTER)),
3436 		     NDISC_REDIRECT, &ndopts);
3437 
3438 	rcu_read_lock();
3439 	from = rcu_dereference(rt->from);
3440 	/* This fib6_info_hold() is safe here because we hold reference to rt
3441 	 * and rt already holds reference to fib6_info.
3442 	 */
3443 	fib6_info_hold(from);
3444 	rcu_read_unlock();
3445 
3446 	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3447 	if (!nrt)
3448 		goto out;
3449 
3450 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3451 	if (on_link)
3452 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3453 
3454 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3455 
3456 	/* No need to remove rt from the exception table if rt is
3457 	 * a cached route because rt6_insert_exception() will
3458 	 * takes care of it
3459 	 */
3460 	if (rt6_insert_exception(nrt, from)) {
3461 		dst_release_immediate(&nrt->dst);
3462 		goto out;
3463 	}
3464 
3465 	netevent.old = &rt->dst;
3466 	netevent.new = &nrt->dst;
3467 	netevent.daddr = &msg->dest;
3468 	netevent.neigh = neigh;
3469 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3470 
3471 out:
3472 	fib6_info_release(from);
3473 	neigh_release(neigh);
3474 }
3475 
3476 #ifdef CONFIG_IPV6_ROUTE_INFO
3477 static struct fib6_info *rt6_get_route_info(struct net *net,
3478 					   const struct in6_addr *prefix, int prefixlen,
3479 					   const struct in6_addr *gwaddr,
3480 					   struct net_device *dev)
3481 {
3482 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3483 	int ifindex = dev->ifindex;
3484 	struct fib6_node *fn;
3485 	struct fib6_info *rt = NULL;
3486 	struct fib6_table *table;
3487 
3488 	table = fib6_get_table(net, tb_id);
3489 	if (!table)
3490 		return NULL;
3491 
3492 	rcu_read_lock();
3493 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3494 	if (!fn)
3495 		goto out;
3496 
3497 	for_each_fib6_node_rt_rcu(fn) {
3498 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3499 			continue;
3500 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3501 		    !rt->fib6_nh.fib_nh_gw_family)
3502 			continue;
3503 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3504 			continue;
3505 		if (!fib6_info_hold_safe(rt))
3506 			continue;
3507 		break;
3508 	}
3509 out:
3510 	rcu_read_unlock();
3511 	return rt;
3512 }
3513 
3514 static struct fib6_info *rt6_add_route_info(struct net *net,
3515 					   const struct in6_addr *prefix, int prefixlen,
3516 					   const struct in6_addr *gwaddr,
3517 					   struct net_device *dev,
3518 					   unsigned int pref)
3519 {
3520 	struct fib6_config cfg = {
3521 		.fc_metric	= IP6_RT_PRIO_USER,
3522 		.fc_ifindex	= dev->ifindex,
3523 		.fc_dst_len	= prefixlen,
3524 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3525 				  RTF_UP | RTF_PREF(pref),
3526 		.fc_protocol = RTPROT_RA,
3527 		.fc_type = RTN_UNICAST,
3528 		.fc_nlinfo.portid = 0,
3529 		.fc_nlinfo.nlh = NULL,
3530 		.fc_nlinfo.nl_net = net,
3531 	};
3532 
3533 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3534 	cfg.fc_dst = *prefix;
3535 	cfg.fc_gateway = *gwaddr;
3536 
3537 	/* We should treat it as a default route if prefix length is 0. */
3538 	if (!prefixlen)
3539 		cfg.fc_flags |= RTF_DEFAULT;
3540 
3541 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3542 
3543 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3544 }
3545 #endif
3546 
3547 struct fib6_info *rt6_get_dflt_router(struct net *net,
3548 				     const struct in6_addr *addr,
3549 				     struct net_device *dev)
3550 {
3551 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3552 	struct fib6_info *rt;
3553 	struct fib6_table *table;
3554 
3555 	table = fib6_get_table(net, tb_id);
3556 	if (!table)
3557 		return NULL;
3558 
3559 	rcu_read_lock();
3560 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3561 		struct fib6_nh *nh = &rt->fib6_nh;
3562 
3563 		if (dev == nh->fib_nh_dev &&
3564 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3565 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3566 			break;
3567 	}
3568 	if (rt && !fib6_info_hold_safe(rt))
3569 		rt = NULL;
3570 	rcu_read_unlock();
3571 	return rt;
3572 }
3573 
3574 struct fib6_info *rt6_add_dflt_router(struct net *net,
3575 				     const struct in6_addr *gwaddr,
3576 				     struct net_device *dev,
3577 				     unsigned int pref)
3578 {
3579 	struct fib6_config cfg = {
3580 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3581 		.fc_metric	= IP6_RT_PRIO_USER,
3582 		.fc_ifindex	= dev->ifindex,
3583 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3584 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3585 		.fc_protocol = RTPROT_RA,
3586 		.fc_type = RTN_UNICAST,
3587 		.fc_nlinfo.portid = 0,
3588 		.fc_nlinfo.nlh = NULL,
3589 		.fc_nlinfo.nl_net = net,
3590 	};
3591 
3592 	cfg.fc_gateway = *gwaddr;
3593 
3594 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3595 		struct fib6_table *table;
3596 
3597 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3598 		if (table)
3599 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3600 	}
3601 
3602 	return rt6_get_dflt_router(net, gwaddr, dev);
3603 }
3604 
3605 static void __rt6_purge_dflt_routers(struct net *net,
3606 				     struct fib6_table *table)
3607 {
3608 	struct fib6_info *rt;
3609 
3610 restart:
3611 	rcu_read_lock();
3612 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3613 		struct net_device *dev = fib6_info_nh_dev(rt);
3614 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3615 
3616 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3617 		    (!idev || idev->cnf.accept_ra != 2) &&
3618 		    fib6_info_hold_safe(rt)) {
3619 			rcu_read_unlock();
3620 			ip6_del_rt(net, rt);
3621 			goto restart;
3622 		}
3623 	}
3624 	rcu_read_unlock();
3625 
3626 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3627 }
3628 
3629 void rt6_purge_dflt_routers(struct net *net)
3630 {
3631 	struct fib6_table *table;
3632 	struct hlist_head *head;
3633 	unsigned int h;
3634 
3635 	rcu_read_lock();
3636 
3637 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3638 		head = &net->ipv6.fib_table_hash[h];
3639 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3640 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3641 				__rt6_purge_dflt_routers(net, table);
3642 		}
3643 	}
3644 
3645 	rcu_read_unlock();
3646 }
3647 
3648 static void rtmsg_to_fib6_config(struct net *net,
3649 				 struct in6_rtmsg *rtmsg,
3650 				 struct fib6_config *cfg)
3651 {
3652 	*cfg = (struct fib6_config){
3653 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3654 			 : RT6_TABLE_MAIN,
3655 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3656 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3657 		.fc_expires = rtmsg->rtmsg_info,
3658 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3659 		.fc_src_len = rtmsg->rtmsg_src_len,
3660 		.fc_flags = rtmsg->rtmsg_flags,
3661 		.fc_type = rtmsg->rtmsg_type,
3662 
3663 		.fc_nlinfo.nl_net = net,
3664 
3665 		.fc_dst = rtmsg->rtmsg_dst,
3666 		.fc_src = rtmsg->rtmsg_src,
3667 		.fc_gateway = rtmsg->rtmsg_gateway,
3668 	};
3669 }
3670 
3671 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3672 {
3673 	struct fib6_config cfg;
3674 	struct in6_rtmsg rtmsg;
3675 	int err;
3676 
3677 	switch (cmd) {
3678 	case SIOCADDRT:		/* Add a route */
3679 	case SIOCDELRT:		/* Delete a route */
3680 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3681 			return -EPERM;
3682 		err = copy_from_user(&rtmsg, arg,
3683 				     sizeof(struct in6_rtmsg));
3684 		if (err)
3685 			return -EFAULT;
3686 
3687 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3688 
3689 		rtnl_lock();
3690 		switch (cmd) {
3691 		case SIOCADDRT:
3692 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3693 			break;
3694 		case SIOCDELRT:
3695 			err = ip6_route_del(&cfg, NULL);
3696 			break;
3697 		default:
3698 			err = -EINVAL;
3699 		}
3700 		rtnl_unlock();
3701 
3702 		return err;
3703 	}
3704 
3705 	return -EINVAL;
3706 }
3707 
3708 /*
3709  *	Drop the packet on the floor
3710  */
3711 
3712 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3713 {
3714 	int type;
3715 	struct dst_entry *dst = skb_dst(skb);
3716 	switch (ipstats_mib_noroutes) {
3717 	case IPSTATS_MIB_INNOROUTES:
3718 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3719 		if (type == IPV6_ADDR_ANY) {
3720 			IP6_INC_STATS(dev_net(dst->dev),
3721 				      __in6_dev_get_safely(skb->dev),
3722 				      IPSTATS_MIB_INADDRERRORS);
3723 			break;
3724 		}
3725 		/* FALLTHROUGH */
3726 	case IPSTATS_MIB_OUTNOROUTES:
3727 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3728 			      ipstats_mib_noroutes);
3729 		break;
3730 	}
3731 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3732 	kfree_skb(skb);
3733 	return 0;
3734 }
3735 
3736 static int ip6_pkt_discard(struct sk_buff *skb)
3737 {
3738 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3739 }
3740 
3741 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3742 {
3743 	skb->dev = skb_dst(skb)->dev;
3744 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3745 }
3746 
3747 static int ip6_pkt_prohibit(struct sk_buff *skb)
3748 {
3749 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3750 }
3751 
3752 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3753 {
3754 	skb->dev = skb_dst(skb)->dev;
3755 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3756 }
3757 
3758 /*
3759  *	Allocate a dst for local (unicast / anycast) address.
3760  */
3761 
3762 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3763 				     struct inet6_dev *idev,
3764 				     const struct in6_addr *addr,
3765 				     bool anycast, gfp_t gfp_flags)
3766 {
3767 	struct fib6_config cfg = {
3768 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3769 		.fc_ifindex = idev->dev->ifindex,
3770 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3771 		.fc_dst = *addr,
3772 		.fc_dst_len = 128,
3773 		.fc_protocol = RTPROT_KERNEL,
3774 		.fc_nlinfo.nl_net = net,
3775 		.fc_ignore_dev_down = true,
3776 	};
3777 
3778 	if (anycast) {
3779 		cfg.fc_type = RTN_ANYCAST;
3780 		cfg.fc_flags |= RTF_ANYCAST;
3781 	} else {
3782 		cfg.fc_type = RTN_LOCAL;
3783 		cfg.fc_flags |= RTF_LOCAL;
3784 	}
3785 
3786 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3787 }
3788 
3789 /* remove deleted ip from prefsrc entries */
3790 struct arg_dev_net_ip {
3791 	struct net_device *dev;
3792 	struct net *net;
3793 	struct in6_addr *addr;
3794 };
3795 
3796 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3797 {
3798 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3799 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3800 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3801 
3802 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3803 	    rt != net->ipv6.fib6_null_entry &&
3804 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3805 		spin_lock_bh(&rt6_exception_lock);
3806 		/* remove prefsrc entry */
3807 		rt->fib6_prefsrc.plen = 0;
3808 		spin_unlock_bh(&rt6_exception_lock);
3809 	}
3810 	return 0;
3811 }
3812 
3813 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3814 {
3815 	struct net *net = dev_net(ifp->idev->dev);
3816 	struct arg_dev_net_ip adni = {
3817 		.dev = ifp->idev->dev,
3818 		.net = net,
3819 		.addr = &ifp->addr,
3820 	};
3821 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3822 }
3823 
3824 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3825 
3826 /* Remove routers and update dst entries when gateway turn into host. */
3827 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3828 {
3829 	struct in6_addr *gateway = (struct in6_addr *)arg;
3830 
3831 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3832 	    rt->fib6_nh.fib_nh_gw_family &&
3833 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3834 		return -1;
3835 	}
3836 
3837 	/* Further clean up cached routes in exception table.
3838 	 * This is needed because cached route may have a different
3839 	 * gateway than its 'parent' in the case of an ip redirect.
3840 	 */
3841 	rt6_exceptions_clean_tohost(rt, gateway);
3842 
3843 	return 0;
3844 }
3845 
3846 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3847 {
3848 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3849 }
3850 
3851 struct arg_netdev_event {
3852 	const struct net_device *dev;
3853 	union {
3854 		unsigned int nh_flags;
3855 		unsigned long event;
3856 	};
3857 };
3858 
3859 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3860 {
3861 	struct fib6_info *iter;
3862 	struct fib6_node *fn;
3863 
3864 	fn = rcu_dereference_protected(rt->fib6_node,
3865 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3866 	iter = rcu_dereference_protected(fn->leaf,
3867 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3868 	while (iter) {
3869 		if (iter->fib6_metric == rt->fib6_metric &&
3870 		    rt6_qualify_for_ecmp(iter))
3871 			return iter;
3872 		iter = rcu_dereference_protected(iter->fib6_next,
3873 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3874 	}
3875 
3876 	return NULL;
3877 }
3878 
3879 static bool rt6_is_dead(const struct fib6_info *rt)
3880 {
3881 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3882 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3883 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3884 		return true;
3885 
3886 	return false;
3887 }
3888 
3889 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3890 {
3891 	struct fib6_info *iter;
3892 	int total = 0;
3893 
3894 	if (!rt6_is_dead(rt))
3895 		total += rt->fib6_nh.fib_nh_weight;
3896 
3897 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3898 		if (!rt6_is_dead(iter))
3899 			total += iter->fib6_nh.fib_nh_weight;
3900 	}
3901 
3902 	return total;
3903 }
3904 
3905 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3906 {
3907 	int upper_bound = -1;
3908 
3909 	if (!rt6_is_dead(rt)) {
3910 		*weight += rt->fib6_nh.fib_nh_weight;
3911 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3912 						    total) - 1;
3913 	}
3914 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3915 }
3916 
3917 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3918 {
3919 	struct fib6_info *iter;
3920 	int weight = 0;
3921 
3922 	rt6_upper_bound_set(rt, &weight, total);
3923 
3924 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3925 		rt6_upper_bound_set(iter, &weight, total);
3926 }
3927 
3928 void rt6_multipath_rebalance(struct fib6_info *rt)
3929 {
3930 	struct fib6_info *first;
3931 	int total;
3932 
3933 	/* In case the entire multipath route was marked for flushing,
3934 	 * then there is no need to rebalance upon the removal of every
3935 	 * sibling route.
3936 	 */
3937 	if (!rt->fib6_nsiblings || rt->should_flush)
3938 		return;
3939 
3940 	/* During lookup routes are evaluated in order, so we need to
3941 	 * make sure upper bounds are assigned from the first sibling
3942 	 * onwards.
3943 	 */
3944 	first = rt6_multipath_first_sibling(rt);
3945 	if (WARN_ON_ONCE(!first))
3946 		return;
3947 
3948 	total = rt6_multipath_total_weight(first);
3949 	rt6_multipath_upper_bound_set(first, total);
3950 }
3951 
3952 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3953 {
3954 	const struct arg_netdev_event *arg = p_arg;
3955 	struct net *net = dev_net(arg->dev);
3956 
3957 	if (rt != net->ipv6.fib6_null_entry &&
3958 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
3959 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3960 		fib6_update_sernum_upto_root(net, rt);
3961 		rt6_multipath_rebalance(rt);
3962 	}
3963 
3964 	return 0;
3965 }
3966 
3967 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3968 {
3969 	struct arg_netdev_event arg = {
3970 		.dev = dev,
3971 		{
3972 			.nh_flags = nh_flags,
3973 		},
3974 	};
3975 
3976 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3977 		arg.nh_flags |= RTNH_F_LINKDOWN;
3978 
3979 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3980 }
3981 
3982 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3983 				   const struct net_device *dev)
3984 {
3985 	struct fib6_info *iter;
3986 
3987 	if (rt->fib6_nh.fib_nh_dev == dev)
3988 		return true;
3989 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3990 		if (iter->fib6_nh.fib_nh_dev == dev)
3991 			return true;
3992 
3993 	return false;
3994 }
3995 
3996 static void rt6_multipath_flush(struct fib6_info *rt)
3997 {
3998 	struct fib6_info *iter;
3999 
4000 	rt->should_flush = 1;
4001 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4002 		iter->should_flush = 1;
4003 }
4004 
4005 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4006 					     const struct net_device *down_dev)
4007 {
4008 	struct fib6_info *iter;
4009 	unsigned int dead = 0;
4010 
4011 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4012 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4013 		dead++;
4014 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4015 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4016 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4017 			dead++;
4018 
4019 	return dead;
4020 }
4021 
4022 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4023 				       const struct net_device *dev,
4024 				       unsigned int nh_flags)
4025 {
4026 	struct fib6_info *iter;
4027 
4028 	if (rt->fib6_nh.fib_nh_dev == dev)
4029 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4030 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4031 		if (iter->fib6_nh.fib_nh_dev == dev)
4032 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4033 }
4034 
4035 /* called with write lock held for table with rt */
4036 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4037 {
4038 	const struct arg_netdev_event *arg = p_arg;
4039 	const struct net_device *dev = arg->dev;
4040 	struct net *net = dev_net(dev);
4041 
4042 	if (rt == net->ipv6.fib6_null_entry)
4043 		return 0;
4044 
4045 	switch (arg->event) {
4046 	case NETDEV_UNREGISTER:
4047 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4048 	case NETDEV_DOWN:
4049 		if (rt->should_flush)
4050 			return -1;
4051 		if (!rt->fib6_nsiblings)
4052 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4053 		if (rt6_multipath_uses_dev(rt, dev)) {
4054 			unsigned int count;
4055 
4056 			count = rt6_multipath_dead_count(rt, dev);
4057 			if (rt->fib6_nsiblings + 1 == count) {
4058 				rt6_multipath_flush(rt);
4059 				return -1;
4060 			}
4061 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4062 						   RTNH_F_LINKDOWN);
4063 			fib6_update_sernum(net, rt);
4064 			rt6_multipath_rebalance(rt);
4065 		}
4066 		return -2;
4067 	case NETDEV_CHANGE:
4068 		if (rt->fib6_nh.fib_nh_dev != dev ||
4069 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4070 			break;
4071 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4072 		rt6_multipath_rebalance(rt);
4073 		break;
4074 	}
4075 
4076 	return 0;
4077 }
4078 
4079 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4080 {
4081 	struct arg_netdev_event arg = {
4082 		.dev = dev,
4083 		{
4084 			.event = event,
4085 		},
4086 	};
4087 	struct net *net = dev_net(dev);
4088 
4089 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4090 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4091 	else
4092 		fib6_clean_all(net, fib6_ifdown, &arg);
4093 }
4094 
4095 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4096 {
4097 	rt6_sync_down_dev(dev, event);
4098 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4099 	neigh_ifdown(&nd_tbl, dev);
4100 }
4101 
4102 struct rt6_mtu_change_arg {
4103 	struct net_device *dev;
4104 	unsigned int mtu;
4105 };
4106 
4107 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4108 {
4109 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4110 	struct inet6_dev *idev;
4111 
4112 	/* In IPv6 pmtu discovery is not optional,
4113 	   so that RTAX_MTU lock cannot disable it.
4114 	   We still use this lock to block changes
4115 	   caused by addrconf/ndisc.
4116 	*/
4117 
4118 	idev = __in6_dev_get(arg->dev);
4119 	if (!idev)
4120 		return 0;
4121 
4122 	/* For administrative MTU increase, there is no way to discover
4123 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4124 	   Since RFC 1981 doesn't include administrative MTU increase
4125 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4126 	 */
4127 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4128 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4129 		u32 mtu = rt->fib6_pmtu;
4130 
4131 		if (mtu >= arg->mtu ||
4132 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4133 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4134 
4135 		spin_lock_bh(&rt6_exception_lock);
4136 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4137 		spin_unlock_bh(&rt6_exception_lock);
4138 	}
4139 	return 0;
4140 }
4141 
4142 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4143 {
4144 	struct rt6_mtu_change_arg arg = {
4145 		.dev = dev,
4146 		.mtu = mtu,
4147 	};
4148 
4149 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4150 }
4151 
4152 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4153 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4154 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4155 	[RTA_OIF]               = { .type = NLA_U32 },
4156 	[RTA_IIF]		= { .type = NLA_U32 },
4157 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4158 	[RTA_METRICS]           = { .type = NLA_NESTED },
4159 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4160 	[RTA_PREF]              = { .type = NLA_U8 },
4161 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4162 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4163 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4164 	[RTA_UID]		= { .type = NLA_U32 },
4165 	[RTA_MARK]		= { .type = NLA_U32 },
4166 	[RTA_TABLE]		= { .type = NLA_U32 },
4167 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4168 	[RTA_SPORT]		= { .type = NLA_U16 },
4169 	[RTA_DPORT]		= { .type = NLA_U16 },
4170 };
4171 
4172 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4173 			      struct fib6_config *cfg,
4174 			      struct netlink_ext_ack *extack)
4175 {
4176 	struct rtmsg *rtm;
4177 	struct nlattr *tb[RTA_MAX+1];
4178 	unsigned int pref;
4179 	int err;
4180 
4181 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4182 			  extack);
4183 	if (err < 0)
4184 		goto errout;
4185 
4186 	err = -EINVAL;
4187 	rtm = nlmsg_data(nlh);
4188 
4189 	*cfg = (struct fib6_config){
4190 		.fc_table = rtm->rtm_table,
4191 		.fc_dst_len = rtm->rtm_dst_len,
4192 		.fc_src_len = rtm->rtm_src_len,
4193 		.fc_flags = RTF_UP,
4194 		.fc_protocol = rtm->rtm_protocol,
4195 		.fc_type = rtm->rtm_type,
4196 
4197 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4198 		.fc_nlinfo.nlh = nlh,
4199 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4200 	};
4201 
4202 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4203 	    rtm->rtm_type == RTN_BLACKHOLE ||
4204 	    rtm->rtm_type == RTN_PROHIBIT ||
4205 	    rtm->rtm_type == RTN_THROW)
4206 		cfg->fc_flags |= RTF_REJECT;
4207 
4208 	if (rtm->rtm_type == RTN_LOCAL)
4209 		cfg->fc_flags |= RTF_LOCAL;
4210 
4211 	if (rtm->rtm_flags & RTM_F_CLONED)
4212 		cfg->fc_flags |= RTF_CACHE;
4213 
4214 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4215 
4216 	if (tb[RTA_GATEWAY]) {
4217 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4218 		cfg->fc_flags |= RTF_GATEWAY;
4219 	}
4220 	if (tb[RTA_VIA]) {
4221 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4222 		goto errout;
4223 	}
4224 
4225 	if (tb[RTA_DST]) {
4226 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4227 
4228 		if (nla_len(tb[RTA_DST]) < plen)
4229 			goto errout;
4230 
4231 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4232 	}
4233 
4234 	if (tb[RTA_SRC]) {
4235 		int plen = (rtm->rtm_src_len + 7) >> 3;
4236 
4237 		if (nla_len(tb[RTA_SRC]) < plen)
4238 			goto errout;
4239 
4240 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4241 	}
4242 
4243 	if (tb[RTA_PREFSRC])
4244 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4245 
4246 	if (tb[RTA_OIF])
4247 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4248 
4249 	if (tb[RTA_PRIORITY])
4250 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4251 
4252 	if (tb[RTA_METRICS]) {
4253 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4254 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4255 	}
4256 
4257 	if (tb[RTA_TABLE])
4258 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4259 
4260 	if (tb[RTA_MULTIPATH]) {
4261 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4262 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4263 
4264 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4265 						     cfg->fc_mp_len, extack);
4266 		if (err < 0)
4267 			goto errout;
4268 	}
4269 
4270 	if (tb[RTA_PREF]) {
4271 		pref = nla_get_u8(tb[RTA_PREF]);
4272 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4273 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4274 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4275 		cfg->fc_flags |= RTF_PREF(pref);
4276 	}
4277 
4278 	if (tb[RTA_ENCAP])
4279 		cfg->fc_encap = tb[RTA_ENCAP];
4280 
4281 	if (tb[RTA_ENCAP_TYPE]) {
4282 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4283 
4284 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4285 		if (err < 0)
4286 			goto errout;
4287 	}
4288 
4289 	if (tb[RTA_EXPIRES]) {
4290 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4291 
4292 		if (addrconf_finite_timeout(timeout)) {
4293 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4294 			cfg->fc_flags |= RTF_EXPIRES;
4295 		}
4296 	}
4297 
4298 	err = 0;
4299 errout:
4300 	return err;
4301 }
4302 
4303 struct rt6_nh {
4304 	struct fib6_info *fib6_info;
4305 	struct fib6_config r_cfg;
4306 	struct list_head next;
4307 };
4308 
4309 static int ip6_route_info_append(struct net *net,
4310 				 struct list_head *rt6_nh_list,
4311 				 struct fib6_info *rt,
4312 				 struct fib6_config *r_cfg)
4313 {
4314 	struct rt6_nh *nh;
4315 	int err = -EEXIST;
4316 
4317 	list_for_each_entry(nh, rt6_nh_list, next) {
4318 		/* check if fib6_info already exists */
4319 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4320 			return err;
4321 	}
4322 
4323 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4324 	if (!nh)
4325 		return -ENOMEM;
4326 	nh->fib6_info = rt;
4327 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4328 	list_add_tail(&nh->next, rt6_nh_list);
4329 
4330 	return 0;
4331 }
4332 
4333 static void ip6_route_mpath_notify(struct fib6_info *rt,
4334 				   struct fib6_info *rt_last,
4335 				   struct nl_info *info,
4336 				   __u16 nlflags)
4337 {
4338 	/* if this is an APPEND route, then rt points to the first route
4339 	 * inserted and rt_last points to last route inserted. Userspace
4340 	 * wants a consistent dump of the route which starts at the first
4341 	 * nexthop. Since sibling routes are always added at the end of
4342 	 * the list, find the first sibling of the last route appended
4343 	 */
4344 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4345 		rt = list_first_entry(&rt_last->fib6_siblings,
4346 				      struct fib6_info,
4347 				      fib6_siblings);
4348 	}
4349 
4350 	if (rt)
4351 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4352 }
4353 
4354 static int ip6_route_multipath_add(struct fib6_config *cfg,
4355 				   struct netlink_ext_ack *extack)
4356 {
4357 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4358 	struct nl_info *info = &cfg->fc_nlinfo;
4359 	struct fib6_config r_cfg;
4360 	struct rtnexthop *rtnh;
4361 	struct fib6_info *rt;
4362 	struct rt6_nh *err_nh;
4363 	struct rt6_nh *nh, *nh_safe;
4364 	__u16 nlflags;
4365 	int remaining;
4366 	int attrlen;
4367 	int err = 1;
4368 	int nhn = 0;
4369 	int replace = (cfg->fc_nlinfo.nlh &&
4370 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4371 	LIST_HEAD(rt6_nh_list);
4372 
4373 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4374 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4375 		nlflags |= NLM_F_APPEND;
4376 
4377 	remaining = cfg->fc_mp_len;
4378 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4379 
4380 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4381 	 * fib6_info structs per nexthop
4382 	 */
4383 	while (rtnh_ok(rtnh, remaining)) {
4384 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4385 		if (rtnh->rtnh_ifindex)
4386 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4387 
4388 		attrlen = rtnh_attrlen(rtnh);
4389 		if (attrlen > 0) {
4390 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4391 
4392 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4393 			if (nla) {
4394 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4395 				r_cfg.fc_flags |= RTF_GATEWAY;
4396 			}
4397 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4398 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4399 			if (nla)
4400 				r_cfg.fc_encap_type = nla_get_u16(nla);
4401 		}
4402 
4403 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4404 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4405 		if (IS_ERR(rt)) {
4406 			err = PTR_ERR(rt);
4407 			rt = NULL;
4408 			goto cleanup;
4409 		}
4410 		if (!rt6_qualify_for_ecmp(rt)) {
4411 			err = -EINVAL;
4412 			NL_SET_ERR_MSG(extack,
4413 				       "Device only routes can not be added for IPv6 using the multipath API.");
4414 			fib6_info_release(rt);
4415 			goto cleanup;
4416 		}
4417 
4418 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4419 
4420 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4421 					    rt, &r_cfg);
4422 		if (err) {
4423 			fib6_info_release(rt);
4424 			goto cleanup;
4425 		}
4426 
4427 		rtnh = rtnh_next(rtnh, &remaining);
4428 	}
4429 
4430 	/* for add and replace send one notification with all nexthops.
4431 	 * Skip the notification in fib6_add_rt2node and send one with
4432 	 * the full route when done
4433 	 */
4434 	info->skip_notify = 1;
4435 
4436 	err_nh = NULL;
4437 	list_for_each_entry(nh, &rt6_nh_list, next) {
4438 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4439 		fib6_info_release(nh->fib6_info);
4440 
4441 		if (!err) {
4442 			/* save reference to last route successfully inserted */
4443 			rt_last = nh->fib6_info;
4444 
4445 			/* save reference to first route for notification */
4446 			if (!rt_notif)
4447 				rt_notif = nh->fib6_info;
4448 		}
4449 
4450 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4451 		nh->fib6_info = NULL;
4452 		if (err) {
4453 			if (replace && nhn)
4454 				NL_SET_ERR_MSG_MOD(extack,
4455 						   "multipath route replace failed (check consistency of installed routes)");
4456 			err_nh = nh;
4457 			goto add_errout;
4458 		}
4459 
4460 		/* Because each route is added like a single route we remove
4461 		 * these flags after the first nexthop: if there is a collision,
4462 		 * we have already failed to add the first nexthop:
4463 		 * fib6_add_rt2node() has rejected it; when replacing, old
4464 		 * nexthops have been replaced by first new, the rest should
4465 		 * be added to it.
4466 		 */
4467 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4468 						     NLM_F_REPLACE);
4469 		nhn++;
4470 	}
4471 
4472 	/* success ... tell user about new route */
4473 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4474 	goto cleanup;
4475 
4476 add_errout:
4477 	/* send notification for routes that were added so that
4478 	 * the delete notifications sent by ip6_route_del are
4479 	 * coherent
4480 	 */
4481 	if (rt_notif)
4482 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4483 
4484 	/* Delete routes that were already added */
4485 	list_for_each_entry(nh, &rt6_nh_list, next) {
4486 		if (err_nh == nh)
4487 			break;
4488 		ip6_route_del(&nh->r_cfg, extack);
4489 	}
4490 
4491 cleanup:
4492 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4493 		if (nh->fib6_info)
4494 			fib6_info_release(nh->fib6_info);
4495 		list_del(&nh->next);
4496 		kfree(nh);
4497 	}
4498 
4499 	return err;
4500 }
4501 
4502 static int ip6_route_multipath_del(struct fib6_config *cfg,
4503 				   struct netlink_ext_ack *extack)
4504 {
4505 	struct fib6_config r_cfg;
4506 	struct rtnexthop *rtnh;
4507 	int remaining;
4508 	int attrlen;
4509 	int err = 1, last_err = 0;
4510 
4511 	remaining = cfg->fc_mp_len;
4512 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4513 
4514 	/* Parse a Multipath Entry */
4515 	while (rtnh_ok(rtnh, remaining)) {
4516 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4517 		if (rtnh->rtnh_ifindex)
4518 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4519 
4520 		attrlen = rtnh_attrlen(rtnh);
4521 		if (attrlen > 0) {
4522 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4523 
4524 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4525 			if (nla) {
4526 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4527 				r_cfg.fc_flags |= RTF_GATEWAY;
4528 			}
4529 		}
4530 		err = ip6_route_del(&r_cfg, extack);
4531 		if (err)
4532 			last_err = err;
4533 
4534 		rtnh = rtnh_next(rtnh, &remaining);
4535 	}
4536 
4537 	return last_err;
4538 }
4539 
4540 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4541 			      struct netlink_ext_ack *extack)
4542 {
4543 	struct fib6_config cfg;
4544 	int err;
4545 
4546 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4547 	if (err < 0)
4548 		return err;
4549 
4550 	if (cfg.fc_mp)
4551 		return ip6_route_multipath_del(&cfg, extack);
4552 	else {
4553 		cfg.fc_delete_all_nh = 1;
4554 		return ip6_route_del(&cfg, extack);
4555 	}
4556 }
4557 
4558 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4559 			      struct netlink_ext_ack *extack)
4560 {
4561 	struct fib6_config cfg;
4562 	int err;
4563 
4564 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4565 	if (err < 0)
4566 		return err;
4567 
4568 	if (cfg.fc_metric == 0)
4569 		cfg.fc_metric = IP6_RT_PRIO_USER;
4570 
4571 	if (cfg.fc_mp)
4572 		return ip6_route_multipath_add(&cfg, extack);
4573 	else
4574 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4575 }
4576 
4577 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4578 {
4579 	int nexthop_len = 0;
4580 
4581 	if (rt->fib6_nsiblings) {
4582 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4583 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4584 			    + nla_total_size(16) /* RTA_GATEWAY */
4585 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4586 
4587 		nexthop_len *= rt->fib6_nsiblings;
4588 	}
4589 
4590 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4591 	       + nla_total_size(16) /* RTA_SRC */
4592 	       + nla_total_size(16) /* RTA_DST */
4593 	       + nla_total_size(16) /* RTA_GATEWAY */
4594 	       + nla_total_size(16) /* RTA_PREFSRC */
4595 	       + nla_total_size(4) /* RTA_TABLE */
4596 	       + nla_total_size(4) /* RTA_IIF */
4597 	       + nla_total_size(4) /* RTA_OIF */
4598 	       + nla_total_size(4) /* RTA_PRIORITY */
4599 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4600 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4601 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4602 	       + nla_total_size(1) /* RTA_PREF */
4603 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4604 	       + nexthop_len;
4605 }
4606 
4607 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4608 			 struct fib6_info *rt, struct dst_entry *dst,
4609 			 struct in6_addr *dest, struct in6_addr *src,
4610 			 int iif, int type, u32 portid, u32 seq,
4611 			 unsigned int flags)
4612 {
4613 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4614 	struct rt6key *rt6_dst, *rt6_src;
4615 	u32 *pmetrics, table, rt6_flags;
4616 	struct nlmsghdr *nlh;
4617 	struct rtmsg *rtm;
4618 	long expires = 0;
4619 
4620 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4621 	if (!nlh)
4622 		return -EMSGSIZE;
4623 
4624 	if (rt6) {
4625 		rt6_dst = &rt6->rt6i_dst;
4626 		rt6_src = &rt6->rt6i_src;
4627 		rt6_flags = rt6->rt6i_flags;
4628 	} else {
4629 		rt6_dst = &rt->fib6_dst;
4630 		rt6_src = &rt->fib6_src;
4631 		rt6_flags = rt->fib6_flags;
4632 	}
4633 
4634 	rtm = nlmsg_data(nlh);
4635 	rtm->rtm_family = AF_INET6;
4636 	rtm->rtm_dst_len = rt6_dst->plen;
4637 	rtm->rtm_src_len = rt6_src->plen;
4638 	rtm->rtm_tos = 0;
4639 	if (rt->fib6_table)
4640 		table = rt->fib6_table->tb6_id;
4641 	else
4642 		table = RT6_TABLE_UNSPEC;
4643 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4644 	if (nla_put_u32(skb, RTA_TABLE, table))
4645 		goto nla_put_failure;
4646 
4647 	rtm->rtm_type = rt->fib6_type;
4648 	rtm->rtm_flags = 0;
4649 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4650 	rtm->rtm_protocol = rt->fib6_protocol;
4651 
4652 	if (rt6_flags & RTF_CACHE)
4653 		rtm->rtm_flags |= RTM_F_CLONED;
4654 
4655 	if (dest) {
4656 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4657 			goto nla_put_failure;
4658 		rtm->rtm_dst_len = 128;
4659 	} else if (rtm->rtm_dst_len)
4660 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4661 			goto nla_put_failure;
4662 #ifdef CONFIG_IPV6_SUBTREES
4663 	if (src) {
4664 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4665 			goto nla_put_failure;
4666 		rtm->rtm_src_len = 128;
4667 	} else if (rtm->rtm_src_len &&
4668 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4669 		goto nla_put_failure;
4670 #endif
4671 	if (iif) {
4672 #ifdef CONFIG_IPV6_MROUTE
4673 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4674 			int err = ip6mr_get_route(net, skb, rtm, portid);
4675 
4676 			if (err == 0)
4677 				return 0;
4678 			if (err < 0)
4679 				goto nla_put_failure;
4680 		} else
4681 #endif
4682 			if (nla_put_u32(skb, RTA_IIF, iif))
4683 				goto nla_put_failure;
4684 	} else if (dest) {
4685 		struct in6_addr saddr_buf;
4686 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4687 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4688 			goto nla_put_failure;
4689 	}
4690 
4691 	if (rt->fib6_prefsrc.plen) {
4692 		struct in6_addr saddr_buf;
4693 		saddr_buf = rt->fib6_prefsrc.addr;
4694 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4695 			goto nla_put_failure;
4696 	}
4697 
4698 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4699 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4700 		goto nla_put_failure;
4701 
4702 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4703 		goto nla_put_failure;
4704 
4705 	/* For multipath routes, walk the siblings list and add
4706 	 * each as a nexthop within RTA_MULTIPATH.
4707 	 */
4708 	if (rt6) {
4709 		if (rt6_flags & RTF_GATEWAY &&
4710 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4711 			goto nla_put_failure;
4712 
4713 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4714 			goto nla_put_failure;
4715 	} else if (rt->fib6_nsiblings) {
4716 		struct fib6_info *sibling, *next_sibling;
4717 		struct nlattr *mp;
4718 
4719 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4720 		if (!mp)
4721 			goto nla_put_failure;
4722 
4723 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4724 				    rt->fib6_nh.fib_nh_weight) < 0)
4725 			goto nla_put_failure;
4726 
4727 		list_for_each_entry_safe(sibling, next_sibling,
4728 					 &rt->fib6_siblings, fib6_siblings) {
4729 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4730 					    sibling->fib6_nh.fib_nh_weight) < 0)
4731 				goto nla_put_failure;
4732 		}
4733 
4734 		nla_nest_end(skb, mp);
4735 	} else {
4736 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4737 				     &rtm->rtm_flags, false) < 0)
4738 			goto nla_put_failure;
4739 	}
4740 
4741 	if (rt6_flags & RTF_EXPIRES) {
4742 		expires = dst ? dst->expires : rt->expires;
4743 		expires -= jiffies;
4744 	}
4745 
4746 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4747 		goto nla_put_failure;
4748 
4749 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4750 		goto nla_put_failure;
4751 
4752 
4753 	nlmsg_end(skb, nlh);
4754 	return 0;
4755 
4756 nla_put_failure:
4757 	nlmsg_cancel(skb, nlh);
4758 	return -EMSGSIZE;
4759 }
4760 
4761 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4762 			       const struct net_device *dev)
4763 {
4764 	if (f6i->fib6_nh.fib_nh_dev == dev)
4765 		return true;
4766 
4767 	if (f6i->fib6_nsiblings) {
4768 		struct fib6_info *sibling, *next_sibling;
4769 
4770 		list_for_each_entry_safe(sibling, next_sibling,
4771 					 &f6i->fib6_siblings, fib6_siblings) {
4772 			if (sibling->fib6_nh.fib_nh_dev == dev)
4773 				return true;
4774 		}
4775 	}
4776 
4777 	return false;
4778 }
4779 
4780 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4781 {
4782 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4783 	struct fib_dump_filter *filter = &arg->filter;
4784 	unsigned int flags = NLM_F_MULTI;
4785 	struct net *net = arg->net;
4786 
4787 	if (rt == net->ipv6.fib6_null_entry)
4788 		return 0;
4789 
4790 	if ((filter->flags & RTM_F_PREFIX) &&
4791 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4792 		/* success since this is not a prefix route */
4793 		return 1;
4794 	}
4795 	if (filter->filter_set) {
4796 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4797 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4798 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4799 			return 1;
4800 		}
4801 		flags |= NLM_F_DUMP_FILTERED;
4802 	}
4803 
4804 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4805 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4806 			     arg->cb->nlh->nlmsg_seq, flags);
4807 }
4808 
4809 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4810 					const struct nlmsghdr *nlh,
4811 					struct nlattr **tb,
4812 					struct netlink_ext_ack *extack)
4813 {
4814 	struct rtmsg *rtm;
4815 	int i, err;
4816 
4817 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4818 		NL_SET_ERR_MSG_MOD(extack,
4819 				   "Invalid header for get route request");
4820 		return -EINVAL;
4821 	}
4822 
4823 	if (!netlink_strict_get_check(skb))
4824 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4825 				   rtm_ipv6_policy, extack);
4826 
4827 	rtm = nlmsg_data(nlh);
4828 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4829 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4830 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4831 	    rtm->rtm_type) {
4832 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4833 		return -EINVAL;
4834 	}
4835 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4836 		NL_SET_ERR_MSG_MOD(extack,
4837 				   "Invalid flags for get route request");
4838 		return -EINVAL;
4839 	}
4840 
4841 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4842 				 rtm_ipv6_policy, extack);
4843 	if (err)
4844 		return err;
4845 
4846 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4847 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4848 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4849 		return -EINVAL;
4850 	}
4851 
4852 	for (i = 0; i <= RTA_MAX; i++) {
4853 		if (!tb[i])
4854 			continue;
4855 
4856 		switch (i) {
4857 		case RTA_SRC:
4858 		case RTA_DST:
4859 		case RTA_IIF:
4860 		case RTA_OIF:
4861 		case RTA_MARK:
4862 		case RTA_UID:
4863 		case RTA_SPORT:
4864 		case RTA_DPORT:
4865 		case RTA_IP_PROTO:
4866 			break;
4867 		default:
4868 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4869 			return -EINVAL;
4870 		}
4871 	}
4872 
4873 	return 0;
4874 }
4875 
4876 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4877 			      struct netlink_ext_ack *extack)
4878 {
4879 	struct net *net = sock_net(in_skb->sk);
4880 	struct nlattr *tb[RTA_MAX+1];
4881 	int err, iif = 0, oif = 0;
4882 	struct fib6_info *from;
4883 	struct dst_entry *dst;
4884 	struct rt6_info *rt;
4885 	struct sk_buff *skb;
4886 	struct rtmsg *rtm;
4887 	struct flowi6 fl6 = {};
4888 	bool fibmatch;
4889 
4890 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4891 	if (err < 0)
4892 		goto errout;
4893 
4894 	err = -EINVAL;
4895 	rtm = nlmsg_data(nlh);
4896 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4897 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4898 
4899 	if (tb[RTA_SRC]) {
4900 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4901 			goto errout;
4902 
4903 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4904 	}
4905 
4906 	if (tb[RTA_DST]) {
4907 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4908 			goto errout;
4909 
4910 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4911 	}
4912 
4913 	if (tb[RTA_IIF])
4914 		iif = nla_get_u32(tb[RTA_IIF]);
4915 
4916 	if (tb[RTA_OIF])
4917 		oif = nla_get_u32(tb[RTA_OIF]);
4918 
4919 	if (tb[RTA_MARK])
4920 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4921 
4922 	if (tb[RTA_UID])
4923 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4924 					   nla_get_u32(tb[RTA_UID]));
4925 	else
4926 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4927 
4928 	if (tb[RTA_SPORT])
4929 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4930 
4931 	if (tb[RTA_DPORT])
4932 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4933 
4934 	if (tb[RTA_IP_PROTO]) {
4935 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4936 						  &fl6.flowi6_proto, AF_INET6,
4937 						  extack);
4938 		if (err)
4939 			goto errout;
4940 	}
4941 
4942 	if (iif) {
4943 		struct net_device *dev;
4944 		int flags = 0;
4945 
4946 		rcu_read_lock();
4947 
4948 		dev = dev_get_by_index_rcu(net, iif);
4949 		if (!dev) {
4950 			rcu_read_unlock();
4951 			err = -ENODEV;
4952 			goto errout;
4953 		}
4954 
4955 		fl6.flowi6_iif = iif;
4956 
4957 		if (!ipv6_addr_any(&fl6.saddr))
4958 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4959 
4960 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4961 
4962 		rcu_read_unlock();
4963 	} else {
4964 		fl6.flowi6_oif = oif;
4965 
4966 		dst = ip6_route_output(net, NULL, &fl6);
4967 	}
4968 
4969 
4970 	rt = container_of(dst, struct rt6_info, dst);
4971 	if (rt->dst.error) {
4972 		err = rt->dst.error;
4973 		ip6_rt_put(rt);
4974 		goto errout;
4975 	}
4976 
4977 	if (rt == net->ipv6.ip6_null_entry) {
4978 		err = rt->dst.error;
4979 		ip6_rt_put(rt);
4980 		goto errout;
4981 	}
4982 
4983 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4984 	if (!skb) {
4985 		ip6_rt_put(rt);
4986 		err = -ENOBUFS;
4987 		goto errout;
4988 	}
4989 
4990 	skb_dst_set(skb, &rt->dst);
4991 
4992 	rcu_read_lock();
4993 	from = rcu_dereference(rt->from);
4994 
4995 	if (fibmatch)
4996 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4997 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4998 				    nlh->nlmsg_seq, 0);
4999 	else
5000 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5001 				    &fl6.saddr, iif, RTM_NEWROUTE,
5002 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5003 				    0);
5004 	rcu_read_unlock();
5005 
5006 	if (err < 0) {
5007 		kfree_skb(skb);
5008 		goto errout;
5009 	}
5010 
5011 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5012 errout:
5013 	return err;
5014 }
5015 
5016 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5017 		     unsigned int nlm_flags)
5018 {
5019 	struct sk_buff *skb;
5020 	struct net *net = info->nl_net;
5021 	u32 seq;
5022 	int err;
5023 
5024 	err = -ENOBUFS;
5025 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5026 
5027 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5028 	if (!skb)
5029 		goto errout;
5030 
5031 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5032 			    event, info->portid, seq, nlm_flags);
5033 	if (err < 0) {
5034 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5035 		WARN_ON(err == -EMSGSIZE);
5036 		kfree_skb(skb);
5037 		goto errout;
5038 	}
5039 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5040 		    info->nlh, gfp_any());
5041 	return;
5042 errout:
5043 	if (err < 0)
5044 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5045 }
5046 
5047 static int ip6_route_dev_notify(struct notifier_block *this,
5048 				unsigned long event, void *ptr)
5049 {
5050 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5051 	struct net *net = dev_net(dev);
5052 
5053 	if (!(dev->flags & IFF_LOOPBACK))
5054 		return NOTIFY_OK;
5055 
5056 	if (event == NETDEV_REGISTER) {
5057 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5058 		net->ipv6.ip6_null_entry->dst.dev = dev;
5059 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5060 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5061 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5062 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5063 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5064 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5065 #endif
5066 	 } else if (event == NETDEV_UNREGISTER &&
5067 		    dev->reg_state != NETREG_UNREGISTERED) {
5068 		/* NETDEV_UNREGISTER could be fired for multiple times by
5069 		 * netdev_wait_allrefs(). Make sure we only call this once.
5070 		 */
5071 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5072 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5073 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5074 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5075 #endif
5076 	}
5077 
5078 	return NOTIFY_OK;
5079 }
5080 
5081 /*
5082  *	/proc
5083  */
5084 
5085 #ifdef CONFIG_PROC_FS
5086 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5087 {
5088 	struct net *net = (struct net *)seq->private;
5089 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5090 		   net->ipv6.rt6_stats->fib_nodes,
5091 		   net->ipv6.rt6_stats->fib_route_nodes,
5092 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5093 		   net->ipv6.rt6_stats->fib_rt_entries,
5094 		   net->ipv6.rt6_stats->fib_rt_cache,
5095 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5096 		   net->ipv6.rt6_stats->fib_discarded_routes);
5097 
5098 	return 0;
5099 }
5100 #endif	/* CONFIG_PROC_FS */
5101 
5102 #ifdef CONFIG_SYSCTL
5103 
5104 static
5105 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5106 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5107 {
5108 	struct net *net;
5109 	int delay;
5110 	int ret;
5111 	if (!write)
5112 		return -EINVAL;
5113 
5114 	net = (struct net *)ctl->extra1;
5115 	delay = net->ipv6.sysctl.flush_delay;
5116 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5117 	if (ret)
5118 		return ret;
5119 
5120 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5121 	return 0;
5122 }
5123 
5124 static int zero;
5125 static int one = 1;
5126 
5127 static struct ctl_table ipv6_route_table_template[] = {
5128 	{
5129 		.procname	=	"flush",
5130 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5131 		.maxlen		=	sizeof(int),
5132 		.mode		=	0200,
5133 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5134 	},
5135 	{
5136 		.procname	=	"gc_thresh",
5137 		.data		=	&ip6_dst_ops_template.gc_thresh,
5138 		.maxlen		=	sizeof(int),
5139 		.mode		=	0644,
5140 		.proc_handler	=	proc_dointvec,
5141 	},
5142 	{
5143 		.procname	=	"max_size",
5144 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5145 		.maxlen		=	sizeof(int),
5146 		.mode		=	0644,
5147 		.proc_handler	=	proc_dointvec,
5148 	},
5149 	{
5150 		.procname	=	"gc_min_interval",
5151 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5152 		.maxlen		=	sizeof(int),
5153 		.mode		=	0644,
5154 		.proc_handler	=	proc_dointvec_jiffies,
5155 	},
5156 	{
5157 		.procname	=	"gc_timeout",
5158 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5159 		.maxlen		=	sizeof(int),
5160 		.mode		=	0644,
5161 		.proc_handler	=	proc_dointvec_jiffies,
5162 	},
5163 	{
5164 		.procname	=	"gc_interval",
5165 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5166 		.maxlen		=	sizeof(int),
5167 		.mode		=	0644,
5168 		.proc_handler	=	proc_dointvec_jiffies,
5169 	},
5170 	{
5171 		.procname	=	"gc_elasticity",
5172 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5173 		.maxlen		=	sizeof(int),
5174 		.mode		=	0644,
5175 		.proc_handler	=	proc_dointvec,
5176 	},
5177 	{
5178 		.procname	=	"mtu_expires",
5179 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5180 		.maxlen		=	sizeof(int),
5181 		.mode		=	0644,
5182 		.proc_handler	=	proc_dointvec_jiffies,
5183 	},
5184 	{
5185 		.procname	=	"min_adv_mss",
5186 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5187 		.maxlen		=	sizeof(int),
5188 		.mode		=	0644,
5189 		.proc_handler	=	proc_dointvec,
5190 	},
5191 	{
5192 		.procname	=	"gc_min_interval_ms",
5193 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5194 		.maxlen		=	sizeof(int),
5195 		.mode		=	0644,
5196 		.proc_handler	=	proc_dointvec_ms_jiffies,
5197 	},
5198 	{
5199 		.procname	=	"skip_notify_on_dev_down",
5200 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5201 		.maxlen		=	sizeof(int),
5202 		.mode		=	0644,
5203 		.proc_handler	=	proc_dointvec,
5204 		.extra1		=	&zero,
5205 		.extra2		=	&one,
5206 	},
5207 	{ }
5208 };
5209 
5210 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5211 {
5212 	struct ctl_table *table;
5213 
5214 	table = kmemdup(ipv6_route_table_template,
5215 			sizeof(ipv6_route_table_template),
5216 			GFP_KERNEL);
5217 
5218 	if (table) {
5219 		table[0].data = &net->ipv6.sysctl.flush_delay;
5220 		table[0].extra1 = net;
5221 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5222 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5223 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5224 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5225 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5226 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5227 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5228 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5229 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5230 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5231 
5232 		/* Don't export sysctls to unprivileged users */
5233 		if (net->user_ns != &init_user_ns)
5234 			table[0].procname = NULL;
5235 	}
5236 
5237 	return table;
5238 }
5239 #endif
5240 
5241 static int __net_init ip6_route_net_init(struct net *net)
5242 {
5243 	int ret = -ENOMEM;
5244 
5245 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5246 	       sizeof(net->ipv6.ip6_dst_ops));
5247 
5248 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5249 		goto out_ip6_dst_ops;
5250 
5251 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5252 					    sizeof(*net->ipv6.fib6_null_entry),
5253 					    GFP_KERNEL);
5254 	if (!net->ipv6.fib6_null_entry)
5255 		goto out_ip6_dst_entries;
5256 
5257 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5258 					   sizeof(*net->ipv6.ip6_null_entry),
5259 					   GFP_KERNEL);
5260 	if (!net->ipv6.ip6_null_entry)
5261 		goto out_fib6_null_entry;
5262 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5263 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5264 			 ip6_template_metrics, true);
5265 
5266 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5267 	net->ipv6.fib6_has_custom_rules = false;
5268 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5269 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5270 					       GFP_KERNEL);
5271 	if (!net->ipv6.ip6_prohibit_entry)
5272 		goto out_ip6_null_entry;
5273 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5274 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5275 			 ip6_template_metrics, true);
5276 
5277 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5278 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5279 					       GFP_KERNEL);
5280 	if (!net->ipv6.ip6_blk_hole_entry)
5281 		goto out_ip6_prohibit_entry;
5282 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5283 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5284 			 ip6_template_metrics, true);
5285 #endif
5286 
5287 	net->ipv6.sysctl.flush_delay = 0;
5288 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5289 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5290 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5291 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5292 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5293 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5294 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5295 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5296 
5297 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5298 
5299 	ret = 0;
5300 out:
5301 	return ret;
5302 
5303 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5304 out_ip6_prohibit_entry:
5305 	kfree(net->ipv6.ip6_prohibit_entry);
5306 out_ip6_null_entry:
5307 	kfree(net->ipv6.ip6_null_entry);
5308 #endif
5309 out_fib6_null_entry:
5310 	kfree(net->ipv6.fib6_null_entry);
5311 out_ip6_dst_entries:
5312 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5313 out_ip6_dst_ops:
5314 	goto out;
5315 }
5316 
5317 static void __net_exit ip6_route_net_exit(struct net *net)
5318 {
5319 	kfree(net->ipv6.fib6_null_entry);
5320 	kfree(net->ipv6.ip6_null_entry);
5321 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5322 	kfree(net->ipv6.ip6_prohibit_entry);
5323 	kfree(net->ipv6.ip6_blk_hole_entry);
5324 #endif
5325 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5326 }
5327 
5328 static int __net_init ip6_route_net_init_late(struct net *net)
5329 {
5330 #ifdef CONFIG_PROC_FS
5331 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5332 			sizeof(struct ipv6_route_iter));
5333 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5334 			rt6_stats_seq_show, NULL);
5335 #endif
5336 	return 0;
5337 }
5338 
5339 static void __net_exit ip6_route_net_exit_late(struct net *net)
5340 {
5341 #ifdef CONFIG_PROC_FS
5342 	remove_proc_entry("ipv6_route", net->proc_net);
5343 	remove_proc_entry("rt6_stats", net->proc_net);
5344 #endif
5345 }
5346 
5347 static struct pernet_operations ip6_route_net_ops = {
5348 	.init = ip6_route_net_init,
5349 	.exit = ip6_route_net_exit,
5350 };
5351 
5352 static int __net_init ipv6_inetpeer_init(struct net *net)
5353 {
5354 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5355 
5356 	if (!bp)
5357 		return -ENOMEM;
5358 	inet_peer_base_init(bp);
5359 	net->ipv6.peers = bp;
5360 	return 0;
5361 }
5362 
5363 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5364 {
5365 	struct inet_peer_base *bp = net->ipv6.peers;
5366 
5367 	net->ipv6.peers = NULL;
5368 	inetpeer_invalidate_tree(bp);
5369 	kfree(bp);
5370 }
5371 
5372 static struct pernet_operations ipv6_inetpeer_ops = {
5373 	.init	=	ipv6_inetpeer_init,
5374 	.exit	=	ipv6_inetpeer_exit,
5375 };
5376 
5377 static struct pernet_operations ip6_route_net_late_ops = {
5378 	.init = ip6_route_net_init_late,
5379 	.exit = ip6_route_net_exit_late,
5380 };
5381 
5382 static struct notifier_block ip6_route_dev_notifier = {
5383 	.notifier_call = ip6_route_dev_notify,
5384 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5385 };
5386 
5387 void __init ip6_route_init_special_entries(void)
5388 {
5389 	/* Registering of the loopback is done before this portion of code,
5390 	 * the loopback reference in rt6_info will not be taken, do it
5391 	 * manually for init_net */
5392 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5393 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5394 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5395   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5396 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5397 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5398 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5399 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5400   #endif
5401 }
5402 
5403 int __init ip6_route_init(void)
5404 {
5405 	int ret;
5406 	int cpu;
5407 
5408 	ret = -ENOMEM;
5409 	ip6_dst_ops_template.kmem_cachep =
5410 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5411 				  SLAB_HWCACHE_ALIGN, NULL);
5412 	if (!ip6_dst_ops_template.kmem_cachep)
5413 		goto out;
5414 
5415 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5416 	if (ret)
5417 		goto out_kmem_cache;
5418 
5419 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5420 	if (ret)
5421 		goto out_dst_entries;
5422 
5423 	ret = register_pernet_subsys(&ip6_route_net_ops);
5424 	if (ret)
5425 		goto out_register_inetpeer;
5426 
5427 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5428 
5429 	ret = fib6_init();
5430 	if (ret)
5431 		goto out_register_subsys;
5432 
5433 	ret = xfrm6_init();
5434 	if (ret)
5435 		goto out_fib6_init;
5436 
5437 	ret = fib6_rules_init();
5438 	if (ret)
5439 		goto xfrm6_init;
5440 
5441 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5442 	if (ret)
5443 		goto fib6_rules_init;
5444 
5445 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5446 				   inet6_rtm_newroute, NULL, 0);
5447 	if (ret < 0)
5448 		goto out_register_late_subsys;
5449 
5450 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5451 				   inet6_rtm_delroute, NULL, 0);
5452 	if (ret < 0)
5453 		goto out_register_late_subsys;
5454 
5455 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5456 				   inet6_rtm_getroute, NULL,
5457 				   RTNL_FLAG_DOIT_UNLOCKED);
5458 	if (ret < 0)
5459 		goto out_register_late_subsys;
5460 
5461 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5462 	if (ret)
5463 		goto out_register_late_subsys;
5464 
5465 	for_each_possible_cpu(cpu) {
5466 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5467 
5468 		INIT_LIST_HEAD(&ul->head);
5469 		spin_lock_init(&ul->lock);
5470 	}
5471 
5472 out:
5473 	return ret;
5474 
5475 out_register_late_subsys:
5476 	rtnl_unregister_all(PF_INET6);
5477 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5478 fib6_rules_init:
5479 	fib6_rules_cleanup();
5480 xfrm6_init:
5481 	xfrm6_fini();
5482 out_fib6_init:
5483 	fib6_gc_cleanup();
5484 out_register_subsys:
5485 	unregister_pernet_subsys(&ip6_route_net_ops);
5486 out_register_inetpeer:
5487 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5488 out_dst_entries:
5489 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5490 out_kmem_cache:
5491 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5492 	goto out;
5493 }
5494 
5495 void ip6_route_cleanup(void)
5496 {
5497 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5498 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5499 	fib6_rules_cleanup();
5500 	xfrm6_fini();
5501 	fib6_gc_cleanup();
5502 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5503 	unregister_pernet_subsys(&ip6_route_net_ops);
5504 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5505 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5506 }
5507