xref: /openbmc/linux/net/ipv6/route.c (revision 8cb08174)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= REFCOUNT_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 void fib6_select_path(const struct net *net, struct fib6_result *res,
432 		      struct flowi6 *fl6, int oif, bool have_oif_match,
433 		      const struct sk_buff *skb, int strict)
434 {
435 	struct fib6_info *sibling, *next_sibling;
436 	struct fib6_info *match = res->f6i;
437 
438 	if (!match->fib6_nsiblings || have_oif_match)
439 		goto out;
440 
441 	/* We might have already computed the hash for ICMPv6 errors. In such
442 	 * case it will always be non-zero. Otherwise now is the time to do it.
443 	 */
444 	if (!fl6->mp_hash)
445 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
446 
447 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
448 		goto out;
449 
450 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 				 fib6_siblings) {
452 		const struct fib6_nh *nh = &sibling->fib6_nh;
453 		int nh_upper_bound;
454 
455 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
456 		if (fl6->mp_hash > nh_upper_bound)
457 			continue;
458 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
459 			break;
460 		match = sibling;
461 		break;
462 	}
463 
464 out:
465 	res->f6i = match;
466 	res->nh = &match->fib6_nh;
467 }
468 
469 /*
470  *	Route lookup. rcu_read_lock() should be held.
471  */
472 
473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
474 			       const struct in6_addr *saddr, int oif, int flags)
475 {
476 	const struct net_device *dev;
477 
478 	if (nh->fib_nh_flags & RTNH_F_DEAD)
479 		return false;
480 
481 	dev = nh->fib_nh_dev;
482 	if (oif) {
483 		if (dev->ifindex == oif)
484 			return true;
485 	} else {
486 		if (ipv6_chk_addr(net, saddr, dev,
487 				  flags & RT6_LOOKUP_F_IFACE))
488 			return true;
489 	}
490 
491 	return false;
492 }
493 
494 static void rt6_device_match(struct net *net, struct fib6_result *res,
495 			     const struct in6_addr *saddr, int oif, int flags)
496 {
497 	struct fib6_info *f6i = res->f6i;
498 	struct fib6_info *spf6i;
499 	struct fib6_nh *nh;
500 
501 	if (!oif && ipv6_addr_any(saddr)) {
502 		nh = &f6i->fib6_nh;
503 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
504 			goto out;
505 	}
506 
507 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
508 		nh = &spf6i->fib6_nh;
509 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
510 			res->f6i = spf6i;
511 			goto out;
512 		}
513 	}
514 
515 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
516 		res->f6i = net->ipv6.fib6_null_entry;
517 		nh = &res->f6i->fib6_nh;
518 		goto out;
519 	}
520 
521 	nh = &f6i->fib6_nh;
522 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
523 		res->f6i = net->ipv6.fib6_null_entry;
524 		nh = &res->f6i->fib6_nh;
525 	}
526 out:
527 	res->nh = nh;
528 	res->fib6_type = res->f6i->fib6_type;
529 	res->fib6_flags = res->f6i->fib6_flags;
530 }
531 
532 #ifdef CONFIG_IPV6_ROUTER_PREF
533 struct __rt6_probe_work {
534 	struct work_struct work;
535 	struct in6_addr target;
536 	struct net_device *dev;
537 };
538 
539 static void rt6_probe_deferred(struct work_struct *w)
540 {
541 	struct in6_addr mcaddr;
542 	struct __rt6_probe_work *work =
543 		container_of(w, struct __rt6_probe_work, work);
544 
545 	addrconf_addr_solict_mult(&work->target, &mcaddr);
546 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
547 	dev_put(work->dev);
548 	kfree(work);
549 }
550 
551 static void rt6_probe(struct fib6_nh *fib6_nh)
552 {
553 	struct __rt6_probe_work *work = NULL;
554 	const struct in6_addr *nh_gw;
555 	struct neighbour *neigh;
556 	struct net_device *dev;
557 	struct inet6_dev *idev;
558 
559 	/*
560 	 * Okay, this does not seem to be appropriate
561 	 * for now, however, we need to check if it
562 	 * is really so; aka Router Reachability Probing.
563 	 *
564 	 * Router Reachability Probe MUST be rate-limited
565 	 * to no more than one per minute.
566 	 */
567 	if (fib6_nh->fib_nh_gw_family)
568 		return;
569 
570 	nh_gw = &fib6_nh->fib_nh_gw6;
571 	dev = fib6_nh->fib_nh_dev;
572 	rcu_read_lock_bh();
573 	idev = __in6_dev_get(dev);
574 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
575 	if (neigh) {
576 		if (neigh->nud_state & NUD_VALID)
577 			goto out;
578 
579 		write_lock(&neigh->lock);
580 		if (!(neigh->nud_state & NUD_VALID) &&
581 		    time_after(jiffies,
582 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
583 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 			if (work)
585 				__neigh_set_probe_once(neigh);
586 		}
587 		write_unlock(&neigh->lock);
588 	} else if (time_after(jiffies, fib6_nh->last_probe +
589 				       idev->cnf.rtr_probe_interval)) {
590 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
591 	}
592 
593 	if (work) {
594 		fib6_nh->last_probe = jiffies;
595 		INIT_WORK(&work->work, rt6_probe_deferred);
596 		work->target = *nh_gw;
597 		dev_hold(dev);
598 		work->dev = dev;
599 		schedule_work(&work->work);
600 	}
601 
602 out:
603 	rcu_read_unlock_bh();
604 }
605 #else
606 static inline void rt6_probe(struct fib6_nh *fib6_nh)
607 {
608 }
609 #endif
610 
611 /*
612  * Default Router Selection (RFC 2461 6.3.6)
613  */
614 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
615 {
616 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
617 	struct neighbour *neigh;
618 
619 	rcu_read_lock_bh();
620 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
621 					  &fib6_nh->fib_nh_gw6);
622 	if (neigh) {
623 		read_lock(&neigh->lock);
624 		if (neigh->nud_state & NUD_VALID)
625 			ret = RT6_NUD_SUCCEED;
626 #ifdef CONFIG_IPV6_ROUTER_PREF
627 		else if (!(neigh->nud_state & NUD_FAILED))
628 			ret = RT6_NUD_SUCCEED;
629 		else
630 			ret = RT6_NUD_FAIL_PROBE;
631 #endif
632 		read_unlock(&neigh->lock);
633 	} else {
634 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
635 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
636 	}
637 	rcu_read_unlock_bh();
638 
639 	return ret;
640 }
641 
642 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
643 			   int strict)
644 {
645 	int m = 0;
646 
647 	if (!oif || nh->fib_nh_dev->ifindex == oif)
648 		m = 2;
649 
650 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
651 		return RT6_NUD_FAIL_HARD;
652 #ifdef CONFIG_IPV6_ROUTER_PREF
653 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
654 #endif
655 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
656 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
657 		int n = rt6_check_neigh(nh);
658 		if (n < 0)
659 			return n;
660 	}
661 	return m;
662 }
663 
664 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
665 		       int oif, int strict, int *mpri, bool *do_rr)
666 {
667 	bool match_do_rr = false;
668 	bool rc = false;
669 	int m;
670 
671 	if (nh->fib_nh_flags & RTNH_F_DEAD)
672 		goto out;
673 
674 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
675 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
676 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
677 		goto out;
678 
679 	m = rt6_score_route(nh, fib6_flags, oif, strict);
680 	if (m == RT6_NUD_FAIL_DO_RR) {
681 		match_do_rr = true;
682 		m = 0; /* lowest valid score */
683 	} else if (m == RT6_NUD_FAIL_HARD) {
684 		goto out;
685 	}
686 
687 	if (strict & RT6_LOOKUP_F_REACHABLE)
688 		rt6_probe(nh);
689 
690 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
691 	if (m > *mpri) {
692 		*do_rr = match_do_rr;
693 		*mpri = m;
694 		rc = true;
695 	}
696 out:
697 	return rc;
698 }
699 
700 static void __find_rr_leaf(struct fib6_info *f6i_start,
701 			   struct fib6_info *nomatch, u32 metric,
702 			   struct fib6_result *res, struct fib6_info **cont,
703 			   int oif, int strict, bool *do_rr, int *mpri)
704 {
705 	struct fib6_info *f6i;
706 
707 	for (f6i = f6i_start;
708 	     f6i && f6i != nomatch;
709 	     f6i = rcu_dereference(f6i->fib6_next)) {
710 		struct fib6_nh *nh;
711 
712 		if (cont && f6i->fib6_metric != metric) {
713 			*cont = f6i;
714 			return;
715 		}
716 
717 		if (fib6_check_expired(f6i))
718 			continue;
719 
720 		nh = &f6i->fib6_nh;
721 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
722 			res->f6i = f6i;
723 			res->nh = nh;
724 			res->fib6_flags = f6i->fib6_flags;
725 			res->fib6_type = f6i->fib6_type;
726 		}
727 	}
728 }
729 
730 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
731 			 struct fib6_info *rr_head, int oif, int strict,
732 			 bool *do_rr, struct fib6_result *res)
733 {
734 	u32 metric = rr_head->fib6_metric;
735 	struct fib6_info *cont = NULL;
736 	int mpri = -1;
737 
738 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
739 		       oif, strict, do_rr, &mpri);
740 
741 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
742 		       oif, strict, do_rr, &mpri);
743 
744 	if (res->f6i || !cont)
745 		return;
746 
747 	__find_rr_leaf(cont, NULL, metric, res, NULL,
748 		       oif, strict, do_rr, &mpri);
749 }
750 
751 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
752 		       struct fib6_result *res, int strict)
753 {
754 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
755 	struct fib6_info *rt0;
756 	bool do_rr = false;
757 	int key_plen;
758 
759 	/* make sure this function or its helpers sets f6i */
760 	res->f6i = NULL;
761 
762 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
763 		goto out;
764 
765 	rt0 = rcu_dereference(fn->rr_ptr);
766 	if (!rt0)
767 		rt0 = leaf;
768 
769 	/* Double check to make sure fn is not an intermediate node
770 	 * and fn->leaf does not points to its child's leaf
771 	 * (This might happen if all routes under fn are deleted from
772 	 * the tree and fib6_repair_tree() is called on the node.)
773 	 */
774 	key_plen = rt0->fib6_dst.plen;
775 #ifdef CONFIG_IPV6_SUBTREES
776 	if (rt0->fib6_src.plen)
777 		key_plen = rt0->fib6_src.plen;
778 #endif
779 	if (fn->fn_bit != key_plen)
780 		goto out;
781 
782 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
783 	if (do_rr) {
784 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
785 
786 		/* no entries matched; do round-robin */
787 		if (!next || next->fib6_metric != rt0->fib6_metric)
788 			next = leaf;
789 
790 		if (next != rt0) {
791 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
792 			/* make sure next is not being deleted from the tree */
793 			if (next->fib6_node)
794 				rcu_assign_pointer(fn->rr_ptr, next);
795 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
796 		}
797 	}
798 
799 out:
800 	if (!res->f6i) {
801 		res->f6i = net->ipv6.fib6_null_entry;
802 		res->nh = &res->f6i->fib6_nh;
803 		res->fib6_flags = res->f6i->fib6_flags;
804 		res->fib6_type = res->f6i->fib6_type;
805 	}
806 }
807 
808 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
809 {
810 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
811 	       res->nh->fib_nh_gw_family;
812 }
813 
814 #ifdef CONFIG_IPV6_ROUTE_INFO
815 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
816 		  const struct in6_addr *gwaddr)
817 {
818 	struct net *net = dev_net(dev);
819 	struct route_info *rinfo = (struct route_info *) opt;
820 	struct in6_addr prefix_buf, *prefix;
821 	unsigned int pref;
822 	unsigned long lifetime;
823 	struct fib6_info *rt;
824 
825 	if (len < sizeof(struct route_info)) {
826 		return -EINVAL;
827 	}
828 
829 	/* Sanity check for prefix_len and length */
830 	if (rinfo->length > 3) {
831 		return -EINVAL;
832 	} else if (rinfo->prefix_len > 128) {
833 		return -EINVAL;
834 	} else if (rinfo->prefix_len > 64) {
835 		if (rinfo->length < 2) {
836 			return -EINVAL;
837 		}
838 	} else if (rinfo->prefix_len > 0) {
839 		if (rinfo->length < 1) {
840 			return -EINVAL;
841 		}
842 	}
843 
844 	pref = rinfo->route_pref;
845 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
846 		return -EINVAL;
847 
848 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
849 
850 	if (rinfo->length == 3)
851 		prefix = (struct in6_addr *)rinfo->prefix;
852 	else {
853 		/* this function is safe */
854 		ipv6_addr_prefix(&prefix_buf,
855 				 (struct in6_addr *)rinfo->prefix,
856 				 rinfo->prefix_len);
857 		prefix = &prefix_buf;
858 	}
859 
860 	if (rinfo->prefix_len == 0)
861 		rt = rt6_get_dflt_router(net, gwaddr, dev);
862 	else
863 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
864 					gwaddr, dev);
865 
866 	if (rt && !lifetime) {
867 		ip6_del_rt(net, rt);
868 		rt = NULL;
869 	}
870 
871 	if (!rt && lifetime)
872 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
873 					dev, pref);
874 	else if (rt)
875 		rt->fib6_flags = RTF_ROUTEINFO |
876 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
877 
878 	if (rt) {
879 		if (!addrconf_finite_timeout(lifetime))
880 			fib6_clean_expires(rt);
881 		else
882 			fib6_set_expires(rt, jiffies + HZ * lifetime);
883 
884 		fib6_info_release(rt);
885 	}
886 	return 0;
887 }
888 #endif
889 
890 /*
891  *	Misc support functions
892  */
893 
894 /* called with rcu_lock held */
895 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
896 {
897 	struct net_device *dev = res->nh->fib_nh_dev;
898 
899 	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
900 		/* for copies of local routes, dst->dev needs to be the
901 		 * device if it is a master device, the master device if
902 		 * device is enslaved, and the loopback as the default
903 		 */
904 		if (netif_is_l3_slave(dev) &&
905 		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
906 			dev = l3mdev_master_dev_rcu(dev);
907 		else if (!netif_is_l3_master(dev))
908 			dev = dev_net(dev)->loopback_dev;
909 		/* last case is netif_is_l3_master(dev) is true in which
910 		 * case we want dev returned to be dev
911 		 */
912 	}
913 
914 	return dev;
915 }
916 
917 static const int fib6_prop[RTN_MAX + 1] = {
918 	[RTN_UNSPEC]	= 0,
919 	[RTN_UNICAST]	= 0,
920 	[RTN_LOCAL]	= 0,
921 	[RTN_BROADCAST]	= 0,
922 	[RTN_ANYCAST]	= 0,
923 	[RTN_MULTICAST]	= 0,
924 	[RTN_BLACKHOLE]	= -EINVAL,
925 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
926 	[RTN_PROHIBIT]	= -EACCES,
927 	[RTN_THROW]	= -EAGAIN,
928 	[RTN_NAT]	= -EINVAL,
929 	[RTN_XRESOLVE]	= -EINVAL,
930 };
931 
932 static int ip6_rt_type_to_error(u8 fib6_type)
933 {
934 	return fib6_prop[fib6_type];
935 }
936 
937 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
938 {
939 	unsigned short flags = 0;
940 
941 	if (rt->dst_nocount)
942 		flags |= DST_NOCOUNT;
943 	if (rt->dst_nopolicy)
944 		flags |= DST_NOPOLICY;
945 	if (rt->dst_host)
946 		flags |= DST_HOST;
947 
948 	return flags;
949 }
950 
951 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
952 {
953 	rt->dst.error = ip6_rt_type_to_error(fib6_type);
954 
955 	switch (fib6_type) {
956 	case RTN_BLACKHOLE:
957 		rt->dst.output = dst_discard_out;
958 		rt->dst.input = dst_discard;
959 		break;
960 	case RTN_PROHIBIT:
961 		rt->dst.output = ip6_pkt_prohibit_out;
962 		rt->dst.input = ip6_pkt_prohibit;
963 		break;
964 	case RTN_THROW:
965 	case RTN_UNREACHABLE:
966 	default:
967 		rt->dst.output = ip6_pkt_discard_out;
968 		rt->dst.input = ip6_pkt_discard;
969 		break;
970 	}
971 }
972 
973 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
974 {
975 	struct fib6_info *f6i = res->f6i;
976 
977 	if (res->fib6_flags & RTF_REJECT) {
978 		ip6_rt_init_dst_reject(rt, res->fib6_type);
979 		return;
980 	}
981 
982 	rt->dst.error = 0;
983 	rt->dst.output = ip6_output;
984 
985 	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
986 		rt->dst.input = ip6_input;
987 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
988 		rt->dst.input = ip6_mc_input;
989 	} else {
990 		rt->dst.input = ip6_forward;
991 	}
992 
993 	if (res->nh->fib_nh_lws) {
994 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
995 		lwtunnel_set_redirect(&rt->dst);
996 	}
997 
998 	rt->dst.lastuse = jiffies;
999 }
1000 
1001 /* Caller must already hold reference to @from */
1002 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1003 {
1004 	rt->rt6i_flags &= ~RTF_EXPIRES;
1005 	rcu_assign_pointer(rt->from, from);
1006 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1007 }
1008 
1009 /* Caller must already hold reference to f6i in result */
1010 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1011 {
1012 	const struct fib6_nh *nh = res->nh;
1013 	const struct net_device *dev = nh->fib_nh_dev;
1014 	struct fib6_info *f6i = res->f6i;
1015 
1016 	ip6_rt_init_dst(rt, res);
1017 
1018 	rt->rt6i_dst = f6i->fib6_dst;
1019 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1020 	rt->rt6i_flags = res->fib6_flags;
1021 	if (nh->fib_nh_gw_family) {
1022 		rt->rt6i_gateway = nh->fib_nh_gw6;
1023 		rt->rt6i_flags |= RTF_GATEWAY;
1024 	}
1025 	rt6_set_from(rt, f6i);
1026 #ifdef CONFIG_IPV6_SUBTREES
1027 	rt->rt6i_src = f6i->fib6_src;
1028 #endif
1029 }
1030 
1031 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1032 					struct in6_addr *saddr)
1033 {
1034 	struct fib6_node *pn, *sn;
1035 	while (1) {
1036 		if (fn->fn_flags & RTN_TL_ROOT)
1037 			return NULL;
1038 		pn = rcu_dereference(fn->parent);
1039 		sn = FIB6_SUBTREE(pn);
1040 		if (sn && sn != fn)
1041 			fn = fib6_node_lookup(sn, NULL, saddr);
1042 		else
1043 			fn = pn;
1044 		if (fn->fn_flags & RTN_RTINFO)
1045 			return fn;
1046 	}
1047 }
1048 
1049 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1050 {
1051 	struct rt6_info *rt = *prt;
1052 
1053 	if (dst_hold_safe(&rt->dst))
1054 		return true;
1055 	if (net) {
1056 		rt = net->ipv6.ip6_null_entry;
1057 		dst_hold(&rt->dst);
1058 	} else {
1059 		rt = NULL;
1060 	}
1061 	*prt = rt;
1062 	return false;
1063 }
1064 
1065 /* called with rcu_lock held */
1066 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1067 {
1068 	struct net_device *dev = res->nh->fib_nh_dev;
1069 	struct fib6_info *f6i = res->f6i;
1070 	unsigned short flags;
1071 	struct rt6_info *nrt;
1072 
1073 	if (!fib6_info_hold_safe(f6i))
1074 		goto fallback;
1075 
1076 	flags = fib6_info_dst_flags(f6i);
1077 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1078 	if (!nrt) {
1079 		fib6_info_release(f6i);
1080 		goto fallback;
1081 	}
1082 
1083 	ip6_rt_copy_init(nrt, res);
1084 	return nrt;
1085 
1086 fallback:
1087 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1088 	dst_hold(&nrt->dst);
1089 	return nrt;
1090 }
1091 
1092 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1093 					     struct fib6_table *table,
1094 					     struct flowi6 *fl6,
1095 					     const struct sk_buff *skb,
1096 					     int flags)
1097 {
1098 	struct fib6_result res = {};
1099 	struct fib6_node *fn;
1100 	struct rt6_info *rt;
1101 
1102 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1103 		flags &= ~RT6_LOOKUP_F_IFACE;
1104 
1105 	rcu_read_lock();
1106 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1107 restart:
1108 	res.f6i = rcu_dereference(fn->leaf);
1109 	if (!res.f6i)
1110 		res.f6i = net->ipv6.fib6_null_entry;
1111 	else
1112 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1113 				 flags);
1114 
1115 	if (res.f6i == net->ipv6.fib6_null_entry) {
1116 		fn = fib6_backtrack(fn, &fl6->saddr);
1117 		if (fn)
1118 			goto restart;
1119 
1120 		rt = net->ipv6.ip6_null_entry;
1121 		dst_hold(&rt->dst);
1122 		goto out;
1123 	}
1124 
1125 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1126 			 fl6->flowi6_oif != 0, skb, flags);
1127 
1128 	/* Search through exception table */
1129 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1130 	if (rt) {
1131 		if (ip6_hold_safe(net, &rt))
1132 			dst_use_noref(&rt->dst, jiffies);
1133 	} else {
1134 		rt = ip6_create_rt_rcu(&res);
1135 	}
1136 
1137 out:
1138 	trace_fib6_table_lookup(net, &res, table, fl6);
1139 
1140 	rcu_read_unlock();
1141 
1142 	return rt;
1143 }
1144 
1145 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1146 				   const struct sk_buff *skb, int flags)
1147 {
1148 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1149 }
1150 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1151 
1152 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1153 			    const struct in6_addr *saddr, int oif,
1154 			    const struct sk_buff *skb, int strict)
1155 {
1156 	struct flowi6 fl6 = {
1157 		.flowi6_oif = oif,
1158 		.daddr = *daddr,
1159 	};
1160 	struct dst_entry *dst;
1161 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1162 
1163 	if (saddr) {
1164 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1165 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1166 	}
1167 
1168 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1169 	if (dst->error == 0)
1170 		return (struct rt6_info *) dst;
1171 
1172 	dst_release(dst);
1173 
1174 	return NULL;
1175 }
1176 EXPORT_SYMBOL(rt6_lookup);
1177 
1178 /* ip6_ins_rt is called with FREE table->tb6_lock.
1179  * It takes new route entry, the addition fails by any reason the
1180  * route is released.
1181  * Caller must hold dst before calling it.
1182  */
1183 
1184 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1185 			struct netlink_ext_ack *extack)
1186 {
1187 	int err;
1188 	struct fib6_table *table;
1189 
1190 	table = rt->fib6_table;
1191 	spin_lock_bh(&table->tb6_lock);
1192 	err = fib6_add(&table->tb6_root, rt, info, extack);
1193 	spin_unlock_bh(&table->tb6_lock);
1194 
1195 	return err;
1196 }
1197 
1198 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1199 {
1200 	struct nl_info info = {	.nl_net = net, };
1201 
1202 	return __ip6_ins_rt(rt, &info, NULL);
1203 }
1204 
1205 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1206 					   const struct in6_addr *daddr,
1207 					   const struct in6_addr *saddr)
1208 {
1209 	struct fib6_info *f6i = res->f6i;
1210 	struct net_device *dev;
1211 	struct rt6_info *rt;
1212 
1213 	/*
1214 	 *	Clone the route.
1215 	 */
1216 
1217 	if (!fib6_info_hold_safe(f6i))
1218 		return NULL;
1219 
1220 	dev = ip6_rt_get_dev_rcu(res);
1221 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1222 	if (!rt) {
1223 		fib6_info_release(f6i);
1224 		return NULL;
1225 	}
1226 
1227 	ip6_rt_copy_init(rt, res);
1228 	rt->rt6i_flags |= RTF_CACHE;
1229 	rt->dst.flags |= DST_HOST;
1230 	rt->rt6i_dst.addr = *daddr;
1231 	rt->rt6i_dst.plen = 128;
1232 
1233 	if (!rt6_is_gw_or_nonexthop(res)) {
1234 		if (f6i->fib6_dst.plen != 128 &&
1235 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1236 			rt->rt6i_flags |= RTF_ANYCAST;
1237 #ifdef CONFIG_IPV6_SUBTREES
1238 		if (rt->rt6i_src.plen && saddr) {
1239 			rt->rt6i_src.addr = *saddr;
1240 			rt->rt6i_src.plen = 128;
1241 		}
1242 #endif
1243 	}
1244 
1245 	return rt;
1246 }
1247 
1248 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1249 {
1250 	struct fib6_info *f6i = res->f6i;
1251 	unsigned short flags = fib6_info_dst_flags(f6i);
1252 	struct net_device *dev;
1253 	struct rt6_info *pcpu_rt;
1254 
1255 	if (!fib6_info_hold_safe(f6i))
1256 		return NULL;
1257 
1258 	rcu_read_lock();
1259 	dev = ip6_rt_get_dev_rcu(res);
1260 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1261 	rcu_read_unlock();
1262 	if (!pcpu_rt) {
1263 		fib6_info_release(f6i);
1264 		return NULL;
1265 	}
1266 	ip6_rt_copy_init(pcpu_rt, res);
1267 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1268 	return pcpu_rt;
1269 }
1270 
1271 /* It should be called with rcu_read_lock() acquired */
1272 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1273 {
1274 	struct rt6_info *pcpu_rt, **p;
1275 
1276 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1277 	pcpu_rt = *p;
1278 
1279 	if (pcpu_rt)
1280 		ip6_hold_safe(NULL, &pcpu_rt);
1281 
1282 	return pcpu_rt;
1283 }
1284 
1285 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1286 					    const struct fib6_result *res)
1287 {
1288 	struct rt6_info *pcpu_rt, *prev, **p;
1289 
1290 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1291 	if (!pcpu_rt) {
1292 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1293 		return net->ipv6.ip6_null_entry;
1294 	}
1295 
1296 	dst_hold(&pcpu_rt->dst);
1297 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1298 	prev = cmpxchg(p, NULL, pcpu_rt);
1299 	BUG_ON(prev);
1300 
1301 	return pcpu_rt;
1302 }
1303 
1304 /* exception hash table implementation
1305  */
1306 static DEFINE_SPINLOCK(rt6_exception_lock);
1307 
1308 /* Remove rt6_ex from hash table and free the memory
1309  * Caller must hold rt6_exception_lock
1310  */
1311 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1312 				 struct rt6_exception *rt6_ex)
1313 {
1314 	struct fib6_info *from;
1315 	struct net *net;
1316 
1317 	if (!bucket || !rt6_ex)
1318 		return;
1319 
1320 	net = dev_net(rt6_ex->rt6i->dst.dev);
1321 	net->ipv6.rt6_stats->fib_rt_cache--;
1322 
1323 	/* purge completely the exception to allow releasing the held resources:
1324 	 * some [sk] cache may keep the dst around for unlimited time
1325 	 */
1326 	from = rcu_dereference_protected(rt6_ex->rt6i->from,
1327 					 lockdep_is_held(&rt6_exception_lock));
1328 	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1329 	fib6_info_release(from);
1330 	dst_dev_put(&rt6_ex->rt6i->dst);
1331 
1332 	hlist_del_rcu(&rt6_ex->hlist);
1333 	dst_release(&rt6_ex->rt6i->dst);
1334 	kfree_rcu(rt6_ex, rcu);
1335 	WARN_ON_ONCE(!bucket->depth);
1336 	bucket->depth--;
1337 }
1338 
1339 /* Remove oldest rt6_ex in bucket and free the memory
1340  * Caller must hold rt6_exception_lock
1341  */
1342 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1343 {
1344 	struct rt6_exception *rt6_ex, *oldest = NULL;
1345 
1346 	if (!bucket)
1347 		return;
1348 
1349 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1350 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1351 			oldest = rt6_ex;
1352 	}
1353 	rt6_remove_exception(bucket, oldest);
1354 }
1355 
1356 static u32 rt6_exception_hash(const struct in6_addr *dst,
1357 			      const struct in6_addr *src)
1358 {
1359 	static u32 seed __read_mostly;
1360 	u32 val;
1361 
1362 	net_get_random_once(&seed, sizeof(seed));
1363 	val = jhash(dst, sizeof(*dst), seed);
1364 
1365 #ifdef CONFIG_IPV6_SUBTREES
1366 	if (src)
1367 		val = jhash(src, sizeof(*src), val);
1368 #endif
1369 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1370 }
1371 
1372 /* Helper function to find the cached rt in the hash table
1373  * and update bucket pointer to point to the bucket for this
1374  * (daddr, saddr) pair
1375  * Caller must hold rt6_exception_lock
1376  */
1377 static struct rt6_exception *
1378 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1379 			      const struct in6_addr *daddr,
1380 			      const struct in6_addr *saddr)
1381 {
1382 	struct rt6_exception *rt6_ex;
1383 	u32 hval;
1384 
1385 	if (!(*bucket) || !daddr)
1386 		return NULL;
1387 
1388 	hval = rt6_exception_hash(daddr, saddr);
1389 	*bucket += hval;
1390 
1391 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1392 		struct rt6_info *rt6 = rt6_ex->rt6i;
1393 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1394 
1395 #ifdef CONFIG_IPV6_SUBTREES
1396 		if (matched && saddr)
1397 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1398 #endif
1399 		if (matched)
1400 			return rt6_ex;
1401 	}
1402 	return NULL;
1403 }
1404 
1405 /* Helper function to find the cached rt in the hash table
1406  * and update bucket pointer to point to the bucket for this
1407  * (daddr, saddr) pair
1408  * Caller must hold rcu_read_lock()
1409  */
1410 static struct rt6_exception *
1411 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1412 			 const struct in6_addr *daddr,
1413 			 const struct in6_addr *saddr)
1414 {
1415 	struct rt6_exception *rt6_ex;
1416 	u32 hval;
1417 
1418 	WARN_ON_ONCE(!rcu_read_lock_held());
1419 
1420 	if (!(*bucket) || !daddr)
1421 		return NULL;
1422 
1423 	hval = rt6_exception_hash(daddr, saddr);
1424 	*bucket += hval;
1425 
1426 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1427 		struct rt6_info *rt6 = rt6_ex->rt6i;
1428 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1429 
1430 #ifdef CONFIG_IPV6_SUBTREES
1431 		if (matched && saddr)
1432 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1433 #endif
1434 		if (matched)
1435 			return rt6_ex;
1436 	}
1437 	return NULL;
1438 }
1439 
1440 static unsigned int fib6_mtu(const struct fib6_result *res)
1441 {
1442 	const struct fib6_nh *nh = res->nh;
1443 	unsigned int mtu;
1444 
1445 	if (res->f6i->fib6_pmtu) {
1446 		mtu = res->f6i->fib6_pmtu;
1447 	} else {
1448 		struct net_device *dev = nh->fib_nh_dev;
1449 		struct inet6_dev *idev;
1450 
1451 		rcu_read_lock();
1452 		idev = __in6_dev_get(dev);
1453 		mtu = idev->cnf.mtu6;
1454 		rcu_read_unlock();
1455 	}
1456 
1457 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1458 
1459 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1460 }
1461 
1462 static int rt6_insert_exception(struct rt6_info *nrt,
1463 				const struct fib6_result *res)
1464 {
1465 	struct net *net = dev_net(nrt->dst.dev);
1466 	struct rt6_exception_bucket *bucket;
1467 	struct in6_addr *src_key = NULL;
1468 	struct rt6_exception *rt6_ex;
1469 	struct fib6_info *f6i = res->f6i;
1470 	int err = 0;
1471 
1472 	spin_lock_bh(&rt6_exception_lock);
1473 
1474 	if (f6i->exception_bucket_flushed) {
1475 		err = -EINVAL;
1476 		goto out;
1477 	}
1478 
1479 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1480 					lockdep_is_held(&rt6_exception_lock));
1481 	if (!bucket) {
1482 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1483 				 GFP_ATOMIC);
1484 		if (!bucket) {
1485 			err = -ENOMEM;
1486 			goto out;
1487 		}
1488 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1489 	}
1490 
1491 #ifdef CONFIG_IPV6_SUBTREES
1492 	/* fib6_src.plen != 0 indicates f6i is in subtree
1493 	 * and exception table is indexed by a hash of
1494 	 * both fib6_dst and fib6_src.
1495 	 * Otherwise, the exception table is indexed by
1496 	 * a hash of only fib6_dst.
1497 	 */
1498 	if (f6i->fib6_src.plen)
1499 		src_key = &nrt->rt6i_src.addr;
1500 #endif
1501 	/* rt6_mtu_change() might lower mtu on f6i.
1502 	 * Only insert this exception route if its mtu
1503 	 * is less than f6i's mtu value.
1504 	 */
1505 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1506 		err = -EINVAL;
1507 		goto out;
1508 	}
1509 
1510 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1511 					       src_key);
1512 	if (rt6_ex)
1513 		rt6_remove_exception(bucket, rt6_ex);
1514 
1515 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1516 	if (!rt6_ex) {
1517 		err = -ENOMEM;
1518 		goto out;
1519 	}
1520 	rt6_ex->rt6i = nrt;
1521 	rt6_ex->stamp = jiffies;
1522 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1523 	bucket->depth++;
1524 	net->ipv6.rt6_stats->fib_rt_cache++;
1525 
1526 	if (bucket->depth > FIB6_MAX_DEPTH)
1527 		rt6_exception_remove_oldest(bucket);
1528 
1529 out:
1530 	spin_unlock_bh(&rt6_exception_lock);
1531 
1532 	/* Update fn->fn_sernum to invalidate all cached dst */
1533 	if (!err) {
1534 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1535 		fib6_update_sernum(net, f6i);
1536 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1537 		fib6_force_start_gc(net);
1538 	}
1539 
1540 	return err;
1541 }
1542 
1543 void rt6_flush_exceptions(struct fib6_info *rt)
1544 {
1545 	struct rt6_exception_bucket *bucket;
1546 	struct rt6_exception *rt6_ex;
1547 	struct hlist_node *tmp;
1548 	int i;
1549 
1550 	spin_lock_bh(&rt6_exception_lock);
1551 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1552 	rt->exception_bucket_flushed = 1;
1553 
1554 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1555 				    lockdep_is_held(&rt6_exception_lock));
1556 	if (!bucket)
1557 		goto out;
1558 
1559 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1560 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1561 			rt6_remove_exception(bucket, rt6_ex);
1562 		WARN_ON_ONCE(bucket->depth);
1563 		bucket++;
1564 	}
1565 
1566 out:
1567 	spin_unlock_bh(&rt6_exception_lock);
1568 }
1569 
1570 /* Find cached rt in the hash table inside passed in rt
1571  * Caller has to hold rcu_read_lock()
1572  */
1573 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1574 					   struct in6_addr *daddr,
1575 					   struct in6_addr *saddr)
1576 {
1577 	struct rt6_exception_bucket *bucket;
1578 	struct in6_addr *src_key = NULL;
1579 	struct rt6_exception *rt6_ex;
1580 	struct rt6_info *ret = NULL;
1581 
1582 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1583 
1584 #ifdef CONFIG_IPV6_SUBTREES
1585 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1586 	 * and exception table is indexed by a hash of
1587 	 * both fib6_dst and fib6_src.
1588 	 * Otherwise, the exception table is indexed by
1589 	 * a hash of only fib6_dst.
1590 	 */
1591 	if (res->f6i->fib6_src.plen)
1592 		src_key = saddr;
1593 #endif
1594 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1595 
1596 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1597 		ret = rt6_ex->rt6i;
1598 
1599 	return ret;
1600 }
1601 
1602 /* Remove the passed in cached rt from the hash table that contains it */
1603 static int rt6_remove_exception_rt(struct rt6_info *rt)
1604 {
1605 	struct rt6_exception_bucket *bucket;
1606 	struct in6_addr *src_key = NULL;
1607 	struct rt6_exception *rt6_ex;
1608 	struct fib6_info *from;
1609 	int err;
1610 
1611 	from = rcu_dereference(rt->from);
1612 	if (!from ||
1613 	    !(rt->rt6i_flags & RTF_CACHE))
1614 		return -EINVAL;
1615 
1616 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1617 		return -ENOENT;
1618 
1619 	spin_lock_bh(&rt6_exception_lock);
1620 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1621 				    lockdep_is_held(&rt6_exception_lock));
1622 #ifdef CONFIG_IPV6_SUBTREES
1623 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1624 	 * and exception table is indexed by a hash of
1625 	 * both rt6i_dst and rt6i_src.
1626 	 * Otherwise, the exception table is indexed by
1627 	 * a hash of only rt6i_dst.
1628 	 */
1629 	if (from->fib6_src.plen)
1630 		src_key = &rt->rt6i_src.addr;
1631 #endif
1632 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1633 					       &rt->rt6i_dst.addr,
1634 					       src_key);
1635 	if (rt6_ex) {
1636 		rt6_remove_exception(bucket, rt6_ex);
1637 		err = 0;
1638 	} else {
1639 		err = -ENOENT;
1640 	}
1641 
1642 	spin_unlock_bh(&rt6_exception_lock);
1643 	return err;
1644 }
1645 
1646 /* Find rt6_ex which contains the passed in rt cache and
1647  * refresh its stamp
1648  */
1649 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1650 {
1651 	struct rt6_exception_bucket *bucket;
1652 	struct in6_addr *src_key = NULL;
1653 	struct rt6_exception *rt6_ex;
1654 	struct fib6_info *from;
1655 
1656 	rcu_read_lock();
1657 	from = rcu_dereference(rt->from);
1658 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1659 		goto unlock;
1660 
1661 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1662 
1663 #ifdef CONFIG_IPV6_SUBTREES
1664 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1665 	 * and exception table is indexed by a hash of
1666 	 * both rt6i_dst and rt6i_src.
1667 	 * Otherwise, the exception table is indexed by
1668 	 * a hash of only rt6i_dst.
1669 	 */
1670 	if (from->fib6_src.plen)
1671 		src_key = &rt->rt6i_src.addr;
1672 #endif
1673 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1674 					  &rt->rt6i_dst.addr,
1675 					  src_key);
1676 	if (rt6_ex)
1677 		rt6_ex->stamp = jiffies;
1678 
1679 unlock:
1680 	rcu_read_unlock();
1681 }
1682 
1683 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1684 					 struct rt6_info *rt, int mtu)
1685 {
1686 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1687 	 * lowest MTU in the path: always allow updating the route PMTU to
1688 	 * reflect PMTU decreases.
1689 	 *
1690 	 * If the new MTU is higher, and the route PMTU is equal to the local
1691 	 * MTU, this means the old MTU is the lowest in the path, so allow
1692 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1693 	 * handle this.
1694 	 */
1695 
1696 	if (dst_mtu(&rt->dst) >= mtu)
1697 		return true;
1698 
1699 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1700 		return true;
1701 
1702 	return false;
1703 }
1704 
1705 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1706 				       struct fib6_info *rt, int mtu)
1707 {
1708 	struct rt6_exception_bucket *bucket;
1709 	struct rt6_exception *rt6_ex;
1710 	int i;
1711 
1712 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1713 					lockdep_is_held(&rt6_exception_lock));
1714 
1715 	if (!bucket)
1716 		return;
1717 
1718 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1719 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1720 			struct rt6_info *entry = rt6_ex->rt6i;
1721 
1722 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1723 			 * route), the metrics of its rt->from have already
1724 			 * been updated.
1725 			 */
1726 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1727 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1728 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1729 		}
1730 		bucket++;
1731 	}
1732 }
1733 
1734 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1735 
1736 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1737 					struct in6_addr *gateway)
1738 {
1739 	struct rt6_exception_bucket *bucket;
1740 	struct rt6_exception *rt6_ex;
1741 	struct hlist_node *tmp;
1742 	int i;
1743 
1744 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1745 		return;
1746 
1747 	spin_lock_bh(&rt6_exception_lock);
1748 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1749 				     lockdep_is_held(&rt6_exception_lock));
1750 
1751 	if (bucket) {
1752 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1753 			hlist_for_each_entry_safe(rt6_ex, tmp,
1754 						  &bucket->chain, hlist) {
1755 				struct rt6_info *entry = rt6_ex->rt6i;
1756 
1757 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1758 				    RTF_CACHE_GATEWAY &&
1759 				    ipv6_addr_equal(gateway,
1760 						    &entry->rt6i_gateway)) {
1761 					rt6_remove_exception(bucket, rt6_ex);
1762 				}
1763 			}
1764 			bucket++;
1765 		}
1766 	}
1767 
1768 	spin_unlock_bh(&rt6_exception_lock);
1769 }
1770 
1771 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1772 				      struct rt6_exception *rt6_ex,
1773 				      struct fib6_gc_args *gc_args,
1774 				      unsigned long now)
1775 {
1776 	struct rt6_info *rt = rt6_ex->rt6i;
1777 
1778 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1779 	 * even if others have still references to them, so that on next
1780 	 * dst_check() such references can be dropped.
1781 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1782 	 * expired, independently from their aging, as per RFC 8201 section 4
1783 	 */
1784 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1785 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1786 			RT6_TRACE("aging clone %p\n", rt);
1787 			rt6_remove_exception(bucket, rt6_ex);
1788 			return;
1789 		}
1790 	} else if (time_after(jiffies, rt->dst.expires)) {
1791 		RT6_TRACE("purging expired route %p\n", rt);
1792 		rt6_remove_exception(bucket, rt6_ex);
1793 		return;
1794 	}
1795 
1796 	if (rt->rt6i_flags & RTF_GATEWAY) {
1797 		struct neighbour *neigh;
1798 		__u8 neigh_flags = 0;
1799 
1800 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1801 		if (neigh)
1802 			neigh_flags = neigh->flags;
1803 
1804 		if (!(neigh_flags & NTF_ROUTER)) {
1805 			RT6_TRACE("purging route %p via non-router but gateway\n",
1806 				  rt);
1807 			rt6_remove_exception(bucket, rt6_ex);
1808 			return;
1809 		}
1810 	}
1811 
1812 	gc_args->more++;
1813 }
1814 
1815 void rt6_age_exceptions(struct fib6_info *rt,
1816 			struct fib6_gc_args *gc_args,
1817 			unsigned long now)
1818 {
1819 	struct rt6_exception_bucket *bucket;
1820 	struct rt6_exception *rt6_ex;
1821 	struct hlist_node *tmp;
1822 	int i;
1823 
1824 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1825 		return;
1826 
1827 	rcu_read_lock_bh();
1828 	spin_lock(&rt6_exception_lock);
1829 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1830 				    lockdep_is_held(&rt6_exception_lock));
1831 
1832 	if (bucket) {
1833 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1834 			hlist_for_each_entry_safe(rt6_ex, tmp,
1835 						  &bucket->chain, hlist) {
1836 				rt6_age_examine_exception(bucket, rt6_ex,
1837 							  gc_args, now);
1838 			}
1839 			bucket++;
1840 		}
1841 	}
1842 	spin_unlock(&rt6_exception_lock);
1843 	rcu_read_unlock_bh();
1844 }
1845 
1846 /* must be called with rcu lock held */
1847 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1848 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1849 {
1850 	struct fib6_node *fn, *saved_fn;
1851 
1852 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1853 	saved_fn = fn;
1854 
1855 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1856 		oif = 0;
1857 
1858 redo_rt6_select:
1859 	rt6_select(net, fn, oif, res, strict);
1860 	if (res->f6i == net->ipv6.fib6_null_entry) {
1861 		fn = fib6_backtrack(fn, &fl6->saddr);
1862 		if (fn)
1863 			goto redo_rt6_select;
1864 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1865 			/* also consider unreachable route */
1866 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1867 			fn = saved_fn;
1868 			goto redo_rt6_select;
1869 		}
1870 	}
1871 
1872 	trace_fib6_table_lookup(net, res, table, fl6);
1873 
1874 	return 0;
1875 }
1876 
1877 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1878 			       int oif, struct flowi6 *fl6,
1879 			       const struct sk_buff *skb, int flags)
1880 {
1881 	struct fib6_result res = {};
1882 	struct rt6_info *rt;
1883 	int strict = 0;
1884 
1885 	strict |= flags & RT6_LOOKUP_F_IFACE;
1886 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1887 	if (net->ipv6.devconf_all->forwarding == 0)
1888 		strict |= RT6_LOOKUP_F_REACHABLE;
1889 
1890 	rcu_read_lock();
1891 
1892 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1893 	if (res.f6i == net->ipv6.fib6_null_entry) {
1894 		rt = net->ipv6.ip6_null_entry;
1895 		rcu_read_unlock();
1896 		dst_hold(&rt->dst);
1897 		return rt;
1898 	}
1899 
1900 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1901 
1902 	/*Search through exception table */
1903 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1904 	if (rt) {
1905 		if (ip6_hold_safe(net, &rt))
1906 			dst_use_noref(&rt->dst, jiffies);
1907 
1908 		rcu_read_unlock();
1909 		return rt;
1910 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1911 			    !res.nh->fib_nh_gw_family)) {
1912 		/* Create a RTF_CACHE clone which will not be
1913 		 * owned by the fib6 tree.  It is for the special case where
1914 		 * the daddr in the skb during the neighbor look-up is different
1915 		 * from the fl6->daddr used to look-up route here.
1916 		 */
1917 		struct rt6_info *uncached_rt;
1918 
1919 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1920 
1921 		rcu_read_unlock();
1922 
1923 		if (uncached_rt) {
1924 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1925 			 * No need for another dst_hold()
1926 			 */
1927 			rt6_uncached_list_add(uncached_rt);
1928 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1929 		} else {
1930 			uncached_rt = net->ipv6.ip6_null_entry;
1931 			dst_hold(&uncached_rt->dst);
1932 		}
1933 
1934 		return uncached_rt;
1935 	} else {
1936 		/* Get a percpu copy */
1937 
1938 		struct rt6_info *pcpu_rt;
1939 
1940 		local_bh_disable();
1941 		pcpu_rt = rt6_get_pcpu_route(&res);
1942 
1943 		if (!pcpu_rt)
1944 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1945 
1946 		local_bh_enable();
1947 		rcu_read_unlock();
1948 
1949 		return pcpu_rt;
1950 	}
1951 }
1952 EXPORT_SYMBOL_GPL(ip6_pol_route);
1953 
1954 static struct rt6_info *ip6_pol_route_input(struct net *net,
1955 					    struct fib6_table *table,
1956 					    struct flowi6 *fl6,
1957 					    const struct sk_buff *skb,
1958 					    int flags)
1959 {
1960 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1961 }
1962 
1963 struct dst_entry *ip6_route_input_lookup(struct net *net,
1964 					 struct net_device *dev,
1965 					 struct flowi6 *fl6,
1966 					 const struct sk_buff *skb,
1967 					 int flags)
1968 {
1969 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1970 		flags |= RT6_LOOKUP_F_IFACE;
1971 
1972 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1973 }
1974 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1975 
1976 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1977 				  struct flow_keys *keys,
1978 				  struct flow_keys *flkeys)
1979 {
1980 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1981 	const struct ipv6hdr *key_iph = outer_iph;
1982 	struct flow_keys *_flkeys = flkeys;
1983 	const struct ipv6hdr *inner_iph;
1984 	const struct icmp6hdr *icmph;
1985 	struct ipv6hdr _inner_iph;
1986 	struct icmp6hdr _icmph;
1987 
1988 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1989 		goto out;
1990 
1991 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1992 				   sizeof(_icmph), &_icmph);
1993 	if (!icmph)
1994 		goto out;
1995 
1996 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1997 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1998 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1999 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
2000 		goto out;
2001 
2002 	inner_iph = skb_header_pointer(skb,
2003 				       skb_transport_offset(skb) + sizeof(*icmph),
2004 				       sizeof(_inner_iph), &_inner_iph);
2005 	if (!inner_iph)
2006 		goto out;
2007 
2008 	key_iph = inner_iph;
2009 	_flkeys = NULL;
2010 out:
2011 	if (_flkeys) {
2012 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2013 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2014 		keys->tags.flow_label = _flkeys->tags.flow_label;
2015 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2016 	} else {
2017 		keys->addrs.v6addrs.src = key_iph->saddr;
2018 		keys->addrs.v6addrs.dst = key_iph->daddr;
2019 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2020 		keys->basic.ip_proto = key_iph->nexthdr;
2021 	}
2022 }
2023 
2024 /* if skb is set it will be used and fl6 can be NULL */
2025 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2026 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2027 {
2028 	struct flow_keys hash_keys;
2029 	u32 mhash;
2030 
2031 	switch (ip6_multipath_hash_policy(net)) {
2032 	case 0:
2033 		memset(&hash_keys, 0, sizeof(hash_keys));
2034 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2035 		if (skb) {
2036 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2037 		} else {
2038 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2039 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2040 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2041 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2042 		}
2043 		break;
2044 	case 1:
2045 		if (skb) {
2046 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2047 			struct flow_keys keys;
2048 
2049 			/* short-circuit if we already have L4 hash present */
2050 			if (skb->l4_hash)
2051 				return skb_get_hash_raw(skb) >> 1;
2052 
2053 			memset(&hash_keys, 0, sizeof(hash_keys));
2054 
2055                         if (!flkeys) {
2056 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2057 				flkeys = &keys;
2058 			}
2059 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2060 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2061 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2062 			hash_keys.ports.src = flkeys->ports.src;
2063 			hash_keys.ports.dst = flkeys->ports.dst;
2064 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2065 		} else {
2066 			memset(&hash_keys, 0, sizeof(hash_keys));
2067 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2068 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2069 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2070 			hash_keys.ports.src = fl6->fl6_sport;
2071 			hash_keys.ports.dst = fl6->fl6_dport;
2072 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2073 		}
2074 		break;
2075 	}
2076 	mhash = flow_hash_from_keys(&hash_keys);
2077 
2078 	return mhash >> 1;
2079 }
2080 
2081 void ip6_route_input(struct sk_buff *skb)
2082 {
2083 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2084 	struct net *net = dev_net(skb->dev);
2085 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2086 	struct ip_tunnel_info *tun_info;
2087 	struct flowi6 fl6 = {
2088 		.flowi6_iif = skb->dev->ifindex,
2089 		.daddr = iph->daddr,
2090 		.saddr = iph->saddr,
2091 		.flowlabel = ip6_flowinfo(iph),
2092 		.flowi6_mark = skb->mark,
2093 		.flowi6_proto = iph->nexthdr,
2094 	};
2095 	struct flow_keys *flkeys = NULL, _flkeys;
2096 
2097 	tun_info = skb_tunnel_info(skb);
2098 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2099 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2100 
2101 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2102 		flkeys = &_flkeys;
2103 
2104 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2105 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2106 	skb_dst_drop(skb);
2107 	skb_dst_set(skb,
2108 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2109 }
2110 
2111 static struct rt6_info *ip6_pol_route_output(struct net *net,
2112 					     struct fib6_table *table,
2113 					     struct flowi6 *fl6,
2114 					     const struct sk_buff *skb,
2115 					     int flags)
2116 {
2117 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2118 }
2119 
2120 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2121 					 struct flowi6 *fl6, int flags)
2122 {
2123 	bool any_src;
2124 
2125 	if (ipv6_addr_type(&fl6->daddr) &
2126 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2127 		struct dst_entry *dst;
2128 
2129 		dst = l3mdev_link_scope_lookup(net, fl6);
2130 		if (dst)
2131 			return dst;
2132 	}
2133 
2134 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2135 
2136 	any_src = ipv6_addr_any(&fl6->saddr);
2137 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2138 	    (fl6->flowi6_oif && any_src))
2139 		flags |= RT6_LOOKUP_F_IFACE;
2140 
2141 	if (!any_src)
2142 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2143 	else if (sk)
2144 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2145 
2146 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2147 }
2148 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2149 
2150 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2151 {
2152 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2153 	struct net_device *loopback_dev = net->loopback_dev;
2154 	struct dst_entry *new = NULL;
2155 
2156 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2157 		       DST_OBSOLETE_DEAD, 0);
2158 	if (rt) {
2159 		rt6_info_init(rt);
2160 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2161 
2162 		new = &rt->dst;
2163 		new->__use = 1;
2164 		new->input = dst_discard;
2165 		new->output = dst_discard_out;
2166 
2167 		dst_copy_metrics(new, &ort->dst);
2168 
2169 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2170 		rt->rt6i_gateway = ort->rt6i_gateway;
2171 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2172 
2173 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2174 #ifdef CONFIG_IPV6_SUBTREES
2175 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2176 #endif
2177 	}
2178 
2179 	dst_release(dst_orig);
2180 	return new ? new : ERR_PTR(-ENOMEM);
2181 }
2182 
2183 /*
2184  *	Destination cache support functions
2185  */
2186 
2187 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2188 {
2189 	u32 rt_cookie = 0;
2190 
2191 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2192 		return false;
2193 
2194 	if (fib6_check_expired(f6i))
2195 		return false;
2196 
2197 	return true;
2198 }
2199 
2200 static struct dst_entry *rt6_check(struct rt6_info *rt,
2201 				   struct fib6_info *from,
2202 				   u32 cookie)
2203 {
2204 	u32 rt_cookie = 0;
2205 
2206 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2207 	    rt_cookie != cookie)
2208 		return NULL;
2209 
2210 	if (rt6_check_expired(rt))
2211 		return NULL;
2212 
2213 	return &rt->dst;
2214 }
2215 
2216 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2217 					    struct fib6_info *from,
2218 					    u32 cookie)
2219 {
2220 	if (!__rt6_check_expired(rt) &&
2221 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2222 	    fib6_check(from, cookie))
2223 		return &rt->dst;
2224 	else
2225 		return NULL;
2226 }
2227 
2228 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2229 {
2230 	struct dst_entry *dst_ret;
2231 	struct fib6_info *from;
2232 	struct rt6_info *rt;
2233 
2234 	rt = container_of(dst, struct rt6_info, dst);
2235 
2236 	rcu_read_lock();
2237 
2238 	/* All IPV6 dsts are created with ->obsolete set to the value
2239 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2240 	 * into this function always.
2241 	 */
2242 
2243 	from = rcu_dereference(rt->from);
2244 
2245 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2246 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2247 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2248 	else
2249 		dst_ret = rt6_check(rt, from, cookie);
2250 
2251 	rcu_read_unlock();
2252 
2253 	return dst_ret;
2254 }
2255 
2256 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2257 {
2258 	struct rt6_info *rt = (struct rt6_info *) dst;
2259 
2260 	if (rt) {
2261 		if (rt->rt6i_flags & RTF_CACHE) {
2262 			rcu_read_lock();
2263 			if (rt6_check_expired(rt)) {
2264 				rt6_remove_exception_rt(rt);
2265 				dst = NULL;
2266 			}
2267 			rcu_read_unlock();
2268 		} else {
2269 			dst_release(dst);
2270 			dst = NULL;
2271 		}
2272 	}
2273 	return dst;
2274 }
2275 
2276 static void ip6_link_failure(struct sk_buff *skb)
2277 {
2278 	struct rt6_info *rt;
2279 
2280 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2281 
2282 	rt = (struct rt6_info *) skb_dst(skb);
2283 	if (rt) {
2284 		rcu_read_lock();
2285 		if (rt->rt6i_flags & RTF_CACHE) {
2286 			rt6_remove_exception_rt(rt);
2287 		} else {
2288 			struct fib6_info *from;
2289 			struct fib6_node *fn;
2290 
2291 			from = rcu_dereference(rt->from);
2292 			if (from) {
2293 				fn = rcu_dereference(from->fib6_node);
2294 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2295 					fn->fn_sernum = -1;
2296 			}
2297 		}
2298 		rcu_read_unlock();
2299 	}
2300 }
2301 
2302 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2303 {
2304 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2305 		struct fib6_info *from;
2306 
2307 		rcu_read_lock();
2308 		from = rcu_dereference(rt0->from);
2309 		if (from)
2310 			rt0->dst.expires = from->expires;
2311 		rcu_read_unlock();
2312 	}
2313 
2314 	dst_set_expires(&rt0->dst, timeout);
2315 	rt0->rt6i_flags |= RTF_EXPIRES;
2316 }
2317 
2318 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2319 {
2320 	struct net *net = dev_net(rt->dst.dev);
2321 
2322 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2323 	rt->rt6i_flags |= RTF_MODIFIED;
2324 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2325 }
2326 
2327 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2328 {
2329 	return !(rt->rt6i_flags & RTF_CACHE) &&
2330 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2331 }
2332 
2333 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2334 				 const struct ipv6hdr *iph, u32 mtu)
2335 {
2336 	const struct in6_addr *daddr, *saddr;
2337 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2338 
2339 	if (dst_metric_locked(dst, RTAX_MTU))
2340 		return;
2341 
2342 	if (iph) {
2343 		daddr = &iph->daddr;
2344 		saddr = &iph->saddr;
2345 	} else if (sk) {
2346 		daddr = &sk->sk_v6_daddr;
2347 		saddr = &inet6_sk(sk)->saddr;
2348 	} else {
2349 		daddr = NULL;
2350 		saddr = NULL;
2351 	}
2352 	dst_confirm_neigh(dst, daddr);
2353 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2354 	if (mtu >= dst_mtu(dst))
2355 		return;
2356 
2357 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2358 		rt6_do_update_pmtu(rt6, mtu);
2359 		/* update rt6_ex->stamp for cache */
2360 		if (rt6->rt6i_flags & RTF_CACHE)
2361 			rt6_update_exception_stamp_rt(rt6);
2362 	} else if (daddr) {
2363 		struct fib6_result res = {};
2364 		struct rt6_info *nrt6;
2365 
2366 		rcu_read_lock();
2367 		res.f6i = rcu_dereference(rt6->from);
2368 		if (!res.f6i) {
2369 			rcu_read_unlock();
2370 			return;
2371 		}
2372 		res.nh = &res.f6i->fib6_nh;
2373 		res.fib6_flags = res.f6i->fib6_flags;
2374 		res.fib6_type = res.f6i->fib6_type;
2375 
2376 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2377 		if (nrt6) {
2378 			rt6_do_update_pmtu(nrt6, mtu);
2379 			if (rt6_insert_exception(nrt6, &res))
2380 				dst_release_immediate(&nrt6->dst);
2381 		}
2382 		rcu_read_unlock();
2383 	}
2384 }
2385 
2386 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2387 			       struct sk_buff *skb, u32 mtu)
2388 {
2389 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2390 }
2391 
2392 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2393 		     int oif, u32 mark, kuid_t uid)
2394 {
2395 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2396 	struct dst_entry *dst;
2397 	struct flowi6 fl6 = {
2398 		.flowi6_oif = oif,
2399 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2400 		.daddr = iph->daddr,
2401 		.saddr = iph->saddr,
2402 		.flowlabel = ip6_flowinfo(iph),
2403 		.flowi6_uid = uid,
2404 	};
2405 
2406 	dst = ip6_route_output(net, NULL, &fl6);
2407 	if (!dst->error)
2408 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2409 	dst_release(dst);
2410 }
2411 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2412 
2413 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2414 {
2415 	int oif = sk->sk_bound_dev_if;
2416 	struct dst_entry *dst;
2417 
2418 	if (!oif && skb->dev)
2419 		oif = l3mdev_master_ifindex(skb->dev);
2420 
2421 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2422 
2423 	dst = __sk_dst_get(sk);
2424 	if (!dst || !dst->obsolete ||
2425 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2426 		return;
2427 
2428 	bh_lock_sock(sk);
2429 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2430 		ip6_datagram_dst_update(sk, false);
2431 	bh_unlock_sock(sk);
2432 }
2433 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2434 
2435 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2436 			   const struct flowi6 *fl6)
2437 {
2438 #ifdef CONFIG_IPV6_SUBTREES
2439 	struct ipv6_pinfo *np = inet6_sk(sk);
2440 #endif
2441 
2442 	ip6_dst_store(sk, dst,
2443 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2444 		      &sk->sk_v6_daddr : NULL,
2445 #ifdef CONFIG_IPV6_SUBTREES
2446 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2447 		      &np->saddr :
2448 #endif
2449 		      NULL);
2450 }
2451 
2452 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2453 				  struct flowi6 *fl6,
2454 				  const struct in6_addr *gw,
2455 				  struct rt6_info **ret)
2456 {
2457 	const struct fib6_nh *nh = res->nh;
2458 
2459 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2460 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2461 		return false;
2462 
2463 	/* rt_cache's gateway might be different from its 'parent'
2464 	 * in the case of an ip redirect.
2465 	 * So we keep searching in the exception table if the gateway
2466 	 * is different.
2467 	 */
2468 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2469 		struct rt6_info *rt_cache;
2470 
2471 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2472 		if (rt_cache &&
2473 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2474 			*ret = rt_cache;
2475 			return true;
2476 		}
2477 		return false;
2478 	}
2479 	return true;
2480 }
2481 
2482 /* Handle redirects */
2483 struct ip6rd_flowi {
2484 	struct flowi6 fl6;
2485 	struct in6_addr gateway;
2486 };
2487 
2488 static struct rt6_info *__ip6_route_redirect(struct net *net,
2489 					     struct fib6_table *table,
2490 					     struct flowi6 *fl6,
2491 					     const struct sk_buff *skb,
2492 					     int flags)
2493 {
2494 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2495 	struct rt6_info *ret = NULL;
2496 	struct fib6_result res = {};
2497 	struct fib6_info *rt;
2498 	struct fib6_node *fn;
2499 
2500 	/* Get the "current" route for this destination and
2501 	 * check if the redirect has come from appropriate router.
2502 	 *
2503 	 * RFC 4861 specifies that redirects should only be
2504 	 * accepted if they come from the nexthop to the target.
2505 	 * Due to the way the routes are chosen, this notion
2506 	 * is a bit fuzzy and one might need to check all possible
2507 	 * routes.
2508 	 */
2509 
2510 	rcu_read_lock();
2511 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2512 restart:
2513 	for_each_fib6_node_rt_rcu(fn) {
2514 		res.f6i = rt;
2515 		res.nh = &rt->fib6_nh;
2516 
2517 		if (fib6_check_expired(rt))
2518 			continue;
2519 		if (rt->fib6_flags & RTF_REJECT)
2520 			break;
2521 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2522 			goto out;
2523 	}
2524 
2525 	if (!rt)
2526 		rt = net->ipv6.fib6_null_entry;
2527 	else if (rt->fib6_flags & RTF_REJECT) {
2528 		ret = net->ipv6.ip6_null_entry;
2529 		goto out;
2530 	}
2531 
2532 	if (rt == net->ipv6.fib6_null_entry) {
2533 		fn = fib6_backtrack(fn, &fl6->saddr);
2534 		if (fn)
2535 			goto restart;
2536 	}
2537 
2538 	res.f6i = rt;
2539 	res.nh = &rt->fib6_nh;
2540 out:
2541 	if (ret) {
2542 		ip6_hold_safe(net, &ret);
2543 	} else {
2544 		res.fib6_flags = res.f6i->fib6_flags;
2545 		res.fib6_type = res.f6i->fib6_type;
2546 		ret = ip6_create_rt_rcu(&res);
2547 	}
2548 
2549 	rcu_read_unlock();
2550 
2551 	trace_fib6_table_lookup(net, &res, table, fl6);
2552 	return ret;
2553 };
2554 
2555 static struct dst_entry *ip6_route_redirect(struct net *net,
2556 					    const struct flowi6 *fl6,
2557 					    const struct sk_buff *skb,
2558 					    const struct in6_addr *gateway)
2559 {
2560 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2561 	struct ip6rd_flowi rdfl;
2562 
2563 	rdfl.fl6 = *fl6;
2564 	rdfl.gateway = *gateway;
2565 
2566 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2567 				flags, __ip6_route_redirect);
2568 }
2569 
2570 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2571 		  kuid_t uid)
2572 {
2573 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2574 	struct dst_entry *dst;
2575 	struct flowi6 fl6 = {
2576 		.flowi6_iif = LOOPBACK_IFINDEX,
2577 		.flowi6_oif = oif,
2578 		.flowi6_mark = mark,
2579 		.daddr = iph->daddr,
2580 		.saddr = iph->saddr,
2581 		.flowlabel = ip6_flowinfo(iph),
2582 		.flowi6_uid = uid,
2583 	};
2584 
2585 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2586 	rt6_do_redirect(dst, NULL, skb);
2587 	dst_release(dst);
2588 }
2589 EXPORT_SYMBOL_GPL(ip6_redirect);
2590 
2591 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2592 {
2593 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2594 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2595 	struct dst_entry *dst;
2596 	struct flowi6 fl6 = {
2597 		.flowi6_iif = LOOPBACK_IFINDEX,
2598 		.flowi6_oif = oif,
2599 		.daddr = msg->dest,
2600 		.saddr = iph->daddr,
2601 		.flowi6_uid = sock_net_uid(net, NULL),
2602 	};
2603 
2604 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2605 	rt6_do_redirect(dst, NULL, skb);
2606 	dst_release(dst);
2607 }
2608 
2609 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2610 {
2611 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2612 		     sk->sk_uid);
2613 }
2614 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2615 
2616 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2617 {
2618 	struct net_device *dev = dst->dev;
2619 	unsigned int mtu = dst_mtu(dst);
2620 	struct net *net = dev_net(dev);
2621 
2622 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2623 
2624 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2625 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2626 
2627 	/*
2628 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2629 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2630 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2631 	 * rely only on pmtu discovery"
2632 	 */
2633 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2634 		mtu = IPV6_MAXPLEN;
2635 	return mtu;
2636 }
2637 
2638 static unsigned int ip6_mtu(const struct dst_entry *dst)
2639 {
2640 	struct inet6_dev *idev;
2641 	unsigned int mtu;
2642 
2643 	mtu = dst_metric_raw(dst, RTAX_MTU);
2644 	if (mtu)
2645 		goto out;
2646 
2647 	mtu = IPV6_MIN_MTU;
2648 
2649 	rcu_read_lock();
2650 	idev = __in6_dev_get(dst->dev);
2651 	if (idev)
2652 		mtu = idev->cnf.mtu6;
2653 	rcu_read_unlock();
2654 
2655 out:
2656 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2657 
2658 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2659 }
2660 
2661 /* MTU selection:
2662  * 1. mtu on route is locked - use it
2663  * 2. mtu from nexthop exception
2664  * 3. mtu from egress device
2665  *
2666  * based on ip6_dst_mtu_forward and exception logic of
2667  * rt6_find_cached_rt; called with rcu_read_lock
2668  */
2669 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2670 		      const struct in6_addr *daddr,
2671 		      const struct in6_addr *saddr)
2672 {
2673 	struct rt6_exception_bucket *bucket;
2674 	const struct fib6_nh *nh = res->nh;
2675 	struct fib6_info *f6i = res->f6i;
2676 	const struct in6_addr *src_key;
2677 	struct rt6_exception *rt6_ex;
2678 	struct inet6_dev *idev;
2679 	u32 mtu = 0;
2680 
2681 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2682 		mtu = f6i->fib6_pmtu;
2683 		if (mtu)
2684 			goto out;
2685 	}
2686 
2687 	src_key = NULL;
2688 #ifdef CONFIG_IPV6_SUBTREES
2689 	if (f6i->fib6_src.plen)
2690 		src_key = saddr;
2691 #endif
2692 
2693 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2694 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2695 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2696 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2697 
2698 	if (likely(!mtu)) {
2699 		struct net_device *dev = nh->fib_nh_dev;
2700 
2701 		mtu = IPV6_MIN_MTU;
2702 		idev = __in6_dev_get(dev);
2703 		if (idev && idev->cnf.mtu6 > mtu)
2704 			mtu = idev->cnf.mtu6;
2705 	}
2706 
2707 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2708 out:
2709 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2710 }
2711 
2712 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2713 				  struct flowi6 *fl6)
2714 {
2715 	struct dst_entry *dst;
2716 	struct rt6_info *rt;
2717 	struct inet6_dev *idev = in6_dev_get(dev);
2718 	struct net *net = dev_net(dev);
2719 
2720 	if (unlikely(!idev))
2721 		return ERR_PTR(-ENODEV);
2722 
2723 	rt = ip6_dst_alloc(net, dev, 0);
2724 	if (unlikely(!rt)) {
2725 		in6_dev_put(idev);
2726 		dst = ERR_PTR(-ENOMEM);
2727 		goto out;
2728 	}
2729 
2730 	rt->dst.flags |= DST_HOST;
2731 	rt->dst.input = ip6_input;
2732 	rt->dst.output  = ip6_output;
2733 	rt->rt6i_gateway  = fl6->daddr;
2734 	rt->rt6i_dst.addr = fl6->daddr;
2735 	rt->rt6i_dst.plen = 128;
2736 	rt->rt6i_idev     = idev;
2737 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2738 
2739 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2740 	 * do proper release of the net_device
2741 	 */
2742 	rt6_uncached_list_add(rt);
2743 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2744 
2745 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2746 
2747 out:
2748 	return dst;
2749 }
2750 
2751 static int ip6_dst_gc(struct dst_ops *ops)
2752 {
2753 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2754 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2755 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2756 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2757 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2758 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2759 	int entries;
2760 
2761 	entries = dst_entries_get_fast(ops);
2762 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2763 	    entries <= rt_max_size)
2764 		goto out;
2765 
2766 	net->ipv6.ip6_rt_gc_expire++;
2767 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2768 	entries = dst_entries_get_slow(ops);
2769 	if (entries < ops->gc_thresh)
2770 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2771 out:
2772 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2773 	return entries > rt_max_size;
2774 }
2775 
2776 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2777 					    struct fib6_config *cfg,
2778 					    const struct in6_addr *gw_addr,
2779 					    u32 tbid, int flags)
2780 {
2781 	struct flowi6 fl6 = {
2782 		.flowi6_oif = cfg->fc_ifindex,
2783 		.daddr = *gw_addr,
2784 		.saddr = cfg->fc_prefsrc,
2785 	};
2786 	struct fib6_table *table;
2787 	struct rt6_info *rt;
2788 
2789 	table = fib6_get_table(net, tbid);
2790 	if (!table)
2791 		return NULL;
2792 
2793 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2794 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2795 
2796 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2797 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2798 
2799 	/* if table lookup failed, fall back to full lookup */
2800 	if (rt == net->ipv6.ip6_null_entry) {
2801 		ip6_rt_put(rt);
2802 		rt = NULL;
2803 	}
2804 
2805 	return rt;
2806 }
2807 
2808 static int ip6_route_check_nh_onlink(struct net *net,
2809 				     struct fib6_config *cfg,
2810 				     const struct net_device *dev,
2811 				     struct netlink_ext_ack *extack)
2812 {
2813 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2814 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2815 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2816 	struct fib6_info *from;
2817 	struct rt6_info *grt;
2818 	int err;
2819 
2820 	err = 0;
2821 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2822 	if (grt) {
2823 		rcu_read_lock();
2824 		from = rcu_dereference(grt->from);
2825 		if (!grt->dst.error &&
2826 		    /* ignore match if it is the default route */
2827 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2828 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2829 			NL_SET_ERR_MSG(extack,
2830 				       "Nexthop has invalid gateway or device mismatch");
2831 			err = -EINVAL;
2832 		}
2833 		rcu_read_unlock();
2834 
2835 		ip6_rt_put(grt);
2836 	}
2837 
2838 	return err;
2839 }
2840 
2841 static int ip6_route_check_nh(struct net *net,
2842 			      struct fib6_config *cfg,
2843 			      struct net_device **_dev,
2844 			      struct inet6_dev **idev)
2845 {
2846 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2847 	struct net_device *dev = _dev ? *_dev : NULL;
2848 	struct rt6_info *grt = NULL;
2849 	int err = -EHOSTUNREACH;
2850 
2851 	if (cfg->fc_table) {
2852 		int flags = RT6_LOOKUP_F_IFACE;
2853 
2854 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2855 					  cfg->fc_table, flags);
2856 		if (grt) {
2857 			if (grt->rt6i_flags & RTF_GATEWAY ||
2858 			    (dev && dev != grt->dst.dev)) {
2859 				ip6_rt_put(grt);
2860 				grt = NULL;
2861 			}
2862 		}
2863 	}
2864 
2865 	if (!grt)
2866 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2867 
2868 	if (!grt)
2869 		goto out;
2870 
2871 	if (dev) {
2872 		if (dev != grt->dst.dev) {
2873 			ip6_rt_put(grt);
2874 			goto out;
2875 		}
2876 	} else {
2877 		*_dev = dev = grt->dst.dev;
2878 		*idev = grt->rt6i_idev;
2879 		dev_hold(dev);
2880 		in6_dev_hold(grt->rt6i_idev);
2881 	}
2882 
2883 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2884 		err = 0;
2885 
2886 	ip6_rt_put(grt);
2887 
2888 out:
2889 	return err;
2890 }
2891 
2892 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2893 			   struct net_device **_dev, struct inet6_dev **idev,
2894 			   struct netlink_ext_ack *extack)
2895 {
2896 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2897 	int gwa_type = ipv6_addr_type(gw_addr);
2898 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2899 	const struct net_device *dev = *_dev;
2900 	bool need_addr_check = !dev;
2901 	int err = -EINVAL;
2902 
2903 	/* if gw_addr is local we will fail to detect this in case
2904 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2905 	 * will return already-added prefix route via interface that
2906 	 * prefix route was assigned to, which might be non-loopback.
2907 	 */
2908 	if (dev &&
2909 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2910 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2911 		goto out;
2912 	}
2913 
2914 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2915 		/* IPv6 strictly inhibits using not link-local
2916 		 * addresses as nexthop address.
2917 		 * Otherwise, router will not able to send redirects.
2918 		 * It is very good, but in some (rare!) circumstances
2919 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2920 		 * some exceptions. --ANK
2921 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2922 		 * addressing
2923 		 */
2924 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2925 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2926 			goto out;
2927 		}
2928 
2929 		if (cfg->fc_flags & RTNH_F_ONLINK)
2930 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2931 		else
2932 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2933 
2934 		if (err)
2935 			goto out;
2936 	}
2937 
2938 	/* reload in case device was changed */
2939 	dev = *_dev;
2940 
2941 	err = -EINVAL;
2942 	if (!dev) {
2943 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2944 		goto out;
2945 	} else if (dev->flags & IFF_LOOPBACK) {
2946 		NL_SET_ERR_MSG(extack,
2947 			       "Egress device can not be loopback device for this route");
2948 		goto out;
2949 	}
2950 
2951 	/* if we did not check gw_addr above, do so now that the
2952 	 * egress device has been resolved.
2953 	 */
2954 	if (need_addr_check &&
2955 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2956 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2957 		goto out;
2958 	}
2959 
2960 	err = 0;
2961 out:
2962 	return err;
2963 }
2964 
2965 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2966 {
2967 	if ((flags & RTF_REJECT) ||
2968 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2969 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2970 	     !(flags & RTF_LOCAL)))
2971 		return true;
2972 
2973 	return false;
2974 }
2975 
2976 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2977 		 struct fib6_config *cfg, gfp_t gfp_flags,
2978 		 struct netlink_ext_ack *extack)
2979 {
2980 	struct net_device *dev = NULL;
2981 	struct inet6_dev *idev = NULL;
2982 	int addr_type;
2983 	int err;
2984 
2985 	fib6_nh->fib_nh_family = AF_INET6;
2986 
2987 	err = -ENODEV;
2988 	if (cfg->fc_ifindex) {
2989 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2990 		if (!dev)
2991 			goto out;
2992 		idev = in6_dev_get(dev);
2993 		if (!idev)
2994 			goto out;
2995 	}
2996 
2997 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2998 		if (!dev) {
2999 			NL_SET_ERR_MSG(extack,
3000 				       "Nexthop device required for onlink");
3001 			goto out;
3002 		}
3003 
3004 		if (!(dev->flags & IFF_UP)) {
3005 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3006 			err = -ENETDOWN;
3007 			goto out;
3008 		}
3009 
3010 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3011 	}
3012 
3013 	fib6_nh->fib_nh_weight = 1;
3014 
3015 	/* We cannot add true routes via loopback here,
3016 	 * they would result in kernel looping; promote them to reject routes
3017 	 */
3018 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3019 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3020 		/* hold loopback dev/idev if we haven't done so. */
3021 		if (dev != net->loopback_dev) {
3022 			if (dev) {
3023 				dev_put(dev);
3024 				in6_dev_put(idev);
3025 			}
3026 			dev = net->loopback_dev;
3027 			dev_hold(dev);
3028 			idev = in6_dev_get(dev);
3029 			if (!idev) {
3030 				err = -ENODEV;
3031 				goto out;
3032 			}
3033 		}
3034 		goto set_dev;
3035 	}
3036 
3037 	if (cfg->fc_flags & RTF_GATEWAY) {
3038 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3039 		if (err)
3040 			goto out;
3041 
3042 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3043 		fib6_nh->fib_nh_gw_family = AF_INET6;
3044 	}
3045 
3046 	err = -ENODEV;
3047 	if (!dev)
3048 		goto out;
3049 
3050 	if (idev->cnf.disable_ipv6) {
3051 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3052 		err = -EACCES;
3053 		goto out;
3054 	}
3055 
3056 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3057 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3058 		err = -ENETDOWN;
3059 		goto out;
3060 	}
3061 
3062 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3063 	    !netif_carrier_ok(dev))
3064 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3065 
3066 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3067 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3068 	if (err)
3069 		goto out;
3070 set_dev:
3071 	fib6_nh->fib_nh_dev = dev;
3072 	fib6_nh->fib_nh_oif = dev->ifindex;
3073 	err = 0;
3074 out:
3075 	if (idev)
3076 		in6_dev_put(idev);
3077 
3078 	if (err) {
3079 		lwtstate_put(fib6_nh->fib_nh_lws);
3080 		fib6_nh->fib_nh_lws = NULL;
3081 		if (dev)
3082 			dev_put(dev);
3083 	}
3084 
3085 	return err;
3086 }
3087 
3088 void fib6_nh_release(struct fib6_nh *fib6_nh)
3089 {
3090 	fib_nh_common_release(&fib6_nh->nh_common);
3091 }
3092 
3093 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3094 					      gfp_t gfp_flags,
3095 					      struct netlink_ext_ack *extack)
3096 {
3097 	struct net *net = cfg->fc_nlinfo.nl_net;
3098 	struct fib6_info *rt = NULL;
3099 	struct fib6_table *table;
3100 	int err = -EINVAL;
3101 	int addr_type;
3102 
3103 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3104 	if (cfg->fc_flags & RTF_PCPU) {
3105 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3106 		goto out;
3107 	}
3108 
3109 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3110 	if (cfg->fc_flags & RTF_CACHE) {
3111 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3112 		goto out;
3113 	}
3114 
3115 	if (cfg->fc_type > RTN_MAX) {
3116 		NL_SET_ERR_MSG(extack, "Invalid route type");
3117 		goto out;
3118 	}
3119 
3120 	if (cfg->fc_dst_len > 128) {
3121 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3122 		goto out;
3123 	}
3124 	if (cfg->fc_src_len > 128) {
3125 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3126 		goto out;
3127 	}
3128 #ifndef CONFIG_IPV6_SUBTREES
3129 	if (cfg->fc_src_len) {
3130 		NL_SET_ERR_MSG(extack,
3131 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3132 		goto out;
3133 	}
3134 #endif
3135 
3136 	err = -ENOBUFS;
3137 	if (cfg->fc_nlinfo.nlh &&
3138 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3139 		table = fib6_get_table(net, cfg->fc_table);
3140 		if (!table) {
3141 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3142 			table = fib6_new_table(net, cfg->fc_table);
3143 		}
3144 	} else {
3145 		table = fib6_new_table(net, cfg->fc_table);
3146 	}
3147 
3148 	if (!table)
3149 		goto out;
3150 
3151 	err = -ENOMEM;
3152 	rt = fib6_info_alloc(gfp_flags);
3153 	if (!rt)
3154 		goto out;
3155 
3156 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3157 					       extack);
3158 	if (IS_ERR(rt->fib6_metrics)) {
3159 		err = PTR_ERR(rt->fib6_metrics);
3160 		/* Do not leave garbage there. */
3161 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3162 		goto out;
3163 	}
3164 
3165 	if (cfg->fc_flags & RTF_ADDRCONF)
3166 		rt->dst_nocount = true;
3167 
3168 	if (cfg->fc_flags & RTF_EXPIRES)
3169 		fib6_set_expires(rt, jiffies +
3170 				clock_t_to_jiffies(cfg->fc_expires));
3171 	else
3172 		fib6_clean_expires(rt);
3173 
3174 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3175 		cfg->fc_protocol = RTPROT_BOOT;
3176 	rt->fib6_protocol = cfg->fc_protocol;
3177 
3178 	rt->fib6_table = table;
3179 	rt->fib6_metric = cfg->fc_metric;
3180 	rt->fib6_type = cfg->fc_type;
3181 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3182 
3183 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3184 	rt->fib6_dst.plen = cfg->fc_dst_len;
3185 	if (rt->fib6_dst.plen == 128)
3186 		rt->dst_host = true;
3187 
3188 #ifdef CONFIG_IPV6_SUBTREES
3189 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3190 	rt->fib6_src.plen = cfg->fc_src_len;
3191 #endif
3192 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3193 	if (err)
3194 		goto out;
3195 
3196 	/* We cannot add true routes via loopback here,
3197 	 * they would result in kernel looping; promote them to reject routes
3198 	 */
3199 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3200 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3201 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3202 
3203 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3204 		struct net_device *dev = fib6_info_nh_dev(rt);
3205 
3206 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3207 			NL_SET_ERR_MSG(extack, "Invalid source address");
3208 			err = -EINVAL;
3209 			goto out;
3210 		}
3211 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3212 		rt->fib6_prefsrc.plen = 128;
3213 	} else
3214 		rt->fib6_prefsrc.plen = 0;
3215 
3216 	return rt;
3217 out:
3218 	fib6_info_release(rt);
3219 	return ERR_PTR(err);
3220 }
3221 
3222 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3223 		  struct netlink_ext_ack *extack)
3224 {
3225 	struct fib6_info *rt;
3226 	int err;
3227 
3228 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3229 	if (IS_ERR(rt))
3230 		return PTR_ERR(rt);
3231 
3232 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3233 	fib6_info_release(rt);
3234 
3235 	return err;
3236 }
3237 
3238 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3239 {
3240 	struct net *net = info->nl_net;
3241 	struct fib6_table *table;
3242 	int err;
3243 
3244 	if (rt == net->ipv6.fib6_null_entry) {
3245 		err = -ENOENT;
3246 		goto out;
3247 	}
3248 
3249 	table = rt->fib6_table;
3250 	spin_lock_bh(&table->tb6_lock);
3251 	err = fib6_del(rt, info);
3252 	spin_unlock_bh(&table->tb6_lock);
3253 
3254 out:
3255 	fib6_info_release(rt);
3256 	return err;
3257 }
3258 
3259 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3260 {
3261 	struct nl_info info = { .nl_net = net };
3262 
3263 	return __ip6_del_rt(rt, &info);
3264 }
3265 
3266 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3267 {
3268 	struct nl_info *info = &cfg->fc_nlinfo;
3269 	struct net *net = info->nl_net;
3270 	struct sk_buff *skb = NULL;
3271 	struct fib6_table *table;
3272 	int err = -ENOENT;
3273 
3274 	if (rt == net->ipv6.fib6_null_entry)
3275 		goto out_put;
3276 	table = rt->fib6_table;
3277 	spin_lock_bh(&table->tb6_lock);
3278 
3279 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3280 		struct fib6_info *sibling, *next_sibling;
3281 
3282 		/* prefer to send a single notification with all hops */
3283 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3284 		if (skb) {
3285 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3286 
3287 			if (rt6_fill_node(net, skb, rt, NULL,
3288 					  NULL, NULL, 0, RTM_DELROUTE,
3289 					  info->portid, seq, 0) < 0) {
3290 				kfree_skb(skb);
3291 				skb = NULL;
3292 			} else
3293 				info->skip_notify = 1;
3294 		}
3295 
3296 		list_for_each_entry_safe(sibling, next_sibling,
3297 					 &rt->fib6_siblings,
3298 					 fib6_siblings) {
3299 			err = fib6_del(sibling, info);
3300 			if (err)
3301 				goto out_unlock;
3302 		}
3303 	}
3304 
3305 	err = fib6_del(rt, info);
3306 out_unlock:
3307 	spin_unlock_bh(&table->tb6_lock);
3308 out_put:
3309 	fib6_info_release(rt);
3310 
3311 	if (skb) {
3312 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3313 			    info->nlh, gfp_any());
3314 	}
3315 	return err;
3316 }
3317 
3318 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3319 {
3320 	int rc = -ESRCH;
3321 
3322 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3323 		goto out;
3324 
3325 	if (cfg->fc_flags & RTF_GATEWAY &&
3326 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3327 		goto out;
3328 
3329 	rc = rt6_remove_exception_rt(rt);
3330 out:
3331 	return rc;
3332 }
3333 
3334 static int ip6_route_del(struct fib6_config *cfg,
3335 			 struct netlink_ext_ack *extack)
3336 {
3337 	struct rt6_info *rt_cache;
3338 	struct fib6_table *table;
3339 	struct fib6_info *rt;
3340 	struct fib6_node *fn;
3341 	int err = -ESRCH;
3342 
3343 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3344 	if (!table) {
3345 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3346 		return err;
3347 	}
3348 
3349 	rcu_read_lock();
3350 
3351 	fn = fib6_locate(&table->tb6_root,
3352 			 &cfg->fc_dst, cfg->fc_dst_len,
3353 			 &cfg->fc_src, cfg->fc_src_len,
3354 			 !(cfg->fc_flags & RTF_CACHE));
3355 
3356 	if (fn) {
3357 		for_each_fib6_node_rt_rcu(fn) {
3358 			struct fib6_nh *nh;
3359 
3360 			if (cfg->fc_flags & RTF_CACHE) {
3361 				struct fib6_result res = {
3362 					.f6i = rt,
3363 				};
3364 				int rc;
3365 
3366 				rt_cache = rt6_find_cached_rt(&res,
3367 							      &cfg->fc_dst,
3368 							      &cfg->fc_src);
3369 				if (rt_cache) {
3370 					rc = ip6_del_cached_rt(rt_cache, cfg);
3371 					if (rc != -ESRCH) {
3372 						rcu_read_unlock();
3373 						return rc;
3374 					}
3375 				}
3376 				continue;
3377 			}
3378 
3379 			nh = &rt->fib6_nh;
3380 			if (cfg->fc_ifindex &&
3381 			    (!nh->fib_nh_dev ||
3382 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3383 				continue;
3384 			if (cfg->fc_flags & RTF_GATEWAY &&
3385 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3386 				continue;
3387 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3388 				continue;
3389 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3390 				continue;
3391 			if (!fib6_info_hold_safe(rt))
3392 				continue;
3393 			rcu_read_unlock();
3394 
3395 			/* if gateway was specified only delete the one hop */
3396 			if (cfg->fc_flags & RTF_GATEWAY)
3397 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3398 
3399 			return __ip6_del_rt_siblings(rt, cfg);
3400 		}
3401 	}
3402 	rcu_read_unlock();
3403 
3404 	return err;
3405 }
3406 
3407 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3408 {
3409 	struct netevent_redirect netevent;
3410 	struct rt6_info *rt, *nrt = NULL;
3411 	struct fib6_result res = {};
3412 	struct ndisc_options ndopts;
3413 	struct inet6_dev *in6_dev;
3414 	struct neighbour *neigh;
3415 	struct rd_msg *msg;
3416 	int optlen, on_link;
3417 	u8 *lladdr;
3418 
3419 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3420 	optlen -= sizeof(*msg);
3421 
3422 	if (optlen < 0) {
3423 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3424 		return;
3425 	}
3426 
3427 	msg = (struct rd_msg *)icmp6_hdr(skb);
3428 
3429 	if (ipv6_addr_is_multicast(&msg->dest)) {
3430 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3431 		return;
3432 	}
3433 
3434 	on_link = 0;
3435 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3436 		on_link = 1;
3437 	} else if (ipv6_addr_type(&msg->target) !=
3438 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3439 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3440 		return;
3441 	}
3442 
3443 	in6_dev = __in6_dev_get(skb->dev);
3444 	if (!in6_dev)
3445 		return;
3446 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3447 		return;
3448 
3449 	/* RFC2461 8.1:
3450 	 *	The IP source address of the Redirect MUST be the same as the current
3451 	 *	first-hop router for the specified ICMP Destination Address.
3452 	 */
3453 
3454 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3455 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3456 		return;
3457 	}
3458 
3459 	lladdr = NULL;
3460 	if (ndopts.nd_opts_tgt_lladdr) {
3461 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3462 					     skb->dev);
3463 		if (!lladdr) {
3464 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3465 			return;
3466 		}
3467 	}
3468 
3469 	rt = (struct rt6_info *) dst;
3470 	if (rt->rt6i_flags & RTF_REJECT) {
3471 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3472 		return;
3473 	}
3474 
3475 	/* Redirect received -> path was valid.
3476 	 * Look, redirects are sent only in response to data packets,
3477 	 * so that this nexthop apparently is reachable. --ANK
3478 	 */
3479 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3480 
3481 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3482 	if (!neigh)
3483 		return;
3484 
3485 	/*
3486 	 *	We have finally decided to accept it.
3487 	 */
3488 
3489 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3490 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3491 		     NEIGH_UPDATE_F_OVERRIDE|
3492 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3493 				     NEIGH_UPDATE_F_ISROUTER)),
3494 		     NDISC_REDIRECT, &ndopts);
3495 
3496 	rcu_read_lock();
3497 	res.f6i = rcu_dereference(rt->from);
3498 	/* This fib6_info_hold() is safe here because we hold reference to rt
3499 	 * and rt already holds reference to fib6_info.
3500 	 */
3501 	fib6_info_hold(res.f6i);
3502 	rcu_read_unlock();
3503 
3504 	res.nh = &res.f6i->fib6_nh;
3505 	res.fib6_flags = res.f6i->fib6_flags;
3506 	res.fib6_type = res.f6i->fib6_type;
3507 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3508 	if (!nrt)
3509 		goto out;
3510 
3511 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3512 	if (on_link)
3513 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3514 
3515 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3516 
3517 	/* No need to remove rt from the exception table if rt is
3518 	 * a cached route because rt6_insert_exception() will
3519 	 * takes care of it
3520 	 */
3521 	if (rt6_insert_exception(nrt, &res)) {
3522 		dst_release_immediate(&nrt->dst);
3523 		goto out;
3524 	}
3525 
3526 	netevent.old = &rt->dst;
3527 	netevent.new = &nrt->dst;
3528 	netevent.daddr = &msg->dest;
3529 	netevent.neigh = neigh;
3530 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3531 
3532 out:
3533 	fib6_info_release(res.f6i);
3534 	neigh_release(neigh);
3535 }
3536 
3537 #ifdef CONFIG_IPV6_ROUTE_INFO
3538 static struct fib6_info *rt6_get_route_info(struct net *net,
3539 					   const struct in6_addr *prefix, int prefixlen,
3540 					   const struct in6_addr *gwaddr,
3541 					   struct net_device *dev)
3542 {
3543 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3544 	int ifindex = dev->ifindex;
3545 	struct fib6_node *fn;
3546 	struct fib6_info *rt = NULL;
3547 	struct fib6_table *table;
3548 
3549 	table = fib6_get_table(net, tb_id);
3550 	if (!table)
3551 		return NULL;
3552 
3553 	rcu_read_lock();
3554 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3555 	if (!fn)
3556 		goto out;
3557 
3558 	for_each_fib6_node_rt_rcu(fn) {
3559 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3560 			continue;
3561 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3562 		    !rt->fib6_nh.fib_nh_gw_family)
3563 			continue;
3564 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3565 			continue;
3566 		if (!fib6_info_hold_safe(rt))
3567 			continue;
3568 		break;
3569 	}
3570 out:
3571 	rcu_read_unlock();
3572 	return rt;
3573 }
3574 
3575 static struct fib6_info *rt6_add_route_info(struct net *net,
3576 					   const struct in6_addr *prefix, int prefixlen,
3577 					   const struct in6_addr *gwaddr,
3578 					   struct net_device *dev,
3579 					   unsigned int pref)
3580 {
3581 	struct fib6_config cfg = {
3582 		.fc_metric	= IP6_RT_PRIO_USER,
3583 		.fc_ifindex	= dev->ifindex,
3584 		.fc_dst_len	= prefixlen,
3585 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3586 				  RTF_UP | RTF_PREF(pref),
3587 		.fc_protocol = RTPROT_RA,
3588 		.fc_type = RTN_UNICAST,
3589 		.fc_nlinfo.portid = 0,
3590 		.fc_nlinfo.nlh = NULL,
3591 		.fc_nlinfo.nl_net = net,
3592 	};
3593 
3594 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3595 	cfg.fc_dst = *prefix;
3596 	cfg.fc_gateway = *gwaddr;
3597 
3598 	/* We should treat it as a default route if prefix length is 0. */
3599 	if (!prefixlen)
3600 		cfg.fc_flags |= RTF_DEFAULT;
3601 
3602 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3603 
3604 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3605 }
3606 #endif
3607 
3608 struct fib6_info *rt6_get_dflt_router(struct net *net,
3609 				     const struct in6_addr *addr,
3610 				     struct net_device *dev)
3611 {
3612 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3613 	struct fib6_info *rt;
3614 	struct fib6_table *table;
3615 
3616 	table = fib6_get_table(net, tb_id);
3617 	if (!table)
3618 		return NULL;
3619 
3620 	rcu_read_lock();
3621 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3622 		struct fib6_nh *nh = &rt->fib6_nh;
3623 
3624 		if (dev == nh->fib_nh_dev &&
3625 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3626 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3627 			break;
3628 	}
3629 	if (rt && !fib6_info_hold_safe(rt))
3630 		rt = NULL;
3631 	rcu_read_unlock();
3632 	return rt;
3633 }
3634 
3635 struct fib6_info *rt6_add_dflt_router(struct net *net,
3636 				     const struct in6_addr *gwaddr,
3637 				     struct net_device *dev,
3638 				     unsigned int pref)
3639 {
3640 	struct fib6_config cfg = {
3641 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3642 		.fc_metric	= IP6_RT_PRIO_USER,
3643 		.fc_ifindex	= dev->ifindex,
3644 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3645 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3646 		.fc_protocol = RTPROT_RA,
3647 		.fc_type = RTN_UNICAST,
3648 		.fc_nlinfo.portid = 0,
3649 		.fc_nlinfo.nlh = NULL,
3650 		.fc_nlinfo.nl_net = net,
3651 	};
3652 
3653 	cfg.fc_gateway = *gwaddr;
3654 
3655 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3656 		struct fib6_table *table;
3657 
3658 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3659 		if (table)
3660 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3661 	}
3662 
3663 	return rt6_get_dflt_router(net, gwaddr, dev);
3664 }
3665 
3666 static void __rt6_purge_dflt_routers(struct net *net,
3667 				     struct fib6_table *table)
3668 {
3669 	struct fib6_info *rt;
3670 
3671 restart:
3672 	rcu_read_lock();
3673 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3674 		struct net_device *dev = fib6_info_nh_dev(rt);
3675 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3676 
3677 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3678 		    (!idev || idev->cnf.accept_ra != 2) &&
3679 		    fib6_info_hold_safe(rt)) {
3680 			rcu_read_unlock();
3681 			ip6_del_rt(net, rt);
3682 			goto restart;
3683 		}
3684 	}
3685 	rcu_read_unlock();
3686 
3687 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3688 }
3689 
3690 void rt6_purge_dflt_routers(struct net *net)
3691 {
3692 	struct fib6_table *table;
3693 	struct hlist_head *head;
3694 	unsigned int h;
3695 
3696 	rcu_read_lock();
3697 
3698 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3699 		head = &net->ipv6.fib_table_hash[h];
3700 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3701 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3702 				__rt6_purge_dflt_routers(net, table);
3703 		}
3704 	}
3705 
3706 	rcu_read_unlock();
3707 }
3708 
3709 static void rtmsg_to_fib6_config(struct net *net,
3710 				 struct in6_rtmsg *rtmsg,
3711 				 struct fib6_config *cfg)
3712 {
3713 	*cfg = (struct fib6_config){
3714 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3715 			 : RT6_TABLE_MAIN,
3716 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3717 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3718 		.fc_expires = rtmsg->rtmsg_info,
3719 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3720 		.fc_src_len = rtmsg->rtmsg_src_len,
3721 		.fc_flags = rtmsg->rtmsg_flags,
3722 		.fc_type = rtmsg->rtmsg_type,
3723 
3724 		.fc_nlinfo.nl_net = net,
3725 
3726 		.fc_dst = rtmsg->rtmsg_dst,
3727 		.fc_src = rtmsg->rtmsg_src,
3728 		.fc_gateway = rtmsg->rtmsg_gateway,
3729 	};
3730 }
3731 
3732 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3733 {
3734 	struct fib6_config cfg;
3735 	struct in6_rtmsg rtmsg;
3736 	int err;
3737 
3738 	switch (cmd) {
3739 	case SIOCADDRT:		/* Add a route */
3740 	case SIOCDELRT:		/* Delete a route */
3741 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3742 			return -EPERM;
3743 		err = copy_from_user(&rtmsg, arg,
3744 				     sizeof(struct in6_rtmsg));
3745 		if (err)
3746 			return -EFAULT;
3747 
3748 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3749 
3750 		rtnl_lock();
3751 		switch (cmd) {
3752 		case SIOCADDRT:
3753 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3754 			break;
3755 		case SIOCDELRT:
3756 			err = ip6_route_del(&cfg, NULL);
3757 			break;
3758 		default:
3759 			err = -EINVAL;
3760 		}
3761 		rtnl_unlock();
3762 
3763 		return err;
3764 	}
3765 
3766 	return -EINVAL;
3767 }
3768 
3769 /*
3770  *	Drop the packet on the floor
3771  */
3772 
3773 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3774 {
3775 	int type;
3776 	struct dst_entry *dst = skb_dst(skb);
3777 	switch (ipstats_mib_noroutes) {
3778 	case IPSTATS_MIB_INNOROUTES:
3779 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3780 		if (type == IPV6_ADDR_ANY) {
3781 			IP6_INC_STATS(dev_net(dst->dev),
3782 				      __in6_dev_get_safely(skb->dev),
3783 				      IPSTATS_MIB_INADDRERRORS);
3784 			break;
3785 		}
3786 		/* FALLTHROUGH */
3787 	case IPSTATS_MIB_OUTNOROUTES:
3788 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3789 			      ipstats_mib_noroutes);
3790 		break;
3791 	}
3792 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3793 	kfree_skb(skb);
3794 	return 0;
3795 }
3796 
3797 static int ip6_pkt_discard(struct sk_buff *skb)
3798 {
3799 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3800 }
3801 
3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3803 {
3804 	skb->dev = skb_dst(skb)->dev;
3805 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3806 }
3807 
3808 static int ip6_pkt_prohibit(struct sk_buff *skb)
3809 {
3810 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3811 }
3812 
3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3814 {
3815 	skb->dev = skb_dst(skb)->dev;
3816 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3817 }
3818 
3819 /*
3820  *	Allocate a dst for local (unicast / anycast) address.
3821  */
3822 
3823 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3824 				     struct inet6_dev *idev,
3825 				     const struct in6_addr *addr,
3826 				     bool anycast, gfp_t gfp_flags)
3827 {
3828 	struct fib6_config cfg = {
3829 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3830 		.fc_ifindex = idev->dev->ifindex,
3831 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3832 		.fc_dst = *addr,
3833 		.fc_dst_len = 128,
3834 		.fc_protocol = RTPROT_KERNEL,
3835 		.fc_nlinfo.nl_net = net,
3836 		.fc_ignore_dev_down = true,
3837 	};
3838 
3839 	if (anycast) {
3840 		cfg.fc_type = RTN_ANYCAST;
3841 		cfg.fc_flags |= RTF_ANYCAST;
3842 	} else {
3843 		cfg.fc_type = RTN_LOCAL;
3844 		cfg.fc_flags |= RTF_LOCAL;
3845 	}
3846 
3847 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3848 }
3849 
3850 /* remove deleted ip from prefsrc entries */
3851 struct arg_dev_net_ip {
3852 	struct net_device *dev;
3853 	struct net *net;
3854 	struct in6_addr *addr;
3855 };
3856 
3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3858 {
3859 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3860 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3861 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3862 
3863 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3864 	    rt != net->ipv6.fib6_null_entry &&
3865 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3866 		spin_lock_bh(&rt6_exception_lock);
3867 		/* remove prefsrc entry */
3868 		rt->fib6_prefsrc.plen = 0;
3869 		spin_unlock_bh(&rt6_exception_lock);
3870 	}
3871 	return 0;
3872 }
3873 
3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3875 {
3876 	struct net *net = dev_net(ifp->idev->dev);
3877 	struct arg_dev_net_ip adni = {
3878 		.dev = ifp->idev->dev,
3879 		.net = net,
3880 		.addr = &ifp->addr,
3881 	};
3882 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3883 }
3884 
3885 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3886 
3887 /* Remove routers and update dst entries when gateway turn into host. */
3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3889 {
3890 	struct in6_addr *gateway = (struct in6_addr *)arg;
3891 
3892 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3893 	    rt->fib6_nh.fib_nh_gw_family &&
3894 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3895 		return -1;
3896 	}
3897 
3898 	/* Further clean up cached routes in exception table.
3899 	 * This is needed because cached route may have a different
3900 	 * gateway than its 'parent' in the case of an ip redirect.
3901 	 */
3902 	rt6_exceptions_clean_tohost(rt, gateway);
3903 
3904 	return 0;
3905 }
3906 
3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3908 {
3909 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3910 }
3911 
3912 struct arg_netdev_event {
3913 	const struct net_device *dev;
3914 	union {
3915 		unsigned char nh_flags;
3916 		unsigned long event;
3917 	};
3918 };
3919 
3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3921 {
3922 	struct fib6_info *iter;
3923 	struct fib6_node *fn;
3924 
3925 	fn = rcu_dereference_protected(rt->fib6_node,
3926 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3927 	iter = rcu_dereference_protected(fn->leaf,
3928 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3929 	while (iter) {
3930 		if (iter->fib6_metric == rt->fib6_metric &&
3931 		    rt6_qualify_for_ecmp(iter))
3932 			return iter;
3933 		iter = rcu_dereference_protected(iter->fib6_next,
3934 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3935 	}
3936 
3937 	return NULL;
3938 }
3939 
3940 static bool rt6_is_dead(const struct fib6_info *rt)
3941 {
3942 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3943 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3944 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3945 		return true;
3946 
3947 	return false;
3948 }
3949 
3950 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3951 {
3952 	struct fib6_info *iter;
3953 	int total = 0;
3954 
3955 	if (!rt6_is_dead(rt))
3956 		total += rt->fib6_nh.fib_nh_weight;
3957 
3958 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3959 		if (!rt6_is_dead(iter))
3960 			total += iter->fib6_nh.fib_nh_weight;
3961 	}
3962 
3963 	return total;
3964 }
3965 
3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3967 {
3968 	int upper_bound = -1;
3969 
3970 	if (!rt6_is_dead(rt)) {
3971 		*weight += rt->fib6_nh.fib_nh_weight;
3972 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3973 						    total) - 1;
3974 	}
3975 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3976 }
3977 
3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3979 {
3980 	struct fib6_info *iter;
3981 	int weight = 0;
3982 
3983 	rt6_upper_bound_set(rt, &weight, total);
3984 
3985 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3986 		rt6_upper_bound_set(iter, &weight, total);
3987 }
3988 
3989 void rt6_multipath_rebalance(struct fib6_info *rt)
3990 {
3991 	struct fib6_info *first;
3992 	int total;
3993 
3994 	/* In case the entire multipath route was marked for flushing,
3995 	 * then there is no need to rebalance upon the removal of every
3996 	 * sibling route.
3997 	 */
3998 	if (!rt->fib6_nsiblings || rt->should_flush)
3999 		return;
4000 
4001 	/* During lookup routes are evaluated in order, so we need to
4002 	 * make sure upper bounds are assigned from the first sibling
4003 	 * onwards.
4004 	 */
4005 	first = rt6_multipath_first_sibling(rt);
4006 	if (WARN_ON_ONCE(!first))
4007 		return;
4008 
4009 	total = rt6_multipath_total_weight(first);
4010 	rt6_multipath_upper_bound_set(first, total);
4011 }
4012 
4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4014 {
4015 	const struct arg_netdev_event *arg = p_arg;
4016 	struct net *net = dev_net(arg->dev);
4017 
4018 	if (rt != net->ipv6.fib6_null_entry &&
4019 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4020 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4021 		fib6_update_sernum_upto_root(net, rt);
4022 		rt6_multipath_rebalance(rt);
4023 	}
4024 
4025 	return 0;
4026 }
4027 
4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4029 {
4030 	struct arg_netdev_event arg = {
4031 		.dev = dev,
4032 		{
4033 			.nh_flags = nh_flags,
4034 		},
4035 	};
4036 
4037 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4038 		arg.nh_flags |= RTNH_F_LINKDOWN;
4039 
4040 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4041 }
4042 
4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4044 				   const struct net_device *dev)
4045 {
4046 	struct fib6_info *iter;
4047 
4048 	if (rt->fib6_nh.fib_nh_dev == dev)
4049 		return true;
4050 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4051 		if (iter->fib6_nh.fib_nh_dev == dev)
4052 			return true;
4053 
4054 	return false;
4055 }
4056 
4057 static void rt6_multipath_flush(struct fib6_info *rt)
4058 {
4059 	struct fib6_info *iter;
4060 
4061 	rt->should_flush = 1;
4062 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063 		iter->should_flush = 1;
4064 }
4065 
4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4067 					     const struct net_device *down_dev)
4068 {
4069 	struct fib6_info *iter;
4070 	unsigned int dead = 0;
4071 
4072 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4073 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4074 		dead++;
4075 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4077 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4078 			dead++;
4079 
4080 	return dead;
4081 }
4082 
4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4084 				       const struct net_device *dev,
4085 				       unsigned char nh_flags)
4086 {
4087 	struct fib6_info *iter;
4088 
4089 	if (rt->fib6_nh.fib_nh_dev == dev)
4090 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4091 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092 		if (iter->fib6_nh.fib_nh_dev == dev)
4093 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4094 }
4095 
4096 /* called with write lock held for table with rt */
4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4098 {
4099 	const struct arg_netdev_event *arg = p_arg;
4100 	const struct net_device *dev = arg->dev;
4101 	struct net *net = dev_net(dev);
4102 
4103 	if (rt == net->ipv6.fib6_null_entry)
4104 		return 0;
4105 
4106 	switch (arg->event) {
4107 	case NETDEV_UNREGISTER:
4108 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4109 	case NETDEV_DOWN:
4110 		if (rt->should_flush)
4111 			return -1;
4112 		if (!rt->fib6_nsiblings)
4113 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4114 		if (rt6_multipath_uses_dev(rt, dev)) {
4115 			unsigned int count;
4116 
4117 			count = rt6_multipath_dead_count(rt, dev);
4118 			if (rt->fib6_nsiblings + 1 == count) {
4119 				rt6_multipath_flush(rt);
4120 				return -1;
4121 			}
4122 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4123 						   RTNH_F_LINKDOWN);
4124 			fib6_update_sernum(net, rt);
4125 			rt6_multipath_rebalance(rt);
4126 		}
4127 		return -2;
4128 	case NETDEV_CHANGE:
4129 		if (rt->fib6_nh.fib_nh_dev != dev ||
4130 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4131 			break;
4132 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4133 		rt6_multipath_rebalance(rt);
4134 		break;
4135 	}
4136 
4137 	return 0;
4138 }
4139 
4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4141 {
4142 	struct arg_netdev_event arg = {
4143 		.dev = dev,
4144 		{
4145 			.event = event,
4146 		},
4147 	};
4148 	struct net *net = dev_net(dev);
4149 
4150 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4151 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4152 	else
4153 		fib6_clean_all(net, fib6_ifdown, &arg);
4154 }
4155 
4156 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4157 {
4158 	rt6_sync_down_dev(dev, event);
4159 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4160 	neigh_ifdown(&nd_tbl, dev);
4161 }
4162 
4163 struct rt6_mtu_change_arg {
4164 	struct net_device *dev;
4165 	unsigned int mtu;
4166 };
4167 
4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4169 {
4170 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4171 	struct inet6_dev *idev;
4172 
4173 	/* In IPv6 pmtu discovery is not optional,
4174 	   so that RTAX_MTU lock cannot disable it.
4175 	   We still use this lock to block changes
4176 	   caused by addrconf/ndisc.
4177 	*/
4178 
4179 	idev = __in6_dev_get(arg->dev);
4180 	if (!idev)
4181 		return 0;
4182 
4183 	/* For administrative MTU increase, there is no way to discover
4184 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4185 	   Since RFC 1981 doesn't include administrative MTU increase
4186 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4187 	 */
4188 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4189 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4190 		u32 mtu = rt->fib6_pmtu;
4191 
4192 		if (mtu >= arg->mtu ||
4193 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4194 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4195 
4196 		spin_lock_bh(&rt6_exception_lock);
4197 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4198 		spin_unlock_bh(&rt6_exception_lock);
4199 	}
4200 	return 0;
4201 }
4202 
4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4204 {
4205 	struct rt6_mtu_change_arg arg = {
4206 		.dev = dev,
4207 		.mtu = mtu,
4208 	};
4209 
4210 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4211 }
4212 
4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4214 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4215 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4216 	[RTA_OIF]               = { .type = NLA_U32 },
4217 	[RTA_IIF]		= { .type = NLA_U32 },
4218 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4219 	[RTA_METRICS]           = { .type = NLA_NESTED },
4220 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4221 	[RTA_PREF]              = { .type = NLA_U8 },
4222 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4223 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4224 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4225 	[RTA_UID]		= { .type = NLA_U32 },
4226 	[RTA_MARK]		= { .type = NLA_U32 },
4227 	[RTA_TABLE]		= { .type = NLA_U32 },
4228 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4229 	[RTA_SPORT]		= { .type = NLA_U16 },
4230 	[RTA_DPORT]		= { .type = NLA_U16 },
4231 };
4232 
4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4234 			      struct fib6_config *cfg,
4235 			      struct netlink_ext_ack *extack)
4236 {
4237 	struct rtmsg *rtm;
4238 	struct nlattr *tb[RTA_MAX+1];
4239 	unsigned int pref;
4240 	int err;
4241 
4242 	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4243 				     rtm_ipv6_policy, extack);
4244 	if (err < 0)
4245 		goto errout;
4246 
4247 	err = -EINVAL;
4248 	rtm = nlmsg_data(nlh);
4249 
4250 	*cfg = (struct fib6_config){
4251 		.fc_table = rtm->rtm_table,
4252 		.fc_dst_len = rtm->rtm_dst_len,
4253 		.fc_src_len = rtm->rtm_src_len,
4254 		.fc_flags = RTF_UP,
4255 		.fc_protocol = rtm->rtm_protocol,
4256 		.fc_type = rtm->rtm_type,
4257 
4258 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4259 		.fc_nlinfo.nlh = nlh,
4260 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4261 	};
4262 
4263 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4264 	    rtm->rtm_type == RTN_BLACKHOLE ||
4265 	    rtm->rtm_type == RTN_PROHIBIT ||
4266 	    rtm->rtm_type == RTN_THROW)
4267 		cfg->fc_flags |= RTF_REJECT;
4268 
4269 	if (rtm->rtm_type == RTN_LOCAL)
4270 		cfg->fc_flags |= RTF_LOCAL;
4271 
4272 	if (rtm->rtm_flags & RTM_F_CLONED)
4273 		cfg->fc_flags |= RTF_CACHE;
4274 
4275 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4276 
4277 	if (tb[RTA_GATEWAY]) {
4278 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4279 		cfg->fc_flags |= RTF_GATEWAY;
4280 	}
4281 	if (tb[RTA_VIA]) {
4282 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4283 		goto errout;
4284 	}
4285 
4286 	if (tb[RTA_DST]) {
4287 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4288 
4289 		if (nla_len(tb[RTA_DST]) < plen)
4290 			goto errout;
4291 
4292 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4293 	}
4294 
4295 	if (tb[RTA_SRC]) {
4296 		int plen = (rtm->rtm_src_len + 7) >> 3;
4297 
4298 		if (nla_len(tb[RTA_SRC]) < plen)
4299 			goto errout;
4300 
4301 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4302 	}
4303 
4304 	if (tb[RTA_PREFSRC])
4305 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4306 
4307 	if (tb[RTA_OIF])
4308 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4309 
4310 	if (tb[RTA_PRIORITY])
4311 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4312 
4313 	if (tb[RTA_METRICS]) {
4314 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4315 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4316 	}
4317 
4318 	if (tb[RTA_TABLE])
4319 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4320 
4321 	if (tb[RTA_MULTIPATH]) {
4322 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4323 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4324 
4325 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4326 						     cfg->fc_mp_len, extack);
4327 		if (err < 0)
4328 			goto errout;
4329 	}
4330 
4331 	if (tb[RTA_PREF]) {
4332 		pref = nla_get_u8(tb[RTA_PREF]);
4333 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4334 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4335 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4336 		cfg->fc_flags |= RTF_PREF(pref);
4337 	}
4338 
4339 	if (tb[RTA_ENCAP])
4340 		cfg->fc_encap = tb[RTA_ENCAP];
4341 
4342 	if (tb[RTA_ENCAP_TYPE]) {
4343 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4344 
4345 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4346 		if (err < 0)
4347 			goto errout;
4348 	}
4349 
4350 	if (tb[RTA_EXPIRES]) {
4351 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4352 
4353 		if (addrconf_finite_timeout(timeout)) {
4354 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4355 			cfg->fc_flags |= RTF_EXPIRES;
4356 		}
4357 	}
4358 
4359 	err = 0;
4360 errout:
4361 	return err;
4362 }
4363 
4364 struct rt6_nh {
4365 	struct fib6_info *fib6_info;
4366 	struct fib6_config r_cfg;
4367 	struct list_head next;
4368 };
4369 
4370 static int ip6_route_info_append(struct net *net,
4371 				 struct list_head *rt6_nh_list,
4372 				 struct fib6_info *rt,
4373 				 struct fib6_config *r_cfg)
4374 {
4375 	struct rt6_nh *nh;
4376 	int err = -EEXIST;
4377 
4378 	list_for_each_entry(nh, rt6_nh_list, next) {
4379 		/* check if fib6_info already exists */
4380 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4381 			return err;
4382 	}
4383 
4384 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4385 	if (!nh)
4386 		return -ENOMEM;
4387 	nh->fib6_info = rt;
4388 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4389 	list_add_tail(&nh->next, rt6_nh_list);
4390 
4391 	return 0;
4392 }
4393 
4394 static void ip6_route_mpath_notify(struct fib6_info *rt,
4395 				   struct fib6_info *rt_last,
4396 				   struct nl_info *info,
4397 				   __u16 nlflags)
4398 {
4399 	/* if this is an APPEND route, then rt points to the first route
4400 	 * inserted and rt_last points to last route inserted. Userspace
4401 	 * wants a consistent dump of the route which starts at the first
4402 	 * nexthop. Since sibling routes are always added at the end of
4403 	 * the list, find the first sibling of the last route appended
4404 	 */
4405 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4406 		rt = list_first_entry(&rt_last->fib6_siblings,
4407 				      struct fib6_info,
4408 				      fib6_siblings);
4409 	}
4410 
4411 	if (rt)
4412 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4413 }
4414 
4415 static int ip6_route_multipath_add(struct fib6_config *cfg,
4416 				   struct netlink_ext_ack *extack)
4417 {
4418 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4419 	struct nl_info *info = &cfg->fc_nlinfo;
4420 	struct fib6_config r_cfg;
4421 	struct rtnexthop *rtnh;
4422 	struct fib6_info *rt;
4423 	struct rt6_nh *err_nh;
4424 	struct rt6_nh *nh, *nh_safe;
4425 	__u16 nlflags;
4426 	int remaining;
4427 	int attrlen;
4428 	int err = 1;
4429 	int nhn = 0;
4430 	int replace = (cfg->fc_nlinfo.nlh &&
4431 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4432 	LIST_HEAD(rt6_nh_list);
4433 
4434 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4435 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4436 		nlflags |= NLM_F_APPEND;
4437 
4438 	remaining = cfg->fc_mp_len;
4439 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4440 
4441 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4442 	 * fib6_info structs per nexthop
4443 	 */
4444 	while (rtnh_ok(rtnh, remaining)) {
4445 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4446 		if (rtnh->rtnh_ifindex)
4447 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4448 
4449 		attrlen = rtnh_attrlen(rtnh);
4450 		if (attrlen > 0) {
4451 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4452 
4453 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4454 			if (nla) {
4455 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4456 				r_cfg.fc_flags |= RTF_GATEWAY;
4457 			}
4458 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4459 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4460 			if (nla)
4461 				r_cfg.fc_encap_type = nla_get_u16(nla);
4462 		}
4463 
4464 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4465 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4466 		if (IS_ERR(rt)) {
4467 			err = PTR_ERR(rt);
4468 			rt = NULL;
4469 			goto cleanup;
4470 		}
4471 		if (!rt6_qualify_for_ecmp(rt)) {
4472 			err = -EINVAL;
4473 			NL_SET_ERR_MSG(extack,
4474 				       "Device only routes can not be added for IPv6 using the multipath API.");
4475 			fib6_info_release(rt);
4476 			goto cleanup;
4477 		}
4478 
4479 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4480 
4481 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4482 					    rt, &r_cfg);
4483 		if (err) {
4484 			fib6_info_release(rt);
4485 			goto cleanup;
4486 		}
4487 
4488 		rtnh = rtnh_next(rtnh, &remaining);
4489 	}
4490 
4491 	/* for add and replace send one notification with all nexthops.
4492 	 * Skip the notification in fib6_add_rt2node and send one with
4493 	 * the full route when done
4494 	 */
4495 	info->skip_notify = 1;
4496 
4497 	err_nh = NULL;
4498 	list_for_each_entry(nh, &rt6_nh_list, next) {
4499 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4500 		fib6_info_release(nh->fib6_info);
4501 
4502 		if (!err) {
4503 			/* save reference to last route successfully inserted */
4504 			rt_last = nh->fib6_info;
4505 
4506 			/* save reference to first route for notification */
4507 			if (!rt_notif)
4508 				rt_notif = nh->fib6_info;
4509 		}
4510 
4511 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4512 		nh->fib6_info = NULL;
4513 		if (err) {
4514 			if (replace && nhn)
4515 				NL_SET_ERR_MSG_MOD(extack,
4516 						   "multipath route replace failed (check consistency of installed routes)");
4517 			err_nh = nh;
4518 			goto add_errout;
4519 		}
4520 
4521 		/* Because each route is added like a single route we remove
4522 		 * these flags after the first nexthop: if there is a collision,
4523 		 * we have already failed to add the first nexthop:
4524 		 * fib6_add_rt2node() has rejected it; when replacing, old
4525 		 * nexthops have been replaced by first new, the rest should
4526 		 * be added to it.
4527 		 */
4528 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4529 						     NLM_F_REPLACE);
4530 		nhn++;
4531 	}
4532 
4533 	/* success ... tell user about new route */
4534 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4535 	goto cleanup;
4536 
4537 add_errout:
4538 	/* send notification for routes that were added so that
4539 	 * the delete notifications sent by ip6_route_del are
4540 	 * coherent
4541 	 */
4542 	if (rt_notif)
4543 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4544 
4545 	/* Delete routes that were already added */
4546 	list_for_each_entry(nh, &rt6_nh_list, next) {
4547 		if (err_nh == nh)
4548 			break;
4549 		ip6_route_del(&nh->r_cfg, extack);
4550 	}
4551 
4552 cleanup:
4553 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4554 		if (nh->fib6_info)
4555 			fib6_info_release(nh->fib6_info);
4556 		list_del(&nh->next);
4557 		kfree(nh);
4558 	}
4559 
4560 	return err;
4561 }
4562 
4563 static int ip6_route_multipath_del(struct fib6_config *cfg,
4564 				   struct netlink_ext_ack *extack)
4565 {
4566 	struct fib6_config r_cfg;
4567 	struct rtnexthop *rtnh;
4568 	int remaining;
4569 	int attrlen;
4570 	int err = 1, last_err = 0;
4571 
4572 	remaining = cfg->fc_mp_len;
4573 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4574 
4575 	/* Parse a Multipath Entry */
4576 	while (rtnh_ok(rtnh, remaining)) {
4577 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4578 		if (rtnh->rtnh_ifindex)
4579 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4580 
4581 		attrlen = rtnh_attrlen(rtnh);
4582 		if (attrlen > 0) {
4583 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4584 
4585 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4586 			if (nla) {
4587 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4588 				r_cfg.fc_flags |= RTF_GATEWAY;
4589 			}
4590 		}
4591 		err = ip6_route_del(&r_cfg, extack);
4592 		if (err)
4593 			last_err = err;
4594 
4595 		rtnh = rtnh_next(rtnh, &remaining);
4596 	}
4597 
4598 	return last_err;
4599 }
4600 
4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4602 			      struct netlink_ext_ack *extack)
4603 {
4604 	struct fib6_config cfg;
4605 	int err;
4606 
4607 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4608 	if (err < 0)
4609 		return err;
4610 
4611 	if (cfg.fc_mp)
4612 		return ip6_route_multipath_del(&cfg, extack);
4613 	else {
4614 		cfg.fc_delete_all_nh = 1;
4615 		return ip6_route_del(&cfg, extack);
4616 	}
4617 }
4618 
4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4620 			      struct netlink_ext_ack *extack)
4621 {
4622 	struct fib6_config cfg;
4623 	int err;
4624 
4625 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4626 	if (err < 0)
4627 		return err;
4628 
4629 	if (cfg.fc_metric == 0)
4630 		cfg.fc_metric = IP6_RT_PRIO_USER;
4631 
4632 	if (cfg.fc_mp)
4633 		return ip6_route_multipath_add(&cfg, extack);
4634 	else
4635 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4636 }
4637 
4638 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4639 {
4640 	int nexthop_len = 0;
4641 
4642 	if (rt->fib6_nsiblings) {
4643 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4644 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4645 			    + nla_total_size(16) /* RTA_GATEWAY */
4646 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4647 
4648 		nexthop_len *= rt->fib6_nsiblings;
4649 	}
4650 
4651 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4652 	       + nla_total_size(16) /* RTA_SRC */
4653 	       + nla_total_size(16) /* RTA_DST */
4654 	       + nla_total_size(16) /* RTA_GATEWAY */
4655 	       + nla_total_size(16) /* RTA_PREFSRC */
4656 	       + nla_total_size(4) /* RTA_TABLE */
4657 	       + nla_total_size(4) /* RTA_IIF */
4658 	       + nla_total_size(4) /* RTA_OIF */
4659 	       + nla_total_size(4) /* RTA_PRIORITY */
4660 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4661 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4662 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4663 	       + nla_total_size(1) /* RTA_PREF */
4664 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4665 	       + nexthop_len;
4666 }
4667 
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669 			 struct fib6_info *rt, struct dst_entry *dst,
4670 			 struct in6_addr *dest, struct in6_addr *src,
4671 			 int iif, int type, u32 portid, u32 seq,
4672 			 unsigned int flags)
4673 {
4674 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4675 	struct rt6key *rt6_dst, *rt6_src;
4676 	u32 *pmetrics, table, rt6_flags;
4677 	struct nlmsghdr *nlh;
4678 	struct rtmsg *rtm;
4679 	long expires = 0;
4680 
4681 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4682 	if (!nlh)
4683 		return -EMSGSIZE;
4684 
4685 	if (rt6) {
4686 		rt6_dst = &rt6->rt6i_dst;
4687 		rt6_src = &rt6->rt6i_src;
4688 		rt6_flags = rt6->rt6i_flags;
4689 	} else {
4690 		rt6_dst = &rt->fib6_dst;
4691 		rt6_src = &rt->fib6_src;
4692 		rt6_flags = rt->fib6_flags;
4693 	}
4694 
4695 	rtm = nlmsg_data(nlh);
4696 	rtm->rtm_family = AF_INET6;
4697 	rtm->rtm_dst_len = rt6_dst->plen;
4698 	rtm->rtm_src_len = rt6_src->plen;
4699 	rtm->rtm_tos = 0;
4700 	if (rt->fib6_table)
4701 		table = rt->fib6_table->tb6_id;
4702 	else
4703 		table = RT6_TABLE_UNSPEC;
4704 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4705 	if (nla_put_u32(skb, RTA_TABLE, table))
4706 		goto nla_put_failure;
4707 
4708 	rtm->rtm_type = rt->fib6_type;
4709 	rtm->rtm_flags = 0;
4710 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4711 	rtm->rtm_protocol = rt->fib6_protocol;
4712 
4713 	if (rt6_flags & RTF_CACHE)
4714 		rtm->rtm_flags |= RTM_F_CLONED;
4715 
4716 	if (dest) {
4717 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4718 			goto nla_put_failure;
4719 		rtm->rtm_dst_len = 128;
4720 	} else if (rtm->rtm_dst_len)
4721 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4722 			goto nla_put_failure;
4723 #ifdef CONFIG_IPV6_SUBTREES
4724 	if (src) {
4725 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4726 			goto nla_put_failure;
4727 		rtm->rtm_src_len = 128;
4728 	} else if (rtm->rtm_src_len &&
4729 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4730 		goto nla_put_failure;
4731 #endif
4732 	if (iif) {
4733 #ifdef CONFIG_IPV6_MROUTE
4734 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4735 			int err = ip6mr_get_route(net, skb, rtm, portid);
4736 
4737 			if (err == 0)
4738 				return 0;
4739 			if (err < 0)
4740 				goto nla_put_failure;
4741 		} else
4742 #endif
4743 			if (nla_put_u32(skb, RTA_IIF, iif))
4744 				goto nla_put_failure;
4745 	} else if (dest) {
4746 		struct in6_addr saddr_buf;
4747 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4748 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4749 			goto nla_put_failure;
4750 	}
4751 
4752 	if (rt->fib6_prefsrc.plen) {
4753 		struct in6_addr saddr_buf;
4754 		saddr_buf = rt->fib6_prefsrc.addr;
4755 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4756 			goto nla_put_failure;
4757 	}
4758 
4759 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4760 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4761 		goto nla_put_failure;
4762 
4763 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4764 		goto nla_put_failure;
4765 
4766 	/* For multipath routes, walk the siblings list and add
4767 	 * each as a nexthop within RTA_MULTIPATH.
4768 	 */
4769 	if (rt6) {
4770 		if (rt6_flags & RTF_GATEWAY &&
4771 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4772 			goto nla_put_failure;
4773 
4774 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4775 			goto nla_put_failure;
4776 	} else if (rt->fib6_nsiblings) {
4777 		struct fib6_info *sibling, *next_sibling;
4778 		struct nlattr *mp;
4779 
4780 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4781 		if (!mp)
4782 			goto nla_put_failure;
4783 
4784 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4785 				    rt->fib6_nh.fib_nh_weight) < 0)
4786 			goto nla_put_failure;
4787 
4788 		list_for_each_entry_safe(sibling, next_sibling,
4789 					 &rt->fib6_siblings, fib6_siblings) {
4790 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4791 					    sibling->fib6_nh.fib_nh_weight) < 0)
4792 				goto nla_put_failure;
4793 		}
4794 
4795 		nla_nest_end(skb, mp);
4796 	} else {
4797 		unsigned char nh_flags = 0;
4798 
4799 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4800 				     &nh_flags, false) < 0)
4801 			goto nla_put_failure;
4802 
4803 		rtm->rtm_flags |= nh_flags;
4804 	}
4805 
4806 	if (rt6_flags & RTF_EXPIRES) {
4807 		expires = dst ? dst->expires : rt->expires;
4808 		expires -= jiffies;
4809 	}
4810 
4811 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4812 		goto nla_put_failure;
4813 
4814 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4815 		goto nla_put_failure;
4816 
4817 
4818 	nlmsg_end(skb, nlh);
4819 	return 0;
4820 
4821 nla_put_failure:
4822 	nlmsg_cancel(skb, nlh);
4823 	return -EMSGSIZE;
4824 }
4825 
4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4827 			       const struct net_device *dev)
4828 {
4829 	if (f6i->fib6_nh.fib_nh_dev == dev)
4830 		return true;
4831 
4832 	if (f6i->fib6_nsiblings) {
4833 		struct fib6_info *sibling, *next_sibling;
4834 
4835 		list_for_each_entry_safe(sibling, next_sibling,
4836 					 &f6i->fib6_siblings, fib6_siblings) {
4837 			if (sibling->fib6_nh.fib_nh_dev == dev)
4838 				return true;
4839 		}
4840 	}
4841 
4842 	return false;
4843 }
4844 
4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4846 {
4847 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4848 	struct fib_dump_filter *filter = &arg->filter;
4849 	unsigned int flags = NLM_F_MULTI;
4850 	struct net *net = arg->net;
4851 
4852 	if (rt == net->ipv6.fib6_null_entry)
4853 		return 0;
4854 
4855 	if ((filter->flags & RTM_F_PREFIX) &&
4856 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4857 		/* success since this is not a prefix route */
4858 		return 1;
4859 	}
4860 	if (filter->filter_set) {
4861 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4862 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4863 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4864 			return 1;
4865 		}
4866 		flags |= NLM_F_DUMP_FILTERED;
4867 	}
4868 
4869 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4870 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4871 			     arg->cb->nlh->nlmsg_seq, flags);
4872 }
4873 
4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4875 					const struct nlmsghdr *nlh,
4876 					struct nlattr **tb,
4877 					struct netlink_ext_ack *extack)
4878 {
4879 	struct rtmsg *rtm;
4880 	int i, err;
4881 
4882 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4883 		NL_SET_ERR_MSG_MOD(extack,
4884 				   "Invalid header for get route request");
4885 		return -EINVAL;
4886 	}
4887 
4888 	if (!netlink_strict_get_check(skb))
4889 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4890 					      rtm_ipv6_policy, extack);
4891 
4892 	rtm = nlmsg_data(nlh);
4893 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4894 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4895 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4896 	    rtm->rtm_type) {
4897 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4898 		return -EINVAL;
4899 	}
4900 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4901 		NL_SET_ERR_MSG_MOD(extack,
4902 				   "Invalid flags for get route request");
4903 		return -EINVAL;
4904 	}
4905 
4906 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4907 					    rtm_ipv6_policy, extack);
4908 	if (err)
4909 		return err;
4910 
4911 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4912 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4913 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4914 		return -EINVAL;
4915 	}
4916 
4917 	for (i = 0; i <= RTA_MAX; i++) {
4918 		if (!tb[i])
4919 			continue;
4920 
4921 		switch (i) {
4922 		case RTA_SRC:
4923 		case RTA_DST:
4924 		case RTA_IIF:
4925 		case RTA_OIF:
4926 		case RTA_MARK:
4927 		case RTA_UID:
4928 		case RTA_SPORT:
4929 		case RTA_DPORT:
4930 		case RTA_IP_PROTO:
4931 			break;
4932 		default:
4933 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4934 			return -EINVAL;
4935 		}
4936 	}
4937 
4938 	return 0;
4939 }
4940 
4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4942 			      struct netlink_ext_ack *extack)
4943 {
4944 	struct net *net = sock_net(in_skb->sk);
4945 	struct nlattr *tb[RTA_MAX+1];
4946 	int err, iif = 0, oif = 0;
4947 	struct fib6_info *from;
4948 	struct dst_entry *dst;
4949 	struct rt6_info *rt;
4950 	struct sk_buff *skb;
4951 	struct rtmsg *rtm;
4952 	struct flowi6 fl6 = {};
4953 	bool fibmatch;
4954 
4955 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4956 	if (err < 0)
4957 		goto errout;
4958 
4959 	err = -EINVAL;
4960 	rtm = nlmsg_data(nlh);
4961 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4962 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4963 
4964 	if (tb[RTA_SRC]) {
4965 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4966 			goto errout;
4967 
4968 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4969 	}
4970 
4971 	if (tb[RTA_DST]) {
4972 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4973 			goto errout;
4974 
4975 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4976 	}
4977 
4978 	if (tb[RTA_IIF])
4979 		iif = nla_get_u32(tb[RTA_IIF]);
4980 
4981 	if (tb[RTA_OIF])
4982 		oif = nla_get_u32(tb[RTA_OIF]);
4983 
4984 	if (tb[RTA_MARK])
4985 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4986 
4987 	if (tb[RTA_UID])
4988 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4989 					   nla_get_u32(tb[RTA_UID]));
4990 	else
4991 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4992 
4993 	if (tb[RTA_SPORT])
4994 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4995 
4996 	if (tb[RTA_DPORT])
4997 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4998 
4999 	if (tb[RTA_IP_PROTO]) {
5000 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5001 						  &fl6.flowi6_proto, AF_INET6,
5002 						  extack);
5003 		if (err)
5004 			goto errout;
5005 	}
5006 
5007 	if (iif) {
5008 		struct net_device *dev;
5009 		int flags = 0;
5010 
5011 		rcu_read_lock();
5012 
5013 		dev = dev_get_by_index_rcu(net, iif);
5014 		if (!dev) {
5015 			rcu_read_unlock();
5016 			err = -ENODEV;
5017 			goto errout;
5018 		}
5019 
5020 		fl6.flowi6_iif = iif;
5021 
5022 		if (!ipv6_addr_any(&fl6.saddr))
5023 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5024 
5025 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5026 
5027 		rcu_read_unlock();
5028 	} else {
5029 		fl6.flowi6_oif = oif;
5030 
5031 		dst = ip6_route_output(net, NULL, &fl6);
5032 	}
5033 
5034 
5035 	rt = container_of(dst, struct rt6_info, dst);
5036 	if (rt->dst.error) {
5037 		err = rt->dst.error;
5038 		ip6_rt_put(rt);
5039 		goto errout;
5040 	}
5041 
5042 	if (rt == net->ipv6.ip6_null_entry) {
5043 		err = rt->dst.error;
5044 		ip6_rt_put(rt);
5045 		goto errout;
5046 	}
5047 
5048 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5049 	if (!skb) {
5050 		ip6_rt_put(rt);
5051 		err = -ENOBUFS;
5052 		goto errout;
5053 	}
5054 
5055 	skb_dst_set(skb, &rt->dst);
5056 
5057 	rcu_read_lock();
5058 	from = rcu_dereference(rt->from);
5059 
5060 	if (fibmatch)
5061 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5062 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5063 				    nlh->nlmsg_seq, 0);
5064 	else
5065 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5066 				    &fl6.saddr, iif, RTM_NEWROUTE,
5067 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5068 				    0);
5069 	rcu_read_unlock();
5070 
5071 	if (err < 0) {
5072 		kfree_skb(skb);
5073 		goto errout;
5074 	}
5075 
5076 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5077 errout:
5078 	return err;
5079 }
5080 
5081 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5082 		     unsigned int nlm_flags)
5083 {
5084 	struct sk_buff *skb;
5085 	struct net *net = info->nl_net;
5086 	u32 seq;
5087 	int err;
5088 
5089 	err = -ENOBUFS;
5090 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5091 
5092 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5093 	if (!skb)
5094 		goto errout;
5095 
5096 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5097 			    event, info->portid, seq, nlm_flags);
5098 	if (err < 0) {
5099 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5100 		WARN_ON(err == -EMSGSIZE);
5101 		kfree_skb(skb);
5102 		goto errout;
5103 	}
5104 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5105 		    info->nlh, gfp_any());
5106 	return;
5107 errout:
5108 	if (err < 0)
5109 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5110 }
5111 
5112 static int ip6_route_dev_notify(struct notifier_block *this,
5113 				unsigned long event, void *ptr)
5114 {
5115 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5116 	struct net *net = dev_net(dev);
5117 
5118 	if (!(dev->flags & IFF_LOOPBACK))
5119 		return NOTIFY_OK;
5120 
5121 	if (event == NETDEV_REGISTER) {
5122 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5123 		net->ipv6.ip6_null_entry->dst.dev = dev;
5124 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5125 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5126 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5127 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5128 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5129 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5130 #endif
5131 	 } else if (event == NETDEV_UNREGISTER &&
5132 		    dev->reg_state != NETREG_UNREGISTERED) {
5133 		/* NETDEV_UNREGISTER could be fired for multiple times by
5134 		 * netdev_wait_allrefs(). Make sure we only call this once.
5135 		 */
5136 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5137 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5138 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5139 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5140 #endif
5141 	}
5142 
5143 	return NOTIFY_OK;
5144 }
5145 
5146 /*
5147  *	/proc
5148  */
5149 
5150 #ifdef CONFIG_PROC_FS
5151 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5152 {
5153 	struct net *net = (struct net *)seq->private;
5154 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5155 		   net->ipv6.rt6_stats->fib_nodes,
5156 		   net->ipv6.rt6_stats->fib_route_nodes,
5157 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5158 		   net->ipv6.rt6_stats->fib_rt_entries,
5159 		   net->ipv6.rt6_stats->fib_rt_cache,
5160 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5161 		   net->ipv6.rt6_stats->fib_discarded_routes);
5162 
5163 	return 0;
5164 }
5165 #endif	/* CONFIG_PROC_FS */
5166 
5167 #ifdef CONFIG_SYSCTL
5168 
5169 static
5170 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5171 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5172 {
5173 	struct net *net;
5174 	int delay;
5175 	int ret;
5176 	if (!write)
5177 		return -EINVAL;
5178 
5179 	net = (struct net *)ctl->extra1;
5180 	delay = net->ipv6.sysctl.flush_delay;
5181 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5182 	if (ret)
5183 		return ret;
5184 
5185 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5186 	return 0;
5187 }
5188 
5189 static int zero;
5190 static int one = 1;
5191 
5192 static struct ctl_table ipv6_route_table_template[] = {
5193 	{
5194 		.procname	=	"flush",
5195 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5196 		.maxlen		=	sizeof(int),
5197 		.mode		=	0200,
5198 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5199 	},
5200 	{
5201 		.procname	=	"gc_thresh",
5202 		.data		=	&ip6_dst_ops_template.gc_thresh,
5203 		.maxlen		=	sizeof(int),
5204 		.mode		=	0644,
5205 		.proc_handler	=	proc_dointvec,
5206 	},
5207 	{
5208 		.procname	=	"max_size",
5209 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5210 		.maxlen		=	sizeof(int),
5211 		.mode		=	0644,
5212 		.proc_handler	=	proc_dointvec,
5213 	},
5214 	{
5215 		.procname	=	"gc_min_interval",
5216 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5217 		.maxlen		=	sizeof(int),
5218 		.mode		=	0644,
5219 		.proc_handler	=	proc_dointvec_jiffies,
5220 	},
5221 	{
5222 		.procname	=	"gc_timeout",
5223 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5224 		.maxlen		=	sizeof(int),
5225 		.mode		=	0644,
5226 		.proc_handler	=	proc_dointvec_jiffies,
5227 	},
5228 	{
5229 		.procname	=	"gc_interval",
5230 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5231 		.maxlen		=	sizeof(int),
5232 		.mode		=	0644,
5233 		.proc_handler	=	proc_dointvec_jiffies,
5234 	},
5235 	{
5236 		.procname	=	"gc_elasticity",
5237 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5238 		.maxlen		=	sizeof(int),
5239 		.mode		=	0644,
5240 		.proc_handler	=	proc_dointvec,
5241 	},
5242 	{
5243 		.procname	=	"mtu_expires",
5244 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5245 		.maxlen		=	sizeof(int),
5246 		.mode		=	0644,
5247 		.proc_handler	=	proc_dointvec_jiffies,
5248 	},
5249 	{
5250 		.procname	=	"min_adv_mss",
5251 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5252 		.maxlen		=	sizeof(int),
5253 		.mode		=	0644,
5254 		.proc_handler	=	proc_dointvec,
5255 	},
5256 	{
5257 		.procname	=	"gc_min_interval_ms",
5258 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5259 		.maxlen		=	sizeof(int),
5260 		.mode		=	0644,
5261 		.proc_handler	=	proc_dointvec_ms_jiffies,
5262 	},
5263 	{
5264 		.procname	=	"skip_notify_on_dev_down",
5265 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5266 		.maxlen		=	sizeof(int),
5267 		.mode		=	0644,
5268 		.proc_handler	=	proc_dointvec,
5269 		.extra1		=	&zero,
5270 		.extra2		=	&one,
5271 	},
5272 	{ }
5273 };
5274 
5275 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5276 {
5277 	struct ctl_table *table;
5278 
5279 	table = kmemdup(ipv6_route_table_template,
5280 			sizeof(ipv6_route_table_template),
5281 			GFP_KERNEL);
5282 
5283 	if (table) {
5284 		table[0].data = &net->ipv6.sysctl.flush_delay;
5285 		table[0].extra1 = net;
5286 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5287 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5288 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5289 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5290 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5291 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5292 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5293 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5294 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5295 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5296 
5297 		/* Don't export sysctls to unprivileged users */
5298 		if (net->user_ns != &init_user_ns)
5299 			table[0].procname = NULL;
5300 	}
5301 
5302 	return table;
5303 }
5304 #endif
5305 
5306 static int __net_init ip6_route_net_init(struct net *net)
5307 {
5308 	int ret = -ENOMEM;
5309 
5310 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5311 	       sizeof(net->ipv6.ip6_dst_ops));
5312 
5313 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5314 		goto out_ip6_dst_ops;
5315 
5316 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5317 					    sizeof(*net->ipv6.fib6_null_entry),
5318 					    GFP_KERNEL);
5319 	if (!net->ipv6.fib6_null_entry)
5320 		goto out_ip6_dst_entries;
5321 
5322 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5323 					   sizeof(*net->ipv6.ip6_null_entry),
5324 					   GFP_KERNEL);
5325 	if (!net->ipv6.ip6_null_entry)
5326 		goto out_fib6_null_entry;
5327 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5328 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5329 			 ip6_template_metrics, true);
5330 
5331 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5332 	net->ipv6.fib6_has_custom_rules = false;
5333 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5334 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5335 					       GFP_KERNEL);
5336 	if (!net->ipv6.ip6_prohibit_entry)
5337 		goto out_ip6_null_entry;
5338 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5339 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5340 			 ip6_template_metrics, true);
5341 
5342 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5343 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5344 					       GFP_KERNEL);
5345 	if (!net->ipv6.ip6_blk_hole_entry)
5346 		goto out_ip6_prohibit_entry;
5347 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5348 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5349 			 ip6_template_metrics, true);
5350 #endif
5351 
5352 	net->ipv6.sysctl.flush_delay = 0;
5353 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5354 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5355 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5356 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5357 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5358 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5359 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5360 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5361 
5362 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5363 
5364 	ret = 0;
5365 out:
5366 	return ret;
5367 
5368 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5369 out_ip6_prohibit_entry:
5370 	kfree(net->ipv6.ip6_prohibit_entry);
5371 out_ip6_null_entry:
5372 	kfree(net->ipv6.ip6_null_entry);
5373 #endif
5374 out_fib6_null_entry:
5375 	kfree(net->ipv6.fib6_null_entry);
5376 out_ip6_dst_entries:
5377 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5378 out_ip6_dst_ops:
5379 	goto out;
5380 }
5381 
5382 static void __net_exit ip6_route_net_exit(struct net *net)
5383 {
5384 	kfree(net->ipv6.fib6_null_entry);
5385 	kfree(net->ipv6.ip6_null_entry);
5386 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5387 	kfree(net->ipv6.ip6_prohibit_entry);
5388 	kfree(net->ipv6.ip6_blk_hole_entry);
5389 #endif
5390 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5391 }
5392 
5393 static int __net_init ip6_route_net_init_late(struct net *net)
5394 {
5395 #ifdef CONFIG_PROC_FS
5396 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5397 			sizeof(struct ipv6_route_iter));
5398 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5399 			rt6_stats_seq_show, NULL);
5400 #endif
5401 	return 0;
5402 }
5403 
5404 static void __net_exit ip6_route_net_exit_late(struct net *net)
5405 {
5406 #ifdef CONFIG_PROC_FS
5407 	remove_proc_entry("ipv6_route", net->proc_net);
5408 	remove_proc_entry("rt6_stats", net->proc_net);
5409 #endif
5410 }
5411 
5412 static struct pernet_operations ip6_route_net_ops = {
5413 	.init = ip6_route_net_init,
5414 	.exit = ip6_route_net_exit,
5415 };
5416 
5417 static int __net_init ipv6_inetpeer_init(struct net *net)
5418 {
5419 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5420 
5421 	if (!bp)
5422 		return -ENOMEM;
5423 	inet_peer_base_init(bp);
5424 	net->ipv6.peers = bp;
5425 	return 0;
5426 }
5427 
5428 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5429 {
5430 	struct inet_peer_base *bp = net->ipv6.peers;
5431 
5432 	net->ipv6.peers = NULL;
5433 	inetpeer_invalidate_tree(bp);
5434 	kfree(bp);
5435 }
5436 
5437 static struct pernet_operations ipv6_inetpeer_ops = {
5438 	.init	=	ipv6_inetpeer_init,
5439 	.exit	=	ipv6_inetpeer_exit,
5440 };
5441 
5442 static struct pernet_operations ip6_route_net_late_ops = {
5443 	.init = ip6_route_net_init_late,
5444 	.exit = ip6_route_net_exit_late,
5445 };
5446 
5447 static struct notifier_block ip6_route_dev_notifier = {
5448 	.notifier_call = ip6_route_dev_notify,
5449 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5450 };
5451 
5452 void __init ip6_route_init_special_entries(void)
5453 {
5454 	/* Registering of the loopback is done before this portion of code,
5455 	 * the loopback reference in rt6_info will not be taken, do it
5456 	 * manually for init_net */
5457 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5458 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5459 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5460   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5461 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5462 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5463 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5464 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5465   #endif
5466 }
5467 
5468 int __init ip6_route_init(void)
5469 {
5470 	int ret;
5471 	int cpu;
5472 
5473 	ret = -ENOMEM;
5474 	ip6_dst_ops_template.kmem_cachep =
5475 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5476 				  SLAB_HWCACHE_ALIGN, NULL);
5477 	if (!ip6_dst_ops_template.kmem_cachep)
5478 		goto out;
5479 
5480 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5481 	if (ret)
5482 		goto out_kmem_cache;
5483 
5484 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5485 	if (ret)
5486 		goto out_dst_entries;
5487 
5488 	ret = register_pernet_subsys(&ip6_route_net_ops);
5489 	if (ret)
5490 		goto out_register_inetpeer;
5491 
5492 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5493 
5494 	ret = fib6_init();
5495 	if (ret)
5496 		goto out_register_subsys;
5497 
5498 	ret = xfrm6_init();
5499 	if (ret)
5500 		goto out_fib6_init;
5501 
5502 	ret = fib6_rules_init();
5503 	if (ret)
5504 		goto xfrm6_init;
5505 
5506 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5507 	if (ret)
5508 		goto fib6_rules_init;
5509 
5510 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5511 				   inet6_rtm_newroute, NULL, 0);
5512 	if (ret < 0)
5513 		goto out_register_late_subsys;
5514 
5515 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5516 				   inet6_rtm_delroute, NULL, 0);
5517 	if (ret < 0)
5518 		goto out_register_late_subsys;
5519 
5520 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5521 				   inet6_rtm_getroute, NULL,
5522 				   RTNL_FLAG_DOIT_UNLOCKED);
5523 	if (ret < 0)
5524 		goto out_register_late_subsys;
5525 
5526 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5527 	if (ret)
5528 		goto out_register_late_subsys;
5529 
5530 	for_each_possible_cpu(cpu) {
5531 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5532 
5533 		INIT_LIST_HEAD(&ul->head);
5534 		spin_lock_init(&ul->lock);
5535 	}
5536 
5537 out:
5538 	return ret;
5539 
5540 out_register_late_subsys:
5541 	rtnl_unregister_all(PF_INET6);
5542 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5543 fib6_rules_init:
5544 	fib6_rules_cleanup();
5545 xfrm6_init:
5546 	xfrm6_fini();
5547 out_fib6_init:
5548 	fib6_gc_cleanup();
5549 out_register_subsys:
5550 	unregister_pernet_subsys(&ip6_route_net_ops);
5551 out_register_inetpeer:
5552 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5553 out_dst_entries:
5554 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5555 out_kmem_cache:
5556 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5557 	goto out;
5558 }
5559 
5560 void ip6_route_cleanup(void)
5561 {
5562 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5563 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5564 	fib6_rules_cleanup();
5565 	xfrm6_fini();
5566 	fib6_gc_cleanup();
5567 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5568 	unregister_pernet_subsys(&ip6_route_net_ops);
5569 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5570 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5571 }
5572