xref: /openbmc/linux/net/ipv6/route.c (revision ccf7a31f)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 					   const struct in6_addr *daddr,
115 					   const struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= REFCOUNT_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	from = xchg((__force struct fib6_info **)&rt->from, NULL);
384 	fib6_info_release(from);
385 }
386 
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388 			   int how)
389 {
390 	struct rt6_info *rt = (struct rt6_info *)dst;
391 	struct inet6_dev *idev = rt->rt6i_idev;
392 	struct net_device *loopback_dev =
393 		dev_net(dev)->loopback_dev;
394 
395 	if (idev && idev->dev != loopback_dev) {
396 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397 		if (loopback_idev) {
398 			rt->rt6i_idev = loopback_idev;
399 			in6_dev_put(idev);
400 		}
401 	}
402 }
403 
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406 	if (rt->rt6i_flags & RTF_EXPIRES)
407 		return time_after(jiffies, rt->dst.expires);
408 	else
409 		return false;
410 }
411 
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414 	struct fib6_info *from;
415 
416 	from = rcu_dereference(rt->from);
417 
418 	if (rt->rt6i_flags & RTF_EXPIRES) {
419 		if (time_after(jiffies, rt->dst.expires))
420 			return true;
421 	} else if (from) {
422 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 			fib6_check_expired(from);
424 	}
425 	return false;
426 }
427 
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429 		      struct flowi6 *fl6, int oif, bool have_oif_match,
430 		      const struct sk_buff *skb, int strict)
431 {
432 	struct fib6_info *sibling, *next_sibling;
433 	struct fib6_info *match = res->f6i;
434 
435 	if (!match->fib6_nsiblings || have_oif_match)
436 		goto out;
437 
438 	/* We might have already computed the hash for ICMPv6 errors. In such
439 	 * case it will always be non-zero. Otherwise now is the time to do it.
440 	 */
441 	if (!fl6->mp_hash)
442 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443 
444 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445 		goto out;
446 
447 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448 				 fib6_siblings) {
449 		const struct fib6_nh *nh = &sibling->fib6_nh;
450 		int nh_upper_bound;
451 
452 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453 		if (fl6->mp_hash > nh_upper_bound)
454 			continue;
455 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456 			break;
457 		match = sibling;
458 		break;
459 	}
460 
461 out:
462 	res->f6i = match;
463 	res->nh = &match->fib6_nh;
464 }
465 
466 /*
467  *	Route lookup. rcu_read_lock() should be held.
468  */
469 
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471 			       const struct in6_addr *saddr, int oif, int flags)
472 {
473 	const struct net_device *dev;
474 
475 	if (nh->fib_nh_flags & RTNH_F_DEAD)
476 		return false;
477 
478 	dev = nh->fib_nh_dev;
479 	if (oif) {
480 		if (dev->ifindex == oif)
481 			return true;
482 	} else {
483 		if (ipv6_chk_addr(net, saddr, dev,
484 				  flags & RT6_LOOKUP_F_IFACE))
485 			return true;
486 	}
487 
488 	return false;
489 }
490 
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492 			     const struct in6_addr *saddr, int oif, int flags)
493 {
494 	struct fib6_info *f6i = res->f6i;
495 	struct fib6_info *spf6i;
496 	struct fib6_nh *nh;
497 
498 	if (!oif && ipv6_addr_any(saddr)) {
499 		nh = &f6i->fib6_nh;
500 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501 			goto out;
502 	}
503 
504 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505 		nh = &spf6i->fib6_nh;
506 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507 			res->f6i = spf6i;
508 			goto out;
509 		}
510 	}
511 
512 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
513 		res->f6i = net->ipv6.fib6_null_entry;
514 		nh = &res->f6i->fib6_nh;
515 		goto out;
516 	}
517 
518 	nh = &f6i->fib6_nh;
519 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
520 		res->f6i = net->ipv6.fib6_null_entry;
521 		nh = &res->f6i->fib6_nh;
522 	}
523 out:
524 	res->nh = nh;
525 	res->fib6_type = res->f6i->fib6_type;
526 	res->fib6_flags = res->f6i->fib6_flags;
527 }
528 
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531 	struct work_struct work;
532 	struct in6_addr target;
533 	struct net_device *dev;
534 };
535 
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538 	struct in6_addr mcaddr;
539 	struct __rt6_probe_work *work =
540 		container_of(w, struct __rt6_probe_work, work);
541 
542 	addrconf_addr_solict_mult(&work->target, &mcaddr);
543 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544 	dev_put(work->dev);
545 	kfree(work);
546 }
547 
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550 	struct __rt6_probe_work *work = NULL;
551 	const struct in6_addr *nh_gw;
552 	struct neighbour *neigh;
553 	struct net_device *dev;
554 	struct inet6_dev *idev;
555 
556 	/*
557 	 * Okay, this does not seem to be appropriate
558 	 * for now, however, we need to check if it
559 	 * is really so; aka Router Reachability Probing.
560 	 *
561 	 * Router Reachability Probe MUST be rate-limited
562 	 * to no more than one per minute.
563 	 */
564 	if (fib6_nh->fib_nh_gw_family)
565 		return;
566 
567 	nh_gw = &fib6_nh->fib_nh_gw6;
568 	dev = fib6_nh->fib_nh_dev;
569 	rcu_read_lock_bh();
570 	idev = __in6_dev_get(dev);
571 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572 	if (neigh) {
573 		if (neigh->nud_state & NUD_VALID)
574 			goto out;
575 
576 		write_lock(&neigh->lock);
577 		if (!(neigh->nud_state & NUD_VALID) &&
578 		    time_after(jiffies,
579 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else if (time_after(jiffies, fib6_nh->last_probe +
586 				       idev->cnf.rtr_probe_interval)) {
587 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 	}
589 
590 	if (work) {
591 		fib6_nh->last_probe = jiffies;
592 		INIT_WORK(&work->work, rt6_probe_deferred);
593 		work->target = *nh_gw;
594 		dev_hold(dev);
595 		work->dev = dev;
596 		schedule_work(&work->work);
597 	}
598 
599 out:
600 	rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607 
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614 	struct neighbour *neigh;
615 
616 	rcu_read_lock_bh();
617 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618 					  &fib6_nh->fib_nh_gw6);
619 	if (neigh) {
620 		read_lock(&neigh->lock);
621 		if (neigh->nud_state & NUD_VALID)
622 			ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 		else if (!(neigh->nud_state & NUD_FAILED))
625 			ret = RT6_NUD_SUCCEED;
626 		else
627 			ret = RT6_NUD_FAIL_PROBE;
628 #endif
629 		read_unlock(&neigh->lock);
630 	} else {
631 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633 	}
634 	rcu_read_unlock_bh();
635 
636 	return ret;
637 }
638 
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640 			   int strict)
641 {
642 	int m = 0;
643 
644 	if (!oif || nh->fib_nh_dev->ifindex == oif)
645 		m = 2;
646 
647 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
648 		return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654 		int n = rt6_check_neigh(nh);
655 		if (n < 0)
656 			return n;
657 	}
658 	return m;
659 }
660 
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662 		       int oif, int strict, int *mpri, bool *do_rr)
663 {
664 	bool match_do_rr = false;
665 	bool rc = false;
666 	int m;
667 
668 	if (nh->fib_nh_flags & RTNH_F_DEAD)
669 		goto out;
670 
671 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674 		goto out;
675 
676 	m = rt6_score_route(nh, fib6_flags, oif, strict);
677 	if (m == RT6_NUD_FAIL_DO_RR) {
678 		match_do_rr = true;
679 		m = 0; /* lowest valid score */
680 	} else if (m == RT6_NUD_FAIL_HARD) {
681 		goto out;
682 	}
683 
684 	if (strict & RT6_LOOKUP_F_REACHABLE)
685 		rt6_probe(nh);
686 
687 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 	if (m > *mpri) {
689 		*do_rr = match_do_rr;
690 		*mpri = m;
691 		rc = true;
692 	}
693 out:
694 	return rc;
695 }
696 
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698 			   struct fib6_info *nomatch, u32 metric,
699 			   struct fib6_result *res, struct fib6_info **cont,
700 			   int oif, int strict, bool *do_rr, int *mpri)
701 {
702 	struct fib6_info *f6i;
703 
704 	for (f6i = f6i_start;
705 	     f6i && f6i != nomatch;
706 	     f6i = rcu_dereference(f6i->fib6_next)) {
707 		struct fib6_nh *nh;
708 
709 		if (cont && f6i->fib6_metric != metric) {
710 			*cont = f6i;
711 			return;
712 		}
713 
714 		if (fib6_check_expired(f6i))
715 			continue;
716 
717 		nh = &f6i->fib6_nh;
718 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719 			res->f6i = f6i;
720 			res->nh = nh;
721 			res->fib6_flags = f6i->fib6_flags;
722 			res->fib6_type = f6i->fib6_type;
723 		}
724 	}
725 }
726 
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728 			 struct fib6_info *rr_head, int oif, int strict,
729 			 bool *do_rr, struct fib6_result *res)
730 {
731 	u32 metric = rr_head->fib6_metric;
732 	struct fib6_info *cont = NULL;
733 	int mpri = -1;
734 
735 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
736 		       oif, strict, do_rr, &mpri);
737 
738 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
739 		       oif, strict, do_rr, &mpri);
740 
741 	if (res->f6i || !cont)
742 		return;
743 
744 	__find_rr_leaf(cont, NULL, metric, res, NULL,
745 		       oif, strict, do_rr, &mpri);
746 }
747 
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749 		       struct fib6_result *res, int strict)
750 {
751 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
752 	struct fib6_info *rt0;
753 	bool do_rr = false;
754 	int key_plen;
755 
756 	/* make sure this function or its helpers sets f6i */
757 	res->f6i = NULL;
758 
759 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
760 		goto out;
761 
762 	rt0 = rcu_dereference(fn->rr_ptr);
763 	if (!rt0)
764 		rt0 = leaf;
765 
766 	/* Double check to make sure fn is not an intermediate node
767 	 * and fn->leaf does not points to its child's leaf
768 	 * (This might happen if all routes under fn are deleted from
769 	 * the tree and fib6_repair_tree() is called on the node.)
770 	 */
771 	key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 	if (rt0->fib6_src.plen)
774 		key_plen = rt0->fib6_src.plen;
775 #endif
776 	if (fn->fn_bit != key_plen)
777 		goto out;
778 
779 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780 	if (do_rr) {
781 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782 
783 		/* no entries matched; do round-robin */
784 		if (!next || next->fib6_metric != rt0->fib6_metric)
785 			next = leaf;
786 
787 		if (next != rt0) {
788 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
789 			/* make sure next is not being deleted from the tree */
790 			if (next->fib6_node)
791 				rcu_assign_pointer(fn->rr_ptr, next);
792 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793 		}
794 	}
795 
796 out:
797 	if (!res->f6i) {
798 		res->f6i = net->ipv6.fib6_null_entry;
799 		res->nh = &res->f6i->fib6_nh;
800 		res->fib6_flags = res->f6i->fib6_flags;
801 		res->fib6_type = res->f6i->fib6_type;
802 	}
803 }
804 
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808 	       res->nh->fib_nh_gw_family;
809 }
810 
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813 		  const struct in6_addr *gwaddr)
814 {
815 	struct net *net = dev_net(dev);
816 	struct route_info *rinfo = (struct route_info *) opt;
817 	struct in6_addr prefix_buf, *prefix;
818 	unsigned int pref;
819 	unsigned long lifetime;
820 	struct fib6_info *rt;
821 
822 	if (len < sizeof(struct route_info)) {
823 		return -EINVAL;
824 	}
825 
826 	/* Sanity check for prefix_len and length */
827 	if (rinfo->length > 3) {
828 		return -EINVAL;
829 	} else if (rinfo->prefix_len > 128) {
830 		return -EINVAL;
831 	} else if (rinfo->prefix_len > 64) {
832 		if (rinfo->length < 2) {
833 			return -EINVAL;
834 		}
835 	} else if (rinfo->prefix_len > 0) {
836 		if (rinfo->length < 1) {
837 			return -EINVAL;
838 		}
839 	}
840 
841 	pref = rinfo->route_pref;
842 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
843 		return -EINVAL;
844 
845 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846 
847 	if (rinfo->length == 3)
848 		prefix = (struct in6_addr *)rinfo->prefix;
849 	else {
850 		/* this function is safe */
851 		ipv6_addr_prefix(&prefix_buf,
852 				 (struct in6_addr *)rinfo->prefix,
853 				 rinfo->prefix_len);
854 		prefix = &prefix_buf;
855 	}
856 
857 	if (rinfo->prefix_len == 0)
858 		rt = rt6_get_dflt_router(net, gwaddr, dev);
859 	else
860 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861 					gwaddr, dev);
862 
863 	if (rt && !lifetime) {
864 		ip6_del_rt(net, rt);
865 		rt = NULL;
866 	}
867 
868 	if (!rt && lifetime)
869 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870 					dev, pref);
871 	else if (rt)
872 		rt->fib6_flags = RTF_ROUTEINFO |
873 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874 
875 	if (rt) {
876 		if (!addrconf_finite_timeout(lifetime))
877 			fib6_clean_expires(rt);
878 		else
879 			fib6_set_expires(rt, jiffies + HZ * lifetime);
880 
881 		fib6_info_release(rt);
882 	}
883 	return 0;
884 }
885 #endif
886 
887 /*
888  *	Misc support functions
889  */
890 
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894 	struct net_device *dev = res->nh->fib_nh_dev;
895 
896 	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897 		/* for copies of local routes, dst->dev needs to be the
898 		 * device if it is a master device, the master device if
899 		 * device is enslaved, and the loopback as the default
900 		 */
901 		if (netif_is_l3_slave(dev) &&
902 		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
903 			dev = l3mdev_master_dev_rcu(dev);
904 		else if (!netif_is_l3_master(dev))
905 			dev = dev_net(dev)->loopback_dev;
906 		/* last case is netif_is_l3_master(dev) is true in which
907 		 * case we want dev returned to be dev
908 		 */
909 	}
910 
911 	return dev;
912 }
913 
914 static const int fib6_prop[RTN_MAX + 1] = {
915 	[RTN_UNSPEC]	= 0,
916 	[RTN_UNICAST]	= 0,
917 	[RTN_LOCAL]	= 0,
918 	[RTN_BROADCAST]	= 0,
919 	[RTN_ANYCAST]	= 0,
920 	[RTN_MULTICAST]	= 0,
921 	[RTN_BLACKHOLE]	= -EINVAL,
922 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
923 	[RTN_PROHIBIT]	= -EACCES,
924 	[RTN_THROW]	= -EAGAIN,
925 	[RTN_NAT]	= -EINVAL,
926 	[RTN_XRESOLVE]	= -EINVAL,
927 };
928 
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931 	return fib6_prop[fib6_type];
932 }
933 
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936 	unsigned short flags = 0;
937 
938 	if (rt->dst_nocount)
939 		flags |= DST_NOCOUNT;
940 	if (rt->dst_nopolicy)
941 		flags |= DST_NOPOLICY;
942 	if (rt->dst_host)
943 		flags |= DST_HOST;
944 
945 	return flags;
946 }
947 
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950 	rt->dst.error = ip6_rt_type_to_error(fib6_type);
951 
952 	switch (fib6_type) {
953 	case RTN_BLACKHOLE:
954 		rt->dst.output = dst_discard_out;
955 		rt->dst.input = dst_discard;
956 		break;
957 	case RTN_PROHIBIT:
958 		rt->dst.output = ip6_pkt_prohibit_out;
959 		rt->dst.input = ip6_pkt_prohibit;
960 		break;
961 	case RTN_THROW:
962 	case RTN_UNREACHABLE:
963 	default:
964 		rt->dst.output = ip6_pkt_discard_out;
965 		rt->dst.input = ip6_pkt_discard;
966 		break;
967 	}
968 }
969 
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972 	struct fib6_info *f6i = res->f6i;
973 
974 	if (res->fib6_flags & RTF_REJECT) {
975 		ip6_rt_init_dst_reject(rt, res->fib6_type);
976 		return;
977 	}
978 
979 	rt->dst.error = 0;
980 	rt->dst.output = ip6_output;
981 
982 	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983 		rt->dst.input = ip6_input;
984 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985 		rt->dst.input = ip6_mc_input;
986 	} else {
987 		rt->dst.input = ip6_forward;
988 	}
989 
990 	if (res->nh->fib_nh_lws) {
991 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992 		lwtunnel_set_redirect(&rt->dst);
993 	}
994 
995 	rt->dst.lastuse = jiffies;
996 }
997 
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001 	rt->rt6i_flags &= ~RTF_EXPIRES;
1002 	rcu_assign_pointer(rt->from, from);
1003 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005 
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009 	const struct fib6_nh *nh = res->nh;
1010 	const struct net_device *dev = nh->fib_nh_dev;
1011 	struct fib6_info *f6i = res->f6i;
1012 
1013 	ip6_rt_init_dst(rt, res);
1014 
1015 	rt->rt6i_dst = f6i->fib6_dst;
1016 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017 	rt->rt6i_flags = res->fib6_flags;
1018 	if (nh->fib_nh_gw_family) {
1019 		rt->rt6i_gateway = nh->fib_nh_gw6;
1020 		rt->rt6i_flags |= RTF_GATEWAY;
1021 	}
1022 	rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024 	rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027 
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029 					struct in6_addr *saddr)
1030 {
1031 	struct fib6_node *pn, *sn;
1032 	while (1) {
1033 		if (fn->fn_flags & RTN_TL_ROOT)
1034 			return NULL;
1035 		pn = rcu_dereference(fn->parent);
1036 		sn = FIB6_SUBTREE(pn);
1037 		if (sn && sn != fn)
1038 			fn = fib6_node_lookup(sn, NULL, saddr);
1039 		else
1040 			fn = pn;
1041 		if (fn->fn_flags & RTN_RTINFO)
1042 			return fn;
1043 	}
1044 }
1045 
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048 	struct rt6_info *rt = *prt;
1049 
1050 	if (dst_hold_safe(&rt->dst))
1051 		return true;
1052 	if (net) {
1053 		rt = net->ipv6.ip6_null_entry;
1054 		dst_hold(&rt->dst);
1055 	} else {
1056 		rt = NULL;
1057 	}
1058 	*prt = rt;
1059 	return false;
1060 }
1061 
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065 	struct net_device *dev = res->nh->fib_nh_dev;
1066 	struct fib6_info *f6i = res->f6i;
1067 	unsigned short flags;
1068 	struct rt6_info *nrt;
1069 
1070 	if (!fib6_info_hold_safe(f6i))
1071 		goto fallback;
1072 
1073 	flags = fib6_info_dst_flags(f6i);
1074 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075 	if (!nrt) {
1076 		fib6_info_release(f6i);
1077 		goto fallback;
1078 	}
1079 
1080 	ip6_rt_copy_init(nrt, res);
1081 	return nrt;
1082 
1083 fallback:
1084 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085 	dst_hold(&nrt->dst);
1086 	return nrt;
1087 }
1088 
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090 					     struct fib6_table *table,
1091 					     struct flowi6 *fl6,
1092 					     const struct sk_buff *skb,
1093 					     int flags)
1094 {
1095 	struct fib6_result res = {};
1096 	struct fib6_node *fn;
1097 	struct rt6_info *rt;
1098 
1099 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100 		flags &= ~RT6_LOOKUP_F_IFACE;
1101 
1102 	rcu_read_lock();
1103 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105 	res.f6i = rcu_dereference(fn->leaf);
1106 	if (!res.f6i)
1107 		res.f6i = net->ipv6.fib6_null_entry;
1108 	else
1109 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110 				 flags);
1111 
1112 	if (res.f6i == net->ipv6.fib6_null_entry) {
1113 		fn = fib6_backtrack(fn, &fl6->saddr);
1114 		if (fn)
1115 			goto restart;
1116 
1117 		rt = net->ipv6.ip6_null_entry;
1118 		dst_hold(&rt->dst);
1119 		goto out;
1120 	}
1121 
1122 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123 			 fl6->flowi6_oif != 0, skb, flags);
1124 
1125 	/* Search through exception table */
1126 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127 	if (rt) {
1128 		if (ip6_hold_safe(net, &rt))
1129 			dst_use_noref(&rt->dst, jiffies);
1130 	} else {
1131 		rt = ip6_create_rt_rcu(&res);
1132 	}
1133 
1134 out:
1135 	trace_fib6_table_lookup(net, &res, table, fl6);
1136 
1137 	rcu_read_unlock();
1138 
1139 	return rt;
1140 }
1141 
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143 				   const struct sk_buff *skb, int flags)
1144 {
1145 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148 
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150 			    const struct in6_addr *saddr, int oif,
1151 			    const struct sk_buff *skb, int strict)
1152 {
1153 	struct flowi6 fl6 = {
1154 		.flowi6_oif = oif,
1155 		.daddr = *daddr,
1156 	};
1157 	struct dst_entry *dst;
1158 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159 
1160 	if (saddr) {
1161 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1163 	}
1164 
1165 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166 	if (dst->error == 0)
1167 		return (struct rt6_info *) dst;
1168 
1169 	dst_release(dst);
1170 
1171 	return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174 
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180 
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182 			struct netlink_ext_ack *extack)
1183 {
1184 	int err;
1185 	struct fib6_table *table;
1186 
1187 	table = rt->fib6_table;
1188 	spin_lock_bh(&table->tb6_lock);
1189 	err = fib6_add(&table->tb6_root, rt, info, extack);
1190 	spin_unlock_bh(&table->tb6_lock);
1191 
1192 	return err;
1193 }
1194 
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197 	struct nl_info info = {	.nl_net = net, };
1198 
1199 	return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201 
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203 					   const struct in6_addr *daddr,
1204 					   const struct in6_addr *saddr)
1205 {
1206 	struct fib6_info *f6i = res->f6i;
1207 	struct net_device *dev;
1208 	struct rt6_info *rt;
1209 
1210 	/*
1211 	 *	Clone the route.
1212 	 */
1213 
1214 	if (!fib6_info_hold_safe(f6i))
1215 		return NULL;
1216 
1217 	dev = ip6_rt_get_dev_rcu(res);
1218 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219 	if (!rt) {
1220 		fib6_info_release(f6i);
1221 		return NULL;
1222 	}
1223 
1224 	ip6_rt_copy_init(rt, res);
1225 	rt->rt6i_flags |= RTF_CACHE;
1226 	rt->dst.flags |= DST_HOST;
1227 	rt->rt6i_dst.addr = *daddr;
1228 	rt->rt6i_dst.plen = 128;
1229 
1230 	if (!rt6_is_gw_or_nonexthop(res)) {
1231 		if (f6i->fib6_dst.plen != 128 &&
1232 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233 			rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235 		if (rt->rt6i_src.plen && saddr) {
1236 			rt->rt6i_src.addr = *saddr;
1237 			rt->rt6i_src.plen = 128;
1238 		}
1239 #endif
1240 	}
1241 
1242 	return rt;
1243 }
1244 
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247 	struct fib6_info *f6i = res->f6i;
1248 	unsigned short flags = fib6_info_dst_flags(f6i);
1249 	struct net_device *dev;
1250 	struct rt6_info *pcpu_rt;
1251 
1252 	if (!fib6_info_hold_safe(f6i))
1253 		return NULL;
1254 
1255 	rcu_read_lock();
1256 	dev = ip6_rt_get_dev_rcu(res);
1257 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258 	rcu_read_unlock();
1259 	if (!pcpu_rt) {
1260 		fib6_info_release(f6i);
1261 		return NULL;
1262 	}
1263 	ip6_rt_copy_init(pcpu_rt, res);
1264 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1265 	return pcpu_rt;
1266 }
1267 
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271 	struct rt6_info *pcpu_rt, **p;
1272 
1273 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274 	pcpu_rt = *p;
1275 
1276 	if (pcpu_rt)
1277 		ip6_hold_safe(NULL, &pcpu_rt);
1278 
1279 	return pcpu_rt;
1280 }
1281 
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283 					    const struct fib6_result *res)
1284 {
1285 	struct rt6_info *pcpu_rt, *prev, **p;
1286 
1287 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1288 	if (!pcpu_rt) {
1289 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1290 		return net->ipv6.ip6_null_entry;
1291 	}
1292 
1293 	dst_hold(&pcpu_rt->dst);
1294 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295 	prev = cmpxchg(p, NULL, pcpu_rt);
1296 	BUG_ON(prev);
1297 
1298 	if (res->f6i->fib6_destroying) {
1299 		struct fib6_info *from;
1300 
1301 		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1302 		fib6_info_release(from);
1303 	}
1304 
1305 	return pcpu_rt;
1306 }
1307 
1308 /* exception hash table implementation
1309  */
1310 static DEFINE_SPINLOCK(rt6_exception_lock);
1311 
1312 /* Remove rt6_ex from hash table and free the memory
1313  * Caller must hold rt6_exception_lock
1314  */
1315 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1316 				 struct rt6_exception *rt6_ex)
1317 {
1318 	struct fib6_info *from;
1319 	struct net *net;
1320 
1321 	if (!bucket || !rt6_ex)
1322 		return;
1323 
1324 	net = dev_net(rt6_ex->rt6i->dst.dev);
1325 	net->ipv6.rt6_stats->fib_rt_cache--;
1326 
1327 	/* purge completely the exception to allow releasing the held resources:
1328 	 * some [sk] cache may keep the dst around for unlimited time
1329 	 */
1330 	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1331 	fib6_info_release(from);
1332 	dst_dev_put(&rt6_ex->rt6i->dst);
1333 
1334 	hlist_del_rcu(&rt6_ex->hlist);
1335 	dst_release(&rt6_ex->rt6i->dst);
1336 	kfree_rcu(rt6_ex, rcu);
1337 	WARN_ON_ONCE(!bucket->depth);
1338 	bucket->depth--;
1339 }
1340 
1341 /* Remove oldest rt6_ex in bucket and free the memory
1342  * Caller must hold rt6_exception_lock
1343  */
1344 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1345 {
1346 	struct rt6_exception *rt6_ex, *oldest = NULL;
1347 
1348 	if (!bucket)
1349 		return;
1350 
1351 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1352 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1353 			oldest = rt6_ex;
1354 	}
1355 	rt6_remove_exception(bucket, oldest);
1356 }
1357 
1358 static u32 rt6_exception_hash(const struct in6_addr *dst,
1359 			      const struct in6_addr *src)
1360 {
1361 	static u32 seed __read_mostly;
1362 	u32 val;
1363 
1364 	net_get_random_once(&seed, sizeof(seed));
1365 	val = jhash(dst, sizeof(*dst), seed);
1366 
1367 #ifdef CONFIG_IPV6_SUBTREES
1368 	if (src)
1369 		val = jhash(src, sizeof(*src), val);
1370 #endif
1371 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1372 }
1373 
1374 /* Helper function to find the cached rt in the hash table
1375  * and update bucket pointer to point to the bucket for this
1376  * (daddr, saddr) pair
1377  * Caller must hold rt6_exception_lock
1378  */
1379 static struct rt6_exception *
1380 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1381 			      const struct in6_addr *daddr,
1382 			      const struct in6_addr *saddr)
1383 {
1384 	struct rt6_exception *rt6_ex;
1385 	u32 hval;
1386 
1387 	if (!(*bucket) || !daddr)
1388 		return NULL;
1389 
1390 	hval = rt6_exception_hash(daddr, saddr);
1391 	*bucket += hval;
1392 
1393 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1394 		struct rt6_info *rt6 = rt6_ex->rt6i;
1395 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1396 
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 		if (matched && saddr)
1399 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1400 #endif
1401 		if (matched)
1402 			return rt6_ex;
1403 	}
1404 	return NULL;
1405 }
1406 
1407 /* Helper function to find the cached rt in the hash table
1408  * and update bucket pointer to point to the bucket for this
1409  * (daddr, saddr) pair
1410  * Caller must hold rcu_read_lock()
1411  */
1412 static struct rt6_exception *
1413 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1414 			 const struct in6_addr *daddr,
1415 			 const struct in6_addr *saddr)
1416 {
1417 	struct rt6_exception *rt6_ex;
1418 	u32 hval;
1419 
1420 	WARN_ON_ONCE(!rcu_read_lock_held());
1421 
1422 	if (!(*bucket) || !daddr)
1423 		return NULL;
1424 
1425 	hval = rt6_exception_hash(daddr, saddr);
1426 	*bucket += hval;
1427 
1428 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1429 		struct rt6_info *rt6 = rt6_ex->rt6i;
1430 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1431 
1432 #ifdef CONFIG_IPV6_SUBTREES
1433 		if (matched && saddr)
1434 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1435 #endif
1436 		if (matched)
1437 			return rt6_ex;
1438 	}
1439 	return NULL;
1440 }
1441 
1442 static unsigned int fib6_mtu(const struct fib6_result *res)
1443 {
1444 	const struct fib6_nh *nh = res->nh;
1445 	unsigned int mtu;
1446 
1447 	if (res->f6i->fib6_pmtu) {
1448 		mtu = res->f6i->fib6_pmtu;
1449 	} else {
1450 		struct net_device *dev = nh->fib_nh_dev;
1451 		struct inet6_dev *idev;
1452 
1453 		rcu_read_lock();
1454 		idev = __in6_dev_get(dev);
1455 		mtu = idev->cnf.mtu6;
1456 		rcu_read_unlock();
1457 	}
1458 
1459 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1460 
1461 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1462 }
1463 
1464 static int rt6_insert_exception(struct rt6_info *nrt,
1465 				const struct fib6_result *res)
1466 {
1467 	struct net *net = dev_net(nrt->dst.dev);
1468 	struct rt6_exception_bucket *bucket;
1469 	struct in6_addr *src_key = NULL;
1470 	struct rt6_exception *rt6_ex;
1471 	struct fib6_info *f6i = res->f6i;
1472 	int err = 0;
1473 
1474 	spin_lock_bh(&rt6_exception_lock);
1475 
1476 	if (f6i->exception_bucket_flushed) {
1477 		err = -EINVAL;
1478 		goto out;
1479 	}
1480 
1481 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1482 					lockdep_is_held(&rt6_exception_lock));
1483 	if (!bucket) {
1484 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1485 				 GFP_ATOMIC);
1486 		if (!bucket) {
1487 			err = -ENOMEM;
1488 			goto out;
1489 		}
1490 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1491 	}
1492 
1493 #ifdef CONFIG_IPV6_SUBTREES
1494 	/* fib6_src.plen != 0 indicates f6i is in subtree
1495 	 * and exception table is indexed by a hash of
1496 	 * both fib6_dst and fib6_src.
1497 	 * Otherwise, the exception table is indexed by
1498 	 * a hash of only fib6_dst.
1499 	 */
1500 	if (f6i->fib6_src.plen)
1501 		src_key = &nrt->rt6i_src.addr;
1502 #endif
1503 	/* rt6_mtu_change() might lower mtu on f6i.
1504 	 * Only insert this exception route if its mtu
1505 	 * is less than f6i's mtu value.
1506 	 */
1507 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1508 		err = -EINVAL;
1509 		goto out;
1510 	}
1511 
1512 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1513 					       src_key);
1514 	if (rt6_ex)
1515 		rt6_remove_exception(bucket, rt6_ex);
1516 
1517 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1518 	if (!rt6_ex) {
1519 		err = -ENOMEM;
1520 		goto out;
1521 	}
1522 	rt6_ex->rt6i = nrt;
1523 	rt6_ex->stamp = jiffies;
1524 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1525 	bucket->depth++;
1526 	net->ipv6.rt6_stats->fib_rt_cache++;
1527 
1528 	if (bucket->depth > FIB6_MAX_DEPTH)
1529 		rt6_exception_remove_oldest(bucket);
1530 
1531 out:
1532 	spin_unlock_bh(&rt6_exception_lock);
1533 
1534 	/* Update fn->fn_sernum to invalidate all cached dst */
1535 	if (!err) {
1536 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1537 		fib6_update_sernum(net, f6i);
1538 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1539 		fib6_force_start_gc(net);
1540 	}
1541 
1542 	return err;
1543 }
1544 
1545 void rt6_flush_exceptions(struct fib6_info *rt)
1546 {
1547 	struct rt6_exception_bucket *bucket;
1548 	struct rt6_exception *rt6_ex;
1549 	struct hlist_node *tmp;
1550 	int i;
1551 
1552 	spin_lock_bh(&rt6_exception_lock);
1553 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1554 	rt->exception_bucket_flushed = 1;
1555 
1556 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1557 				    lockdep_is_held(&rt6_exception_lock));
1558 	if (!bucket)
1559 		goto out;
1560 
1561 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1562 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1563 			rt6_remove_exception(bucket, rt6_ex);
1564 		WARN_ON_ONCE(bucket->depth);
1565 		bucket++;
1566 	}
1567 
1568 out:
1569 	spin_unlock_bh(&rt6_exception_lock);
1570 }
1571 
1572 /* Find cached rt in the hash table inside passed in rt
1573  * Caller has to hold rcu_read_lock()
1574  */
1575 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1576 					   const struct in6_addr *daddr,
1577 					   const struct in6_addr *saddr)
1578 {
1579 	const struct in6_addr *src_key = NULL;
1580 	struct rt6_exception_bucket *bucket;
1581 	struct rt6_exception *rt6_ex;
1582 	struct rt6_info *ret = NULL;
1583 
1584 #ifdef CONFIG_IPV6_SUBTREES
1585 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1586 	 * and exception table is indexed by a hash of
1587 	 * both fib6_dst and fib6_src.
1588 	 * However, the src addr used to create the hash
1589 	 * might not be exactly the passed in saddr which
1590 	 * is a /128 addr from the flow.
1591 	 * So we need to use f6i->fib6_src to redo lookup
1592 	 * if the passed in saddr does not find anything.
1593 	 * (See the logic in ip6_rt_cache_alloc() on how
1594 	 * rt->rt6i_src is updated.)
1595 	 */
1596 	if (res->f6i->fib6_src.plen)
1597 		src_key = saddr;
1598 find_ex:
1599 #endif
1600 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1601 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1602 
1603 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1604 		ret = rt6_ex->rt6i;
1605 
1606 #ifdef CONFIG_IPV6_SUBTREES
1607 	/* Use fib6_src as src_key and redo lookup */
1608 	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1609 		src_key = &res->f6i->fib6_src.addr;
1610 		goto find_ex;
1611 	}
1612 #endif
1613 
1614 	return ret;
1615 }
1616 
1617 /* Remove the passed in cached rt from the hash table that contains it */
1618 static int rt6_remove_exception_rt(struct rt6_info *rt)
1619 {
1620 	struct rt6_exception_bucket *bucket;
1621 	struct in6_addr *src_key = NULL;
1622 	struct rt6_exception *rt6_ex;
1623 	struct fib6_info *from;
1624 	int err;
1625 
1626 	from = rcu_dereference(rt->from);
1627 	if (!from ||
1628 	    !(rt->rt6i_flags & RTF_CACHE))
1629 		return -EINVAL;
1630 
1631 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1632 		return -ENOENT;
1633 
1634 	spin_lock_bh(&rt6_exception_lock);
1635 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1636 				    lockdep_is_held(&rt6_exception_lock));
1637 #ifdef CONFIG_IPV6_SUBTREES
1638 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1639 	 * and exception table is indexed by a hash of
1640 	 * both rt6i_dst and rt6i_src.
1641 	 * Otherwise, the exception table is indexed by
1642 	 * a hash of only rt6i_dst.
1643 	 */
1644 	if (from->fib6_src.plen)
1645 		src_key = &rt->rt6i_src.addr;
1646 #endif
1647 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1648 					       &rt->rt6i_dst.addr,
1649 					       src_key);
1650 	if (rt6_ex) {
1651 		rt6_remove_exception(bucket, rt6_ex);
1652 		err = 0;
1653 	} else {
1654 		err = -ENOENT;
1655 	}
1656 
1657 	spin_unlock_bh(&rt6_exception_lock);
1658 	return err;
1659 }
1660 
1661 /* Find rt6_ex which contains the passed in rt cache and
1662  * refresh its stamp
1663  */
1664 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1665 {
1666 	struct rt6_exception_bucket *bucket;
1667 	struct in6_addr *src_key = NULL;
1668 	struct rt6_exception *rt6_ex;
1669 	struct fib6_info *from;
1670 
1671 	rcu_read_lock();
1672 	from = rcu_dereference(rt->from);
1673 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1674 		goto unlock;
1675 
1676 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1677 
1678 #ifdef CONFIG_IPV6_SUBTREES
1679 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1680 	 * and exception table is indexed by a hash of
1681 	 * both rt6i_dst and rt6i_src.
1682 	 * Otherwise, the exception table is indexed by
1683 	 * a hash of only rt6i_dst.
1684 	 */
1685 	if (from->fib6_src.plen)
1686 		src_key = &rt->rt6i_src.addr;
1687 #endif
1688 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1689 					  &rt->rt6i_dst.addr,
1690 					  src_key);
1691 	if (rt6_ex)
1692 		rt6_ex->stamp = jiffies;
1693 
1694 unlock:
1695 	rcu_read_unlock();
1696 }
1697 
1698 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1699 					 struct rt6_info *rt, int mtu)
1700 {
1701 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1702 	 * lowest MTU in the path: always allow updating the route PMTU to
1703 	 * reflect PMTU decreases.
1704 	 *
1705 	 * If the new MTU is higher, and the route PMTU is equal to the local
1706 	 * MTU, this means the old MTU is the lowest in the path, so allow
1707 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1708 	 * handle this.
1709 	 */
1710 
1711 	if (dst_mtu(&rt->dst) >= mtu)
1712 		return true;
1713 
1714 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1715 		return true;
1716 
1717 	return false;
1718 }
1719 
1720 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1721 				       struct fib6_info *rt, int mtu)
1722 {
1723 	struct rt6_exception_bucket *bucket;
1724 	struct rt6_exception *rt6_ex;
1725 	int i;
1726 
1727 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1728 					lockdep_is_held(&rt6_exception_lock));
1729 
1730 	if (!bucket)
1731 		return;
1732 
1733 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1734 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1735 			struct rt6_info *entry = rt6_ex->rt6i;
1736 
1737 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1738 			 * route), the metrics of its rt->from have already
1739 			 * been updated.
1740 			 */
1741 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1742 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1743 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1744 		}
1745 		bucket++;
1746 	}
1747 }
1748 
1749 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1750 
1751 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1752 					struct in6_addr *gateway)
1753 {
1754 	struct rt6_exception_bucket *bucket;
1755 	struct rt6_exception *rt6_ex;
1756 	struct hlist_node *tmp;
1757 	int i;
1758 
1759 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1760 		return;
1761 
1762 	spin_lock_bh(&rt6_exception_lock);
1763 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1764 				     lockdep_is_held(&rt6_exception_lock));
1765 
1766 	if (bucket) {
1767 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1768 			hlist_for_each_entry_safe(rt6_ex, tmp,
1769 						  &bucket->chain, hlist) {
1770 				struct rt6_info *entry = rt6_ex->rt6i;
1771 
1772 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1773 				    RTF_CACHE_GATEWAY &&
1774 				    ipv6_addr_equal(gateway,
1775 						    &entry->rt6i_gateway)) {
1776 					rt6_remove_exception(bucket, rt6_ex);
1777 				}
1778 			}
1779 			bucket++;
1780 		}
1781 	}
1782 
1783 	spin_unlock_bh(&rt6_exception_lock);
1784 }
1785 
1786 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1787 				      struct rt6_exception *rt6_ex,
1788 				      struct fib6_gc_args *gc_args,
1789 				      unsigned long now)
1790 {
1791 	struct rt6_info *rt = rt6_ex->rt6i;
1792 
1793 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1794 	 * even if others have still references to them, so that on next
1795 	 * dst_check() such references can be dropped.
1796 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1797 	 * expired, independently from their aging, as per RFC 8201 section 4
1798 	 */
1799 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1800 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1801 			RT6_TRACE("aging clone %p\n", rt);
1802 			rt6_remove_exception(bucket, rt6_ex);
1803 			return;
1804 		}
1805 	} else if (time_after(jiffies, rt->dst.expires)) {
1806 		RT6_TRACE("purging expired route %p\n", rt);
1807 		rt6_remove_exception(bucket, rt6_ex);
1808 		return;
1809 	}
1810 
1811 	if (rt->rt6i_flags & RTF_GATEWAY) {
1812 		struct neighbour *neigh;
1813 		__u8 neigh_flags = 0;
1814 
1815 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1816 		if (neigh)
1817 			neigh_flags = neigh->flags;
1818 
1819 		if (!(neigh_flags & NTF_ROUTER)) {
1820 			RT6_TRACE("purging route %p via non-router but gateway\n",
1821 				  rt);
1822 			rt6_remove_exception(bucket, rt6_ex);
1823 			return;
1824 		}
1825 	}
1826 
1827 	gc_args->more++;
1828 }
1829 
1830 void rt6_age_exceptions(struct fib6_info *rt,
1831 			struct fib6_gc_args *gc_args,
1832 			unsigned long now)
1833 {
1834 	struct rt6_exception_bucket *bucket;
1835 	struct rt6_exception *rt6_ex;
1836 	struct hlist_node *tmp;
1837 	int i;
1838 
1839 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1840 		return;
1841 
1842 	rcu_read_lock_bh();
1843 	spin_lock(&rt6_exception_lock);
1844 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1845 				    lockdep_is_held(&rt6_exception_lock));
1846 
1847 	if (bucket) {
1848 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1849 			hlist_for_each_entry_safe(rt6_ex, tmp,
1850 						  &bucket->chain, hlist) {
1851 				rt6_age_examine_exception(bucket, rt6_ex,
1852 							  gc_args, now);
1853 			}
1854 			bucket++;
1855 		}
1856 	}
1857 	spin_unlock(&rt6_exception_lock);
1858 	rcu_read_unlock_bh();
1859 }
1860 
1861 /* must be called with rcu lock held */
1862 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1863 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1864 {
1865 	struct fib6_node *fn, *saved_fn;
1866 
1867 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1868 	saved_fn = fn;
1869 
1870 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1871 		oif = 0;
1872 
1873 redo_rt6_select:
1874 	rt6_select(net, fn, oif, res, strict);
1875 	if (res->f6i == net->ipv6.fib6_null_entry) {
1876 		fn = fib6_backtrack(fn, &fl6->saddr);
1877 		if (fn)
1878 			goto redo_rt6_select;
1879 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1880 			/* also consider unreachable route */
1881 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1882 			fn = saved_fn;
1883 			goto redo_rt6_select;
1884 		}
1885 	}
1886 
1887 	trace_fib6_table_lookup(net, res, table, fl6);
1888 
1889 	return 0;
1890 }
1891 
1892 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1893 			       int oif, struct flowi6 *fl6,
1894 			       const struct sk_buff *skb, int flags)
1895 {
1896 	struct fib6_result res = {};
1897 	struct rt6_info *rt;
1898 	int strict = 0;
1899 
1900 	strict |= flags & RT6_LOOKUP_F_IFACE;
1901 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1902 	if (net->ipv6.devconf_all->forwarding == 0)
1903 		strict |= RT6_LOOKUP_F_REACHABLE;
1904 
1905 	rcu_read_lock();
1906 
1907 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1908 	if (res.f6i == net->ipv6.fib6_null_entry) {
1909 		rt = net->ipv6.ip6_null_entry;
1910 		rcu_read_unlock();
1911 		dst_hold(&rt->dst);
1912 		return rt;
1913 	}
1914 
1915 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1916 
1917 	/*Search through exception table */
1918 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1919 	if (rt) {
1920 		if (ip6_hold_safe(net, &rt))
1921 			dst_use_noref(&rt->dst, jiffies);
1922 
1923 		rcu_read_unlock();
1924 		return rt;
1925 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1926 			    !res.nh->fib_nh_gw_family)) {
1927 		/* Create a RTF_CACHE clone which will not be
1928 		 * owned by the fib6 tree.  It is for the special case where
1929 		 * the daddr in the skb during the neighbor look-up is different
1930 		 * from the fl6->daddr used to look-up route here.
1931 		 */
1932 		struct rt6_info *uncached_rt;
1933 
1934 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1935 
1936 		rcu_read_unlock();
1937 
1938 		if (uncached_rt) {
1939 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1940 			 * No need for another dst_hold()
1941 			 */
1942 			rt6_uncached_list_add(uncached_rt);
1943 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1944 		} else {
1945 			uncached_rt = net->ipv6.ip6_null_entry;
1946 			dst_hold(&uncached_rt->dst);
1947 		}
1948 
1949 		return uncached_rt;
1950 	} else {
1951 		/* Get a percpu copy */
1952 
1953 		struct rt6_info *pcpu_rt;
1954 
1955 		local_bh_disable();
1956 		pcpu_rt = rt6_get_pcpu_route(&res);
1957 
1958 		if (!pcpu_rt)
1959 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1960 
1961 		local_bh_enable();
1962 		rcu_read_unlock();
1963 
1964 		return pcpu_rt;
1965 	}
1966 }
1967 EXPORT_SYMBOL_GPL(ip6_pol_route);
1968 
1969 static struct rt6_info *ip6_pol_route_input(struct net *net,
1970 					    struct fib6_table *table,
1971 					    struct flowi6 *fl6,
1972 					    const struct sk_buff *skb,
1973 					    int flags)
1974 {
1975 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1976 }
1977 
1978 struct dst_entry *ip6_route_input_lookup(struct net *net,
1979 					 struct net_device *dev,
1980 					 struct flowi6 *fl6,
1981 					 const struct sk_buff *skb,
1982 					 int flags)
1983 {
1984 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1985 		flags |= RT6_LOOKUP_F_IFACE;
1986 
1987 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1988 }
1989 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1990 
1991 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1992 				  struct flow_keys *keys,
1993 				  struct flow_keys *flkeys)
1994 {
1995 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1996 	const struct ipv6hdr *key_iph = outer_iph;
1997 	struct flow_keys *_flkeys = flkeys;
1998 	const struct ipv6hdr *inner_iph;
1999 	const struct icmp6hdr *icmph;
2000 	struct ipv6hdr _inner_iph;
2001 	struct icmp6hdr _icmph;
2002 
2003 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2004 		goto out;
2005 
2006 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2007 				   sizeof(_icmph), &_icmph);
2008 	if (!icmph)
2009 		goto out;
2010 
2011 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2012 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2013 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2014 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
2015 		goto out;
2016 
2017 	inner_iph = skb_header_pointer(skb,
2018 				       skb_transport_offset(skb) + sizeof(*icmph),
2019 				       sizeof(_inner_iph), &_inner_iph);
2020 	if (!inner_iph)
2021 		goto out;
2022 
2023 	key_iph = inner_iph;
2024 	_flkeys = NULL;
2025 out:
2026 	if (_flkeys) {
2027 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2028 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2029 		keys->tags.flow_label = _flkeys->tags.flow_label;
2030 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2031 	} else {
2032 		keys->addrs.v6addrs.src = key_iph->saddr;
2033 		keys->addrs.v6addrs.dst = key_iph->daddr;
2034 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2035 		keys->basic.ip_proto = key_iph->nexthdr;
2036 	}
2037 }
2038 
2039 /* if skb is set it will be used and fl6 can be NULL */
2040 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2041 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2042 {
2043 	struct flow_keys hash_keys;
2044 	u32 mhash;
2045 
2046 	switch (ip6_multipath_hash_policy(net)) {
2047 	case 0:
2048 		memset(&hash_keys, 0, sizeof(hash_keys));
2049 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2050 		if (skb) {
2051 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2052 		} else {
2053 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2054 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2055 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2056 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2057 		}
2058 		break;
2059 	case 1:
2060 		if (skb) {
2061 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062 			struct flow_keys keys;
2063 
2064 			/* short-circuit if we already have L4 hash present */
2065 			if (skb->l4_hash)
2066 				return skb_get_hash_raw(skb) >> 1;
2067 
2068 			memset(&hash_keys, 0, sizeof(hash_keys));
2069 
2070                         if (!flkeys) {
2071 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2072 				flkeys = &keys;
2073 			}
2074 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2075 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2076 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2077 			hash_keys.ports.src = flkeys->ports.src;
2078 			hash_keys.ports.dst = flkeys->ports.dst;
2079 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2080 		} else {
2081 			memset(&hash_keys, 0, sizeof(hash_keys));
2082 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2083 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2084 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2085 			hash_keys.ports.src = fl6->fl6_sport;
2086 			hash_keys.ports.dst = fl6->fl6_dport;
2087 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2088 		}
2089 		break;
2090 	}
2091 	mhash = flow_hash_from_keys(&hash_keys);
2092 
2093 	return mhash >> 1;
2094 }
2095 
2096 void ip6_route_input(struct sk_buff *skb)
2097 {
2098 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2099 	struct net *net = dev_net(skb->dev);
2100 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2101 	struct ip_tunnel_info *tun_info;
2102 	struct flowi6 fl6 = {
2103 		.flowi6_iif = skb->dev->ifindex,
2104 		.daddr = iph->daddr,
2105 		.saddr = iph->saddr,
2106 		.flowlabel = ip6_flowinfo(iph),
2107 		.flowi6_mark = skb->mark,
2108 		.flowi6_proto = iph->nexthdr,
2109 	};
2110 	struct flow_keys *flkeys = NULL, _flkeys;
2111 
2112 	tun_info = skb_tunnel_info(skb);
2113 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2114 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2115 
2116 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2117 		flkeys = &_flkeys;
2118 
2119 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2120 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2121 	skb_dst_drop(skb);
2122 	skb_dst_set(skb,
2123 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2124 }
2125 
2126 static struct rt6_info *ip6_pol_route_output(struct net *net,
2127 					     struct fib6_table *table,
2128 					     struct flowi6 *fl6,
2129 					     const struct sk_buff *skb,
2130 					     int flags)
2131 {
2132 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2133 }
2134 
2135 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2136 					 struct flowi6 *fl6, int flags)
2137 {
2138 	bool any_src;
2139 
2140 	if (ipv6_addr_type(&fl6->daddr) &
2141 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2142 		struct dst_entry *dst;
2143 
2144 		dst = l3mdev_link_scope_lookup(net, fl6);
2145 		if (dst)
2146 			return dst;
2147 	}
2148 
2149 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2150 
2151 	any_src = ipv6_addr_any(&fl6->saddr);
2152 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2153 	    (fl6->flowi6_oif && any_src))
2154 		flags |= RT6_LOOKUP_F_IFACE;
2155 
2156 	if (!any_src)
2157 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2158 	else if (sk)
2159 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2160 
2161 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2162 }
2163 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2164 
2165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2166 {
2167 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2168 	struct net_device *loopback_dev = net->loopback_dev;
2169 	struct dst_entry *new = NULL;
2170 
2171 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2172 		       DST_OBSOLETE_DEAD, 0);
2173 	if (rt) {
2174 		rt6_info_init(rt);
2175 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2176 
2177 		new = &rt->dst;
2178 		new->__use = 1;
2179 		new->input = dst_discard;
2180 		new->output = dst_discard_out;
2181 
2182 		dst_copy_metrics(new, &ort->dst);
2183 
2184 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2185 		rt->rt6i_gateway = ort->rt6i_gateway;
2186 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2187 
2188 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2189 #ifdef CONFIG_IPV6_SUBTREES
2190 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2191 #endif
2192 	}
2193 
2194 	dst_release(dst_orig);
2195 	return new ? new : ERR_PTR(-ENOMEM);
2196 }
2197 
2198 /*
2199  *	Destination cache support functions
2200  */
2201 
2202 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2203 {
2204 	u32 rt_cookie = 0;
2205 
2206 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2207 		return false;
2208 
2209 	if (fib6_check_expired(f6i))
2210 		return false;
2211 
2212 	return true;
2213 }
2214 
2215 static struct dst_entry *rt6_check(struct rt6_info *rt,
2216 				   struct fib6_info *from,
2217 				   u32 cookie)
2218 {
2219 	u32 rt_cookie = 0;
2220 
2221 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2222 	    rt_cookie != cookie)
2223 		return NULL;
2224 
2225 	if (rt6_check_expired(rt))
2226 		return NULL;
2227 
2228 	return &rt->dst;
2229 }
2230 
2231 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2232 					    struct fib6_info *from,
2233 					    u32 cookie)
2234 {
2235 	if (!__rt6_check_expired(rt) &&
2236 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2237 	    fib6_check(from, cookie))
2238 		return &rt->dst;
2239 	else
2240 		return NULL;
2241 }
2242 
2243 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2244 {
2245 	struct dst_entry *dst_ret;
2246 	struct fib6_info *from;
2247 	struct rt6_info *rt;
2248 
2249 	rt = container_of(dst, struct rt6_info, dst);
2250 
2251 	rcu_read_lock();
2252 
2253 	/* All IPV6 dsts are created with ->obsolete set to the value
2254 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2255 	 * into this function always.
2256 	 */
2257 
2258 	from = rcu_dereference(rt->from);
2259 
2260 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2261 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2262 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2263 	else
2264 		dst_ret = rt6_check(rt, from, cookie);
2265 
2266 	rcu_read_unlock();
2267 
2268 	return dst_ret;
2269 }
2270 
2271 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2272 {
2273 	struct rt6_info *rt = (struct rt6_info *) dst;
2274 
2275 	if (rt) {
2276 		if (rt->rt6i_flags & RTF_CACHE) {
2277 			rcu_read_lock();
2278 			if (rt6_check_expired(rt)) {
2279 				rt6_remove_exception_rt(rt);
2280 				dst = NULL;
2281 			}
2282 			rcu_read_unlock();
2283 		} else {
2284 			dst_release(dst);
2285 			dst = NULL;
2286 		}
2287 	}
2288 	return dst;
2289 }
2290 
2291 static void ip6_link_failure(struct sk_buff *skb)
2292 {
2293 	struct rt6_info *rt;
2294 
2295 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2296 
2297 	rt = (struct rt6_info *) skb_dst(skb);
2298 	if (rt) {
2299 		rcu_read_lock();
2300 		if (rt->rt6i_flags & RTF_CACHE) {
2301 			rt6_remove_exception_rt(rt);
2302 		} else {
2303 			struct fib6_info *from;
2304 			struct fib6_node *fn;
2305 
2306 			from = rcu_dereference(rt->from);
2307 			if (from) {
2308 				fn = rcu_dereference(from->fib6_node);
2309 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2310 					fn->fn_sernum = -1;
2311 			}
2312 		}
2313 		rcu_read_unlock();
2314 	}
2315 }
2316 
2317 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2318 {
2319 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2320 		struct fib6_info *from;
2321 
2322 		rcu_read_lock();
2323 		from = rcu_dereference(rt0->from);
2324 		if (from)
2325 			rt0->dst.expires = from->expires;
2326 		rcu_read_unlock();
2327 	}
2328 
2329 	dst_set_expires(&rt0->dst, timeout);
2330 	rt0->rt6i_flags |= RTF_EXPIRES;
2331 }
2332 
2333 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2334 {
2335 	struct net *net = dev_net(rt->dst.dev);
2336 
2337 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2338 	rt->rt6i_flags |= RTF_MODIFIED;
2339 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2340 }
2341 
2342 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2343 {
2344 	return !(rt->rt6i_flags & RTF_CACHE) &&
2345 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2346 }
2347 
2348 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2349 				 const struct ipv6hdr *iph, u32 mtu)
2350 {
2351 	const struct in6_addr *daddr, *saddr;
2352 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2353 
2354 	if (dst_metric_locked(dst, RTAX_MTU))
2355 		return;
2356 
2357 	if (iph) {
2358 		daddr = &iph->daddr;
2359 		saddr = &iph->saddr;
2360 	} else if (sk) {
2361 		daddr = &sk->sk_v6_daddr;
2362 		saddr = &inet6_sk(sk)->saddr;
2363 	} else {
2364 		daddr = NULL;
2365 		saddr = NULL;
2366 	}
2367 	dst_confirm_neigh(dst, daddr);
2368 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2369 	if (mtu >= dst_mtu(dst))
2370 		return;
2371 
2372 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2373 		rt6_do_update_pmtu(rt6, mtu);
2374 		/* update rt6_ex->stamp for cache */
2375 		if (rt6->rt6i_flags & RTF_CACHE)
2376 			rt6_update_exception_stamp_rt(rt6);
2377 	} else if (daddr) {
2378 		struct fib6_result res = {};
2379 		struct rt6_info *nrt6;
2380 
2381 		rcu_read_lock();
2382 		res.f6i = rcu_dereference(rt6->from);
2383 		if (!res.f6i) {
2384 			rcu_read_unlock();
2385 			return;
2386 		}
2387 		res.nh = &res.f6i->fib6_nh;
2388 		res.fib6_flags = res.f6i->fib6_flags;
2389 		res.fib6_type = res.f6i->fib6_type;
2390 
2391 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2392 		if (nrt6) {
2393 			rt6_do_update_pmtu(nrt6, mtu);
2394 			if (rt6_insert_exception(nrt6, &res))
2395 				dst_release_immediate(&nrt6->dst);
2396 		}
2397 		rcu_read_unlock();
2398 	}
2399 }
2400 
2401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2402 			       struct sk_buff *skb, u32 mtu)
2403 {
2404 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2405 }
2406 
2407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2408 		     int oif, u32 mark, kuid_t uid)
2409 {
2410 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2411 	struct dst_entry *dst;
2412 	struct flowi6 fl6 = {
2413 		.flowi6_oif = oif,
2414 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2415 		.daddr = iph->daddr,
2416 		.saddr = iph->saddr,
2417 		.flowlabel = ip6_flowinfo(iph),
2418 		.flowi6_uid = uid,
2419 	};
2420 
2421 	dst = ip6_route_output(net, NULL, &fl6);
2422 	if (!dst->error)
2423 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2424 	dst_release(dst);
2425 }
2426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2427 
2428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2429 {
2430 	int oif = sk->sk_bound_dev_if;
2431 	struct dst_entry *dst;
2432 
2433 	if (!oif && skb->dev)
2434 		oif = l3mdev_master_ifindex(skb->dev);
2435 
2436 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2437 
2438 	dst = __sk_dst_get(sk);
2439 	if (!dst || !dst->obsolete ||
2440 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2441 		return;
2442 
2443 	bh_lock_sock(sk);
2444 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2445 		ip6_datagram_dst_update(sk, false);
2446 	bh_unlock_sock(sk);
2447 }
2448 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2449 
2450 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2451 			   const struct flowi6 *fl6)
2452 {
2453 #ifdef CONFIG_IPV6_SUBTREES
2454 	struct ipv6_pinfo *np = inet6_sk(sk);
2455 #endif
2456 
2457 	ip6_dst_store(sk, dst,
2458 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2459 		      &sk->sk_v6_daddr : NULL,
2460 #ifdef CONFIG_IPV6_SUBTREES
2461 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2462 		      &np->saddr :
2463 #endif
2464 		      NULL);
2465 }
2466 
2467 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2468 				  struct flowi6 *fl6,
2469 				  const struct in6_addr *gw,
2470 				  struct rt6_info **ret)
2471 {
2472 	const struct fib6_nh *nh = res->nh;
2473 
2474 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2475 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2476 		return false;
2477 
2478 	/* rt_cache's gateway might be different from its 'parent'
2479 	 * in the case of an ip redirect.
2480 	 * So we keep searching in the exception table if the gateway
2481 	 * is different.
2482 	 */
2483 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2484 		struct rt6_info *rt_cache;
2485 
2486 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2487 		if (rt_cache &&
2488 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2489 			*ret = rt_cache;
2490 			return true;
2491 		}
2492 		return false;
2493 	}
2494 	return true;
2495 }
2496 
2497 /* Handle redirects */
2498 struct ip6rd_flowi {
2499 	struct flowi6 fl6;
2500 	struct in6_addr gateway;
2501 };
2502 
2503 static struct rt6_info *__ip6_route_redirect(struct net *net,
2504 					     struct fib6_table *table,
2505 					     struct flowi6 *fl6,
2506 					     const struct sk_buff *skb,
2507 					     int flags)
2508 {
2509 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2510 	struct rt6_info *ret = NULL;
2511 	struct fib6_result res = {};
2512 	struct fib6_info *rt;
2513 	struct fib6_node *fn;
2514 
2515 	/* Get the "current" route for this destination and
2516 	 * check if the redirect has come from appropriate router.
2517 	 *
2518 	 * RFC 4861 specifies that redirects should only be
2519 	 * accepted if they come from the nexthop to the target.
2520 	 * Due to the way the routes are chosen, this notion
2521 	 * is a bit fuzzy and one might need to check all possible
2522 	 * routes.
2523 	 */
2524 
2525 	rcu_read_lock();
2526 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2527 restart:
2528 	for_each_fib6_node_rt_rcu(fn) {
2529 		res.f6i = rt;
2530 		res.nh = &rt->fib6_nh;
2531 
2532 		if (fib6_check_expired(rt))
2533 			continue;
2534 		if (rt->fib6_flags & RTF_REJECT)
2535 			break;
2536 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2537 			goto out;
2538 	}
2539 
2540 	if (!rt)
2541 		rt = net->ipv6.fib6_null_entry;
2542 	else if (rt->fib6_flags & RTF_REJECT) {
2543 		ret = net->ipv6.ip6_null_entry;
2544 		goto out;
2545 	}
2546 
2547 	if (rt == net->ipv6.fib6_null_entry) {
2548 		fn = fib6_backtrack(fn, &fl6->saddr);
2549 		if (fn)
2550 			goto restart;
2551 	}
2552 
2553 	res.f6i = rt;
2554 	res.nh = &rt->fib6_nh;
2555 out:
2556 	if (ret) {
2557 		ip6_hold_safe(net, &ret);
2558 	} else {
2559 		res.fib6_flags = res.f6i->fib6_flags;
2560 		res.fib6_type = res.f6i->fib6_type;
2561 		ret = ip6_create_rt_rcu(&res);
2562 	}
2563 
2564 	rcu_read_unlock();
2565 
2566 	trace_fib6_table_lookup(net, &res, table, fl6);
2567 	return ret;
2568 };
2569 
2570 static struct dst_entry *ip6_route_redirect(struct net *net,
2571 					    const struct flowi6 *fl6,
2572 					    const struct sk_buff *skb,
2573 					    const struct in6_addr *gateway)
2574 {
2575 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2576 	struct ip6rd_flowi rdfl;
2577 
2578 	rdfl.fl6 = *fl6;
2579 	rdfl.gateway = *gateway;
2580 
2581 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2582 				flags, __ip6_route_redirect);
2583 }
2584 
2585 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2586 		  kuid_t uid)
2587 {
2588 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2589 	struct dst_entry *dst;
2590 	struct flowi6 fl6 = {
2591 		.flowi6_iif = LOOPBACK_IFINDEX,
2592 		.flowi6_oif = oif,
2593 		.flowi6_mark = mark,
2594 		.daddr = iph->daddr,
2595 		.saddr = iph->saddr,
2596 		.flowlabel = ip6_flowinfo(iph),
2597 		.flowi6_uid = uid,
2598 	};
2599 
2600 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2601 	rt6_do_redirect(dst, NULL, skb);
2602 	dst_release(dst);
2603 }
2604 EXPORT_SYMBOL_GPL(ip6_redirect);
2605 
2606 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2607 {
2608 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2609 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2610 	struct dst_entry *dst;
2611 	struct flowi6 fl6 = {
2612 		.flowi6_iif = LOOPBACK_IFINDEX,
2613 		.flowi6_oif = oif,
2614 		.daddr = msg->dest,
2615 		.saddr = iph->daddr,
2616 		.flowi6_uid = sock_net_uid(net, NULL),
2617 	};
2618 
2619 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2620 	rt6_do_redirect(dst, NULL, skb);
2621 	dst_release(dst);
2622 }
2623 
2624 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2625 {
2626 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2627 		     sk->sk_uid);
2628 }
2629 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2630 
2631 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2632 {
2633 	struct net_device *dev = dst->dev;
2634 	unsigned int mtu = dst_mtu(dst);
2635 	struct net *net = dev_net(dev);
2636 
2637 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2638 
2639 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2640 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2641 
2642 	/*
2643 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2644 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2645 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2646 	 * rely only on pmtu discovery"
2647 	 */
2648 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2649 		mtu = IPV6_MAXPLEN;
2650 	return mtu;
2651 }
2652 
2653 static unsigned int ip6_mtu(const struct dst_entry *dst)
2654 {
2655 	struct inet6_dev *idev;
2656 	unsigned int mtu;
2657 
2658 	mtu = dst_metric_raw(dst, RTAX_MTU);
2659 	if (mtu)
2660 		goto out;
2661 
2662 	mtu = IPV6_MIN_MTU;
2663 
2664 	rcu_read_lock();
2665 	idev = __in6_dev_get(dst->dev);
2666 	if (idev)
2667 		mtu = idev->cnf.mtu6;
2668 	rcu_read_unlock();
2669 
2670 out:
2671 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2672 
2673 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2674 }
2675 
2676 /* MTU selection:
2677  * 1. mtu on route is locked - use it
2678  * 2. mtu from nexthop exception
2679  * 3. mtu from egress device
2680  *
2681  * based on ip6_dst_mtu_forward and exception logic of
2682  * rt6_find_cached_rt; called with rcu_read_lock
2683  */
2684 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2685 		      const struct in6_addr *daddr,
2686 		      const struct in6_addr *saddr)
2687 {
2688 	const struct fib6_nh *nh = res->nh;
2689 	struct fib6_info *f6i = res->f6i;
2690 	struct inet6_dev *idev;
2691 	struct rt6_info *rt;
2692 	u32 mtu = 0;
2693 
2694 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2695 		mtu = f6i->fib6_pmtu;
2696 		if (mtu)
2697 			goto out;
2698 	}
2699 
2700 	rt = rt6_find_cached_rt(res, daddr, saddr);
2701 	if (unlikely(rt)) {
2702 		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2703 	} else {
2704 		struct net_device *dev = nh->fib_nh_dev;
2705 
2706 		mtu = IPV6_MIN_MTU;
2707 		idev = __in6_dev_get(dev);
2708 		if (idev && idev->cnf.mtu6 > mtu)
2709 			mtu = idev->cnf.mtu6;
2710 	}
2711 
2712 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2713 out:
2714 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2715 }
2716 
2717 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2718 				  struct flowi6 *fl6)
2719 {
2720 	struct dst_entry *dst;
2721 	struct rt6_info *rt;
2722 	struct inet6_dev *idev = in6_dev_get(dev);
2723 	struct net *net = dev_net(dev);
2724 
2725 	if (unlikely(!idev))
2726 		return ERR_PTR(-ENODEV);
2727 
2728 	rt = ip6_dst_alloc(net, dev, 0);
2729 	if (unlikely(!rt)) {
2730 		in6_dev_put(idev);
2731 		dst = ERR_PTR(-ENOMEM);
2732 		goto out;
2733 	}
2734 
2735 	rt->dst.flags |= DST_HOST;
2736 	rt->dst.input = ip6_input;
2737 	rt->dst.output  = ip6_output;
2738 	rt->rt6i_gateway  = fl6->daddr;
2739 	rt->rt6i_dst.addr = fl6->daddr;
2740 	rt->rt6i_dst.plen = 128;
2741 	rt->rt6i_idev     = idev;
2742 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2743 
2744 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2745 	 * do proper release of the net_device
2746 	 */
2747 	rt6_uncached_list_add(rt);
2748 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2749 
2750 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2751 
2752 out:
2753 	return dst;
2754 }
2755 
2756 static int ip6_dst_gc(struct dst_ops *ops)
2757 {
2758 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2759 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2760 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2761 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2762 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2763 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2764 	int entries;
2765 
2766 	entries = dst_entries_get_fast(ops);
2767 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2768 	    entries <= rt_max_size)
2769 		goto out;
2770 
2771 	net->ipv6.ip6_rt_gc_expire++;
2772 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2773 	entries = dst_entries_get_slow(ops);
2774 	if (entries < ops->gc_thresh)
2775 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2776 out:
2777 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2778 	return entries > rt_max_size;
2779 }
2780 
2781 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2782 					    struct fib6_config *cfg,
2783 					    const struct in6_addr *gw_addr,
2784 					    u32 tbid, int flags)
2785 {
2786 	struct flowi6 fl6 = {
2787 		.flowi6_oif = cfg->fc_ifindex,
2788 		.daddr = *gw_addr,
2789 		.saddr = cfg->fc_prefsrc,
2790 	};
2791 	struct fib6_table *table;
2792 	struct rt6_info *rt;
2793 
2794 	table = fib6_get_table(net, tbid);
2795 	if (!table)
2796 		return NULL;
2797 
2798 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2799 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2800 
2801 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2802 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2803 
2804 	/* if table lookup failed, fall back to full lookup */
2805 	if (rt == net->ipv6.ip6_null_entry) {
2806 		ip6_rt_put(rt);
2807 		rt = NULL;
2808 	}
2809 
2810 	return rt;
2811 }
2812 
2813 static int ip6_route_check_nh_onlink(struct net *net,
2814 				     struct fib6_config *cfg,
2815 				     const struct net_device *dev,
2816 				     struct netlink_ext_ack *extack)
2817 {
2818 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2819 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2820 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2821 	struct fib6_info *from;
2822 	struct rt6_info *grt;
2823 	int err;
2824 
2825 	err = 0;
2826 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2827 	if (grt) {
2828 		rcu_read_lock();
2829 		from = rcu_dereference(grt->from);
2830 		if (!grt->dst.error &&
2831 		    /* ignore match if it is the default route */
2832 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2833 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2834 			NL_SET_ERR_MSG(extack,
2835 				       "Nexthop has invalid gateway or device mismatch");
2836 			err = -EINVAL;
2837 		}
2838 		rcu_read_unlock();
2839 
2840 		ip6_rt_put(grt);
2841 	}
2842 
2843 	return err;
2844 }
2845 
2846 static int ip6_route_check_nh(struct net *net,
2847 			      struct fib6_config *cfg,
2848 			      struct net_device **_dev,
2849 			      struct inet6_dev **idev)
2850 {
2851 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852 	struct net_device *dev = _dev ? *_dev : NULL;
2853 	struct rt6_info *grt = NULL;
2854 	int err = -EHOSTUNREACH;
2855 
2856 	if (cfg->fc_table) {
2857 		int flags = RT6_LOOKUP_F_IFACE;
2858 
2859 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2860 					  cfg->fc_table, flags);
2861 		if (grt) {
2862 			if (grt->rt6i_flags & RTF_GATEWAY ||
2863 			    (dev && dev != grt->dst.dev)) {
2864 				ip6_rt_put(grt);
2865 				grt = NULL;
2866 			}
2867 		}
2868 	}
2869 
2870 	if (!grt)
2871 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2872 
2873 	if (!grt)
2874 		goto out;
2875 
2876 	if (dev) {
2877 		if (dev != grt->dst.dev) {
2878 			ip6_rt_put(grt);
2879 			goto out;
2880 		}
2881 	} else {
2882 		*_dev = dev = grt->dst.dev;
2883 		*idev = grt->rt6i_idev;
2884 		dev_hold(dev);
2885 		in6_dev_hold(grt->rt6i_idev);
2886 	}
2887 
2888 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2889 		err = 0;
2890 
2891 	ip6_rt_put(grt);
2892 
2893 out:
2894 	return err;
2895 }
2896 
2897 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2898 			   struct net_device **_dev, struct inet6_dev **idev,
2899 			   struct netlink_ext_ack *extack)
2900 {
2901 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2902 	int gwa_type = ipv6_addr_type(gw_addr);
2903 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2904 	const struct net_device *dev = *_dev;
2905 	bool need_addr_check = !dev;
2906 	int err = -EINVAL;
2907 
2908 	/* if gw_addr is local we will fail to detect this in case
2909 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2910 	 * will return already-added prefix route via interface that
2911 	 * prefix route was assigned to, which might be non-loopback.
2912 	 */
2913 	if (dev &&
2914 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2915 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2916 		goto out;
2917 	}
2918 
2919 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2920 		/* IPv6 strictly inhibits using not link-local
2921 		 * addresses as nexthop address.
2922 		 * Otherwise, router will not able to send redirects.
2923 		 * It is very good, but in some (rare!) circumstances
2924 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2925 		 * some exceptions. --ANK
2926 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2927 		 * addressing
2928 		 */
2929 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2930 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2931 			goto out;
2932 		}
2933 
2934 		if (cfg->fc_flags & RTNH_F_ONLINK)
2935 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2936 		else
2937 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2938 
2939 		if (err)
2940 			goto out;
2941 	}
2942 
2943 	/* reload in case device was changed */
2944 	dev = *_dev;
2945 
2946 	err = -EINVAL;
2947 	if (!dev) {
2948 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2949 		goto out;
2950 	} else if (dev->flags & IFF_LOOPBACK) {
2951 		NL_SET_ERR_MSG(extack,
2952 			       "Egress device can not be loopback device for this route");
2953 		goto out;
2954 	}
2955 
2956 	/* if we did not check gw_addr above, do so now that the
2957 	 * egress device has been resolved.
2958 	 */
2959 	if (need_addr_check &&
2960 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2961 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2962 		goto out;
2963 	}
2964 
2965 	err = 0;
2966 out:
2967 	return err;
2968 }
2969 
2970 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2971 {
2972 	if ((flags & RTF_REJECT) ||
2973 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2974 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2975 	     !(flags & RTF_LOCAL)))
2976 		return true;
2977 
2978 	return false;
2979 }
2980 
2981 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2982 		 struct fib6_config *cfg, gfp_t gfp_flags,
2983 		 struct netlink_ext_ack *extack)
2984 {
2985 	struct net_device *dev = NULL;
2986 	struct inet6_dev *idev = NULL;
2987 	int addr_type;
2988 	int err;
2989 
2990 	fib6_nh->fib_nh_family = AF_INET6;
2991 
2992 	err = -ENODEV;
2993 	if (cfg->fc_ifindex) {
2994 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2995 		if (!dev)
2996 			goto out;
2997 		idev = in6_dev_get(dev);
2998 		if (!idev)
2999 			goto out;
3000 	}
3001 
3002 	if (cfg->fc_flags & RTNH_F_ONLINK) {
3003 		if (!dev) {
3004 			NL_SET_ERR_MSG(extack,
3005 				       "Nexthop device required for onlink");
3006 			goto out;
3007 		}
3008 
3009 		if (!(dev->flags & IFF_UP)) {
3010 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3011 			err = -ENETDOWN;
3012 			goto out;
3013 		}
3014 
3015 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3016 	}
3017 
3018 	fib6_nh->fib_nh_weight = 1;
3019 
3020 	/* We cannot add true routes via loopback here,
3021 	 * they would result in kernel looping; promote them to reject routes
3022 	 */
3023 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3024 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3025 		/* hold loopback dev/idev if we haven't done so. */
3026 		if (dev != net->loopback_dev) {
3027 			if (dev) {
3028 				dev_put(dev);
3029 				in6_dev_put(idev);
3030 			}
3031 			dev = net->loopback_dev;
3032 			dev_hold(dev);
3033 			idev = in6_dev_get(dev);
3034 			if (!idev) {
3035 				err = -ENODEV;
3036 				goto out;
3037 			}
3038 		}
3039 		goto set_dev;
3040 	}
3041 
3042 	if (cfg->fc_flags & RTF_GATEWAY) {
3043 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3044 		if (err)
3045 			goto out;
3046 
3047 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3048 		fib6_nh->fib_nh_gw_family = AF_INET6;
3049 	}
3050 
3051 	err = -ENODEV;
3052 	if (!dev)
3053 		goto out;
3054 
3055 	if (idev->cnf.disable_ipv6) {
3056 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3057 		err = -EACCES;
3058 		goto out;
3059 	}
3060 
3061 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3062 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3063 		err = -ENETDOWN;
3064 		goto out;
3065 	}
3066 
3067 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3068 	    !netif_carrier_ok(dev))
3069 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3070 
3071 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3072 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3073 	if (err)
3074 		goto out;
3075 set_dev:
3076 	fib6_nh->fib_nh_dev = dev;
3077 	fib6_nh->fib_nh_oif = dev->ifindex;
3078 	err = 0;
3079 out:
3080 	if (idev)
3081 		in6_dev_put(idev);
3082 
3083 	if (err) {
3084 		lwtstate_put(fib6_nh->fib_nh_lws);
3085 		fib6_nh->fib_nh_lws = NULL;
3086 		if (dev)
3087 			dev_put(dev);
3088 	}
3089 
3090 	return err;
3091 }
3092 
3093 void fib6_nh_release(struct fib6_nh *fib6_nh)
3094 {
3095 	fib_nh_common_release(&fib6_nh->nh_common);
3096 }
3097 
3098 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3099 					      gfp_t gfp_flags,
3100 					      struct netlink_ext_ack *extack)
3101 {
3102 	struct net *net = cfg->fc_nlinfo.nl_net;
3103 	struct fib6_info *rt = NULL;
3104 	struct fib6_table *table;
3105 	int err = -EINVAL;
3106 	int addr_type;
3107 
3108 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3109 	if (cfg->fc_flags & RTF_PCPU) {
3110 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3111 		goto out;
3112 	}
3113 
3114 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3115 	if (cfg->fc_flags & RTF_CACHE) {
3116 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3117 		goto out;
3118 	}
3119 
3120 	if (cfg->fc_type > RTN_MAX) {
3121 		NL_SET_ERR_MSG(extack, "Invalid route type");
3122 		goto out;
3123 	}
3124 
3125 	if (cfg->fc_dst_len > 128) {
3126 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3127 		goto out;
3128 	}
3129 	if (cfg->fc_src_len > 128) {
3130 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3131 		goto out;
3132 	}
3133 #ifndef CONFIG_IPV6_SUBTREES
3134 	if (cfg->fc_src_len) {
3135 		NL_SET_ERR_MSG(extack,
3136 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3137 		goto out;
3138 	}
3139 #endif
3140 
3141 	err = -ENOBUFS;
3142 	if (cfg->fc_nlinfo.nlh &&
3143 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3144 		table = fib6_get_table(net, cfg->fc_table);
3145 		if (!table) {
3146 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3147 			table = fib6_new_table(net, cfg->fc_table);
3148 		}
3149 	} else {
3150 		table = fib6_new_table(net, cfg->fc_table);
3151 	}
3152 
3153 	if (!table)
3154 		goto out;
3155 
3156 	err = -ENOMEM;
3157 	rt = fib6_info_alloc(gfp_flags);
3158 	if (!rt)
3159 		goto out;
3160 
3161 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3162 					       extack);
3163 	if (IS_ERR(rt->fib6_metrics)) {
3164 		err = PTR_ERR(rt->fib6_metrics);
3165 		/* Do not leave garbage there. */
3166 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3167 		goto out;
3168 	}
3169 
3170 	if (cfg->fc_flags & RTF_ADDRCONF)
3171 		rt->dst_nocount = true;
3172 
3173 	if (cfg->fc_flags & RTF_EXPIRES)
3174 		fib6_set_expires(rt, jiffies +
3175 				clock_t_to_jiffies(cfg->fc_expires));
3176 	else
3177 		fib6_clean_expires(rt);
3178 
3179 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3180 		cfg->fc_protocol = RTPROT_BOOT;
3181 	rt->fib6_protocol = cfg->fc_protocol;
3182 
3183 	rt->fib6_table = table;
3184 	rt->fib6_metric = cfg->fc_metric;
3185 	rt->fib6_type = cfg->fc_type;
3186 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3187 
3188 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3189 	rt->fib6_dst.plen = cfg->fc_dst_len;
3190 	if (rt->fib6_dst.plen == 128)
3191 		rt->dst_host = true;
3192 
3193 #ifdef CONFIG_IPV6_SUBTREES
3194 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3195 	rt->fib6_src.plen = cfg->fc_src_len;
3196 #endif
3197 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3198 	if (err)
3199 		goto out;
3200 
3201 	/* We cannot add true routes via loopback here,
3202 	 * they would result in kernel looping; promote them to reject routes
3203 	 */
3204 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3205 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3206 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3207 
3208 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3209 		struct net_device *dev = fib6_info_nh_dev(rt);
3210 
3211 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3212 			NL_SET_ERR_MSG(extack, "Invalid source address");
3213 			err = -EINVAL;
3214 			goto out;
3215 		}
3216 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3217 		rt->fib6_prefsrc.plen = 128;
3218 	} else
3219 		rt->fib6_prefsrc.plen = 0;
3220 
3221 	return rt;
3222 out:
3223 	fib6_info_release(rt);
3224 	return ERR_PTR(err);
3225 }
3226 
3227 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3228 		  struct netlink_ext_ack *extack)
3229 {
3230 	struct fib6_info *rt;
3231 	int err;
3232 
3233 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3234 	if (IS_ERR(rt))
3235 		return PTR_ERR(rt);
3236 
3237 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3238 	fib6_info_release(rt);
3239 
3240 	return err;
3241 }
3242 
3243 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3244 {
3245 	struct net *net = info->nl_net;
3246 	struct fib6_table *table;
3247 	int err;
3248 
3249 	if (rt == net->ipv6.fib6_null_entry) {
3250 		err = -ENOENT;
3251 		goto out;
3252 	}
3253 
3254 	table = rt->fib6_table;
3255 	spin_lock_bh(&table->tb6_lock);
3256 	err = fib6_del(rt, info);
3257 	spin_unlock_bh(&table->tb6_lock);
3258 
3259 out:
3260 	fib6_info_release(rt);
3261 	return err;
3262 }
3263 
3264 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3265 {
3266 	struct nl_info info = { .nl_net = net };
3267 
3268 	return __ip6_del_rt(rt, &info);
3269 }
3270 
3271 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3272 {
3273 	struct nl_info *info = &cfg->fc_nlinfo;
3274 	struct net *net = info->nl_net;
3275 	struct sk_buff *skb = NULL;
3276 	struct fib6_table *table;
3277 	int err = -ENOENT;
3278 
3279 	if (rt == net->ipv6.fib6_null_entry)
3280 		goto out_put;
3281 	table = rt->fib6_table;
3282 	spin_lock_bh(&table->tb6_lock);
3283 
3284 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3285 		struct fib6_info *sibling, *next_sibling;
3286 
3287 		/* prefer to send a single notification with all hops */
3288 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3289 		if (skb) {
3290 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3291 
3292 			if (rt6_fill_node(net, skb, rt, NULL,
3293 					  NULL, NULL, 0, RTM_DELROUTE,
3294 					  info->portid, seq, 0) < 0) {
3295 				kfree_skb(skb);
3296 				skb = NULL;
3297 			} else
3298 				info->skip_notify = 1;
3299 		}
3300 
3301 		list_for_each_entry_safe(sibling, next_sibling,
3302 					 &rt->fib6_siblings,
3303 					 fib6_siblings) {
3304 			err = fib6_del(sibling, info);
3305 			if (err)
3306 				goto out_unlock;
3307 		}
3308 	}
3309 
3310 	err = fib6_del(rt, info);
3311 out_unlock:
3312 	spin_unlock_bh(&table->tb6_lock);
3313 out_put:
3314 	fib6_info_release(rt);
3315 
3316 	if (skb) {
3317 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3318 			    info->nlh, gfp_any());
3319 	}
3320 	return err;
3321 }
3322 
3323 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3324 {
3325 	int rc = -ESRCH;
3326 
3327 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3328 		goto out;
3329 
3330 	if (cfg->fc_flags & RTF_GATEWAY &&
3331 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3332 		goto out;
3333 
3334 	rc = rt6_remove_exception_rt(rt);
3335 out:
3336 	return rc;
3337 }
3338 
3339 static int ip6_route_del(struct fib6_config *cfg,
3340 			 struct netlink_ext_ack *extack)
3341 {
3342 	struct rt6_info *rt_cache;
3343 	struct fib6_table *table;
3344 	struct fib6_info *rt;
3345 	struct fib6_node *fn;
3346 	int err = -ESRCH;
3347 
3348 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3349 	if (!table) {
3350 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3351 		return err;
3352 	}
3353 
3354 	rcu_read_lock();
3355 
3356 	fn = fib6_locate(&table->tb6_root,
3357 			 &cfg->fc_dst, cfg->fc_dst_len,
3358 			 &cfg->fc_src, cfg->fc_src_len,
3359 			 !(cfg->fc_flags & RTF_CACHE));
3360 
3361 	if (fn) {
3362 		for_each_fib6_node_rt_rcu(fn) {
3363 			struct fib6_nh *nh;
3364 
3365 			if (cfg->fc_flags & RTF_CACHE) {
3366 				struct fib6_result res = {
3367 					.f6i = rt,
3368 				};
3369 				int rc;
3370 
3371 				rt_cache = rt6_find_cached_rt(&res,
3372 							      &cfg->fc_dst,
3373 							      &cfg->fc_src);
3374 				if (rt_cache) {
3375 					rc = ip6_del_cached_rt(rt_cache, cfg);
3376 					if (rc != -ESRCH) {
3377 						rcu_read_unlock();
3378 						return rc;
3379 					}
3380 				}
3381 				continue;
3382 			}
3383 
3384 			nh = &rt->fib6_nh;
3385 			if (cfg->fc_ifindex &&
3386 			    (!nh->fib_nh_dev ||
3387 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3388 				continue;
3389 			if (cfg->fc_flags & RTF_GATEWAY &&
3390 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3391 				continue;
3392 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3393 				continue;
3394 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3395 				continue;
3396 			if (!fib6_info_hold_safe(rt))
3397 				continue;
3398 			rcu_read_unlock();
3399 
3400 			/* if gateway was specified only delete the one hop */
3401 			if (cfg->fc_flags & RTF_GATEWAY)
3402 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3403 
3404 			return __ip6_del_rt_siblings(rt, cfg);
3405 		}
3406 	}
3407 	rcu_read_unlock();
3408 
3409 	return err;
3410 }
3411 
3412 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3413 {
3414 	struct netevent_redirect netevent;
3415 	struct rt6_info *rt, *nrt = NULL;
3416 	struct fib6_result res = {};
3417 	struct ndisc_options ndopts;
3418 	struct inet6_dev *in6_dev;
3419 	struct neighbour *neigh;
3420 	struct rd_msg *msg;
3421 	int optlen, on_link;
3422 	u8 *lladdr;
3423 
3424 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3425 	optlen -= sizeof(*msg);
3426 
3427 	if (optlen < 0) {
3428 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3429 		return;
3430 	}
3431 
3432 	msg = (struct rd_msg *)icmp6_hdr(skb);
3433 
3434 	if (ipv6_addr_is_multicast(&msg->dest)) {
3435 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3436 		return;
3437 	}
3438 
3439 	on_link = 0;
3440 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3441 		on_link = 1;
3442 	} else if (ipv6_addr_type(&msg->target) !=
3443 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3444 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3445 		return;
3446 	}
3447 
3448 	in6_dev = __in6_dev_get(skb->dev);
3449 	if (!in6_dev)
3450 		return;
3451 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3452 		return;
3453 
3454 	/* RFC2461 8.1:
3455 	 *	The IP source address of the Redirect MUST be the same as the current
3456 	 *	first-hop router for the specified ICMP Destination Address.
3457 	 */
3458 
3459 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3460 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3461 		return;
3462 	}
3463 
3464 	lladdr = NULL;
3465 	if (ndopts.nd_opts_tgt_lladdr) {
3466 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3467 					     skb->dev);
3468 		if (!lladdr) {
3469 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3470 			return;
3471 		}
3472 	}
3473 
3474 	rt = (struct rt6_info *) dst;
3475 	if (rt->rt6i_flags & RTF_REJECT) {
3476 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3477 		return;
3478 	}
3479 
3480 	/* Redirect received -> path was valid.
3481 	 * Look, redirects are sent only in response to data packets,
3482 	 * so that this nexthop apparently is reachable. --ANK
3483 	 */
3484 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3485 
3486 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3487 	if (!neigh)
3488 		return;
3489 
3490 	/*
3491 	 *	We have finally decided to accept it.
3492 	 */
3493 
3494 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3495 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3496 		     NEIGH_UPDATE_F_OVERRIDE|
3497 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3498 				     NEIGH_UPDATE_F_ISROUTER)),
3499 		     NDISC_REDIRECT, &ndopts);
3500 
3501 	rcu_read_lock();
3502 	res.f6i = rcu_dereference(rt->from);
3503 	if (!res.f6i)
3504 		goto out;
3505 
3506 	res.nh = &res.f6i->fib6_nh;
3507 	res.fib6_flags = res.f6i->fib6_flags;
3508 	res.fib6_type = res.f6i->fib6_type;
3509 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3510 	if (!nrt)
3511 		goto out;
3512 
3513 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3514 	if (on_link)
3515 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3516 
3517 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3518 
3519 	/* rt6_insert_exception() will take care of duplicated exceptions */
3520 	if (rt6_insert_exception(nrt, &res)) {
3521 		dst_release_immediate(&nrt->dst);
3522 		goto out;
3523 	}
3524 
3525 	netevent.old = &rt->dst;
3526 	netevent.new = &nrt->dst;
3527 	netevent.daddr = &msg->dest;
3528 	netevent.neigh = neigh;
3529 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3530 
3531 out:
3532 	rcu_read_unlock();
3533 	neigh_release(neigh);
3534 }
3535 
3536 #ifdef CONFIG_IPV6_ROUTE_INFO
3537 static struct fib6_info *rt6_get_route_info(struct net *net,
3538 					   const struct in6_addr *prefix, int prefixlen,
3539 					   const struct in6_addr *gwaddr,
3540 					   struct net_device *dev)
3541 {
3542 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3543 	int ifindex = dev->ifindex;
3544 	struct fib6_node *fn;
3545 	struct fib6_info *rt = NULL;
3546 	struct fib6_table *table;
3547 
3548 	table = fib6_get_table(net, tb_id);
3549 	if (!table)
3550 		return NULL;
3551 
3552 	rcu_read_lock();
3553 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3554 	if (!fn)
3555 		goto out;
3556 
3557 	for_each_fib6_node_rt_rcu(fn) {
3558 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3559 			continue;
3560 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3561 		    !rt->fib6_nh.fib_nh_gw_family)
3562 			continue;
3563 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3564 			continue;
3565 		if (!fib6_info_hold_safe(rt))
3566 			continue;
3567 		break;
3568 	}
3569 out:
3570 	rcu_read_unlock();
3571 	return rt;
3572 }
3573 
3574 static struct fib6_info *rt6_add_route_info(struct net *net,
3575 					   const struct in6_addr *prefix, int prefixlen,
3576 					   const struct in6_addr *gwaddr,
3577 					   struct net_device *dev,
3578 					   unsigned int pref)
3579 {
3580 	struct fib6_config cfg = {
3581 		.fc_metric	= IP6_RT_PRIO_USER,
3582 		.fc_ifindex	= dev->ifindex,
3583 		.fc_dst_len	= prefixlen,
3584 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3585 				  RTF_UP | RTF_PREF(pref),
3586 		.fc_protocol = RTPROT_RA,
3587 		.fc_type = RTN_UNICAST,
3588 		.fc_nlinfo.portid = 0,
3589 		.fc_nlinfo.nlh = NULL,
3590 		.fc_nlinfo.nl_net = net,
3591 	};
3592 
3593 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3594 	cfg.fc_dst = *prefix;
3595 	cfg.fc_gateway = *gwaddr;
3596 
3597 	/* We should treat it as a default route if prefix length is 0. */
3598 	if (!prefixlen)
3599 		cfg.fc_flags |= RTF_DEFAULT;
3600 
3601 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3602 
3603 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3604 }
3605 #endif
3606 
3607 struct fib6_info *rt6_get_dflt_router(struct net *net,
3608 				     const struct in6_addr *addr,
3609 				     struct net_device *dev)
3610 {
3611 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3612 	struct fib6_info *rt;
3613 	struct fib6_table *table;
3614 
3615 	table = fib6_get_table(net, tb_id);
3616 	if (!table)
3617 		return NULL;
3618 
3619 	rcu_read_lock();
3620 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3621 		struct fib6_nh *nh = &rt->fib6_nh;
3622 
3623 		if (dev == nh->fib_nh_dev &&
3624 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3625 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3626 			break;
3627 	}
3628 	if (rt && !fib6_info_hold_safe(rt))
3629 		rt = NULL;
3630 	rcu_read_unlock();
3631 	return rt;
3632 }
3633 
3634 struct fib6_info *rt6_add_dflt_router(struct net *net,
3635 				     const struct in6_addr *gwaddr,
3636 				     struct net_device *dev,
3637 				     unsigned int pref)
3638 {
3639 	struct fib6_config cfg = {
3640 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3641 		.fc_metric	= IP6_RT_PRIO_USER,
3642 		.fc_ifindex	= dev->ifindex,
3643 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3644 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3645 		.fc_protocol = RTPROT_RA,
3646 		.fc_type = RTN_UNICAST,
3647 		.fc_nlinfo.portid = 0,
3648 		.fc_nlinfo.nlh = NULL,
3649 		.fc_nlinfo.nl_net = net,
3650 	};
3651 
3652 	cfg.fc_gateway = *gwaddr;
3653 
3654 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3655 		struct fib6_table *table;
3656 
3657 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3658 		if (table)
3659 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3660 	}
3661 
3662 	return rt6_get_dflt_router(net, gwaddr, dev);
3663 }
3664 
3665 static void __rt6_purge_dflt_routers(struct net *net,
3666 				     struct fib6_table *table)
3667 {
3668 	struct fib6_info *rt;
3669 
3670 restart:
3671 	rcu_read_lock();
3672 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3673 		struct net_device *dev = fib6_info_nh_dev(rt);
3674 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3675 
3676 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3677 		    (!idev || idev->cnf.accept_ra != 2) &&
3678 		    fib6_info_hold_safe(rt)) {
3679 			rcu_read_unlock();
3680 			ip6_del_rt(net, rt);
3681 			goto restart;
3682 		}
3683 	}
3684 	rcu_read_unlock();
3685 
3686 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3687 }
3688 
3689 void rt6_purge_dflt_routers(struct net *net)
3690 {
3691 	struct fib6_table *table;
3692 	struct hlist_head *head;
3693 	unsigned int h;
3694 
3695 	rcu_read_lock();
3696 
3697 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3698 		head = &net->ipv6.fib_table_hash[h];
3699 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3700 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3701 				__rt6_purge_dflt_routers(net, table);
3702 		}
3703 	}
3704 
3705 	rcu_read_unlock();
3706 }
3707 
3708 static void rtmsg_to_fib6_config(struct net *net,
3709 				 struct in6_rtmsg *rtmsg,
3710 				 struct fib6_config *cfg)
3711 {
3712 	*cfg = (struct fib6_config){
3713 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3714 			 : RT6_TABLE_MAIN,
3715 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3716 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3717 		.fc_expires = rtmsg->rtmsg_info,
3718 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3719 		.fc_src_len = rtmsg->rtmsg_src_len,
3720 		.fc_flags = rtmsg->rtmsg_flags,
3721 		.fc_type = rtmsg->rtmsg_type,
3722 
3723 		.fc_nlinfo.nl_net = net,
3724 
3725 		.fc_dst = rtmsg->rtmsg_dst,
3726 		.fc_src = rtmsg->rtmsg_src,
3727 		.fc_gateway = rtmsg->rtmsg_gateway,
3728 	};
3729 }
3730 
3731 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3732 {
3733 	struct fib6_config cfg;
3734 	struct in6_rtmsg rtmsg;
3735 	int err;
3736 
3737 	switch (cmd) {
3738 	case SIOCADDRT:		/* Add a route */
3739 	case SIOCDELRT:		/* Delete a route */
3740 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3741 			return -EPERM;
3742 		err = copy_from_user(&rtmsg, arg,
3743 				     sizeof(struct in6_rtmsg));
3744 		if (err)
3745 			return -EFAULT;
3746 
3747 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3748 
3749 		rtnl_lock();
3750 		switch (cmd) {
3751 		case SIOCADDRT:
3752 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3753 			break;
3754 		case SIOCDELRT:
3755 			err = ip6_route_del(&cfg, NULL);
3756 			break;
3757 		default:
3758 			err = -EINVAL;
3759 		}
3760 		rtnl_unlock();
3761 
3762 		return err;
3763 	}
3764 
3765 	return -EINVAL;
3766 }
3767 
3768 /*
3769  *	Drop the packet on the floor
3770  */
3771 
3772 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3773 {
3774 	struct dst_entry *dst = skb_dst(skb);
3775 	struct net *net = dev_net(dst->dev);
3776 	struct inet6_dev *idev;
3777 	int type;
3778 
3779 	if (netif_is_l3_master(skb->dev) &&
3780 	    dst->dev == net->loopback_dev)
3781 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3782 	else
3783 		idev = ip6_dst_idev(dst);
3784 
3785 	switch (ipstats_mib_noroutes) {
3786 	case IPSTATS_MIB_INNOROUTES:
3787 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3788 		if (type == IPV6_ADDR_ANY) {
3789 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3790 			break;
3791 		}
3792 		/* FALLTHROUGH */
3793 	case IPSTATS_MIB_OUTNOROUTES:
3794 		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3795 		break;
3796 	}
3797 
3798 	/* Start over by dropping the dst for l3mdev case */
3799 	if (netif_is_l3_master(skb->dev))
3800 		skb_dst_drop(skb);
3801 
3802 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3803 	kfree_skb(skb);
3804 	return 0;
3805 }
3806 
3807 static int ip6_pkt_discard(struct sk_buff *skb)
3808 {
3809 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3810 }
3811 
3812 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3813 {
3814 	skb->dev = skb_dst(skb)->dev;
3815 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3816 }
3817 
3818 static int ip6_pkt_prohibit(struct sk_buff *skb)
3819 {
3820 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3821 }
3822 
3823 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3824 {
3825 	skb->dev = skb_dst(skb)->dev;
3826 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3827 }
3828 
3829 /*
3830  *	Allocate a dst for local (unicast / anycast) address.
3831  */
3832 
3833 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3834 				     struct inet6_dev *idev,
3835 				     const struct in6_addr *addr,
3836 				     bool anycast, gfp_t gfp_flags)
3837 {
3838 	struct fib6_config cfg = {
3839 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3840 		.fc_ifindex = idev->dev->ifindex,
3841 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3842 		.fc_dst = *addr,
3843 		.fc_dst_len = 128,
3844 		.fc_protocol = RTPROT_KERNEL,
3845 		.fc_nlinfo.nl_net = net,
3846 		.fc_ignore_dev_down = true,
3847 	};
3848 
3849 	if (anycast) {
3850 		cfg.fc_type = RTN_ANYCAST;
3851 		cfg.fc_flags |= RTF_ANYCAST;
3852 	} else {
3853 		cfg.fc_type = RTN_LOCAL;
3854 		cfg.fc_flags |= RTF_LOCAL;
3855 	}
3856 
3857 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3858 }
3859 
3860 /* remove deleted ip from prefsrc entries */
3861 struct arg_dev_net_ip {
3862 	struct net_device *dev;
3863 	struct net *net;
3864 	struct in6_addr *addr;
3865 };
3866 
3867 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3868 {
3869 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3870 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3871 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3872 
3873 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3874 	    rt != net->ipv6.fib6_null_entry &&
3875 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3876 		spin_lock_bh(&rt6_exception_lock);
3877 		/* remove prefsrc entry */
3878 		rt->fib6_prefsrc.plen = 0;
3879 		spin_unlock_bh(&rt6_exception_lock);
3880 	}
3881 	return 0;
3882 }
3883 
3884 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3885 {
3886 	struct net *net = dev_net(ifp->idev->dev);
3887 	struct arg_dev_net_ip adni = {
3888 		.dev = ifp->idev->dev,
3889 		.net = net,
3890 		.addr = &ifp->addr,
3891 	};
3892 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3893 }
3894 
3895 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3896 
3897 /* Remove routers and update dst entries when gateway turn into host. */
3898 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3899 {
3900 	struct in6_addr *gateway = (struct in6_addr *)arg;
3901 
3902 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3903 	    rt->fib6_nh.fib_nh_gw_family &&
3904 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3905 		return -1;
3906 	}
3907 
3908 	/* Further clean up cached routes in exception table.
3909 	 * This is needed because cached route may have a different
3910 	 * gateway than its 'parent' in the case of an ip redirect.
3911 	 */
3912 	rt6_exceptions_clean_tohost(rt, gateway);
3913 
3914 	return 0;
3915 }
3916 
3917 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3918 {
3919 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3920 }
3921 
3922 struct arg_netdev_event {
3923 	const struct net_device *dev;
3924 	union {
3925 		unsigned char nh_flags;
3926 		unsigned long event;
3927 	};
3928 };
3929 
3930 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3931 {
3932 	struct fib6_info *iter;
3933 	struct fib6_node *fn;
3934 
3935 	fn = rcu_dereference_protected(rt->fib6_node,
3936 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3937 	iter = rcu_dereference_protected(fn->leaf,
3938 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3939 	while (iter) {
3940 		if (iter->fib6_metric == rt->fib6_metric &&
3941 		    rt6_qualify_for_ecmp(iter))
3942 			return iter;
3943 		iter = rcu_dereference_protected(iter->fib6_next,
3944 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3945 	}
3946 
3947 	return NULL;
3948 }
3949 
3950 static bool rt6_is_dead(const struct fib6_info *rt)
3951 {
3952 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3953 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3954 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3955 		return true;
3956 
3957 	return false;
3958 }
3959 
3960 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3961 {
3962 	struct fib6_info *iter;
3963 	int total = 0;
3964 
3965 	if (!rt6_is_dead(rt))
3966 		total += rt->fib6_nh.fib_nh_weight;
3967 
3968 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3969 		if (!rt6_is_dead(iter))
3970 			total += iter->fib6_nh.fib_nh_weight;
3971 	}
3972 
3973 	return total;
3974 }
3975 
3976 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3977 {
3978 	int upper_bound = -1;
3979 
3980 	if (!rt6_is_dead(rt)) {
3981 		*weight += rt->fib6_nh.fib_nh_weight;
3982 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3983 						    total) - 1;
3984 	}
3985 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3986 }
3987 
3988 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3989 {
3990 	struct fib6_info *iter;
3991 	int weight = 0;
3992 
3993 	rt6_upper_bound_set(rt, &weight, total);
3994 
3995 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3996 		rt6_upper_bound_set(iter, &weight, total);
3997 }
3998 
3999 void rt6_multipath_rebalance(struct fib6_info *rt)
4000 {
4001 	struct fib6_info *first;
4002 	int total;
4003 
4004 	/* In case the entire multipath route was marked for flushing,
4005 	 * then there is no need to rebalance upon the removal of every
4006 	 * sibling route.
4007 	 */
4008 	if (!rt->fib6_nsiblings || rt->should_flush)
4009 		return;
4010 
4011 	/* During lookup routes are evaluated in order, so we need to
4012 	 * make sure upper bounds are assigned from the first sibling
4013 	 * onwards.
4014 	 */
4015 	first = rt6_multipath_first_sibling(rt);
4016 	if (WARN_ON_ONCE(!first))
4017 		return;
4018 
4019 	total = rt6_multipath_total_weight(first);
4020 	rt6_multipath_upper_bound_set(first, total);
4021 }
4022 
4023 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4024 {
4025 	const struct arg_netdev_event *arg = p_arg;
4026 	struct net *net = dev_net(arg->dev);
4027 
4028 	if (rt != net->ipv6.fib6_null_entry &&
4029 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4030 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4031 		fib6_update_sernum_upto_root(net, rt);
4032 		rt6_multipath_rebalance(rt);
4033 	}
4034 
4035 	return 0;
4036 }
4037 
4038 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4039 {
4040 	struct arg_netdev_event arg = {
4041 		.dev = dev,
4042 		{
4043 			.nh_flags = nh_flags,
4044 		},
4045 	};
4046 
4047 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4048 		arg.nh_flags |= RTNH_F_LINKDOWN;
4049 
4050 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4051 }
4052 
4053 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4054 				   const struct net_device *dev)
4055 {
4056 	struct fib6_info *iter;
4057 
4058 	if (rt->fib6_nh.fib_nh_dev == dev)
4059 		return true;
4060 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4061 		if (iter->fib6_nh.fib_nh_dev == dev)
4062 			return true;
4063 
4064 	return false;
4065 }
4066 
4067 static void rt6_multipath_flush(struct fib6_info *rt)
4068 {
4069 	struct fib6_info *iter;
4070 
4071 	rt->should_flush = 1;
4072 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4073 		iter->should_flush = 1;
4074 }
4075 
4076 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4077 					     const struct net_device *down_dev)
4078 {
4079 	struct fib6_info *iter;
4080 	unsigned int dead = 0;
4081 
4082 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4083 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4084 		dead++;
4085 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4086 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4087 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4088 			dead++;
4089 
4090 	return dead;
4091 }
4092 
4093 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4094 				       const struct net_device *dev,
4095 				       unsigned char nh_flags)
4096 {
4097 	struct fib6_info *iter;
4098 
4099 	if (rt->fib6_nh.fib_nh_dev == dev)
4100 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4101 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4102 		if (iter->fib6_nh.fib_nh_dev == dev)
4103 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4104 }
4105 
4106 /* called with write lock held for table with rt */
4107 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4108 {
4109 	const struct arg_netdev_event *arg = p_arg;
4110 	const struct net_device *dev = arg->dev;
4111 	struct net *net = dev_net(dev);
4112 
4113 	if (rt == net->ipv6.fib6_null_entry)
4114 		return 0;
4115 
4116 	switch (arg->event) {
4117 	case NETDEV_UNREGISTER:
4118 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4119 	case NETDEV_DOWN:
4120 		if (rt->should_flush)
4121 			return -1;
4122 		if (!rt->fib6_nsiblings)
4123 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4124 		if (rt6_multipath_uses_dev(rt, dev)) {
4125 			unsigned int count;
4126 
4127 			count = rt6_multipath_dead_count(rt, dev);
4128 			if (rt->fib6_nsiblings + 1 == count) {
4129 				rt6_multipath_flush(rt);
4130 				return -1;
4131 			}
4132 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4133 						   RTNH_F_LINKDOWN);
4134 			fib6_update_sernum(net, rt);
4135 			rt6_multipath_rebalance(rt);
4136 		}
4137 		return -2;
4138 	case NETDEV_CHANGE:
4139 		if (rt->fib6_nh.fib_nh_dev != dev ||
4140 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4141 			break;
4142 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4143 		rt6_multipath_rebalance(rt);
4144 		break;
4145 	}
4146 
4147 	return 0;
4148 }
4149 
4150 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4151 {
4152 	struct arg_netdev_event arg = {
4153 		.dev = dev,
4154 		{
4155 			.event = event,
4156 		},
4157 	};
4158 	struct net *net = dev_net(dev);
4159 
4160 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4161 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4162 	else
4163 		fib6_clean_all(net, fib6_ifdown, &arg);
4164 }
4165 
4166 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4167 {
4168 	rt6_sync_down_dev(dev, event);
4169 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4170 	neigh_ifdown(&nd_tbl, dev);
4171 }
4172 
4173 struct rt6_mtu_change_arg {
4174 	struct net_device *dev;
4175 	unsigned int mtu;
4176 };
4177 
4178 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4179 {
4180 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4181 	struct inet6_dev *idev;
4182 
4183 	/* In IPv6 pmtu discovery is not optional,
4184 	   so that RTAX_MTU lock cannot disable it.
4185 	   We still use this lock to block changes
4186 	   caused by addrconf/ndisc.
4187 	*/
4188 
4189 	idev = __in6_dev_get(arg->dev);
4190 	if (!idev)
4191 		return 0;
4192 
4193 	/* For administrative MTU increase, there is no way to discover
4194 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4195 	   Since RFC 1981 doesn't include administrative MTU increase
4196 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4197 	 */
4198 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4199 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4200 		u32 mtu = rt->fib6_pmtu;
4201 
4202 		if (mtu >= arg->mtu ||
4203 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4204 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4205 
4206 		spin_lock_bh(&rt6_exception_lock);
4207 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4208 		spin_unlock_bh(&rt6_exception_lock);
4209 	}
4210 	return 0;
4211 }
4212 
4213 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4214 {
4215 	struct rt6_mtu_change_arg arg = {
4216 		.dev = dev,
4217 		.mtu = mtu,
4218 	};
4219 
4220 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4221 }
4222 
4223 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4224 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4225 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4226 	[RTA_OIF]               = { .type = NLA_U32 },
4227 	[RTA_IIF]		= { .type = NLA_U32 },
4228 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4229 	[RTA_METRICS]           = { .type = NLA_NESTED },
4230 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4231 	[RTA_PREF]              = { .type = NLA_U8 },
4232 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4233 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4234 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4235 	[RTA_UID]		= { .type = NLA_U32 },
4236 	[RTA_MARK]		= { .type = NLA_U32 },
4237 	[RTA_TABLE]		= { .type = NLA_U32 },
4238 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4239 	[RTA_SPORT]		= { .type = NLA_U16 },
4240 	[RTA_DPORT]		= { .type = NLA_U16 },
4241 };
4242 
4243 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4244 			      struct fib6_config *cfg,
4245 			      struct netlink_ext_ack *extack)
4246 {
4247 	struct rtmsg *rtm;
4248 	struct nlattr *tb[RTA_MAX+1];
4249 	unsigned int pref;
4250 	int err;
4251 
4252 	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4253 				     rtm_ipv6_policy, extack);
4254 	if (err < 0)
4255 		goto errout;
4256 
4257 	err = -EINVAL;
4258 	rtm = nlmsg_data(nlh);
4259 
4260 	*cfg = (struct fib6_config){
4261 		.fc_table = rtm->rtm_table,
4262 		.fc_dst_len = rtm->rtm_dst_len,
4263 		.fc_src_len = rtm->rtm_src_len,
4264 		.fc_flags = RTF_UP,
4265 		.fc_protocol = rtm->rtm_protocol,
4266 		.fc_type = rtm->rtm_type,
4267 
4268 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4269 		.fc_nlinfo.nlh = nlh,
4270 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4271 	};
4272 
4273 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4274 	    rtm->rtm_type == RTN_BLACKHOLE ||
4275 	    rtm->rtm_type == RTN_PROHIBIT ||
4276 	    rtm->rtm_type == RTN_THROW)
4277 		cfg->fc_flags |= RTF_REJECT;
4278 
4279 	if (rtm->rtm_type == RTN_LOCAL)
4280 		cfg->fc_flags |= RTF_LOCAL;
4281 
4282 	if (rtm->rtm_flags & RTM_F_CLONED)
4283 		cfg->fc_flags |= RTF_CACHE;
4284 
4285 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4286 
4287 	if (tb[RTA_GATEWAY]) {
4288 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4289 		cfg->fc_flags |= RTF_GATEWAY;
4290 	}
4291 	if (tb[RTA_VIA]) {
4292 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4293 		goto errout;
4294 	}
4295 
4296 	if (tb[RTA_DST]) {
4297 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4298 
4299 		if (nla_len(tb[RTA_DST]) < plen)
4300 			goto errout;
4301 
4302 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4303 	}
4304 
4305 	if (tb[RTA_SRC]) {
4306 		int plen = (rtm->rtm_src_len + 7) >> 3;
4307 
4308 		if (nla_len(tb[RTA_SRC]) < plen)
4309 			goto errout;
4310 
4311 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4312 	}
4313 
4314 	if (tb[RTA_PREFSRC])
4315 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4316 
4317 	if (tb[RTA_OIF])
4318 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4319 
4320 	if (tb[RTA_PRIORITY])
4321 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4322 
4323 	if (tb[RTA_METRICS]) {
4324 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4325 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4326 	}
4327 
4328 	if (tb[RTA_TABLE])
4329 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4330 
4331 	if (tb[RTA_MULTIPATH]) {
4332 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4333 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4334 
4335 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4336 						     cfg->fc_mp_len, extack);
4337 		if (err < 0)
4338 			goto errout;
4339 	}
4340 
4341 	if (tb[RTA_PREF]) {
4342 		pref = nla_get_u8(tb[RTA_PREF]);
4343 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4344 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4345 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4346 		cfg->fc_flags |= RTF_PREF(pref);
4347 	}
4348 
4349 	if (tb[RTA_ENCAP])
4350 		cfg->fc_encap = tb[RTA_ENCAP];
4351 
4352 	if (tb[RTA_ENCAP_TYPE]) {
4353 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4354 
4355 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4356 		if (err < 0)
4357 			goto errout;
4358 	}
4359 
4360 	if (tb[RTA_EXPIRES]) {
4361 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4362 
4363 		if (addrconf_finite_timeout(timeout)) {
4364 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4365 			cfg->fc_flags |= RTF_EXPIRES;
4366 		}
4367 	}
4368 
4369 	err = 0;
4370 errout:
4371 	return err;
4372 }
4373 
4374 struct rt6_nh {
4375 	struct fib6_info *fib6_info;
4376 	struct fib6_config r_cfg;
4377 	struct list_head next;
4378 };
4379 
4380 static int ip6_route_info_append(struct net *net,
4381 				 struct list_head *rt6_nh_list,
4382 				 struct fib6_info *rt,
4383 				 struct fib6_config *r_cfg)
4384 {
4385 	struct rt6_nh *nh;
4386 	int err = -EEXIST;
4387 
4388 	list_for_each_entry(nh, rt6_nh_list, next) {
4389 		/* check if fib6_info already exists */
4390 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4391 			return err;
4392 	}
4393 
4394 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4395 	if (!nh)
4396 		return -ENOMEM;
4397 	nh->fib6_info = rt;
4398 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4399 	list_add_tail(&nh->next, rt6_nh_list);
4400 
4401 	return 0;
4402 }
4403 
4404 static void ip6_route_mpath_notify(struct fib6_info *rt,
4405 				   struct fib6_info *rt_last,
4406 				   struct nl_info *info,
4407 				   __u16 nlflags)
4408 {
4409 	/* if this is an APPEND route, then rt points to the first route
4410 	 * inserted and rt_last points to last route inserted. Userspace
4411 	 * wants a consistent dump of the route which starts at the first
4412 	 * nexthop. Since sibling routes are always added at the end of
4413 	 * the list, find the first sibling of the last route appended
4414 	 */
4415 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4416 		rt = list_first_entry(&rt_last->fib6_siblings,
4417 				      struct fib6_info,
4418 				      fib6_siblings);
4419 	}
4420 
4421 	if (rt)
4422 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4423 }
4424 
4425 static int ip6_route_multipath_add(struct fib6_config *cfg,
4426 				   struct netlink_ext_ack *extack)
4427 {
4428 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4429 	struct nl_info *info = &cfg->fc_nlinfo;
4430 	struct fib6_config r_cfg;
4431 	struct rtnexthop *rtnh;
4432 	struct fib6_info *rt;
4433 	struct rt6_nh *err_nh;
4434 	struct rt6_nh *nh, *nh_safe;
4435 	__u16 nlflags;
4436 	int remaining;
4437 	int attrlen;
4438 	int err = 1;
4439 	int nhn = 0;
4440 	int replace = (cfg->fc_nlinfo.nlh &&
4441 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4442 	LIST_HEAD(rt6_nh_list);
4443 
4444 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4445 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4446 		nlflags |= NLM_F_APPEND;
4447 
4448 	remaining = cfg->fc_mp_len;
4449 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4450 
4451 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4452 	 * fib6_info structs per nexthop
4453 	 */
4454 	while (rtnh_ok(rtnh, remaining)) {
4455 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4456 		if (rtnh->rtnh_ifindex)
4457 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4458 
4459 		attrlen = rtnh_attrlen(rtnh);
4460 		if (attrlen > 0) {
4461 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4462 
4463 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4464 			if (nla) {
4465 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4466 				r_cfg.fc_flags |= RTF_GATEWAY;
4467 			}
4468 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4469 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4470 			if (nla)
4471 				r_cfg.fc_encap_type = nla_get_u16(nla);
4472 		}
4473 
4474 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4475 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4476 		if (IS_ERR(rt)) {
4477 			err = PTR_ERR(rt);
4478 			rt = NULL;
4479 			goto cleanup;
4480 		}
4481 		if (!rt6_qualify_for_ecmp(rt)) {
4482 			err = -EINVAL;
4483 			NL_SET_ERR_MSG(extack,
4484 				       "Device only routes can not be added for IPv6 using the multipath API.");
4485 			fib6_info_release(rt);
4486 			goto cleanup;
4487 		}
4488 
4489 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4490 
4491 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4492 					    rt, &r_cfg);
4493 		if (err) {
4494 			fib6_info_release(rt);
4495 			goto cleanup;
4496 		}
4497 
4498 		rtnh = rtnh_next(rtnh, &remaining);
4499 	}
4500 
4501 	/* for add and replace send one notification with all nexthops.
4502 	 * Skip the notification in fib6_add_rt2node and send one with
4503 	 * the full route when done
4504 	 */
4505 	info->skip_notify = 1;
4506 
4507 	err_nh = NULL;
4508 	list_for_each_entry(nh, &rt6_nh_list, next) {
4509 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4510 		fib6_info_release(nh->fib6_info);
4511 
4512 		if (!err) {
4513 			/* save reference to last route successfully inserted */
4514 			rt_last = nh->fib6_info;
4515 
4516 			/* save reference to first route for notification */
4517 			if (!rt_notif)
4518 				rt_notif = nh->fib6_info;
4519 		}
4520 
4521 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4522 		nh->fib6_info = NULL;
4523 		if (err) {
4524 			if (replace && nhn)
4525 				NL_SET_ERR_MSG_MOD(extack,
4526 						   "multipath route replace failed (check consistency of installed routes)");
4527 			err_nh = nh;
4528 			goto add_errout;
4529 		}
4530 
4531 		/* Because each route is added like a single route we remove
4532 		 * these flags after the first nexthop: if there is a collision,
4533 		 * we have already failed to add the first nexthop:
4534 		 * fib6_add_rt2node() has rejected it; when replacing, old
4535 		 * nexthops have been replaced by first new, the rest should
4536 		 * be added to it.
4537 		 */
4538 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4539 						     NLM_F_REPLACE);
4540 		nhn++;
4541 	}
4542 
4543 	/* success ... tell user about new route */
4544 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4545 	goto cleanup;
4546 
4547 add_errout:
4548 	/* send notification for routes that were added so that
4549 	 * the delete notifications sent by ip6_route_del are
4550 	 * coherent
4551 	 */
4552 	if (rt_notif)
4553 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4554 
4555 	/* Delete routes that were already added */
4556 	list_for_each_entry(nh, &rt6_nh_list, next) {
4557 		if (err_nh == nh)
4558 			break;
4559 		ip6_route_del(&nh->r_cfg, extack);
4560 	}
4561 
4562 cleanup:
4563 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4564 		if (nh->fib6_info)
4565 			fib6_info_release(nh->fib6_info);
4566 		list_del(&nh->next);
4567 		kfree(nh);
4568 	}
4569 
4570 	return err;
4571 }
4572 
4573 static int ip6_route_multipath_del(struct fib6_config *cfg,
4574 				   struct netlink_ext_ack *extack)
4575 {
4576 	struct fib6_config r_cfg;
4577 	struct rtnexthop *rtnh;
4578 	int remaining;
4579 	int attrlen;
4580 	int err = 1, last_err = 0;
4581 
4582 	remaining = cfg->fc_mp_len;
4583 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4584 
4585 	/* Parse a Multipath Entry */
4586 	while (rtnh_ok(rtnh, remaining)) {
4587 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4588 		if (rtnh->rtnh_ifindex)
4589 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4590 
4591 		attrlen = rtnh_attrlen(rtnh);
4592 		if (attrlen > 0) {
4593 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4594 
4595 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4596 			if (nla) {
4597 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4598 				r_cfg.fc_flags |= RTF_GATEWAY;
4599 			}
4600 		}
4601 		err = ip6_route_del(&r_cfg, extack);
4602 		if (err)
4603 			last_err = err;
4604 
4605 		rtnh = rtnh_next(rtnh, &remaining);
4606 	}
4607 
4608 	return last_err;
4609 }
4610 
4611 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4612 			      struct netlink_ext_ack *extack)
4613 {
4614 	struct fib6_config cfg;
4615 	int err;
4616 
4617 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4618 	if (err < 0)
4619 		return err;
4620 
4621 	if (cfg.fc_mp)
4622 		return ip6_route_multipath_del(&cfg, extack);
4623 	else {
4624 		cfg.fc_delete_all_nh = 1;
4625 		return ip6_route_del(&cfg, extack);
4626 	}
4627 }
4628 
4629 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4630 			      struct netlink_ext_ack *extack)
4631 {
4632 	struct fib6_config cfg;
4633 	int err;
4634 
4635 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4636 	if (err < 0)
4637 		return err;
4638 
4639 	if (cfg.fc_metric == 0)
4640 		cfg.fc_metric = IP6_RT_PRIO_USER;
4641 
4642 	if (cfg.fc_mp)
4643 		return ip6_route_multipath_add(&cfg, extack);
4644 	else
4645 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4646 }
4647 
4648 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4649 {
4650 	int nexthop_len = 0;
4651 
4652 	if (rt->fib6_nsiblings) {
4653 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4654 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4655 			    + nla_total_size(16) /* RTA_GATEWAY */
4656 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4657 
4658 		nexthop_len *= rt->fib6_nsiblings;
4659 	}
4660 
4661 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4662 	       + nla_total_size(16) /* RTA_SRC */
4663 	       + nla_total_size(16) /* RTA_DST */
4664 	       + nla_total_size(16) /* RTA_GATEWAY */
4665 	       + nla_total_size(16) /* RTA_PREFSRC */
4666 	       + nla_total_size(4) /* RTA_TABLE */
4667 	       + nla_total_size(4) /* RTA_IIF */
4668 	       + nla_total_size(4) /* RTA_OIF */
4669 	       + nla_total_size(4) /* RTA_PRIORITY */
4670 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4671 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4672 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4673 	       + nla_total_size(1) /* RTA_PREF */
4674 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4675 	       + nexthop_len;
4676 }
4677 
4678 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4679 			 struct fib6_info *rt, struct dst_entry *dst,
4680 			 struct in6_addr *dest, struct in6_addr *src,
4681 			 int iif, int type, u32 portid, u32 seq,
4682 			 unsigned int flags)
4683 {
4684 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4685 	struct rt6key *rt6_dst, *rt6_src;
4686 	u32 *pmetrics, table, rt6_flags;
4687 	struct nlmsghdr *nlh;
4688 	struct rtmsg *rtm;
4689 	long expires = 0;
4690 
4691 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4692 	if (!nlh)
4693 		return -EMSGSIZE;
4694 
4695 	if (rt6) {
4696 		rt6_dst = &rt6->rt6i_dst;
4697 		rt6_src = &rt6->rt6i_src;
4698 		rt6_flags = rt6->rt6i_flags;
4699 	} else {
4700 		rt6_dst = &rt->fib6_dst;
4701 		rt6_src = &rt->fib6_src;
4702 		rt6_flags = rt->fib6_flags;
4703 	}
4704 
4705 	rtm = nlmsg_data(nlh);
4706 	rtm->rtm_family = AF_INET6;
4707 	rtm->rtm_dst_len = rt6_dst->plen;
4708 	rtm->rtm_src_len = rt6_src->plen;
4709 	rtm->rtm_tos = 0;
4710 	if (rt->fib6_table)
4711 		table = rt->fib6_table->tb6_id;
4712 	else
4713 		table = RT6_TABLE_UNSPEC;
4714 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4715 	if (nla_put_u32(skb, RTA_TABLE, table))
4716 		goto nla_put_failure;
4717 
4718 	rtm->rtm_type = rt->fib6_type;
4719 	rtm->rtm_flags = 0;
4720 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4721 	rtm->rtm_protocol = rt->fib6_protocol;
4722 
4723 	if (rt6_flags & RTF_CACHE)
4724 		rtm->rtm_flags |= RTM_F_CLONED;
4725 
4726 	if (dest) {
4727 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4728 			goto nla_put_failure;
4729 		rtm->rtm_dst_len = 128;
4730 	} else if (rtm->rtm_dst_len)
4731 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4732 			goto nla_put_failure;
4733 #ifdef CONFIG_IPV6_SUBTREES
4734 	if (src) {
4735 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4736 			goto nla_put_failure;
4737 		rtm->rtm_src_len = 128;
4738 	} else if (rtm->rtm_src_len &&
4739 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4740 		goto nla_put_failure;
4741 #endif
4742 	if (iif) {
4743 #ifdef CONFIG_IPV6_MROUTE
4744 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4745 			int err = ip6mr_get_route(net, skb, rtm, portid);
4746 
4747 			if (err == 0)
4748 				return 0;
4749 			if (err < 0)
4750 				goto nla_put_failure;
4751 		} else
4752 #endif
4753 			if (nla_put_u32(skb, RTA_IIF, iif))
4754 				goto nla_put_failure;
4755 	} else if (dest) {
4756 		struct in6_addr saddr_buf;
4757 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4758 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4759 			goto nla_put_failure;
4760 	}
4761 
4762 	if (rt->fib6_prefsrc.plen) {
4763 		struct in6_addr saddr_buf;
4764 		saddr_buf = rt->fib6_prefsrc.addr;
4765 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4766 			goto nla_put_failure;
4767 	}
4768 
4769 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4770 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4771 		goto nla_put_failure;
4772 
4773 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4774 		goto nla_put_failure;
4775 
4776 	/* For multipath routes, walk the siblings list and add
4777 	 * each as a nexthop within RTA_MULTIPATH.
4778 	 */
4779 	if (rt6) {
4780 		if (rt6_flags & RTF_GATEWAY &&
4781 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4782 			goto nla_put_failure;
4783 
4784 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4785 			goto nla_put_failure;
4786 	} else if (rt->fib6_nsiblings) {
4787 		struct fib6_info *sibling, *next_sibling;
4788 		struct nlattr *mp;
4789 
4790 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4791 		if (!mp)
4792 			goto nla_put_failure;
4793 
4794 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4795 				    rt->fib6_nh.fib_nh_weight) < 0)
4796 			goto nla_put_failure;
4797 
4798 		list_for_each_entry_safe(sibling, next_sibling,
4799 					 &rt->fib6_siblings, fib6_siblings) {
4800 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4801 					    sibling->fib6_nh.fib_nh_weight) < 0)
4802 				goto nla_put_failure;
4803 		}
4804 
4805 		nla_nest_end(skb, mp);
4806 	} else {
4807 		unsigned char nh_flags = 0;
4808 
4809 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4810 				     &nh_flags, false) < 0)
4811 			goto nla_put_failure;
4812 
4813 		rtm->rtm_flags |= nh_flags;
4814 	}
4815 
4816 	if (rt6_flags & RTF_EXPIRES) {
4817 		expires = dst ? dst->expires : rt->expires;
4818 		expires -= jiffies;
4819 	}
4820 
4821 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4822 		goto nla_put_failure;
4823 
4824 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4825 		goto nla_put_failure;
4826 
4827 
4828 	nlmsg_end(skb, nlh);
4829 	return 0;
4830 
4831 nla_put_failure:
4832 	nlmsg_cancel(skb, nlh);
4833 	return -EMSGSIZE;
4834 }
4835 
4836 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4837 			       const struct net_device *dev)
4838 {
4839 	if (f6i->fib6_nh.fib_nh_dev == dev)
4840 		return true;
4841 
4842 	if (f6i->fib6_nsiblings) {
4843 		struct fib6_info *sibling, *next_sibling;
4844 
4845 		list_for_each_entry_safe(sibling, next_sibling,
4846 					 &f6i->fib6_siblings, fib6_siblings) {
4847 			if (sibling->fib6_nh.fib_nh_dev == dev)
4848 				return true;
4849 		}
4850 	}
4851 
4852 	return false;
4853 }
4854 
4855 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4856 {
4857 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4858 	struct fib_dump_filter *filter = &arg->filter;
4859 	unsigned int flags = NLM_F_MULTI;
4860 	struct net *net = arg->net;
4861 
4862 	if (rt == net->ipv6.fib6_null_entry)
4863 		return 0;
4864 
4865 	if ((filter->flags & RTM_F_PREFIX) &&
4866 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4867 		/* success since this is not a prefix route */
4868 		return 1;
4869 	}
4870 	if (filter->filter_set) {
4871 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4872 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4873 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4874 			return 1;
4875 		}
4876 		flags |= NLM_F_DUMP_FILTERED;
4877 	}
4878 
4879 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4880 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4881 			     arg->cb->nlh->nlmsg_seq, flags);
4882 }
4883 
4884 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4885 					const struct nlmsghdr *nlh,
4886 					struct nlattr **tb,
4887 					struct netlink_ext_ack *extack)
4888 {
4889 	struct rtmsg *rtm;
4890 	int i, err;
4891 
4892 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4893 		NL_SET_ERR_MSG_MOD(extack,
4894 				   "Invalid header for get route request");
4895 		return -EINVAL;
4896 	}
4897 
4898 	if (!netlink_strict_get_check(skb))
4899 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4900 					      rtm_ipv6_policy, extack);
4901 
4902 	rtm = nlmsg_data(nlh);
4903 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4904 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4905 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4906 	    rtm->rtm_type) {
4907 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4908 		return -EINVAL;
4909 	}
4910 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4911 		NL_SET_ERR_MSG_MOD(extack,
4912 				   "Invalid flags for get route request");
4913 		return -EINVAL;
4914 	}
4915 
4916 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4917 					    rtm_ipv6_policy, extack);
4918 	if (err)
4919 		return err;
4920 
4921 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4922 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4923 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4924 		return -EINVAL;
4925 	}
4926 
4927 	for (i = 0; i <= RTA_MAX; i++) {
4928 		if (!tb[i])
4929 			continue;
4930 
4931 		switch (i) {
4932 		case RTA_SRC:
4933 		case RTA_DST:
4934 		case RTA_IIF:
4935 		case RTA_OIF:
4936 		case RTA_MARK:
4937 		case RTA_UID:
4938 		case RTA_SPORT:
4939 		case RTA_DPORT:
4940 		case RTA_IP_PROTO:
4941 			break;
4942 		default:
4943 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4944 			return -EINVAL;
4945 		}
4946 	}
4947 
4948 	return 0;
4949 }
4950 
4951 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4952 			      struct netlink_ext_ack *extack)
4953 {
4954 	struct net *net = sock_net(in_skb->sk);
4955 	struct nlattr *tb[RTA_MAX+1];
4956 	int err, iif = 0, oif = 0;
4957 	struct fib6_info *from;
4958 	struct dst_entry *dst;
4959 	struct rt6_info *rt;
4960 	struct sk_buff *skb;
4961 	struct rtmsg *rtm;
4962 	struct flowi6 fl6 = {};
4963 	bool fibmatch;
4964 
4965 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4966 	if (err < 0)
4967 		goto errout;
4968 
4969 	err = -EINVAL;
4970 	rtm = nlmsg_data(nlh);
4971 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4972 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4973 
4974 	if (tb[RTA_SRC]) {
4975 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4976 			goto errout;
4977 
4978 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4979 	}
4980 
4981 	if (tb[RTA_DST]) {
4982 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4983 			goto errout;
4984 
4985 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4986 	}
4987 
4988 	if (tb[RTA_IIF])
4989 		iif = nla_get_u32(tb[RTA_IIF]);
4990 
4991 	if (tb[RTA_OIF])
4992 		oif = nla_get_u32(tb[RTA_OIF]);
4993 
4994 	if (tb[RTA_MARK])
4995 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4996 
4997 	if (tb[RTA_UID])
4998 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4999 					   nla_get_u32(tb[RTA_UID]));
5000 	else
5001 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5002 
5003 	if (tb[RTA_SPORT])
5004 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5005 
5006 	if (tb[RTA_DPORT])
5007 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5008 
5009 	if (tb[RTA_IP_PROTO]) {
5010 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5011 						  &fl6.flowi6_proto, AF_INET6,
5012 						  extack);
5013 		if (err)
5014 			goto errout;
5015 	}
5016 
5017 	if (iif) {
5018 		struct net_device *dev;
5019 		int flags = 0;
5020 
5021 		rcu_read_lock();
5022 
5023 		dev = dev_get_by_index_rcu(net, iif);
5024 		if (!dev) {
5025 			rcu_read_unlock();
5026 			err = -ENODEV;
5027 			goto errout;
5028 		}
5029 
5030 		fl6.flowi6_iif = iif;
5031 
5032 		if (!ipv6_addr_any(&fl6.saddr))
5033 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5034 
5035 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5036 
5037 		rcu_read_unlock();
5038 	} else {
5039 		fl6.flowi6_oif = oif;
5040 
5041 		dst = ip6_route_output(net, NULL, &fl6);
5042 	}
5043 
5044 
5045 	rt = container_of(dst, struct rt6_info, dst);
5046 	if (rt->dst.error) {
5047 		err = rt->dst.error;
5048 		ip6_rt_put(rt);
5049 		goto errout;
5050 	}
5051 
5052 	if (rt == net->ipv6.ip6_null_entry) {
5053 		err = rt->dst.error;
5054 		ip6_rt_put(rt);
5055 		goto errout;
5056 	}
5057 
5058 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5059 	if (!skb) {
5060 		ip6_rt_put(rt);
5061 		err = -ENOBUFS;
5062 		goto errout;
5063 	}
5064 
5065 	skb_dst_set(skb, &rt->dst);
5066 
5067 	rcu_read_lock();
5068 	from = rcu_dereference(rt->from);
5069 	if (from) {
5070 		if (fibmatch)
5071 			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5072 					    iif, RTM_NEWROUTE,
5073 					    NETLINK_CB(in_skb).portid,
5074 					    nlh->nlmsg_seq, 0);
5075 		else
5076 			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5077 					    &fl6.saddr, iif, RTM_NEWROUTE,
5078 					    NETLINK_CB(in_skb).portid,
5079 					    nlh->nlmsg_seq, 0);
5080 	} else {
5081 		err = -ENETUNREACH;
5082 	}
5083 	rcu_read_unlock();
5084 
5085 	if (err < 0) {
5086 		kfree_skb(skb);
5087 		goto errout;
5088 	}
5089 
5090 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5091 errout:
5092 	return err;
5093 }
5094 
5095 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5096 		     unsigned int nlm_flags)
5097 {
5098 	struct sk_buff *skb;
5099 	struct net *net = info->nl_net;
5100 	u32 seq;
5101 	int err;
5102 
5103 	err = -ENOBUFS;
5104 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5105 
5106 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5107 	if (!skb)
5108 		goto errout;
5109 
5110 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5111 			    event, info->portid, seq, nlm_flags);
5112 	if (err < 0) {
5113 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5114 		WARN_ON(err == -EMSGSIZE);
5115 		kfree_skb(skb);
5116 		goto errout;
5117 	}
5118 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5119 		    info->nlh, gfp_any());
5120 	return;
5121 errout:
5122 	if (err < 0)
5123 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5124 }
5125 
5126 static int ip6_route_dev_notify(struct notifier_block *this,
5127 				unsigned long event, void *ptr)
5128 {
5129 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5130 	struct net *net = dev_net(dev);
5131 
5132 	if (!(dev->flags & IFF_LOOPBACK))
5133 		return NOTIFY_OK;
5134 
5135 	if (event == NETDEV_REGISTER) {
5136 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5137 		net->ipv6.ip6_null_entry->dst.dev = dev;
5138 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5139 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5140 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5141 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5142 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5143 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5144 #endif
5145 	 } else if (event == NETDEV_UNREGISTER &&
5146 		    dev->reg_state != NETREG_UNREGISTERED) {
5147 		/* NETDEV_UNREGISTER could be fired for multiple times by
5148 		 * netdev_wait_allrefs(). Make sure we only call this once.
5149 		 */
5150 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5152 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5153 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5154 #endif
5155 	}
5156 
5157 	return NOTIFY_OK;
5158 }
5159 
5160 /*
5161  *	/proc
5162  */
5163 
5164 #ifdef CONFIG_PROC_FS
5165 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5166 {
5167 	struct net *net = (struct net *)seq->private;
5168 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5169 		   net->ipv6.rt6_stats->fib_nodes,
5170 		   net->ipv6.rt6_stats->fib_route_nodes,
5171 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5172 		   net->ipv6.rt6_stats->fib_rt_entries,
5173 		   net->ipv6.rt6_stats->fib_rt_cache,
5174 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5175 		   net->ipv6.rt6_stats->fib_discarded_routes);
5176 
5177 	return 0;
5178 }
5179 #endif	/* CONFIG_PROC_FS */
5180 
5181 #ifdef CONFIG_SYSCTL
5182 
5183 static
5184 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5185 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5186 {
5187 	struct net *net;
5188 	int delay;
5189 	int ret;
5190 	if (!write)
5191 		return -EINVAL;
5192 
5193 	net = (struct net *)ctl->extra1;
5194 	delay = net->ipv6.sysctl.flush_delay;
5195 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5196 	if (ret)
5197 		return ret;
5198 
5199 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5200 	return 0;
5201 }
5202 
5203 static int zero;
5204 static int one = 1;
5205 
5206 static struct ctl_table ipv6_route_table_template[] = {
5207 	{
5208 		.procname	=	"flush",
5209 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5210 		.maxlen		=	sizeof(int),
5211 		.mode		=	0200,
5212 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5213 	},
5214 	{
5215 		.procname	=	"gc_thresh",
5216 		.data		=	&ip6_dst_ops_template.gc_thresh,
5217 		.maxlen		=	sizeof(int),
5218 		.mode		=	0644,
5219 		.proc_handler	=	proc_dointvec,
5220 	},
5221 	{
5222 		.procname	=	"max_size",
5223 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5224 		.maxlen		=	sizeof(int),
5225 		.mode		=	0644,
5226 		.proc_handler	=	proc_dointvec,
5227 	},
5228 	{
5229 		.procname	=	"gc_min_interval",
5230 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5231 		.maxlen		=	sizeof(int),
5232 		.mode		=	0644,
5233 		.proc_handler	=	proc_dointvec_jiffies,
5234 	},
5235 	{
5236 		.procname	=	"gc_timeout",
5237 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5238 		.maxlen		=	sizeof(int),
5239 		.mode		=	0644,
5240 		.proc_handler	=	proc_dointvec_jiffies,
5241 	},
5242 	{
5243 		.procname	=	"gc_interval",
5244 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5245 		.maxlen		=	sizeof(int),
5246 		.mode		=	0644,
5247 		.proc_handler	=	proc_dointvec_jiffies,
5248 	},
5249 	{
5250 		.procname	=	"gc_elasticity",
5251 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5252 		.maxlen		=	sizeof(int),
5253 		.mode		=	0644,
5254 		.proc_handler	=	proc_dointvec,
5255 	},
5256 	{
5257 		.procname	=	"mtu_expires",
5258 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5259 		.maxlen		=	sizeof(int),
5260 		.mode		=	0644,
5261 		.proc_handler	=	proc_dointvec_jiffies,
5262 	},
5263 	{
5264 		.procname	=	"min_adv_mss",
5265 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5266 		.maxlen		=	sizeof(int),
5267 		.mode		=	0644,
5268 		.proc_handler	=	proc_dointvec,
5269 	},
5270 	{
5271 		.procname	=	"gc_min_interval_ms",
5272 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5273 		.maxlen		=	sizeof(int),
5274 		.mode		=	0644,
5275 		.proc_handler	=	proc_dointvec_ms_jiffies,
5276 	},
5277 	{
5278 		.procname	=	"skip_notify_on_dev_down",
5279 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5280 		.maxlen		=	sizeof(int),
5281 		.mode		=	0644,
5282 		.proc_handler	=	proc_dointvec,
5283 		.extra1		=	&zero,
5284 		.extra2		=	&one,
5285 	},
5286 	{ }
5287 };
5288 
5289 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5290 {
5291 	struct ctl_table *table;
5292 
5293 	table = kmemdup(ipv6_route_table_template,
5294 			sizeof(ipv6_route_table_template),
5295 			GFP_KERNEL);
5296 
5297 	if (table) {
5298 		table[0].data = &net->ipv6.sysctl.flush_delay;
5299 		table[0].extra1 = net;
5300 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5301 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5302 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5303 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5304 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5305 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5306 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5307 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5308 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5309 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5310 
5311 		/* Don't export sysctls to unprivileged users */
5312 		if (net->user_ns != &init_user_ns)
5313 			table[0].procname = NULL;
5314 	}
5315 
5316 	return table;
5317 }
5318 #endif
5319 
5320 static int __net_init ip6_route_net_init(struct net *net)
5321 {
5322 	int ret = -ENOMEM;
5323 
5324 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5325 	       sizeof(net->ipv6.ip6_dst_ops));
5326 
5327 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5328 		goto out_ip6_dst_ops;
5329 
5330 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5331 					    sizeof(*net->ipv6.fib6_null_entry),
5332 					    GFP_KERNEL);
5333 	if (!net->ipv6.fib6_null_entry)
5334 		goto out_ip6_dst_entries;
5335 
5336 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5337 					   sizeof(*net->ipv6.ip6_null_entry),
5338 					   GFP_KERNEL);
5339 	if (!net->ipv6.ip6_null_entry)
5340 		goto out_fib6_null_entry;
5341 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5342 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5343 			 ip6_template_metrics, true);
5344 
5345 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5346 	net->ipv6.fib6_has_custom_rules = false;
5347 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5348 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5349 					       GFP_KERNEL);
5350 	if (!net->ipv6.ip6_prohibit_entry)
5351 		goto out_ip6_null_entry;
5352 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5353 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5354 			 ip6_template_metrics, true);
5355 
5356 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5357 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5358 					       GFP_KERNEL);
5359 	if (!net->ipv6.ip6_blk_hole_entry)
5360 		goto out_ip6_prohibit_entry;
5361 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5362 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5363 			 ip6_template_metrics, true);
5364 #endif
5365 
5366 	net->ipv6.sysctl.flush_delay = 0;
5367 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5368 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5369 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5370 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5371 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5372 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5373 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5374 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5375 
5376 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5377 
5378 	ret = 0;
5379 out:
5380 	return ret;
5381 
5382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5383 out_ip6_prohibit_entry:
5384 	kfree(net->ipv6.ip6_prohibit_entry);
5385 out_ip6_null_entry:
5386 	kfree(net->ipv6.ip6_null_entry);
5387 #endif
5388 out_fib6_null_entry:
5389 	kfree(net->ipv6.fib6_null_entry);
5390 out_ip6_dst_entries:
5391 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5392 out_ip6_dst_ops:
5393 	goto out;
5394 }
5395 
5396 static void __net_exit ip6_route_net_exit(struct net *net)
5397 {
5398 	kfree(net->ipv6.fib6_null_entry);
5399 	kfree(net->ipv6.ip6_null_entry);
5400 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5401 	kfree(net->ipv6.ip6_prohibit_entry);
5402 	kfree(net->ipv6.ip6_blk_hole_entry);
5403 #endif
5404 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5405 }
5406 
5407 static int __net_init ip6_route_net_init_late(struct net *net)
5408 {
5409 #ifdef CONFIG_PROC_FS
5410 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5411 			sizeof(struct ipv6_route_iter));
5412 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5413 			rt6_stats_seq_show, NULL);
5414 #endif
5415 	return 0;
5416 }
5417 
5418 static void __net_exit ip6_route_net_exit_late(struct net *net)
5419 {
5420 #ifdef CONFIG_PROC_FS
5421 	remove_proc_entry("ipv6_route", net->proc_net);
5422 	remove_proc_entry("rt6_stats", net->proc_net);
5423 #endif
5424 }
5425 
5426 static struct pernet_operations ip6_route_net_ops = {
5427 	.init = ip6_route_net_init,
5428 	.exit = ip6_route_net_exit,
5429 };
5430 
5431 static int __net_init ipv6_inetpeer_init(struct net *net)
5432 {
5433 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5434 
5435 	if (!bp)
5436 		return -ENOMEM;
5437 	inet_peer_base_init(bp);
5438 	net->ipv6.peers = bp;
5439 	return 0;
5440 }
5441 
5442 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5443 {
5444 	struct inet_peer_base *bp = net->ipv6.peers;
5445 
5446 	net->ipv6.peers = NULL;
5447 	inetpeer_invalidate_tree(bp);
5448 	kfree(bp);
5449 }
5450 
5451 static struct pernet_operations ipv6_inetpeer_ops = {
5452 	.init	=	ipv6_inetpeer_init,
5453 	.exit	=	ipv6_inetpeer_exit,
5454 };
5455 
5456 static struct pernet_operations ip6_route_net_late_ops = {
5457 	.init = ip6_route_net_init_late,
5458 	.exit = ip6_route_net_exit_late,
5459 };
5460 
5461 static struct notifier_block ip6_route_dev_notifier = {
5462 	.notifier_call = ip6_route_dev_notify,
5463 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5464 };
5465 
5466 void __init ip6_route_init_special_entries(void)
5467 {
5468 	/* Registering of the loopback is done before this portion of code,
5469 	 * the loopback reference in rt6_info will not be taken, do it
5470 	 * manually for init_net */
5471 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5472 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5473 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5474   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5475 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5476 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5477 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5478 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5479   #endif
5480 }
5481 
5482 int __init ip6_route_init(void)
5483 {
5484 	int ret;
5485 	int cpu;
5486 
5487 	ret = -ENOMEM;
5488 	ip6_dst_ops_template.kmem_cachep =
5489 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5490 				  SLAB_HWCACHE_ALIGN, NULL);
5491 	if (!ip6_dst_ops_template.kmem_cachep)
5492 		goto out;
5493 
5494 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5495 	if (ret)
5496 		goto out_kmem_cache;
5497 
5498 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5499 	if (ret)
5500 		goto out_dst_entries;
5501 
5502 	ret = register_pernet_subsys(&ip6_route_net_ops);
5503 	if (ret)
5504 		goto out_register_inetpeer;
5505 
5506 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5507 
5508 	ret = fib6_init();
5509 	if (ret)
5510 		goto out_register_subsys;
5511 
5512 	ret = xfrm6_init();
5513 	if (ret)
5514 		goto out_fib6_init;
5515 
5516 	ret = fib6_rules_init();
5517 	if (ret)
5518 		goto xfrm6_init;
5519 
5520 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5521 	if (ret)
5522 		goto fib6_rules_init;
5523 
5524 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5525 				   inet6_rtm_newroute, NULL, 0);
5526 	if (ret < 0)
5527 		goto out_register_late_subsys;
5528 
5529 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5530 				   inet6_rtm_delroute, NULL, 0);
5531 	if (ret < 0)
5532 		goto out_register_late_subsys;
5533 
5534 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5535 				   inet6_rtm_getroute, NULL,
5536 				   RTNL_FLAG_DOIT_UNLOCKED);
5537 	if (ret < 0)
5538 		goto out_register_late_subsys;
5539 
5540 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5541 	if (ret)
5542 		goto out_register_late_subsys;
5543 
5544 	for_each_possible_cpu(cpu) {
5545 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5546 
5547 		INIT_LIST_HEAD(&ul->head);
5548 		spin_lock_init(&ul->lock);
5549 	}
5550 
5551 out:
5552 	return ret;
5553 
5554 out_register_late_subsys:
5555 	rtnl_unregister_all(PF_INET6);
5556 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5557 fib6_rules_init:
5558 	fib6_rules_cleanup();
5559 xfrm6_init:
5560 	xfrm6_fini();
5561 out_fib6_init:
5562 	fib6_gc_cleanup();
5563 out_register_subsys:
5564 	unregister_pernet_subsys(&ip6_route_net_ops);
5565 out_register_inetpeer:
5566 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5567 out_dst_entries:
5568 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5569 out_kmem_cache:
5570 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5571 	goto out;
5572 }
5573 
5574 void ip6_route_cleanup(void)
5575 {
5576 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5577 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5578 	fib6_rules_cleanup();
5579 	xfrm6_fini();
5580 	fib6_gc_cleanup();
5581 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5582 	unregister_pernet_subsys(&ip6_route_net_ops);
5583 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5584 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5585 }
5586