xref: /openbmc/linux/net/ipv6/route.c (revision 151f4e2b)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/rtnh.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= REFCOUNT_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	from = xchg((__force struct fib6_info **)&rt->from, NULL);
384 	fib6_info_release(from);
385 }
386 
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388 			   int how)
389 {
390 	struct rt6_info *rt = (struct rt6_info *)dst;
391 	struct inet6_dev *idev = rt->rt6i_idev;
392 	struct net_device *loopback_dev =
393 		dev_net(dev)->loopback_dev;
394 
395 	if (idev && idev->dev != loopback_dev) {
396 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
397 		if (loopback_idev) {
398 			rt->rt6i_idev = loopback_idev;
399 			in6_dev_put(idev);
400 		}
401 	}
402 }
403 
404 static bool __rt6_check_expired(const struct rt6_info *rt)
405 {
406 	if (rt->rt6i_flags & RTF_EXPIRES)
407 		return time_after(jiffies, rt->dst.expires);
408 	else
409 		return false;
410 }
411 
412 static bool rt6_check_expired(const struct rt6_info *rt)
413 {
414 	struct fib6_info *from;
415 
416 	from = rcu_dereference(rt->from);
417 
418 	if (rt->rt6i_flags & RTF_EXPIRES) {
419 		if (time_after(jiffies, rt->dst.expires))
420 			return true;
421 	} else if (from) {
422 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
423 			fib6_check_expired(from);
424 	}
425 	return false;
426 }
427 
428 void fib6_select_path(const struct net *net, struct fib6_result *res,
429 		      struct flowi6 *fl6, int oif, bool have_oif_match,
430 		      const struct sk_buff *skb, int strict)
431 {
432 	struct fib6_info *sibling, *next_sibling;
433 	struct fib6_info *match = res->f6i;
434 
435 	if (!match->fib6_nsiblings || have_oif_match)
436 		goto out;
437 
438 	/* We might have already computed the hash for ICMPv6 errors. In such
439 	 * case it will always be non-zero. Otherwise now is the time to do it.
440 	 */
441 	if (!fl6->mp_hash)
442 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443 
444 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445 		goto out;
446 
447 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
448 				 fib6_siblings) {
449 		const struct fib6_nh *nh = &sibling->fib6_nh;
450 		int nh_upper_bound;
451 
452 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
453 		if (fl6->mp_hash > nh_upper_bound)
454 			continue;
455 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
456 			break;
457 		match = sibling;
458 		break;
459 	}
460 
461 out:
462 	res->f6i = match;
463 	res->nh = &match->fib6_nh;
464 }
465 
466 /*
467  *	Route lookup. rcu_read_lock() should be held.
468  */
469 
470 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
471 			       const struct in6_addr *saddr, int oif, int flags)
472 {
473 	const struct net_device *dev;
474 
475 	if (nh->fib_nh_flags & RTNH_F_DEAD)
476 		return false;
477 
478 	dev = nh->fib_nh_dev;
479 	if (oif) {
480 		if (dev->ifindex == oif)
481 			return true;
482 	} else {
483 		if (ipv6_chk_addr(net, saddr, dev,
484 				  flags & RT6_LOOKUP_F_IFACE))
485 			return true;
486 	}
487 
488 	return false;
489 }
490 
491 static void rt6_device_match(struct net *net, struct fib6_result *res,
492 			     const struct in6_addr *saddr, int oif, int flags)
493 {
494 	struct fib6_info *f6i = res->f6i;
495 	struct fib6_info *spf6i;
496 	struct fib6_nh *nh;
497 
498 	if (!oif && ipv6_addr_any(saddr)) {
499 		nh = &f6i->fib6_nh;
500 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
501 			goto out;
502 	}
503 
504 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
505 		nh = &spf6i->fib6_nh;
506 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
507 			res->f6i = spf6i;
508 			goto out;
509 		}
510 	}
511 
512 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
513 		res->f6i = net->ipv6.fib6_null_entry;
514 		nh = &res->f6i->fib6_nh;
515 		goto out;
516 	}
517 
518 	nh = &f6i->fib6_nh;
519 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
520 		res->f6i = net->ipv6.fib6_null_entry;
521 		nh = &res->f6i->fib6_nh;
522 	}
523 out:
524 	res->nh = nh;
525 	res->fib6_type = res->f6i->fib6_type;
526 	res->fib6_flags = res->f6i->fib6_flags;
527 }
528 
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531 	struct work_struct work;
532 	struct in6_addr target;
533 	struct net_device *dev;
534 };
535 
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538 	struct in6_addr mcaddr;
539 	struct __rt6_probe_work *work =
540 		container_of(w, struct __rt6_probe_work, work);
541 
542 	addrconf_addr_solict_mult(&work->target, &mcaddr);
543 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
544 	dev_put(work->dev);
545 	kfree(work);
546 }
547 
548 static void rt6_probe(struct fib6_nh *fib6_nh)
549 {
550 	struct __rt6_probe_work *work = NULL;
551 	const struct in6_addr *nh_gw;
552 	struct neighbour *neigh;
553 	struct net_device *dev;
554 	struct inet6_dev *idev;
555 
556 	/*
557 	 * Okay, this does not seem to be appropriate
558 	 * for now, however, we need to check if it
559 	 * is really so; aka Router Reachability Probing.
560 	 *
561 	 * Router Reachability Probe MUST be rate-limited
562 	 * to no more than one per minute.
563 	 */
564 	if (fib6_nh->fib_nh_gw_family)
565 		return;
566 
567 	nh_gw = &fib6_nh->fib_nh_gw6;
568 	dev = fib6_nh->fib_nh_dev;
569 	rcu_read_lock_bh();
570 	idev = __in6_dev_get(dev);
571 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
572 	if (neigh) {
573 		if (neigh->nud_state & NUD_VALID)
574 			goto out;
575 
576 		write_lock(&neigh->lock);
577 		if (!(neigh->nud_state & NUD_VALID) &&
578 		    time_after(jiffies,
579 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else if (time_after(jiffies, fib6_nh->last_probe +
586 				       idev->cnf.rtr_probe_interval)) {
587 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 	}
589 
590 	if (work) {
591 		fib6_nh->last_probe = jiffies;
592 		INIT_WORK(&work->work, rt6_probe_deferred);
593 		work->target = *nh_gw;
594 		dev_hold(dev);
595 		work->dev = dev;
596 		schedule_work(&work->work);
597 	}
598 
599 out:
600 	rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct fib6_nh *fib6_nh)
604 {
605 }
606 #endif
607 
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
612 {
613 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
614 	struct neighbour *neigh;
615 
616 	rcu_read_lock_bh();
617 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
618 					  &fib6_nh->fib_nh_gw6);
619 	if (neigh) {
620 		read_lock(&neigh->lock);
621 		if (neigh->nud_state & NUD_VALID)
622 			ret = RT6_NUD_SUCCEED;
623 #ifdef CONFIG_IPV6_ROUTER_PREF
624 		else if (!(neigh->nud_state & NUD_FAILED))
625 			ret = RT6_NUD_SUCCEED;
626 		else
627 			ret = RT6_NUD_FAIL_PROBE;
628 #endif
629 		read_unlock(&neigh->lock);
630 	} else {
631 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
632 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
633 	}
634 	rcu_read_unlock_bh();
635 
636 	return ret;
637 }
638 
639 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
640 			   int strict)
641 {
642 	int m = 0;
643 
644 	if (!oif || nh->fib_nh_dev->ifindex == oif)
645 		m = 2;
646 
647 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
648 		return RT6_NUD_FAIL_HARD;
649 #ifdef CONFIG_IPV6_ROUTER_PREF
650 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
651 #endif
652 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
653 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
654 		int n = rt6_check_neigh(nh);
655 		if (n < 0)
656 			return n;
657 	}
658 	return m;
659 }
660 
661 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
662 		       int oif, int strict, int *mpri, bool *do_rr)
663 {
664 	bool match_do_rr = false;
665 	bool rc = false;
666 	int m;
667 
668 	if (nh->fib_nh_flags & RTNH_F_DEAD)
669 		goto out;
670 
671 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
672 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
673 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
674 		goto out;
675 
676 	m = rt6_score_route(nh, fib6_flags, oif, strict);
677 	if (m == RT6_NUD_FAIL_DO_RR) {
678 		match_do_rr = true;
679 		m = 0; /* lowest valid score */
680 	} else if (m == RT6_NUD_FAIL_HARD) {
681 		goto out;
682 	}
683 
684 	if (strict & RT6_LOOKUP_F_REACHABLE)
685 		rt6_probe(nh);
686 
687 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
688 	if (m > *mpri) {
689 		*do_rr = match_do_rr;
690 		*mpri = m;
691 		rc = true;
692 	}
693 out:
694 	return rc;
695 }
696 
697 static void __find_rr_leaf(struct fib6_info *f6i_start,
698 			   struct fib6_info *nomatch, u32 metric,
699 			   struct fib6_result *res, struct fib6_info **cont,
700 			   int oif, int strict, bool *do_rr, int *mpri)
701 {
702 	struct fib6_info *f6i;
703 
704 	for (f6i = f6i_start;
705 	     f6i && f6i != nomatch;
706 	     f6i = rcu_dereference(f6i->fib6_next)) {
707 		struct fib6_nh *nh;
708 
709 		if (cont && f6i->fib6_metric != metric) {
710 			*cont = f6i;
711 			return;
712 		}
713 
714 		if (fib6_check_expired(f6i))
715 			continue;
716 
717 		nh = &f6i->fib6_nh;
718 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
719 			res->f6i = f6i;
720 			res->nh = nh;
721 			res->fib6_flags = f6i->fib6_flags;
722 			res->fib6_type = f6i->fib6_type;
723 		}
724 	}
725 }
726 
727 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
728 			 struct fib6_info *rr_head, int oif, int strict,
729 			 bool *do_rr, struct fib6_result *res)
730 {
731 	u32 metric = rr_head->fib6_metric;
732 	struct fib6_info *cont = NULL;
733 	int mpri = -1;
734 
735 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
736 		       oif, strict, do_rr, &mpri);
737 
738 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
739 		       oif, strict, do_rr, &mpri);
740 
741 	if (res->f6i || !cont)
742 		return;
743 
744 	__find_rr_leaf(cont, NULL, metric, res, NULL,
745 		       oif, strict, do_rr, &mpri);
746 }
747 
748 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
749 		       struct fib6_result *res, int strict)
750 {
751 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
752 	struct fib6_info *rt0;
753 	bool do_rr = false;
754 	int key_plen;
755 
756 	/* make sure this function or its helpers sets f6i */
757 	res->f6i = NULL;
758 
759 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
760 		goto out;
761 
762 	rt0 = rcu_dereference(fn->rr_ptr);
763 	if (!rt0)
764 		rt0 = leaf;
765 
766 	/* Double check to make sure fn is not an intermediate node
767 	 * and fn->leaf does not points to its child's leaf
768 	 * (This might happen if all routes under fn are deleted from
769 	 * the tree and fib6_repair_tree() is called on the node.)
770 	 */
771 	key_plen = rt0->fib6_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 	if (rt0->fib6_src.plen)
774 		key_plen = rt0->fib6_src.plen;
775 #endif
776 	if (fn->fn_bit != key_plen)
777 		goto out;
778 
779 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
780 	if (do_rr) {
781 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
782 
783 		/* no entries matched; do round-robin */
784 		if (!next || next->fib6_metric != rt0->fib6_metric)
785 			next = leaf;
786 
787 		if (next != rt0) {
788 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
789 			/* make sure next is not being deleted from the tree */
790 			if (next->fib6_node)
791 				rcu_assign_pointer(fn->rr_ptr, next);
792 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
793 		}
794 	}
795 
796 out:
797 	if (!res->f6i) {
798 		res->f6i = net->ipv6.fib6_null_entry;
799 		res->nh = &res->f6i->fib6_nh;
800 		res->fib6_flags = res->f6i->fib6_flags;
801 		res->fib6_type = res->f6i->fib6_type;
802 	}
803 }
804 
805 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
806 {
807 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
808 	       res->nh->fib_nh_gw_family;
809 }
810 
811 #ifdef CONFIG_IPV6_ROUTE_INFO
812 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
813 		  const struct in6_addr *gwaddr)
814 {
815 	struct net *net = dev_net(dev);
816 	struct route_info *rinfo = (struct route_info *) opt;
817 	struct in6_addr prefix_buf, *prefix;
818 	unsigned int pref;
819 	unsigned long lifetime;
820 	struct fib6_info *rt;
821 
822 	if (len < sizeof(struct route_info)) {
823 		return -EINVAL;
824 	}
825 
826 	/* Sanity check for prefix_len and length */
827 	if (rinfo->length > 3) {
828 		return -EINVAL;
829 	} else if (rinfo->prefix_len > 128) {
830 		return -EINVAL;
831 	} else if (rinfo->prefix_len > 64) {
832 		if (rinfo->length < 2) {
833 			return -EINVAL;
834 		}
835 	} else if (rinfo->prefix_len > 0) {
836 		if (rinfo->length < 1) {
837 			return -EINVAL;
838 		}
839 	}
840 
841 	pref = rinfo->route_pref;
842 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
843 		return -EINVAL;
844 
845 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
846 
847 	if (rinfo->length == 3)
848 		prefix = (struct in6_addr *)rinfo->prefix;
849 	else {
850 		/* this function is safe */
851 		ipv6_addr_prefix(&prefix_buf,
852 				 (struct in6_addr *)rinfo->prefix,
853 				 rinfo->prefix_len);
854 		prefix = &prefix_buf;
855 	}
856 
857 	if (rinfo->prefix_len == 0)
858 		rt = rt6_get_dflt_router(net, gwaddr, dev);
859 	else
860 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
861 					gwaddr, dev);
862 
863 	if (rt && !lifetime) {
864 		ip6_del_rt(net, rt);
865 		rt = NULL;
866 	}
867 
868 	if (!rt && lifetime)
869 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
870 					dev, pref);
871 	else if (rt)
872 		rt->fib6_flags = RTF_ROUTEINFO |
873 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
874 
875 	if (rt) {
876 		if (!addrconf_finite_timeout(lifetime))
877 			fib6_clean_expires(rt);
878 		else
879 			fib6_set_expires(rt, jiffies + HZ * lifetime);
880 
881 		fib6_info_release(rt);
882 	}
883 	return 0;
884 }
885 #endif
886 
887 /*
888  *	Misc support functions
889  */
890 
891 /* called with rcu_lock held */
892 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
893 {
894 	struct net_device *dev = res->nh->fib_nh_dev;
895 
896 	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
897 		/* for copies of local routes, dst->dev needs to be the
898 		 * device if it is a master device, the master device if
899 		 * device is enslaved, and the loopback as the default
900 		 */
901 		if (netif_is_l3_slave(dev) &&
902 		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
903 			dev = l3mdev_master_dev_rcu(dev);
904 		else if (!netif_is_l3_master(dev))
905 			dev = dev_net(dev)->loopback_dev;
906 		/* last case is netif_is_l3_master(dev) is true in which
907 		 * case we want dev returned to be dev
908 		 */
909 	}
910 
911 	return dev;
912 }
913 
914 static const int fib6_prop[RTN_MAX + 1] = {
915 	[RTN_UNSPEC]	= 0,
916 	[RTN_UNICAST]	= 0,
917 	[RTN_LOCAL]	= 0,
918 	[RTN_BROADCAST]	= 0,
919 	[RTN_ANYCAST]	= 0,
920 	[RTN_MULTICAST]	= 0,
921 	[RTN_BLACKHOLE]	= -EINVAL,
922 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
923 	[RTN_PROHIBIT]	= -EACCES,
924 	[RTN_THROW]	= -EAGAIN,
925 	[RTN_NAT]	= -EINVAL,
926 	[RTN_XRESOLVE]	= -EINVAL,
927 };
928 
929 static int ip6_rt_type_to_error(u8 fib6_type)
930 {
931 	return fib6_prop[fib6_type];
932 }
933 
934 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
935 {
936 	unsigned short flags = 0;
937 
938 	if (rt->dst_nocount)
939 		flags |= DST_NOCOUNT;
940 	if (rt->dst_nopolicy)
941 		flags |= DST_NOPOLICY;
942 	if (rt->dst_host)
943 		flags |= DST_HOST;
944 
945 	return flags;
946 }
947 
948 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
949 {
950 	rt->dst.error = ip6_rt_type_to_error(fib6_type);
951 
952 	switch (fib6_type) {
953 	case RTN_BLACKHOLE:
954 		rt->dst.output = dst_discard_out;
955 		rt->dst.input = dst_discard;
956 		break;
957 	case RTN_PROHIBIT:
958 		rt->dst.output = ip6_pkt_prohibit_out;
959 		rt->dst.input = ip6_pkt_prohibit;
960 		break;
961 	case RTN_THROW:
962 	case RTN_UNREACHABLE:
963 	default:
964 		rt->dst.output = ip6_pkt_discard_out;
965 		rt->dst.input = ip6_pkt_discard;
966 		break;
967 	}
968 }
969 
970 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
971 {
972 	struct fib6_info *f6i = res->f6i;
973 
974 	if (res->fib6_flags & RTF_REJECT) {
975 		ip6_rt_init_dst_reject(rt, res->fib6_type);
976 		return;
977 	}
978 
979 	rt->dst.error = 0;
980 	rt->dst.output = ip6_output;
981 
982 	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
983 		rt->dst.input = ip6_input;
984 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
985 		rt->dst.input = ip6_mc_input;
986 	} else {
987 		rt->dst.input = ip6_forward;
988 	}
989 
990 	if (res->nh->fib_nh_lws) {
991 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
992 		lwtunnel_set_redirect(&rt->dst);
993 	}
994 
995 	rt->dst.lastuse = jiffies;
996 }
997 
998 /* Caller must already hold reference to @from */
999 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1000 {
1001 	rt->rt6i_flags &= ~RTF_EXPIRES;
1002 	rcu_assign_pointer(rt->from, from);
1003 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1004 }
1005 
1006 /* Caller must already hold reference to f6i in result */
1007 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1008 {
1009 	const struct fib6_nh *nh = res->nh;
1010 	const struct net_device *dev = nh->fib_nh_dev;
1011 	struct fib6_info *f6i = res->f6i;
1012 
1013 	ip6_rt_init_dst(rt, res);
1014 
1015 	rt->rt6i_dst = f6i->fib6_dst;
1016 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1017 	rt->rt6i_flags = res->fib6_flags;
1018 	if (nh->fib_nh_gw_family) {
1019 		rt->rt6i_gateway = nh->fib_nh_gw6;
1020 		rt->rt6i_flags |= RTF_GATEWAY;
1021 	}
1022 	rt6_set_from(rt, f6i);
1023 #ifdef CONFIG_IPV6_SUBTREES
1024 	rt->rt6i_src = f6i->fib6_src;
1025 #endif
1026 }
1027 
1028 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1029 					struct in6_addr *saddr)
1030 {
1031 	struct fib6_node *pn, *sn;
1032 	while (1) {
1033 		if (fn->fn_flags & RTN_TL_ROOT)
1034 			return NULL;
1035 		pn = rcu_dereference(fn->parent);
1036 		sn = FIB6_SUBTREE(pn);
1037 		if (sn && sn != fn)
1038 			fn = fib6_node_lookup(sn, NULL, saddr);
1039 		else
1040 			fn = pn;
1041 		if (fn->fn_flags & RTN_RTINFO)
1042 			return fn;
1043 	}
1044 }
1045 
1046 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1047 {
1048 	struct rt6_info *rt = *prt;
1049 
1050 	if (dst_hold_safe(&rt->dst))
1051 		return true;
1052 	if (net) {
1053 		rt = net->ipv6.ip6_null_entry;
1054 		dst_hold(&rt->dst);
1055 	} else {
1056 		rt = NULL;
1057 	}
1058 	*prt = rt;
1059 	return false;
1060 }
1061 
1062 /* called with rcu_lock held */
1063 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1064 {
1065 	struct net_device *dev = res->nh->fib_nh_dev;
1066 	struct fib6_info *f6i = res->f6i;
1067 	unsigned short flags;
1068 	struct rt6_info *nrt;
1069 
1070 	if (!fib6_info_hold_safe(f6i))
1071 		goto fallback;
1072 
1073 	flags = fib6_info_dst_flags(f6i);
1074 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1075 	if (!nrt) {
1076 		fib6_info_release(f6i);
1077 		goto fallback;
1078 	}
1079 
1080 	ip6_rt_copy_init(nrt, res);
1081 	return nrt;
1082 
1083 fallback:
1084 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1085 	dst_hold(&nrt->dst);
1086 	return nrt;
1087 }
1088 
1089 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1090 					     struct fib6_table *table,
1091 					     struct flowi6 *fl6,
1092 					     const struct sk_buff *skb,
1093 					     int flags)
1094 {
1095 	struct fib6_result res = {};
1096 	struct fib6_node *fn;
1097 	struct rt6_info *rt;
1098 
1099 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1100 		flags &= ~RT6_LOOKUP_F_IFACE;
1101 
1102 	rcu_read_lock();
1103 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1104 restart:
1105 	res.f6i = rcu_dereference(fn->leaf);
1106 	if (!res.f6i)
1107 		res.f6i = net->ipv6.fib6_null_entry;
1108 	else
1109 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1110 				 flags);
1111 
1112 	if (res.f6i == net->ipv6.fib6_null_entry) {
1113 		fn = fib6_backtrack(fn, &fl6->saddr);
1114 		if (fn)
1115 			goto restart;
1116 
1117 		rt = net->ipv6.ip6_null_entry;
1118 		dst_hold(&rt->dst);
1119 		goto out;
1120 	}
1121 
1122 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1123 			 fl6->flowi6_oif != 0, skb, flags);
1124 
1125 	/* Search through exception table */
1126 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1127 	if (rt) {
1128 		if (ip6_hold_safe(net, &rt))
1129 			dst_use_noref(&rt->dst, jiffies);
1130 	} else {
1131 		rt = ip6_create_rt_rcu(&res);
1132 	}
1133 
1134 out:
1135 	trace_fib6_table_lookup(net, &res, table, fl6);
1136 
1137 	rcu_read_unlock();
1138 
1139 	return rt;
1140 }
1141 
1142 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1143 				   const struct sk_buff *skb, int flags)
1144 {
1145 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1146 }
1147 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1148 
1149 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1150 			    const struct in6_addr *saddr, int oif,
1151 			    const struct sk_buff *skb, int strict)
1152 {
1153 	struct flowi6 fl6 = {
1154 		.flowi6_oif = oif,
1155 		.daddr = *daddr,
1156 	};
1157 	struct dst_entry *dst;
1158 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1159 
1160 	if (saddr) {
1161 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1162 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1163 	}
1164 
1165 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1166 	if (dst->error == 0)
1167 		return (struct rt6_info *) dst;
1168 
1169 	dst_release(dst);
1170 
1171 	return NULL;
1172 }
1173 EXPORT_SYMBOL(rt6_lookup);
1174 
1175 /* ip6_ins_rt is called with FREE table->tb6_lock.
1176  * It takes new route entry, the addition fails by any reason the
1177  * route is released.
1178  * Caller must hold dst before calling it.
1179  */
1180 
1181 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1182 			struct netlink_ext_ack *extack)
1183 {
1184 	int err;
1185 	struct fib6_table *table;
1186 
1187 	table = rt->fib6_table;
1188 	spin_lock_bh(&table->tb6_lock);
1189 	err = fib6_add(&table->tb6_root, rt, info, extack);
1190 	spin_unlock_bh(&table->tb6_lock);
1191 
1192 	return err;
1193 }
1194 
1195 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1196 {
1197 	struct nl_info info = {	.nl_net = net, };
1198 
1199 	return __ip6_ins_rt(rt, &info, NULL);
1200 }
1201 
1202 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1203 					   const struct in6_addr *daddr,
1204 					   const struct in6_addr *saddr)
1205 {
1206 	struct fib6_info *f6i = res->f6i;
1207 	struct net_device *dev;
1208 	struct rt6_info *rt;
1209 
1210 	/*
1211 	 *	Clone the route.
1212 	 */
1213 
1214 	if (!fib6_info_hold_safe(f6i))
1215 		return NULL;
1216 
1217 	dev = ip6_rt_get_dev_rcu(res);
1218 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1219 	if (!rt) {
1220 		fib6_info_release(f6i);
1221 		return NULL;
1222 	}
1223 
1224 	ip6_rt_copy_init(rt, res);
1225 	rt->rt6i_flags |= RTF_CACHE;
1226 	rt->dst.flags |= DST_HOST;
1227 	rt->rt6i_dst.addr = *daddr;
1228 	rt->rt6i_dst.plen = 128;
1229 
1230 	if (!rt6_is_gw_or_nonexthop(res)) {
1231 		if (f6i->fib6_dst.plen != 128 &&
1232 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1233 			rt->rt6i_flags |= RTF_ANYCAST;
1234 #ifdef CONFIG_IPV6_SUBTREES
1235 		if (rt->rt6i_src.plen && saddr) {
1236 			rt->rt6i_src.addr = *saddr;
1237 			rt->rt6i_src.plen = 128;
1238 		}
1239 #endif
1240 	}
1241 
1242 	return rt;
1243 }
1244 
1245 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1246 {
1247 	struct fib6_info *f6i = res->f6i;
1248 	unsigned short flags = fib6_info_dst_flags(f6i);
1249 	struct net_device *dev;
1250 	struct rt6_info *pcpu_rt;
1251 
1252 	if (!fib6_info_hold_safe(f6i))
1253 		return NULL;
1254 
1255 	rcu_read_lock();
1256 	dev = ip6_rt_get_dev_rcu(res);
1257 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1258 	rcu_read_unlock();
1259 	if (!pcpu_rt) {
1260 		fib6_info_release(f6i);
1261 		return NULL;
1262 	}
1263 	ip6_rt_copy_init(pcpu_rt, res);
1264 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1265 	return pcpu_rt;
1266 }
1267 
1268 /* It should be called with rcu_read_lock() acquired */
1269 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1270 {
1271 	struct rt6_info *pcpu_rt, **p;
1272 
1273 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1274 	pcpu_rt = *p;
1275 
1276 	if (pcpu_rt)
1277 		ip6_hold_safe(NULL, &pcpu_rt);
1278 
1279 	return pcpu_rt;
1280 }
1281 
1282 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1283 					    const struct fib6_result *res)
1284 {
1285 	struct rt6_info *pcpu_rt, *prev, **p;
1286 
1287 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1288 	if (!pcpu_rt) {
1289 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1290 		return net->ipv6.ip6_null_entry;
1291 	}
1292 
1293 	dst_hold(&pcpu_rt->dst);
1294 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1295 	prev = cmpxchg(p, NULL, pcpu_rt);
1296 	BUG_ON(prev);
1297 
1298 	return pcpu_rt;
1299 }
1300 
1301 /* exception hash table implementation
1302  */
1303 static DEFINE_SPINLOCK(rt6_exception_lock);
1304 
1305 /* Remove rt6_ex from hash table and free the memory
1306  * Caller must hold rt6_exception_lock
1307  */
1308 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1309 				 struct rt6_exception *rt6_ex)
1310 {
1311 	struct fib6_info *from;
1312 	struct net *net;
1313 
1314 	if (!bucket || !rt6_ex)
1315 		return;
1316 
1317 	net = dev_net(rt6_ex->rt6i->dst.dev);
1318 	net->ipv6.rt6_stats->fib_rt_cache--;
1319 
1320 	/* purge completely the exception to allow releasing the held resources:
1321 	 * some [sk] cache may keep the dst around for unlimited time
1322 	 */
1323 	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1324 	fib6_info_release(from);
1325 	dst_dev_put(&rt6_ex->rt6i->dst);
1326 
1327 	hlist_del_rcu(&rt6_ex->hlist);
1328 	dst_release(&rt6_ex->rt6i->dst);
1329 	kfree_rcu(rt6_ex, rcu);
1330 	WARN_ON_ONCE(!bucket->depth);
1331 	bucket->depth--;
1332 }
1333 
1334 /* Remove oldest rt6_ex in bucket and free the memory
1335  * Caller must hold rt6_exception_lock
1336  */
1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1338 {
1339 	struct rt6_exception *rt6_ex, *oldest = NULL;
1340 
1341 	if (!bucket)
1342 		return;
1343 
1344 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1345 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1346 			oldest = rt6_ex;
1347 	}
1348 	rt6_remove_exception(bucket, oldest);
1349 }
1350 
1351 static u32 rt6_exception_hash(const struct in6_addr *dst,
1352 			      const struct in6_addr *src)
1353 {
1354 	static u32 seed __read_mostly;
1355 	u32 val;
1356 
1357 	net_get_random_once(&seed, sizeof(seed));
1358 	val = jhash(dst, sizeof(*dst), seed);
1359 
1360 #ifdef CONFIG_IPV6_SUBTREES
1361 	if (src)
1362 		val = jhash(src, sizeof(*src), val);
1363 #endif
1364 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1365 }
1366 
1367 /* Helper function to find the cached rt in the hash table
1368  * and update bucket pointer to point to the bucket for this
1369  * (daddr, saddr) pair
1370  * Caller must hold rt6_exception_lock
1371  */
1372 static struct rt6_exception *
1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1374 			      const struct in6_addr *daddr,
1375 			      const struct in6_addr *saddr)
1376 {
1377 	struct rt6_exception *rt6_ex;
1378 	u32 hval;
1379 
1380 	if (!(*bucket) || !daddr)
1381 		return NULL;
1382 
1383 	hval = rt6_exception_hash(daddr, saddr);
1384 	*bucket += hval;
1385 
1386 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1387 		struct rt6_info *rt6 = rt6_ex->rt6i;
1388 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1389 
1390 #ifdef CONFIG_IPV6_SUBTREES
1391 		if (matched && saddr)
1392 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 #endif
1394 		if (matched)
1395 			return rt6_ex;
1396 	}
1397 	return NULL;
1398 }
1399 
1400 /* Helper function to find the cached rt in the hash table
1401  * and update bucket pointer to point to the bucket for this
1402  * (daddr, saddr) pair
1403  * Caller must hold rcu_read_lock()
1404  */
1405 static struct rt6_exception *
1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1407 			 const struct in6_addr *daddr,
1408 			 const struct in6_addr *saddr)
1409 {
1410 	struct rt6_exception *rt6_ex;
1411 	u32 hval;
1412 
1413 	WARN_ON_ONCE(!rcu_read_lock_held());
1414 
1415 	if (!(*bucket) || !daddr)
1416 		return NULL;
1417 
1418 	hval = rt6_exception_hash(daddr, saddr);
1419 	*bucket += hval;
1420 
1421 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1422 		struct rt6_info *rt6 = rt6_ex->rt6i;
1423 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1424 
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 		if (matched && saddr)
1427 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1428 #endif
1429 		if (matched)
1430 			return rt6_ex;
1431 	}
1432 	return NULL;
1433 }
1434 
1435 static unsigned int fib6_mtu(const struct fib6_result *res)
1436 {
1437 	const struct fib6_nh *nh = res->nh;
1438 	unsigned int mtu;
1439 
1440 	if (res->f6i->fib6_pmtu) {
1441 		mtu = res->f6i->fib6_pmtu;
1442 	} else {
1443 		struct net_device *dev = nh->fib_nh_dev;
1444 		struct inet6_dev *idev;
1445 
1446 		rcu_read_lock();
1447 		idev = __in6_dev_get(dev);
1448 		mtu = idev->cnf.mtu6;
1449 		rcu_read_unlock();
1450 	}
1451 
1452 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1453 
1454 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1455 }
1456 
1457 static int rt6_insert_exception(struct rt6_info *nrt,
1458 				const struct fib6_result *res)
1459 {
1460 	struct net *net = dev_net(nrt->dst.dev);
1461 	struct rt6_exception_bucket *bucket;
1462 	struct in6_addr *src_key = NULL;
1463 	struct rt6_exception *rt6_ex;
1464 	struct fib6_info *f6i = res->f6i;
1465 	int err = 0;
1466 
1467 	spin_lock_bh(&rt6_exception_lock);
1468 
1469 	if (f6i->exception_bucket_flushed) {
1470 		err = -EINVAL;
1471 		goto out;
1472 	}
1473 
1474 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1475 					lockdep_is_held(&rt6_exception_lock));
1476 	if (!bucket) {
1477 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1478 				 GFP_ATOMIC);
1479 		if (!bucket) {
1480 			err = -ENOMEM;
1481 			goto out;
1482 		}
1483 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1484 	}
1485 
1486 #ifdef CONFIG_IPV6_SUBTREES
1487 	/* fib6_src.plen != 0 indicates f6i is in subtree
1488 	 * and exception table is indexed by a hash of
1489 	 * both fib6_dst and fib6_src.
1490 	 * Otherwise, the exception table is indexed by
1491 	 * a hash of only fib6_dst.
1492 	 */
1493 	if (f6i->fib6_src.plen)
1494 		src_key = &nrt->rt6i_src.addr;
1495 #endif
1496 	/* rt6_mtu_change() might lower mtu on f6i.
1497 	 * Only insert this exception route if its mtu
1498 	 * is less than f6i's mtu value.
1499 	 */
1500 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1501 		err = -EINVAL;
1502 		goto out;
1503 	}
1504 
1505 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1506 					       src_key);
1507 	if (rt6_ex)
1508 		rt6_remove_exception(bucket, rt6_ex);
1509 
1510 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1511 	if (!rt6_ex) {
1512 		err = -ENOMEM;
1513 		goto out;
1514 	}
1515 	rt6_ex->rt6i = nrt;
1516 	rt6_ex->stamp = jiffies;
1517 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1518 	bucket->depth++;
1519 	net->ipv6.rt6_stats->fib_rt_cache++;
1520 
1521 	if (bucket->depth > FIB6_MAX_DEPTH)
1522 		rt6_exception_remove_oldest(bucket);
1523 
1524 out:
1525 	spin_unlock_bh(&rt6_exception_lock);
1526 
1527 	/* Update fn->fn_sernum to invalidate all cached dst */
1528 	if (!err) {
1529 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1530 		fib6_update_sernum(net, f6i);
1531 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1532 		fib6_force_start_gc(net);
1533 	}
1534 
1535 	return err;
1536 }
1537 
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1539 {
1540 	struct rt6_exception_bucket *bucket;
1541 	struct rt6_exception *rt6_ex;
1542 	struct hlist_node *tmp;
1543 	int i;
1544 
1545 	spin_lock_bh(&rt6_exception_lock);
1546 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1547 	rt->exception_bucket_flushed = 1;
1548 
1549 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 				    lockdep_is_held(&rt6_exception_lock));
1551 	if (!bucket)
1552 		goto out;
1553 
1554 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556 			rt6_remove_exception(bucket, rt6_ex);
1557 		WARN_ON_ONCE(bucket->depth);
1558 		bucket++;
1559 	}
1560 
1561 out:
1562 	spin_unlock_bh(&rt6_exception_lock);
1563 }
1564 
1565 /* Find cached rt in the hash table inside passed in rt
1566  * Caller has to hold rcu_read_lock()
1567  */
1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1569 					   struct in6_addr *daddr,
1570 					   struct in6_addr *saddr)
1571 {
1572 	struct rt6_exception_bucket *bucket;
1573 	struct in6_addr *src_key = NULL;
1574 	struct rt6_exception *rt6_ex;
1575 	struct rt6_info *ret = NULL;
1576 
1577 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1578 
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1581 	 * and exception table is indexed by a hash of
1582 	 * both fib6_dst and fib6_src.
1583 	 * Otherwise, the exception table is indexed by
1584 	 * a hash of only fib6_dst.
1585 	 */
1586 	if (res->f6i->fib6_src.plen)
1587 		src_key = saddr;
1588 #endif
1589 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1590 
1591 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1592 		ret = rt6_ex->rt6i;
1593 
1594 	return ret;
1595 }
1596 
1597 /* Remove the passed in cached rt from the hash table that contains it */
1598 static int rt6_remove_exception_rt(struct rt6_info *rt)
1599 {
1600 	struct rt6_exception_bucket *bucket;
1601 	struct in6_addr *src_key = NULL;
1602 	struct rt6_exception *rt6_ex;
1603 	struct fib6_info *from;
1604 	int err;
1605 
1606 	from = rcu_dereference(rt->from);
1607 	if (!from ||
1608 	    !(rt->rt6i_flags & RTF_CACHE))
1609 		return -EINVAL;
1610 
1611 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1612 		return -ENOENT;
1613 
1614 	spin_lock_bh(&rt6_exception_lock);
1615 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1616 				    lockdep_is_held(&rt6_exception_lock));
1617 #ifdef CONFIG_IPV6_SUBTREES
1618 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1619 	 * and exception table is indexed by a hash of
1620 	 * both rt6i_dst and rt6i_src.
1621 	 * Otherwise, the exception table is indexed by
1622 	 * a hash of only rt6i_dst.
1623 	 */
1624 	if (from->fib6_src.plen)
1625 		src_key = &rt->rt6i_src.addr;
1626 #endif
1627 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1628 					       &rt->rt6i_dst.addr,
1629 					       src_key);
1630 	if (rt6_ex) {
1631 		rt6_remove_exception(bucket, rt6_ex);
1632 		err = 0;
1633 	} else {
1634 		err = -ENOENT;
1635 	}
1636 
1637 	spin_unlock_bh(&rt6_exception_lock);
1638 	return err;
1639 }
1640 
1641 /* Find rt6_ex which contains the passed in rt cache and
1642  * refresh its stamp
1643  */
1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1645 {
1646 	struct rt6_exception_bucket *bucket;
1647 	struct in6_addr *src_key = NULL;
1648 	struct rt6_exception *rt6_ex;
1649 	struct fib6_info *from;
1650 
1651 	rcu_read_lock();
1652 	from = rcu_dereference(rt->from);
1653 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1654 		goto unlock;
1655 
1656 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1657 
1658 #ifdef CONFIG_IPV6_SUBTREES
1659 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1660 	 * and exception table is indexed by a hash of
1661 	 * both rt6i_dst and rt6i_src.
1662 	 * Otherwise, the exception table is indexed by
1663 	 * a hash of only rt6i_dst.
1664 	 */
1665 	if (from->fib6_src.plen)
1666 		src_key = &rt->rt6i_src.addr;
1667 #endif
1668 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1669 					  &rt->rt6i_dst.addr,
1670 					  src_key);
1671 	if (rt6_ex)
1672 		rt6_ex->stamp = jiffies;
1673 
1674 unlock:
1675 	rcu_read_unlock();
1676 }
1677 
1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1679 					 struct rt6_info *rt, int mtu)
1680 {
1681 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1682 	 * lowest MTU in the path: always allow updating the route PMTU to
1683 	 * reflect PMTU decreases.
1684 	 *
1685 	 * If the new MTU is higher, and the route PMTU is equal to the local
1686 	 * MTU, this means the old MTU is the lowest in the path, so allow
1687 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1688 	 * handle this.
1689 	 */
1690 
1691 	if (dst_mtu(&rt->dst) >= mtu)
1692 		return true;
1693 
1694 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1695 		return true;
1696 
1697 	return false;
1698 }
1699 
1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1701 				       struct fib6_info *rt, int mtu)
1702 {
1703 	struct rt6_exception_bucket *bucket;
1704 	struct rt6_exception *rt6_ex;
1705 	int i;
1706 
1707 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1708 					lockdep_is_held(&rt6_exception_lock));
1709 
1710 	if (!bucket)
1711 		return;
1712 
1713 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1715 			struct rt6_info *entry = rt6_ex->rt6i;
1716 
1717 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1718 			 * route), the metrics of its rt->from have already
1719 			 * been updated.
1720 			 */
1721 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1722 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1723 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1724 		}
1725 		bucket++;
1726 	}
1727 }
1728 
1729 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1730 
1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1732 					struct in6_addr *gateway)
1733 {
1734 	struct rt6_exception_bucket *bucket;
1735 	struct rt6_exception *rt6_ex;
1736 	struct hlist_node *tmp;
1737 	int i;
1738 
1739 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1740 		return;
1741 
1742 	spin_lock_bh(&rt6_exception_lock);
1743 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1744 				     lockdep_is_held(&rt6_exception_lock));
1745 
1746 	if (bucket) {
1747 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1748 			hlist_for_each_entry_safe(rt6_ex, tmp,
1749 						  &bucket->chain, hlist) {
1750 				struct rt6_info *entry = rt6_ex->rt6i;
1751 
1752 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1753 				    RTF_CACHE_GATEWAY &&
1754 				    ipv6_addr_equal(gateway,
1755 						    &entry->rt6i_gateway)) {
1756 					rt6_remove_exception(bucket, rt6_ex);
1757 				}
1758 			}
1759 			bucket++;
1760 		}
1761 	}
1762 
1763 	spin_unlock_bh(&rt6_exception_lock);
1764 }
1765 
1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1767 				      struct rt6_exception *rt6_ex,
1768 				      struct fib6_gc_args *gc_args,
1769 				      unsigned long now)
1770 {
1771 	struct rt6_info *rt = rt6_ex->rt6i;
1772 
1773 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1774 	 * even if others have still references to them, so that on next
1775 	 * dst_check() such references can be dropped.
1776 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1777 	 * expired, independently from their aging, as per RFC 8201 section 4
1778 	 */
1779 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1780 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1781 			RT6_TRACE("aging clone %p\n", rt);
1782 			rt6_remove_exception(bucket, rt6_ex);
1783 			return;
1784 		}
1785 	} else if (time_after(jiffies, rt->dst.expires)) {
1786 		RT6_TRACE("purging expired route %p\n", rt);
1787 		rt6_remove_exception(bucket, rt6_ex);
1788 		return;
1789 	}
1790 
1791 	if (rt->rt6i_flags & RTF_GATEWAY) {
1792 		struct neighbour *neigh;
1793 		__u8 neigh_flags = 0;
1794 
1795 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1796 		if (neigh)
1797 			neigh_flags = neigh->flags;
1798 
1799 		if (!(neigh_flags & NTF_ROUTER)) {
1800 			RT6_TRACE("purging route %p via non-router but gateway\n",
1801 				  rt);
1802 			rt6_remove_exception(bucket, rt6_ex);
1803 			return;
1804 		}
1805 	}
1806 
1807 	gc_args->more++;
1808 }
1809 
1810 void rt6_age_exceptions(struct fib6_info *rt,
1811 			struct fib6_gc_args *gc_args,
1812 			unsigned long now)
1813 {
1814 	struct rt6_exception_bucket *bucket;
1815 	struct rt6_exception *rt6_ex;
1816 	struct hlist_node *tmp;
1817 	int i;
1818 
1819 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1820 		return;
1821 
1822 	rcu_read_lock_bh();
1823 	spin_lock(&rt6_exception_lock);
1824 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1825 				    lockdep_is_held(&rt6_exception_lock));
1826 
1827 	if (bucket) {
1828 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1829 			hlist_for_each_entry_safe(rt6_ex, tmp,
1830 						  &bucket->chain, hlist) {
1831 				rt6_age_examine_exception(bucket, rt6_ex,
1832 							  gc_args, now);
1833 			}
1834 			bucket++;
1835 		}
1836 	}
1837 	spin_unlock(&rt6_exception_lock);
1838 	rcu_read_unlock_bh();
1839 }
1840 
1841 /* must be called with rcu lock held */
1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1843 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1844 {
1845 	struct fib6_node *fn, *saved_fn;
1846 
1847 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1848 	saved_fn = fn;
1849 
1850 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1851 		oif = 0;
1852 
1853 redo_rt6_select:
1854 	rt6_select(net, fn, oif, res, strict);
1855 	if (res->f6i == net->ipv6.fib6_null_entry) {
1856 		fn = fib6_backtrack(fn, &fl6->saddr);
1857 		if (fn)
1858 			goto redo_rt6_select;
1859 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1860 			/* also consider unreachable route */
1861 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1862 			fn = saved_fn;
1863 			goto redo_rt6_select;
1864 		}
1865 	}
1866 
1867 	trace_fib6_table_lookup(net, res, table, fl6);
1868 
1869 	return 0;
1870 }
1871 
1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1873 			       int oif, struct flowi6 *fl6,
1874 			       const struct sk_buff *skb, int flags)
1875 {
1876 	struct fib6_result res = {};
1877 	struct rt6_info *rt;
1878 	int strict = 0;
1879 
1880 	strict |= flags & RT6_LOOKUP_F_IFACE;
1881 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1882 	if (net->ipv6.devconf_all->forwarding == 0)
1883 		strict |= RT6_LOOKUP_F_REACHABLE;
1884 
1885 	rcu_read_lock();
1886 
1887 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1888 	if (res.f6i == net->ipv6.fib6_null_entry) {
1889 		rt = net->ipv6.ip6_null_entry;
1890 		rcu_read_unlock();
1891 		dst_hold(&rt->dst);
1892 		return rt;
1893 	}
1894 
1895 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1896 
1897 	/*Search through exception table */
1898 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1899 	if (rt) {
1900 		if (ip6_hold_safe(net, &rt))
1901 			dst_use_noref(&rt->dst, jiffies);
1902 
1903 		rcu_read_unlock();
1904 		return rt;
1905 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1906 			    !res.nh->fib_nh_gw_family)) {
1907 		/* Create a RTF_CACHE clone which will not be
1908 		 * owned by the fib6 tree.  It is for the special case where
1909 		 * the daddr in the skb during the neighbor look-up is different
1910 		 * from the fl6->daddr used to look-up route here.
1911 		 */
1912 		struct rt6_info *uncached_rt;
1913 
1914 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1915 
1916 		rcu_read_unlock();
1917 
1918 		if (uncached_rt) {
1919 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1920 			 * No need for another dst_hold()
1921 			 */
1922 			rt6_uncached_list_add(uncached_rt);
1923 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1924 		} else {
1925 			uncached_rt = net->ipv6.ip6_null_entry;
1926 			dst_hold(&uncached_rt->dst);
1927 		}
1928 
1929 		return uncached_rt;
1930 	} else {
1931 		/* Get a percpu copy */
1932 
1933 		struct rt6_info *pcpu_rt;
1934 
1935 		local_bh_disable();
1936 		pcpu_rt = rt6_get_pcpu_route(&res);
1937 
1938 		if (!pcpu_rt)
1939 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1940 
1941 		local_bh_enable();
1942 		rcu_read_unlock();
1943 
1944 		return pcpu_rt;
1945 	}
1946 }
1947 EXPORT_SYMBOL_GPL(ip6_pol_route);
1948 
1949 static struct rt6_info *ip6_pol_route_input(struct net *net,
1950 					    struct fib6_table *table,
1951 					    struct flowi6 *fl6,
1952 					    const struct sk_buff *skb,
1953 					    int flags)
1954 {
1955 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1956 }
1957 
1958 struct dst_entry *ip6_route_input_lookup(struct net *net,
1959 					 struct net_device *dev,
1960 					 struct flowi6 *fl6,
1961 					 const struct sk_buff *skb,
1962 					 int flags)
1963 {
1964 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1965 		flags |= RT6_LOOKUP_F_IFACE;
1966 
1967 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1968 }
1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1970 
1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1972 				  struct flow_keys *keys,
1973 				  struct flow_keys *flkeys)
1974 {
1975 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1976 	const struct ipv6hdr *key_iph = outer_iph;
1977 	struct flow_keys *_flkeys = flkeys;
1978 	const struct ipv6hdr *inner_iph;
1979 	const struct icmp6hdr *icmph;
1980 	struct ipv6hdr _inner_iph;
1981 	struct icmp6hdr _icmph;
1982 
1983 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1984 		goto out;
1985 
1986 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1987 				   sizeof(_icmph), &_icmph);
1988 	if (!icmph)
1989 		goto out;
1990 
1991 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1992 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1993 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1994 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1995 		goto out;
1996 
1997 	inner_iph = skb_header_pointer(skb,
1998 				       skb_transport_offset(skb) + sizeof(*icmph),
1999 				       sizeof(_inner_iph), &_inner_iph);
2000 	if (!inner_iph)
2001 		goto out;
2002 
2003 	key_iph = inner_iph;
2004 	_flkeys = NULL;
2005 out:
2006 	if (_flkeys) {
2007 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2008 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2009 		keys->tags.flow_label = _flkeys->tags.flow_label;
2010 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2011 	} else {
2012 		keys->addrs.v6addrs.src = key_iph->saddr;
2013 		keys->addrs.v6addrs.dst = key_iph->daddr;
2014 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2015 		keys->basic.ip_proto = key_iph->nexthdr;
2016 	}
2017 }
2018 
2019 /* if skb is set it will be used and fl6 can be NULL */
2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2021 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2022 {
2023 	struct flow_keys hash_keys;
2024 	u32 mhash;
2025 
2026 	switch (ip6_multipath_hash_policy(net)) {
2027 	case 0:
2028 		memset(&hash_keys, 0, sizeof(hash_keys));
2029 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030 		if (skb) {
2031 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2032 		} else {
2033 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2036 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2037 		}
2038 		break;
2039 	case 1:
2040 		if (skb) {
2041 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2042 			struct flow_keys keys;
2043 
2044 			/* short-circuit if we already have L4 hash present */
2045 			if (skb->l4_hash)
2046 				return skb_get_hash_raw(skb) >> 1;
2047 
2048 			memset(&hash_keys, 0, sizeof(hash_keys));
2049 
2050                         if (!flkeys) {
2051 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2052 				flkeys = &keys;
2053 			}
2054 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2055 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2056 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2057 			hash_keys.ports.src = flkeys->ports.src;
2058 			hash_keys.ports.dst = flkeys->ports.dst;
2059 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2060 		} else {
2061 			memset(&hash_keys, 0, sizeof(hash_keys));
2062 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2063 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2064 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2065 			hash_keys.ports.src = fl6->fl6_sport;
2066 			hash_keys.ports.dst = fl6->fl6_dport;
2067 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2068 		}
2069 		break;
2070 	}
2071 	mhash = flow_hash_from_keys(&hash_keys);
2072 
2073 	return mhash >> 1;
2074 }
2075 
2076 void ip6_route_input(struct sk_buff *skb)
2077 {
2078 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2079 	struct net *net = dev_net(skb->dev);
2080 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2081 	struct ip_tunnel_info *tun_info;
2082 	struct flowi6 fl6 = {
2083 		.flowi6_iif = skb->dev->ifindex,
2084 		.daddr = iph->daddr,
2085 		.saddr = iph->saddr,
2086 		.flowlabel = ip6_flowinfo(iph),
2087 		.flowi6_mark = skb->mark,
2088 		.flowi6_proto = iph->nexthdr,
2089 	};
2090 	struct flow_keys *flkeys = NULL, _flkeys;
2091 
2092 	tun_info = skb_tunnel_info(skb);
2093 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2094 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2095 
2096 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2097 		flkeys = &_flkeys;
2098 
2099 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2100 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2101 	skb_dst_drop(skb);
2102 	skb_dst_set(skb,
2103 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2104 }
2105 
2106 static struct rt6_info *ip6_pol_route_output(struct net *net,
2107 					     struct fib6_table *table,
2108 					     struct flowi6 *fl6,
2109 					     const struct sk_buff *skb,
2110 					     int flags)
2111 {
2112 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2113 }
2114 
2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2116 					 struct flowi6 *fl6, int flags)
2117 {
2118 	bool any_src;
2119 
2120 	if (ipv6_addr_type(&fl6->daddr) &
2121 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2122 		struct dst_entry *dst;
2123 
2124 		dst = l3mdev_link_scope_lookup(net, fl6);
2125 		if (dst)
2126 			return dst;
2127 	}
2128 
2129 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2130 
2131 	any_src = ipv6_addr_any(&fl6->saddr);
2132 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2133 	    (fl6->flowi6_oif && any_src))
2134 		flags |= RT6_LOOKUP_F_IFACE;
2135 
2136 	if (!any_src)
2137 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2138 	else if (sk)
2139 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2140 
2141 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2142 }
2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2144 
2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2146 {
2147 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2148 	struct net_device *loopback_dev = net->loopback_dev;
2149 	struct dst_entry *new = NULL;
2150 
2151 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2152 		       DST_OBSOLETE_DEAD, 0);
2153 	if (rt) {
2154 		rt6_info_init(rt);
2155 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2156 
2157 		new = &rt->dst;
2158 		new->__use = 1;
2159 		new->input = dst_discard;
2160 		new->output = dst_discard_out;
2161 
2162 		dst_copy_metrics(new, &ort->dst);
2163 
2164 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2165 		rt->rt6i_gateway = ort->rt6i_gateway;
2166 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2167 
2168 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2169 #ifdef CONFIG_IPV6_SUBTREES
2170 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2171 #endif
2172 	}
2173 
2174 	dst_release(dst_orig);
2175 	return new ? new : ERR_PTR(-ENOMEM);
2176 }
2177 
2178 /*
2179  *	Destination cache support functions
2180  */
2181 
2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2183 {
2184 	u32 rt_cookie = 0;
2185 
2186 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2187 		return false;
2188 
2189 	if (fib6_check_expired(f6i))
2190 		return false;
2191 
2192 	return true;
2193 }
2194 
2195 static struct dst_entry *rt6_check(struct rt6_info *rt,
2196 				   struct fib6_info *from,
2197 				   u32 cookie)
2198 {
2199 	u32 rt_cookie = 0;
2200 
2201 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2202 	    rt_cookie != cookie)
2203 		return NULL;
2204 
2205 	if (rt6_check_expired(rt))
2206 		return NULL;
2207 
2208 	return &rt->dst;
2209 }
2210 
2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2212 					    struct fib6_info *from,
2213 					    u32 cookie)
2214 {
2215 	if (!__rt6_check_expired(rt) &&
2216 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2217 	    fib6_check(from, cookie))
2218 		return &rt->dst;
2219 	else
2220 		return NULL;
2221 }
2222 
2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2224 {
2225 	struct dst_entry *dst_ret;
2226 	struct fib6_info *from;
2227 	struct rt6_info *rt;
2228 
2229 	rt = container_of(dst, struct rt6_info, dst);
2230 
2231 	rcu_read_lock();
2232 
2233 	/* All IPV6 dsts are created with ->obsolete set to the value
2234 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2235 	 * into this function always.
2236 	 */
2237 
2238 	from = rcu_dereference(rt->from);
2239 
2240 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2241 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2242 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2243 	else
2244 		dst_ret = rt6_check(rt, from, cookie);
2245 
2246 	rcu_read_unlock();
2247 
2248 	return dst_ret;
2249 }
2250 
2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2252 {
2253 	struct rt6_info *rt = (struct rt6_info *) dst;
2254 
2255 	if (rt) {
2256 		if (rt->rt6i_flags & RTF_CACHE) {
2257 			rcu_read_lock();
2258 			if (rt6_check_expired(rt)) {
2259 				rt6_remove_exception_rt(rt);
2260 				dst = NULL;
2261 			}
2262 			rcu_read_unlock();
2263 		} else {
2264 			dst_release(dst);
2265 			dst = NULL;
2266 		}
2267 	}
2268 	return dst;
2269 }
2270 
2271 static void ip6_link_failure(struct sk_buff *skb)
2272 {
2273 	struct rt6_info *rt;
2274 
2275 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2276 
2277 	rt = (struct rt6_info *) skb_dst(skb);
2278 	if (rt) {
2279 		rcu_read_lock();
2280 		if (rt->rt6i_flags & RTF_CACHE) {
2281 			rt6_remove_exception_rt(rt);
2282 		} else {
2283 			struct fib6_info *from;
2284 			struct fib6_node *fn;
2285 
2286 			from = rcu_dereference(rt->from);
2287 			if (from) {
2288 				fn = rcu_dereference(from->fib6_node);
2289 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2290 					fn->fn_sernum = -1;
2291 			}
2292 		}
2293 		rcu_read_unlock();
2294 	}
2295 }
2296 
2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2298 {
2299 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2300 		struct fib6_info *from;
2301 
2302 		rcu_read_lock();
2303 		from = rcu_dereference(rt0->from);
2304 		if (from)
2305 			rt0->dst.expires = from->expires;
2306 		rcu_read_unlock();
2307 	}
2308 
2309 	dst_set_expires(&rt0->dst, timeout);
2310 	rt0->rt6i_flags |= RTF_EXPIRES;
2311 }
2312 
2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2314 {
2315 	struct net *net = dev_net(rt->dst.dev);
2316 
2317 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2318 	rt->rt6i_flags |= RTF_MODIFIED;
2319 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2320 }
2321 
2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2323 {
2324 	return !(rt->rt6i_flags & RTF_CACHE) &&
2325 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2326 }
2327 
2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2329 				 const struct ipv6hdr *iph, u32 mtu)
2330 {
2331 	const struct in6_addr *daddr, *saddr;
2332 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2333 
2334 	if (dst_metric_locked(dst, RTAX_MTU))
2335 		return;
2336 
2337 	if (iph) {
2338 		daddr = &iph->daddr;
2339 		saddr = &iph->saddr;
2340 	} else if (sk) {
2341 		daddr = &sk->sk_v6_daddr;
2342 		saddr = &inet6_sk(sk)->saddr;
2343 	} else {
2344 		daddr = NULL;
2345 		saddr = NULL;
2346 	}
2347 	dst_confirm_neigh(dst, daddr);
2348 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2349 	if (mtu >= dst_mtu(dst))
2350 		return;
2351 
2352 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2353 		rt6_do_update_pmtu(rt6, mtu);
2354 		/* update rt6_ex->stamp for cache */
2355 		if (rt6->rt6i_flags & RTF_CACHE)
2356 			rt6_update_exception_stamp_rt(rt6);
2357 	} else if (daddr) {
2358 		struct fib6_result res = {};
2359 		struct rt6_info *nrt6;
2360 
2361 		rcu_read_lock();
2362 		res.f6i = rcu_dereference(rt6->from);
2363 		if (!res.f6i) {
2364 			rcu_read_unlock();
2365 			return;
2366 		}
2367 		res.nh = &res.f6i->fib6_nh;
2368 		res.fib6_flags = res.f6i->fib6_flags;
2369 		res.fib6_type = res.f6i->fib6_type;
2370 
2371 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2372 		if (nrt6) {
2373 			rt6_do_update_pmtu(nrt6, mtu);
2374 			if (rt6_insert_exception(nrt6, &res))
2375 				dst_release_immediate(&nrt6->dst);
2376 		}
2377 		rcu_read_unlock();
2378 	}
2379 }
2380 
2381 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2382 			       struct sk_buff *skb, u32 mtu)
2383 {
2384 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2385 }
2386 
2387 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2388 		     int oif, u32 mark, kuid_t uid)
2389 {
2390 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2391 	struct dst_entry *dst;
2392 	struct flowi6 fl6 = {
2393 		.flowi6_oif = oif,
2394 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2395 		.daddr = iph->daddr,
2396 		.saddr = iph->saddr,
2397 		.flowlabel = ip6_flowinfo(iph),
2398 		.flowi6_uid = uid,
2399 	};
2400 
2401 	dst = ip6_route_output(net, NULL, &fl6);
2402 	if (!dst->error)
2403 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2404 	dst_release(dst);
2405 }
2406 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2407 
2408 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2409 {
2410 	int oif = sk->sk_bound_dev_if;
2411 	struct dst_entry *dst;
2412 
2413 	if (!oif && skb->dev)
2414 		oif = l3mdev_master_ifindex(skb->dev);
2415 
2416 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2417 
2418 	dst = __sk_dst_get(sk);
2419 	if (!dst || !dst->obsolete ||
2420 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2421 		return;
2422 
2423 	bh_lock_sock(sk);
2424 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2425 		ip6_datagram_dst_update(sk, false);
2426 	bh_unlock_sock(sk);
2427 }
2428 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2429 
2430 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2431 			   const struct flowi6 *fl6)
2432 {
2433 #ifdef CONFIG_IPV6_SUBTREES
2434 	struct ipv6_pinfo *np = inet6_sk(sk);
2435 #endif
2436 
2437 	ip6_dst_store(sk, dst,
2438 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2439 		      &sk->sk_v6_daddr : NULL,
2440 #ifdef CONFIG_IPV6_SUBTREES
2441 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2442 		      &np->saddr :
2443 #endif
2444 		      NULL);
2445 }
2446 
2447 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2448 				  struct flowi6 *fl6,
2449 				  const struct in6_addr *gw,
2450 				  struct rt6_info **ret)
2451 {
2452 	const struct fib6_nh *nh = res->nh;
2453 
2454 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2455 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2456 		return false;
2457 
2458 	/* rt_cache's gateway might be different from its 'parent'
2459 	 * in the case of an ip redirect.
2460 	 * So we keep searching in the exception table if the gateway
2461 	 * is different.
2462 	 */
2463 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2464 		struct rt6_info *rt_cache;
2465 
2466 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2467 		if (rt_cache &&
2468 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2469 			*ret = rt_cache;
2470 			return true;
2471 		}
2472 		return false;
2473 	}
2474 	return true;
2475 }
2476 
2477 /* Handle redirects */
2478 struct ip6rd_flowi {
2479 	struct flowi6 fl6;
2480 	struct in6_addr gateway;
2481 };
2482 
2483 static struct rt6_info *__ip6_route_redirect(struct net *net,
2484 					     struct fib6_table *table,
2485 					     struct flowi6 *fl6,
2486 					     const struct sk_buff *skb,
2487 					     int flags)
2488 {
2489 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2490 	struct rt6_info *ret = NULL;
2491 	struct fib6_result res = {};
2492 	struct fib6_info *rt;
2493 	struct fib6_node *fn;
2494 
2495 	/* Get the "current" route for this destination and
2496 	 * check if the redirect has come from appropriate router.
2497 	 *
2498 	 * RFC 4861 specifies that redirects should only be
2499 	 * accepted if they come from the nexthop to the target.
2500 	 * Due to the way the routes are chosen, this notion
2501 	 * is a bit fuzzy and one might need to check all possible
2502 	 * routes.
2503 	 */
2504 
2505 	rcu_read_lock();
2506 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2507 restart:
2508 	for_each_fib6_node_rt_rcu(fn) {
2509 		res.f6i = rt;
2510 		res.nh = &rt->fib6_nh;
2511 
2512 		if (fib6_check_expired(rt))
2513 			continue;
2514 		if (rt->fib6_flags & RTF_REJECT)
2515 			break;
2516 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2517 			goto out;
2518 	}
2519 
2520 	if (!rt)
2521 		rt = net->ipv6.fib6_null_entry;
2522 	else if (rt->fib6_flags & RTF_REJECT) {
2523 		ret = net->ipv6.ip6_null_entry;
2524 		goto out;
2525 	}
2526 
2527 	if (rt == net->ipv6.fib6_null_entry) {
2528 		fn = fib6_backtrack(fn, &fl6->saddr);
2529 		if (fn)
2530 			goto restart;
2531 	}
2532 
2533 	res.f6i = rt;
2534 	res.nh = &rt->fib6_nh;
2535 out:
2536 	if (ret) {
2537 		ip6_hold_safe(net, &ret);
2538 	} else {
2539 		res.fib6_flags = res.f6i->fib6_flags;
2540 		res.fib6_type = res.f6i->fib6_type;
2541 		ret = ip6_create_rt_rcu(&res);
2542 	}
2543 
2544 	rcu_read_unlock();
2545 
2546 	trace_fib6_table_lookup(net, &res, table, fl6);
2547 	return ret;
2548 };
2549 
2550 static struct dst_entry *ip6_route_redirect(struct net *net,
2551 					    const struct flowi6 *fl6,
2552 					    const struct sk_buff *skb,
2553 					    const struct in6_addr *gateway)
2554 {
2555 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2556 	struct ip6rd_flowi rdfl;
2557 
2558 	rdfl.fl6 = *fl6;
2559 	rdfl.gateway = *gateway;
2560 
2561 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2562 				flags, __ip6_route_redirect);
2563 }
2564 
2565 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2566 		  kuid_t uid)
2567 {
2568 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2569 	struct dst_entry *dst;
2570 	struct flowi6 fl6 = {
2571 		.flowi6_iif = LOOPBACK_IFINDEX,
2572 		.flowi6_oif = oif,
2573 		.flowi6_mark = mark,
2574 		.daddr = iph->daddr,
2575 		.saddr = iph->saddr,
2576 		.flowlabel = ip6_flowinfo(iph),
2577 		.flowi6_uid = uid,
2578 	};
2579 
2580 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2581 	rt6_do_redirect(dst, NULL, skb);
2582 	dst_release(dst);
2583 }
2584 EXPORT_SYMBOL_GPL(ip6_redirect);
2585 
2586 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2587 {
2588 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2589 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2590 	struct dst_entry *dst;
2591 	struct flowi6 fl6 = {
2592 		.flowi6_iif = LOOPBACK_IFINDEX,
2593 		.flowi6_oif = oif,
2594 		.daddr = msg->dest,
2595 		.saddr = iph->daddr,
2596 		.flowi6_uid = sock_net_uid(net, NULL),
2597 	};
2598 
2599 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2600 	rt6_do_redirect(dst, NULL, skb);
2601 	dst_release(dst);
2602 }
2603 
2604 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2605 {
2606 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2607 		     sk->sk_uid);
2608 }
2609 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2610 
2611 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2612 {
2613 	struct net_device *dev = dst->dev;
2614 	unsigned int mtu = dst_mtu(dst);
2615 	struct net *net = dev_net(dev);
2616 
2617 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2618 
2619 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2620 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2621 
2622 	/*
2623 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2624 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2625 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2626 	 * rely only on pmtu discovery"
2627 	 */
2628 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2629 		mtu = IPV6_MAXPLEN;
2630 	return mtu;
2631 }
2632 
2633 static unsigned int ip6_mtu(const struct dst_entry *dst)
2634 {
2635 	struct inet6_dev *idev;
2636 	unsigned int mtu;
2637 
2638 	mtu = dst_metric_raw(dst, RTAX_MTU);
2639 	if (mtu)
2640 		goto out;
2641 
2642 	mtu = IPV6_MIN_MTU;
2643 
2644 	rcu_read_lock();
2645 	idev = __in6_dev_get(dst->dev);
2646 	if (idev)
2647 		mtu = idev->cnf.mtu6;
2648 	rcu_read_unlock();
2649 
2650 out:
2651 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2652 
2653 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2654 }
2655 
2656 /* MTU selection:
2657  * 1. mtu on route is locked - use it
2658  * 2. mtu from nexthop exception
2659  * 3. mtu from egress device
2660  *
2661  * based on ip6_dst_mtu_forward and exception logic of
2662  * rt6_find_cached_rt; called with rcu_read_lock
2663  */
2664 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2665 		      const struct in6_addr *daddr,
2666 		      const struct in6_addr *saddr)
2667 {
2668 	struct rt6_exception_bucket *bucket;
2669 	const struct fib6_nh *nh = res->nh;
2670 	struct fib6_info *f6i = res->f6i;
2671 	const struct in6_addr *src_key;
2672 	struct rt6_exception *rt6_ex;
2673 	struct inet6_dev *idev;
2674 	u32 mtu = 0;
2675 
2676 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2677 		mtu = f6i->fib6_pmtu;
2678 		if (mtu)
2679 			goto out;
2680 	}
2681 
2682 	src_key = NULL;
2683 #ifdef CONFIG_IPV6_SUBTREES
2684 	if (f6i->fib6_src.plen)
2685 		src_key = saddr;
2686 #endif
2687 
2688 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2689 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2690 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2691 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2692 
2693 	if (likely(!mtu)) {
2694 		struct net_device *dev = nh->fib_nh_dev;
2695 
2696 		mtu = IPV6_MIN_MTU;
2697 		idev = __in6_dev_get(dev);
2698 		if (idev && idev->cnf.mtu6 > mtu)
2699 			mtu = idev->cnf.mtu6;
2700 	}
2701 
2702 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2703 out:
2704 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2705 }
2706 
2707 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2708 				  struct flowi6 *fl6)
2709 {
2710 	struct dst_entry *dst;
2711 	struct rt6_info *rt;
2712 	struct inet6_dev *idev = in6_dev_get(dev);
2713 	struct net *net = dev_net(dev);
2714 
2715 	if (unlikely(!idev))
2716 		return ERR_PTR(-ENODEV);
2717 
2718 	rt = ip6_dst_alloc(net, dev, 0);
2719 	if (unlikely(!rt)) {
2720 		in6_dev_put(idev);
2721 		dst = ERR_PTR(-ENOMEM);
2722 		goto out;
2723 	}
2724 
2725 	rt->dst.flags |= DST_HOST;
2726 	rt->dst.input = ip6_input;
2727 	rt->dst.output  = ip6_output;
2728 	rt->rt6i_gateway  = fl6->daddr;
2729 	rt->rt6i_dst.addr = fl6->daddr;
2730 	rt->rt6i_dst.plen = 128;
2731 	rt->rt6i_idev     = idev;
2732 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2733 
2734 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2735 	 * do proper release of the net_device
2736 	 */
2737 	rt6_uncached_list_add(rt);
2738 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2739 
2740 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2741 
2742 out:
2743 	return dst;
2744 }
2745 
2746 static int ip6_dst_gc(struct dst_ops *ops)
2747 {
2748 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2749 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2750 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2751 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2752 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2753 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2754 	int entries;
2755 
2756 	entries = dst_entries_get_fast(ops);
2757 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2758 	    entries <= rt_max_size)
2759 		goto out;
2760 
2761 	net->ipv6.ip6_rt_gc_expire++;
2762 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2763 	entries = dst_entries_get_slow(ops);
2764 	if (entries < ops->gc_thresh)
2765 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2766 out:
2767 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2768 	return entries > rt_max_size;
2769 }
2770 
2771 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2772 					    struct fib6_config *cfg,
2773 					    const struct in6_addr *gw_addr,
2774 					    u32 tbid, int flags)
2775 {
2776 	struct flowi6 fl6 = {
2777 		.flowi6_oif = cfg->fc_ifindex,
2778 		.daddr = *gw_addr,
2779 		.saddr = cfg->fc_prefsrc,
2780 	};
2781 	struct fib6_table *table;
2782 	struct rt6_info *rt;
2783 
2784 	table = fib6_get_table(net, tbid);
2785 	if (!table)
2786 		return NULL;
2787 
2788 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2789 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2790 
2791 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2792 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2793 
2794 	/* if table lookup failed, fall back to full lookup */
2795 	if (rt == net->ipv6.ip6_null_entry) {
2796 		ip6_rt_put(rt);
2797 		rt = NULL;
2798 	}
2799 
2800 	return rt;
2801 }
2802 
2803 static int ip6_route_check_nh_onlink(struct net *net,
2804 				     struct fib6_config *cfg,
2805 				     const struct net_device *dev,
2806 				     struct netlink_ext_ack *extack)
2807 {
2808 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2809 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2810 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2811 	struct fib6_info *from;
2812 	struct rt6_info *grt;
2813 	int err;
2814 
2815 	err = 0;
2816 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2817 	if (grt) {
2818 		rcu_read_lock();
2819 		from = rcu_dereference(grt->from);
2820 		if (!grt->dst.error &&
2821 		    /* ignore match if it is the default route */
2822 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2823 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2824 			NL_SET_ERR_MSG(extack,
2825 				       "Nexthop has invalid gateway or device mismatch");
2826 			err = -EINVAL;
2827 		}
2828 		rcu_read_unlock();
2829 
2830 		ip6_rt_put(grt);
2831 	}
2832 
2833 	return err;
2834 }
2835 
2836 static int ip6_route_check_nh(struct net *net,
2837 			      struct fib6_config *cfg,
2838 			      struct net_device **_dev,
2839 			      struct inet6_dev **idev)
2840 {
2841 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2842 	struct net_device *dev = _dev ? *_dev : NULL;
2843 	struct rt6_info *grt = NULL;
2844 	int err = -EHOSTUNREACH;
2845 
2846 	if (cfg->fc_table) {
2847 		int flags = RT6_LOOKUP_F_IFACE;
2848 
2849 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2850 					  cfg->fc_table, flags);
2851 		if (grt) {
2852 			if (grt->rt6i_flags & RTF_GATEWAY ||
2853 			    (dev && dev != grt->dst.dev)) {
2854 				ip6_rt_put(grt);
2855 				grt = NULL;
2856 			}
2857 		}
2858 	}
2859 
2860 	if (!grt)
2861 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2862 
2863 	if (!grt)
2864 		goto out;
2865 
2866 	if (dev) {
2867 		if (dev != grt->dst.dev) {
2868 			ip6_rt_put(grt);
2869 			goto out;
2870 		}
2871 	} else {
2872 		*_dev = dev = grt->dst.dev;
2873 		*idev = grt->rt6i_idev;
2874 		dev_hold(dev);
2875 		in6_dev_hold(grt->rt6i_idev);
2876 	}
2877 
2878 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2879 		err = 0;
2880 
2881 	ip6_rt_put(grt);
2882 
2883 out:
2884 	return err;
2885 }
2886 
2887 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2888 			   struct net_device **_dev, struct inet6_dev **idev,
2889 			   struct netlink_ext_ack *extack)
2890 {
2891 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2892 	int gwa_type = ipv6_addr_type(gw_addr);
2893 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2894 	const struct net_device *dev = *_dev;
2895 	bool need_addr_check = !dev;
2896 	int err = -EINVAL;
2897 
2898 	/* if gw_addr is local we will fail to detect this in case
2899 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2900 	 * will return already-added prefix route via interface that
2901 	 * prefix route was assigned to, which might be non-loopback.
2902 	 */
2903 	if (dev &&
2904 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2905 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2906 		goto out;
2907 	}
2908 
2909 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2910 		/* IPv6 strictly inhibits using not link-local
2911 		 * addresses as nexthop address.
2912 		 * Otherwise, router will not able to send redirects.
2913 		 * It is very good, but in some (rare!) circumstances
2914 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2915 		 * some exceptions. --ANK
2916 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2917 		 * addressing
2918 		 */
2919 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2920 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2921 			goto out;
2922 		}
2923 
2924 		if (cfg->fc_flags & RTNH_F_ONLINK)
2925 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2926 		else
2927 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2928 
2929 		if (err)
2930 			goto out;
2931 	}
2932 
2933 	/* reload in case device was changed */
2934 	dev = *_dev;
2935 
2936 	err = -EINVAL;
2937 	if (!dev) {
2938 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2939 		goto out;
2940 	} else if (dev->flags & IFF_LOOPBACK) {
2941 		NL_SET_ERR_MSG(extack,
2942 			       "Egress device can not be loopback device for this route");
2943 		goto out;
2944 	}
2945 
2946 	/* if we did not check gw_addr above, do so now that the
2947 	 * egress device has been resolved.
2948 	 */
2949 	if (need_addr_check &&
2950 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2951 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2952 		goto out;
2953 	}
2954 
2955 	err = 0;
2956 out:
2957 	return err;
2958 }
2959 
2960 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2961 {
2962 	if ((flags & RTF_REJECT) ||
2963 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2964 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2965 	     !(flags & RTF_LOCAL)))
2966 		return true;
2967 
2968 	return false;
2969 }
2970 
2971 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2972 		 struct fib6_config *cfg, gfp_t gfp_flags,
2973 		 struct netlink_ext_ack *extack)
2974 {
2975 	struct net_device *dev = NULL;
2976 	struct inet6_dev *idev = NULL;
2977 	int addr_type;
2978 	int err;
2979 
2980 	fib6_nh->fib_nh_family = AF_INET6;
2981 
2982 	err = -ENODEV;
2983 	if (cfg->fc_ifindex) {
2984 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2985 		if (!dev)
2986 			goto out;
2987 		idev = in6_dev_get(dev);
2988 		if (!idev)
2989 			goto out;
2990 	}
2991 
2992 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2993 		if (!dev) {
2994 			NL_SET_ERR_MSG(extack,
2995 				       "Nexthop device required for onlink");
2996 			goto out;
2997 		}
2998 
2999 		if (!(dev->flags & IFF_UP)) {
3000 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3001 			err = -ENETDOWN;
3002 			goto out;
3003 		}
3004 
3005 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3006 	}
3007 
3008 	fib6_nh->fib_nh_weight = 1;
3009 
3010 	/* We cannot add true routes via loopback here,
3011 	 * they would result in kernel looping; promote them to reject routes
3012 	 */
3013 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3014 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3015 		/* hold loopback dev/idev if we haven't done so. */
3016 		if (dev != net->loopback_dev) {
3017 			if (dev) {
3018 				dev_put(dev);
3019 				in6_dev_put(idev);
3020 			}
3021 			dev = net->loopback_dev;
3022 			dev_hold(dev);
3023 			idev = in6_dev_get(dev);
3024 			if (!idev) {
3025 				err = -ENODEV;
3026 				goto out;
3027 			}
3028 		}
3029 		goto set_dev;
3030 	}
3031 
3032 	if (cfg->fc_flags & RTF_GATEWAY) {
3033 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3034 		if (err)
3035 			goto out;
3036 
3037 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3038 		fib6_nh->fib_nh_gw_family = AF_INET6;
3039 	}
3040 
3041 	err = -ENODEV;
3042 	if (!dev)
3043 		goto out;
3044 
3045 	if (idev->cnf.disable_ipv6) {
3046 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3047 		err = -EACCES;
3048 		goto out;
3049 	}
3050 
3051 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3052 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3053 		err = -ENETDOWN;
3054 		goto out;
3055 	}
3056 
3057 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3058 	    !netif_carrier_ok(dev))
3059 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3060 
3061 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3062 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3063 	if (err)
3064 		goto out;
3065 set_dev:
3066 	fib6_nh->fib_nh_dev = dev;
3067 	fib6_nh->fib_nh_oif = dev->ifindex;
3068 	err = 0;
3069 out:
3070 	if (idev)
3071 		in6_dev_put(idev);
3072 
3073 	if (err) {
3074 		lwtstate_put(fib6_nh->fib_nh_lws);
3075 		fib6_nh->fib_nh_lws = NULL;
3076 		if (dev)
3077 			dev_put(dev);
3078 	}
3079 
3080 	return err;
3081 }
3082 
3083 void fib6_nh_release(struct fib6_nh *fib6_nh)
3084 {
3085 	fib_nh_common_release(&fib6_nh->nh_common);
3086 }
3087 
3088 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3089 					      gfp_t gfp_flags,
3090 					      struct netlink_ext_ack *extack)
3091 {
3092 	struct net *net = cfg->fc_nlinfo.nl_net;
3093 	struct fib6_info *rt = NULL;
3094 	struct fib6_table *table;
3095 	int err = -EINVAL;
3096 	int addr_type;
3097 
3098 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3099 	if (cfg->fc_flags & RTF_PCPU) {
3100 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3101 		goto out;
3102 	}
3103 
3104 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3105 	if (cfg->fc_flags & RTF_CACHE) {
3106 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3107 		goto out;
3108 	}
3109 
3110 	if (cfg->fc_type > RTN_MAX) {
3111 		NL_SET_ERR_MSG(extack, "Invalid route type");
3112 		goto out;
3113 	}
3114 
3115 	if (cfg->fc_dst_len > 128) {
3116 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3117 		goto out;
3118 	}
3119 	if (cfg->fc_src_len > 128) {
3120 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3121 		goto out;
3122 	}
3123 #ifndef CONFIG_IPV6_SUBTREES
3124 	if (cfg->fc_src_len) {
3125 		NL_SET_ERR_MSG(extack,
3126 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3127 		goto out;
3128 	}
3129 #endif
3130 
3131 	err = -ENOBUFS;
3132 	if (cfg->fc_nlinfo.nlh &&
3133 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3134 		table = fib6_get_table(net, cfg->fc_table);
3135 		if (!table) {
3136 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3137 			table = fib6_new_table(net, cfg->fc_table);
3138 		}
3139 	} else {
3140 		table = fib6_new_table(net, cfg->fc_table);
3141 	}
3142 
3143 	if (!table)
3144 		goto out;
3145 
3146 	err = -ENOMEM;
3147 	rt = fib6_info_alloc(gfp_flags);
3148 	if (!rt)
3149 		goto out;
3150 
3151 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3152 					       extack);
3153 	if (IS_ERR(rt->fib6_metrics)) {
3154 		err = PTR_ERR(rt->fib6_metrics);
3155 		/* Do not leave garbage there. */
3156 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3157 		goto out;
3158 	}
3159 
3160 	if (cfg->fc_flags & RTF_ADDRCONF)
3161 		rt->dst_nocount = true;
3162 
3163 	if (cfg->fc_flags & RTF_EXPIRES)
3164 		fib6_set_expires(rt, jiffies +
3165 				clock_t_to_jiffies(cfg->fc_expires));
3166 	else
3167 		fib6_clean_expires(rt);
3168 
3169 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3170 		cfg->fc_protocol = RTPROT_BOOT;
3171 	rt->fib6_protocol = cfg->fc_protocol;
3172 
3173 	rt->fib6_table = table;
3174 	rt->fib6_metric = cfg->fc_metric;
3175 	rt->fib6_type = cfg->fc_type;
3176 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3177 
3178 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3179 	rt->fib6_dst.plen = cfg->fc_dst_len;
3180 	if (rt->fib6_dst.plen == 128)
3181 		rt->dst_host = true;
3182 
3183 #ifdef CONFIG_IPV6_SUBTREES
3184 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3185 	rt->fib6_src.plen = cfg->fc_src_len;
3186 #endif
3187 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3188 	if (err)
3189 		goto out;
3190 
3191 	/* We cannot add true routes via loopback here,
3192 	 * they would result in kernel looping; promote them to reject routes
3193 	 */
3194 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3195 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3196 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3197 
3198 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3199 		struct net_device *dev = fib6_info_nh_dev(rt);
3200 
3201 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3202 			NL_SET_ERR_MSG(extack, "Invalid source address");
3203 			err = -EINVAL;
3204 			goto out;
3205 		}
3206 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3207 		rt->fib6_prefsrc.plen = 128;
3208 	} else
3209 		rt->fib6_prefsrc.plen = 0;
3210 
3211 	return rt;
3212 out:
3213 	fib6_info_release(rt);
3214 	return ERR_PTR(err);
3215 }
3216 
3217 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3218 		  struct netlink_ext_ack *extack)
3219 {
3220 	struct fib6_info *rt;
3221 	int err;
3222 
3223 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3224 	if (IS_ERR(rt))
3225 		return PTR_ERR(rt);
3226 
3227 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3228 	fib6_info_release(rt);
3229 
3230 	return err;
3231 }
3232 
3233 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3234 {
3235 	struct net *net = info->nl_net;
3236 	struct fib6_table *table;
3237 	int err;
3238 
3239 	if (rt == net->ipv6.fib6_null_entry) {
3240 		err = -ENOENT;
3241 		goto out;
3242 	}
3243 
3244 	table = rt->fib6_table;
3245 	spin_lock_bh(&table->tb6_lock);
3246 	err = fib6_del(rt, info);
3247 	spin_unlock_bh(&table->tb6_lock);
3248 
3249 out:
3250 	fib6_info_release(rt);
3251 	return err;
3252 }
3253 
3254 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3255 {
3256 	struct nl_info info = { .nl_net = net };
3257 
3258 	return __ip6_del_rt(rt, &info);
3259 }
3260 
3261 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3262 {
3263 	struct nl_info *info = &cfg->fc_nlinfo;
3264 	struct net *net = info->nl_net;
3265 	struct sk_buff *skb = NULL;
3266 	struct fib6_table *table;
3267 	int err = -ENOENT;
3268 
3269 	if (rt == net->ipv6.fib6_null_entry)
3270 		goto out_put;
3271 	table = rt->fib6_table;
3272 	spin_lock_bh(&table->tb6_lock);
3273 
3274 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3275 		struct fib6_info *sibling, *next_sibling;
3276 
3277 		/* prefer to send a single notification with all hops */
3278 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3279 		if (skb) {
3280 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3281 
3282 			if (rt6_fill_node(net, skb, rt, NULL,
3283 					  NULL, NULL, 0, RTM_DELROUTE,
3284 					  info->portid, seq, 0) < 0) {
3285 				kfree_skb(skb);
3286 				skb = NULL;
3287 			} else
3288 				info->skip_notify = 1;
3289 		}
3290 
3291 		list_for_each_entry_safe(sibling, next_sibling,
3292 					 &rt->fib6_siblings,
3293 					 fib6_siblings) {
3294 			err = fib6_del(sibling, info);
3295 			if (err)
3296 				goto out_unlock;
3297 		}
3298 	}
3299 
3300 	err = fib6_del(rt, info);
3301 out_unlock:
3302 	spin_unlock_bh(&table->tb6_lock);
3303 out_put:
3304 	fib6_info_release(rt);
3305 
3306 	if (skb) {
3307 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3308 			    info->nlh, gfp_any());
3309 	}
3310 	return err;
3311 }
3312 
3313 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3314 {
3315 	int rc = -ESRCH;
3316 
3317 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3318 		goto out;
3319 
3320 	if (cfg->fc_flags & RTF_GATEWAY &&
3321 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3322 		goto out;
3323 
3324 	rc = rt6_remove_exception_rt(rt);
3325 out:
3326 	return rc;
3327 }
3328 
3329 static int ip6_route_del(struct fib6_config *cfg,
3330 			 struct netlink_ext_ack *extack)
3331 {
3332 	struct rt6_info *rt_cache;
3333 	struct fib6_table *table;
3334 	struct fib6_info *rt;
3335 	struct fib6_node *fn;
3336 	int err = -ESRCH;
3337 
3338 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3339 	if (!table) {
3340 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3341 		return err;
3342 	}
3343 
3344 	rcu_read_lock();
3345 
3346 	fn = fib6_locate(&table->tb6_root,
3347 			 &cfg->fc_dst, cfg->fc_dst_len,
3348 			 &cfg->fc_src, cfg->fc_src_len,
3349 			 !(cfg->fc_flags & RTF_CACHE));
3350 
3351 	if (fn) {
3352 		for_each_fib6_node_rt_rcu(fn) {
3353 			struct fib6_nh *nh;
3354 
3355 			if (cfg->fc_flags & RTF_CACHE) {
3356 				struct fib6_result res = {
3357 					.f6i = rt,
3358 				};
3359 				int rc;
3360 
3361 				rt_cache = rt6_find_cached_rt(&res,
3362 							      &cfg->fc_dst,
3363 							      &cfg->fc_src);
3364 				if (rt_cache) {
3365 					rc = ip6_del_cached_rt(rt_cache, cfg);
3366 					if (rc != -ESRCH) {
3367 						rcu_read_unlock();
3368 						return rc;
3369 					}
3370 				}
3371 				continue;
3372 			}
3373 
3374 			nh = &rt->fib6_nh;
3375 			if (cfg->fc_ifindex &&
3376 			    (!nh->fib_nh_dev ||
3377 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3378 				continue;
3379 			if (cfg->fc_flags & RTF_GATEWAY &&
3380 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3381 				continue;
3382 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3383 				continue;
3384 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3385 				continue;
3386 			if (!fib6_info_hold_safe(rt))
3387 				continue;
3388 			rcu_read_unlock();
3389 
3390 			/* if gateway was specified only delete the one hop */
3391 			if (cfg->fc_flags & RTF_GATEWAY)
3392 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3393 
3394 			return __ip6_del_rt_siblings(rt, cfg);
3395 		}
3396 	}
3397 	rcu_read_unlock();
3398 
3399 	return err;
3400 }
3401 
3402 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3403 {
3404 	struct netevent_redirect netevent;
3405 	struct rt6_info *rt, *nrt = NULL;
3406 	struct fib6_result res = {};
3407 	struct ndisc_options ndopts;
3408 	struct inet6_dev *in6_dev;
3409 	struct neighbour *neigh;
3410 	struct rd_msg *msg;
3411 	int optlen, on_link;
3412 	u8 *lladdr;
3413 
3414 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3415 	optlen -= sizeof(*msg);
3416 
3417 	if (optlen < 0) {
3418 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3419 		return;
3420 	}
3421 
3422 	msg = (struct rd_msg *)icmp6_hdr(skb);
3423 
3424 	if (ipv6_addr_is_multicast(&msg->dest)) {
3425 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3426 		return;
3427 	}
3428 
3429 	on_link = 0;
3430 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3431 		on_link = 1;
3432 	} else if (ipv6_addr_type(&msg->target) !=
3433 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3434 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3435 		return;
3436 	}
3437 
3438 	in6_dev = __in6_dev_get(skb->dev);
3439 	if (!in6_dev)
3440 		return;
3441 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3442 		return;
3443 
3444 	/* RFC2461 8.1:
3445 	 *	The IP source address of the Redirect MUST be the same as the current
3446 	 *	first-hop router for the specified ICMP Destination Address.
3447 	 */
3448 
3449 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3450 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3451 		return;
3452 	}
3453 
3454 	lladdr = NULL;
3455 	if (ndopts.nd_opts_tgt_lladdr) {
3456 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3457 					     skb->dev);
3458 		if (!lladdr) {
3459 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3460 			return;
3461 		}
3462 	}
3463 
3464 	rt = (struct rt6_info *) dst;
3465 	if (rt->rt6i_flags & RTF_REJECT) {
3466 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3467 		return;
3468 	}
3469 
3470 	/* Redirect received -> path was valid.
3471 	 * Look, redirects are sent only in response to data packets,
3472 	 * so that this nexthop apparently is reachable. --ANK
3473 	 */
3474 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3475 
3476 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3477 	if (!neigh)
3478 		return;
3479 
3480 	/*
3481 	 *	We have finally decided to accept it.
3482 	 */
3483 
3484 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3485 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3486 		     NEIGH_UPDATE_F_OVERRIDE|
3487 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3488 				     NEIGH_UPDATE_F_ISROUTER)),
3489 		     NDISC_REDIRECT, &ndopts);
3490 
3491 	rcu_read_lock();
3492 	res.f6i = rcu_dereference(rt->from);
3493 	if (!res.f6i)
3494 		goto out;
3495 
3496 	res.nh = &res.f6i->fib6_nh;
3497 	res.fib6_flags = res.f6i->fib6_flags;
3498 	res.fib6_type = res.f6i->fib6_type;
3499 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3500 	if (!nrt)
3501 		goto out;
3502 
3503 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3504 	if (on_link)
3505 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3506 
3507 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3508 
3509 	/* rt6_insert_exception() will take care of duplicated exceptions */
3510 	if (rt6_insert_exception(nrt, &res)) {
3511 		dst_release_immediate(&nrt->dst);
3512 		goto out;
3513 	}
3514 
3515 	netevent.old = &rt->dst;
3516 	netevent.new = &nrt->dst;
3517 	netevent.daddr = &msg->dest;
3518 	netevent.neigh = neigh;
3519 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3520 
3521 out:
3522 	rcu_read_unlock();
3523 	neigh_release(neigh);
3524 }
3525 
3526 #ifdef CONFIG_IPV6_ROUTE_INFO
3527 static struct fib6_info *rt6_get_route_info(struct net *net,
3528 					   const struct in6_addr *prefix, int prefixlen,
3529 					   const struct in6_addr *gwaddr,
3530 					   struct net_device *dev)
3531 {
3532 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3533 	int ifindex = dev->ifindex;
3534 	struct fib6_node *fn;
3535 	struct fib6_info *rt = NULL;
3536 	struct fib6_table *table;
3537 
3538 	table = fib6_get_table(net, tb_id);
3539 	if (!table)
3540 		return NULL;
3541 
3542 	rcu_read_lock();
3543 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3544 	if (!fn)
3545 		goto out;
3546 
3547 	for_each_fib6_node_rt_rcu(fn) {
3548 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3549 			continue;
3550 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3551 		    !rt->fib6_nh.fib_nh_gw_family)
3552 			continue;
3553 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3554 			continue;
3555 		if (!fib6_info_hold_safe(rt))
3556 			continue;
3557 		break;
3558 	}
3559 out:
3560 	rcu_read_unlock();
3561 	return rt;
3562 }
3563 
3564 static struct fib6_info *rt6_add_route_info(struct net *net,
3565 					   const struct in6_addr *prefix, int prefixlen,
3566 					   const struct in6_addr *gwaddr,
3567 					   struct net_device *dev,
3568 					   unsigned int pref)
3569 {
3570 	struct fib6_config cfg = {
3571 		.fc_metric	= IP6_RT_PRIO_USER,
3572 		.fc_ifindex	= dev->ifindex,
3573 		.fc_dst_len	= prefixlen,
3574 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3575 				  RTF_UP | RTF_PREF(pref),
3576 		.fc_protocol = RTPROT_RA,
3577 		.fc_type = RTN_UNICAST,
3578 		.fc_nlinfo.portid = 0,
3579 		.fc_nlinfo.nlh = NULL,
3580 		.fc_nlinfo.nl_net = net,
3581 	};
3582 
3583 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3584 	cfg.fc_dst = *prefix;
3585 	cfg.fc_gateway = *gwaddr;
3586 
3587 	/* We should treat it as a default route if prefix length is 0. */
3588 	if (!prefixlen)
3589 		cfg.fc_flags |= RTF_DEFAULT;
3590 
3591 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3592 
3593 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3594 }
3595 #endif
3596 
3597 struct fib6_info *rt6_get_dflt_router(struct net *net,
3598 				     const struct in6_addr *addr,
3599 				     struct net_device *dev)
3600 {
3601 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3602 	struct fib6_info *rt;
3603 	struct fib6_table *table;
3604 
3605 	table = fib6_get_table(net, tb_id);
3606 	if (!table)
3607 		return NULL;
3608 
3609 	rcu_read_lock();
3610 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3611 		struct fib6_nh *nh = &rt->fib6_nh;
3612 
3613 		if (dev == nh->fib_nh_dev &&
3614 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3615 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3616 			break;
3617 	}
3618 	if (rt && !fib6_info_hold_safe(rt))
3619 		rt = NULL;
3620 	rcu_read_unlock();
3621 	return rt;
3622 }
3623 
3624 struct fib6_info *rt6_add_dflt_router(struct net *net,
3625 				     const struct in6_addr *gwaddr,
3626 				     struct net_device *dev,
3627 				     unsigned int pref)
3628 {
3629 	struct fib6_config cfg = {
3630 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3631 		.fc_metric	= IP6_RT_PRIO_USER,
3632 		.fc_ifindex	= dev->ifindex,
3633 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3634 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3635 		.fc_protocol = RTPROT_RA,
3636 		.fc_type = RTN_UNICAST,
3637 		.fc_nlinfo.portid = 0,
3638 		.fc_nlinfo.nlh = NULL,
3639 		.fc_nlinfo.nl_net = net,
3640 	};
3641 
3642 	cfg.fc_gateway = *gwaddr;
3643 
3644 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3645 		struct fib6_table *table;
3646 
3647 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3648 		if (table)
3649 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3650 	}
3651 
3652 	return rt6_get_dflt_router(net, gwaddr, dev);
3653 }
3654 
3655 static void __rt6_purge_dflt_routers(struct net *net,
3656 				     struct fib6_table *table)
3657 {
3658 	struct fib6_info *rt;
3659 
3660 restart:
3661 	rcu_read_lock();
3662 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3663 		struct net_device *dev = fib6_info_nh_dev(rt);
3664 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3665 
3666 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3667 		    (!idev || idev->cnf.accept_ra != 2) &&
3668 		    fib6_info_hold_safe(rt)) {
3669 			rcu_read_unlock();
3670 			ip6_del_rt(net, rt);
3671 			goto restart;
3672 		}
3673 	}
3674 	rcu_read_unlock();
3675 
3676 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3677 }
3678 
3679 void rt6_purge_dflt_routers(struct net *net)
3680 {
3681 	struct fib6_table *table;
3682 	struct hlist_head *head;
3683 	unsigned int h;
3684 
3685 	rcu_read_lock();
3686 
3687 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3688 		head = &net->ipv6.fib_table_hash[h];
3689 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3690 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3691 				__rt6_purge_dflt_routers(net, table);
3692 		}
3693 	}
3694 
3695 	rcu_read_unlock();
3696 }
3697 
3698 static void rtmsg_to_fib6_config(struct net *net,
3699 				 struct in6_rtmsg *rtmsg,
3700 				 struct fib6_config *cfg)
3701 {
3702 	*cfg = (struct fib6_config){
3703 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3704 			 : RT6_TABLE_MAIN,
3705 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3706 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3707 		.fc_expires = rtmsg->rtmsg_info,
3708 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3709 		.fc_src_len = rtmsg->rtmsg_src_len,
3710 		.fc_flags = rtmsg->rtmsg_flags,
3711 		.fc_type = rtmsg->rtmsg_type,
3712 
3713 		.fc_nlinfo.nl_net = net,
3714 
3715 		.fc_dst = rtmsg->rtmsg_dst,
3716 		.fc_src = rtmsg->rtmsg_src,
3717 		.fc_gateway = rtmsg->rtmsg_gateway,
3718 	};
3719 }
3720 
3721 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3722 {
3723 	struct fib6_config cfg;
3724 	struct in6_rtmsg rtmsg;
3725 	int err;
3726 
3727 	switch (cmd) {
3728 	case SIOCADDRT:		/* Add a route */
3729 	case SIOCDELRT:		/* Delete a route */
3730 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3731 			return -EPERM;
3732 		err = copy_from_user(&rtmsg, arg,
3733 				     sizeof(struct in6_rtmsg));
3734 		if (err)
3735 			return -EFAULT;
3736 
3737 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3738 
3739 		rtnl_lock();
3740 		switch (cmd) {
3741 		case SIOCADDRT:
3742 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3743 			break;
3744 		case SIOCDELRT:
3745 			err = ip6_route_del(&cfg, NULL);
3746 			break;
3747 		default:
3748 			err = -EINVAL;
3749 		}
3750 		rtnl_unlock();
3751 
3752 		return err;
3753 	}
3754 
3755 	return -EINVAL;
3756 }
3757 
3758 /*
3759  *	Drop the packet on the floor
3760  */
3761 
3762 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3763 {
3764 	struct dst_entry *dst = skb_dst(skb);
3765 	struct net *net = dev_net(dst->dev);
3766 	struct inet6_dev *idev;
3767 	int type;
3768 
3769 	if (netif_is_l3_master(skb->dev) &&
3770 	    dst->dev == net->loopback_dev)
3771 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3772 	else
3773 		idev = ip6_dst_idev(dst);
3774 
3775 	switch (ipstats_mib_noroutes) {
3776 	case IPSTATS_MIB_INNOROUTES:
3777 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3778 		if (type == IPV6_ADDR_ANY) {
3779 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3780 			break;
3781 		}
3782 		/* FALLTHROUGH */
3783 	case IPSTATS_MIB_OUTNOROUTES:
3784 		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3785 		break;
3786 	}
3787 
3788 	/* Start over by dropping the dst for l3mdev case */
3789 	if (netif_is_l3_master(skb->dev))
3790 		skb_dst_drop(skb);
3791 
3792 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3793 	kfree_skb(skb);
3794 	return 0;
3795 }
3796 
3797 static int ip6_pkt_discard(struct sk_buff *skb)
3798 {
3799 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3800 }
3801 
3802 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3803 {
3804 	skb->dev = skb_dst(skb)->dev;
3805 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3806 }
3807 
3808 static int ip6_pkt_prohibit(struct sk_buff *skb)
3809 {
3810 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3811 }
3812 
3813 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3814 {
3815 	skb->dev = skb_dst(skb)->dev;
3816 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3817 }
3818 
3819 /*
3820  *	Allocate a dst for local (unicast / anycast) address.
3821  */
3822 
3823 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3824 				     struct inet6_dev *idev,
3825 				     const struct in6_addr *addr,
3826 				     bool anycast, gfp_t gfp_flags)
3827 {
3828 	struct fib6_config cfg = {
3829 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3830 		.fc_ifindex = idev->dev->ifindex,
3831 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3832 		.fc_dst = *addr,
3833 		.fc_dst_len = 128,
3834 		.fc_protocol = RTPROT_KERNEL,
3835 		.fc_nlinfo.nl_net = net,
3836 		.fc_ignore_dev_down = true,
3837 	};
3838 
3839 	if (anycast) {
3840 		cfg.fc_type = RTN_ANYCAST;
3841 		cfg.fc_flags |= RTF_ANYCAST;
3842 	} else {
3843 		cfg.fc_type = RTN_LOCAL;
3844 		cfg.fc_flags |= RTF_LOCAL;
3845 	}
3846 
3847 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3848 }
3849 
3850 /* remove deleted ip from prefsrc entries */
3851 struct arg_dev_net_ip {
3852 	struct net_device *dev;
3853 	struct net *net;
3854 	struct in6_addr *addr;
3855 };
3856 
3857 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3858 {
3859 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3860 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3861 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3862 
3863 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3864 	    rt != net->ipv6.fib6_null_entry &&
3865 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3866 		spin_lock_bh(&rt6_exception_lock);
3867 		/* remove prefsrc entry */
3868 		rt->fib6_prefsrc.plen = 0;
3869 		spin_unlock_bh(&rt6_exception_lock);
3870 	}
3871 	return 0;
3872 }
3873 
3874 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3875 {
3876 	struct net *net = dev_net(ifp->idev->dev);
3877 	struct arg_dev_net_ip adni = {
3878 		.dev = ifp->idev->dev,
3879 		.net = net,
3880 		.addr = &ifp->addr,
3881 	};
3882 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3883 }
3884 
3885 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3886 
3887 /* Remove routers and update dst entries when gateway turn into host. */
3888 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3889 {
3890 	struct in6_addr *gateway = (struct in6_addr *)arg;
3891 
3892 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3893 	    rt->fib6_nh.fib_nh_gw_family &&
3894 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3895 		return -1;
3896 	}
3897 
3898 	/* Further clean up cached routes in exception table.
3899 	 * This is needed because cached route may have a different
3900 	 * gateway than its 'parent' in the case of an ip redirect.
3901 	 */
3902 	rt6_exceptions_clean_tohost(rt, gateway);
3903 
3904 	return 0;
3905 }
3906 
3907 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3908 {
3909 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3910 }
3911 
3912 struct arg_netdev_event {
3913 	const struct net_device *dev;
3914 	union {
3915 		unsigned char nh_flags;
3916 		unsigned long event;
3917 	};
3918 };
3919 
3920 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3921 {
3922 	struct fib6_info *iter;
3923 	struct fib6_node *fn;
3924 
3925 	fn = rcu_dereference_protected(rt->fib6_node,
3926 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3927 	iter = rcu_dereference_protected(fn->leaf,
3928 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3929 	while (iter) {
3930 		if (iter->fib6_metric == rt->fib6_metric &&
3931 		    rt6_qualify_for_ecmp(iter))
3932 			return iter;
3933 		iter = rcu_dereference_protected(iter->fib6_next,
3934 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3935 	}
3936 
3937 	return NULL;
3938 }
3939 
3940 static bool rt6_is_dead(const struct fib6_info *rt)
3941 {
3942 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3943 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3944 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3945 		return true;
3946 
3947 	return false;
3948 }
3949 
3950 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3951 {
3952 	struct fib6_info *iter;
3953 	int total = 0;
3954 
3955 	if (!rt6_is_dead(rt))
3956 		total += rt->fib6_nh.fib_nh_weight;
3957 
3958 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3959 		if (!rt6_is_dead(iter))
3960 			total += iter->fib6_nh.fib_nh_weight;
3961 	}
3962 
3963 	return total;
3964 }
3965 
3966 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3967 {
3968 	int upper_bound = -1;
3969 
3970 	if (!rt6_is_dead(rt)) {
3971 		*weight += rt->fib6_nh.fib_nh_weight;
3972 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3973 						    total) - 1;
3974 	}
3975 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3976 }
3977 
3978 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3979 {
3980 	struct fib6_info *iter;
3981 	int weight = 0;
3982 
3983 	rt6_upper_bound_set(rt, &weight, total);
3984 
3985 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3986 		rt6_upper_bound_set(iter, &weight, total);
3987 }
3988 
3989 void rt6_multipath_rebalance(struct fib6_info *rt)
3990 {
3991 	struct fib6_info *first;
3992 	int total;
3993 
3994 	/* In case the entire multipath route was marked for flushing,
3995 	 * then there is no need to rebalance upon the removal of every
3996 	 * sibling route.
3997 	 */
3998 	if (!rt->fib6_nsiblings || rt->should_flush)
3999 		return;
4000 
4001 	/* During lookup routes are evaluated in order, so we need to
4002 	 * make sure upper bounds are assigned from the first sibling
4003 	 * onwards.
4004 	 */
4005 	first = rt6_multipath_first_sibling(rt);
4006 	if (WARN_ON_ONCE(!first))
4007 		return;
4008 
4009 	total = rt6_multipath_total_weight(first);
4010 	rt6_multipath_upper_bound_set(first, total);
4011 }
4012 
4013 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4014 {
4015 	const struct arg_netdev_event *arg = p_arg;
4016 	struct net *net = dev_net(arg->dev);
4017 
4018 	if (rt != net->ipv6.fib6_null_entry &&
4019 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4020 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4021 		fib6_update_sernum_upto_root(net, rt);
4022 		rt6_multipath_rebalance(rt);
4023 	}
4024 
4025 	return 0;
4026 }
4027 
4028 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4029 {
4030 	struct arg_netdev_event arg = {
4031 		.dev = dev,
4032 		{
4033 			.nh_flags = nh_flags,
4034 		},
4035 	};
4036 
4037 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4038 		arg.nh_flags |= RTNH_F_LINKDOWN;
4039 
4040 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4041 }
4042 
4043 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4044 				   const struct net_device *dev)
4045 {
4046 	struct fib6_info *iter;
4047 
4048 	if (rt->fib6_nh.fib_nh_dev == dev)
4049 		return true;
4050 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4051 		if (iter->fib6_nh.fib_nh_dev == dev)
4052 			return true;
4053 
4054 	return false;
4055 }
4056 
4057 static void rt6_multipath_flush(struct fib6_info *rt)
4058 {
4059 	struct fib6_info *iter;
4060 
4061 	rt->should_flush = 1;
4062 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063 		iter->should_flush = 1;
4064 }
4065 
4066 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4067 					     const struct net_device *down_dev)
4068 {
4069 	struct fib6_info *iter;
4070 	unsigned int dead = 0;
4071 
4072 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4073 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4074 		dead++;
4075 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4076 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4077 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4078 			dead++;
4079 
4080 	return dead;
4081 }
4082 
4083 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4084 				       const struct net_device *dev,
4085 				       unsigned char nh_flags)
4086 {
4087 	struct fib6_info *iter;
4088 
4089 	if (rt->fib6_nh.fib_nh_dev == dev)
4090 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4091 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4092 		if (iter->fib6_nh.fib_nh_dev == dev)
4093 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4094 }
4095 
4096 /* called with write lock held for table with rt */
4097 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4098 {
4099 	const struct arg_netdev_event *arg = p_arg;
4100 	const struct net_device *dev = arg->dev;
4101 	struct net *net = dev_net(dev);
4102 
4103 	if (rt == net->ipv6.fib6_null_entry)
4104 		return 0;
4105 
4106 	switch (arg->event) {
4107 	case NETDEV_UNREGISTER:
4108 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4109 	case NETDEV_DOWN:
4110 		if (rt->should_flush)
4111 			return -1;
4112 		if (!rt->fib6_nsiblings)
4113 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4114 		if (rt6_multipath_uses_dev(rt, dev)) {
4115 			unsigned int count;
4116 
4117 			count = rt6_multipath_dead_count(rt, dev);
4118 			if (rt->fib6_nsiblings + 1 == count) {
4119 				rt6_multipath_flush(rt);
4120 				return -1;
4121 			}
4122 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4123 						   RTNH_F_LINKDOWN);
4124 			fib6_update_sernum(net, rt);
4125 			rt6_multipath_rebalance(rt);
4126 		}
4127 		return -2;
4128 	case NETDEV_CHANGE:
4129 		if (rt->fib6_nh.fib_nh_dev != dev ||
4130 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4131 			break;
4132 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4133 		rt6_multipath_rebalance(rt);
4134 		break;
4135 	}
4136 
4137 	return 0;
4138 }
4139 
4140 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4141 {
4142 	struct arg_netdev_event arg = {
4143 		.dev = dev,
4144 		{
4145 			.event = event,
4146 		},
4147 	};
4148 	struct net *net = dev_net(dev);
4149 
4150 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4151 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4152 	else
4153 		fib6_clean_all(net, fib6_ifdown, &arg);
4154 }
4155 
4156 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4157 {
4158 	rt6_sync_down_dev(dev, event);
4159 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4160 	neigh_ifdown(&nd_tbl, dev);
4161 }
4162 
4163 struct rt6_mtu_change_arg {
4164 	struct net_device *dev;
4165 	unsigned int mtu;
4166 };
4167 
4168 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4169 {
4170 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4171 	struct inet6_dev *idev;
4172 
4173 	/* In IPv6 pmtu discovery is not optional,
4174 	   so that RTAX_MTU lock cannot disable it.
4175 	   We still use this lock to block changes
4176 	   caused by addrconf/ndisc.
4177 	*/
4178 
4179 	idev = __in6_dev_get(arg->dev);
4180 	if (!idev)
4181 		return 0;
4182 
4183 	/* For administrative MTU increase, there is no way to discover
4184 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4185 	   Since RFC 1981 doesn't include administrative MTU increase
4186 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4187 	 */
4188 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4189 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4190 		u32 mtu = rt->fib6_pmtu;
4191 
4192 		if (mtu >= arg->mtu ||
4193 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4194 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4195 
4196 		spin_lock_bh(&rt6_exception_lock);
4197 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4198 		spin_unlock_bh(&rt6_exception_lock);
4199 	}
4200 	return 0;
4201 }
4202 
4203 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4204 {
4205 	struct rt6_mtu_change_arg arg = {
4206 		.dev = dev,
4207 		.mtu = mtu,
4208 	};
4209 
4210 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4211 }
4212 
4213 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4214 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4215 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4216 	[RTA_OIF]               = { .type = NLA_U32 },
4217 	[RTA_IIF]		= { .type = NLA_U32 },
4218 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4219 	[RTA_METRICS]           = { .type = NLA_NESTED },
4220 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4221 	[RTA_PREF]              = { .type = NLA_U8 },
4222 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4223 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4224 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4225 	[RTA_UID]		= { .type = NLA_U32 },
4226 	[RTA_MARK]		= { .type = NLA_U32 },
4227 	[RTA_TABLE]		= { .type = NLA_U32 },
4228 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4229 	[RTA_SPORT]		= { .type = NLA_U16 },
4230 	[RTA_DPORT]		= { .type = NLA_U16 },
4231 };
4232 
4233 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4234 			      struct fib6_config *cfg,
4235 			      struct netlink_ext_ack *extack)
4236 {
4237 	struct rtmsg *rtm;
4238 	struct nlattr *tb[RTA_MAX+1];
4239 	unsigned int pref;
4240 	int err;
4241 
4242 	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4243 				     rtm_ipv6_policy, extack);
4244 	if (err < 0)
4245 		goto errout;
4246 
4247 	err = -EINVAL;
4248 	rtm = nlmsg_data(nlh);
4249 
4250 	*cfg = (struct fib6_config){
4251 		.fc_table = rtm->rtm_table,
4252 		.fc_dst_len = rtm->rtm_dst_len,
4253 		.fc_src_len = rtm->rtm_src_len,
4254 		.fc_flags = RTF_UP,
4255 		.fc_protocol = rtm->rtm_protocol,
4256 		.fc_type = rtm->rtm_type,
4257 
4258 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4259 		.fc_nlinfo.nlh = nlh,
4260 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4261 	};
4262 
4263 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4264 	    rtm->rtm_type == RTN_BLACKHOLE ||
4265 	    rtm->rtm_type == RTN_PROHIBIT ||
4266 	    rtm->rtm_type == RTN_THROW)
4267 		cfg->fc_flags |= RTF_REJECT;
4268 
4269 	if (rtm->rtm_type == RTN_LOCAL)
4270 		cfg->fc_flags |= RTF_LOCAL;
4271 
4272 	if (rtm->rtm_flags & RTM_F_CLONED)
4273 		cfg->fc_flags |= RTF_CACHE;
4274 
4275 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4276 
4277 	if (tb[RTA_GATEWAY]) {
4278 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4279 		cfg->fc_flags |= RTF_GATEWAY;
4280 	}
4281 	if (tb[RTA_VIA]) {
4282 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4283 		goto errout;
4284 	}
4285 
4286 	if (tb[RTA_DST]) {
4287 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4288 
4289 		if (nla_len(tb[RTA_DST]) < plen)
4290 			goto errout;
4291 
4292 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4293 	}
4294 
4295 	if (tb[RTA_SRC]) {
4296 		int plen = (rtm->rtm_src_len + 7) >> 3;
4297 
4298 		if (nla_len(tb[RTA_SRC]) < plen)
4299 			goto errout;
4300 
4301 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4302 	}
4303 
4304 	if (tb[RTA_PREFSRC])
4305 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4306 
4307 	if (tb[RTA_OIF])
4308 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4309 
4310 	if (tb[RTA_PRIORITY])
4311 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4312 
4313 	if (tb[RTA_METRICS]) {
4314 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4315 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4316 	}
4317 
4318 	if (tb[RTA_TABLE])
4319 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4320 
4321 	if (tb[RTA_MULTIPATH]) {
4322 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4323 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4324 
4325 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4326 						     cfg->fc_mp_len, extack);
4327 		if (err < 0)
4328 			goto errout;
4329 	}
4330 
4331 	if (tb[RTA_PREF]) {
4332 		pref = nla_get_u8(tb[RTA_PREF]);
4333 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4334 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4335 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4336 		cfg->fc_flags |= RTF_PREF(pref);
4337 	}
4338 
4339 	if (tb[RTA_ENCAP])
4340 		cfg->fc_encap = tb[RTA_ENCAP];
4341 
4342 	if (tb[RTA_ENCAP_TYPE]) {
4343 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4344 
4345 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4346 		if (err < 0)
4347 			goto errout;
4348 	}
4349 
4350 	if (tb[RTA_EXPIRES]) {
4351 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4352 
4353 		if (addrconf_finite_timeout(timeout)) {
4354 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4355 			cfg->fc_flags |= RTF_EXPIRES;
4356 		}
4357 	}
4358 
4359 	err = 0;
4360 errout:
4361 	return err;
4362 }
4363 
4364 struct rt6_nh {
4365 	struct fib6_info *fib6_info;
4366 	struct fib6_config r_cfg;
4367 	struct list_head next;
4368 };
4369 
4370 static int ip6_route_info_append(struct net *net,
4371 				 struct list_head *rt6_nh_list,
4372 				 struct fib6_info *rt,
4373 				 struct fib6_config *r_cfg)
4374 {
4375 	struct rt6_nh *nh;
4376 	int err = -EEXIST;
4377 
4378 	list_for_each_entry(nh, rt6_nh_list, next) {
4379 		/* check if fib6_info already exists */
4380 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4381 			return err;
4382 	}
4383 
4384 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4385 	if (!nh)
4386 		return -ENOMEM;
4387 	nh->fib6_info = rt;
4388 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4389 	list_add_tail(&nh->next, rt6_nh_list);
4390 
4391 	return 0;
4392 }
4393 
4394 static void ip6_route_mpath_notify(struct fib6_info *rt,
4395 				   struct fib6_info *rt_last,
4396 				   struct nl_info *info,
4397 				   __u16 nlflags)
4398 {
4399 	/* if this is an APPEND route, then rt points to the first route
4400 	 * inserted and rt_last points to last route inserted. Userspace
4401 	 * wants a consistent dump of the route which starts at the first
4402 	 * nexthop. Since sibling routes are always added at the end of
4403 	 * the list, find the first sibling of the last route appended
4404 	 */
4405 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4406 		rt = list_first_entry(&rt_last->fib6_siblings,
4407 				      struct fib6_info,
4408 				      fib6_siblings);
4409 	}
4410 
4411 	if (rt)
4412 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4413 }
4414 
4415 static int ip6_route_multipath_add(struct fib6_config *cfg,
4416 				   struct netlink_ext_ack *extack)
4417 {
4418 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4419 	struct nl_info *info = &cfg->fc_nlinfo;
4420 	struct fib6_config r_cfg;
4421 	struct rtnexthop *rtnh;
4422 	struct fib6_info *rt;
4423 	struct rt6_nh *err_nh;
4424 	struct rt6_nh *nh, *nh_safe;
4425 	__u16 nlflags;
4426 	int remaining;
4427 	int attrlen;
4428 	int err = 1;
4429 	int nhn = 0;
4430 	int replace = (cfg->fc_nlinfo.nlh &&
4431 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4432 	LIST_HEAD(rt6_nh_list);
4433 
4434 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4435 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4436 		nlflags |= NLM_F_APPEND;
4437 
4438 	remaining = cfg->fc_mp_len;
4439 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4440 
4441 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4442 	 * fib6_info structs per nexthop
4443 	 */
4444 	while (rtnh_ok(rtnh, remaining)) {
4445 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4446 		if (rtnh->rtnh_ifindex)
4447 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4448 
4449 		attrlen = rtnh_attrlen(rtnh);
4450 		if (attrlen > 0) {
4451 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4452 
4453 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4454 			if (nla) {
4455 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4456 				r_cfg.fc_flags |= RTF_GATEWAY;
4457 			}
4458 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4459 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4460 			if (nla)
4461 				r_cfg.fc_encap_type = nla_get_u16(nla);
4462 		}
4463 
4464 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4465 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4466 		if (IS_ERR(rt)) {
4467 			err = PTR_ERR(rt);
4468 			rt = NULL;
4469 			goto cleanup;
4470 		}
4471 		if (!rt6_qualify_for_ecmp(rt)) {
4472 			err = -EINVAL;
4473 			NL_SET_ERR_MSG(extack,
4474 				       "Device only routes can not be added for IPv6 using the multipath API.");
4475 			fib6_info_release(rt);
4476 			goto cleanup;
4477 		}
4478 
4479 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4480 
4481 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4482 					    rt, &r_cfg);
4483 		if (err) {
4484 			fib6_info_release(rt);
4485 			goto cleanup;
4486 		}
4487 
4488 		rtnh = rtnh_next(rtnh, &remaining);
4489 	}
4490 
4491 	/* for add and replace send one notification with all nexthops.
4492 	 * Skip the notification in fib6_add_rt2node and send one with
4493 	 * the full route when done
4494 	 */
4495 	info->skip_notify = 1;
4496 
4497 	err_nh = NULL;
4498 	list_for_each_entry(nh, &rt6_nh_list, next) {
4499 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4500 		fib6_info_release(nh->fib6_info);
4501 
4502 		if (!err) {
4503 			/* save reference to last route successfully inserted */
4504 			rt_last = nh->fib6_info;
4505 
4506 			/* save reference to first route for notification */
4507 			if (!rt_notif)
4508 				rt_notif = nh->fib6_info;
4509 		}
4510 
4511 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4512 		nh->fib6_info = NULL;
4513 		if (err) {
4514 			if (replace && nhn)
4515 				NL_SET_ERR_MSG_MOD(extack,
4516 						   "multipath route replace failed (check consistency of installed routes)");
4517 			err_nh = nh;
4518 			goto add_errout;
4519 		}
4520 
4521 		/* Because each route is added like a single route we remove
4522 		 * these flags after the first nexthop: if there is a collision,
4523 		 * we have already failed to add the first nexthop:
4524 		 * fib6_add_rt2node() has rejected it; when replacing, old
4525 		 * nexthops have been replaced by first new, the rest should
4526 		 * be added to it.
4527 		 */
4528 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4529 						     NLM_F_REPLACE);
4530 		nhn++;
4531 	}
4532 
4533 	/* success ... tell user about new route */
4534 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4535 	goto cleanup;
4536 
4537 add_errout:
4538 	/* send notification for routes that were added so that
4539 	 * the delete notifications sent by ip6_route_del are
4540 	 * coherent
4541 	 */
4542 	if (rt_notif)
4543 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4544 
4545 	/* Delete routes that were already added */
4546 	list_for_each_entry(nh, &rt6_nh_list, next) {
4547 		if (err_nh == nh)
4548 			break;
4549 		ip6_route_del(&nh->r_cfg, extack);
4550 	}
4551 
4552 cleanup:
4553 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4554 		if (nh->fib6_info)
4555 			fib6_info_release(nh->fib6_info);
4556 		list_del(&nh->next);
4557 		kfree(nh);
4558 	}
4559 
4560 	return err;
4561 }
4562 
4563 static int ip6_route_multipath_del(struct fib6_config *cfg,
4564 				   struct netlink_ext_ack *extack)
4565 {
4566 	struct fib6_config r_cfg;
4567 	struct rtnexthop *rtnh;
4568 	int remaining;
4569 	int attrlen;
4570 	int err = 1, last_err = 0;
4571 
4572 	remaining = cfg->fc_mp_len;
4573 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4574 
4575 	/* Parse a Multipath Entry */
4576 	while (rtnh_ok(rtnh, remaining)) {
4577 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4578 		if (rtnh->rtnh_ifindex)
4579 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4580 
4581 		attrlen = rtnh_attrlen(rtnh);
4582 		if (attrlen > 0) {
4583 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4584 
4585 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4586 			if (nla) {
4587 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4588 				r_cfg.fc_flags |= RTF_GATEWAY;
4589 			}
4590 		}
4591 		err = ip6_route_del(&r_cfg, extack);
4592 		if (err)
4593 			last_err = err;
4594 
4595 		rtnh = rtnh_next(rtnh, &remaining);
4596 	}
4597 
4598 	return last_err;
4599 }
4600 
4601 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4602 			      struct netlink_ext_ack *extack)
4603 {
4604 	struct fib6_config cfg;
4605 	int err;
4606 
4607 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4608 	if (err < 0)
4609 		return err;
4610 
4611 	if (cfg.fc_mp)
4612 		return ip6_route_multipath_del(&cfg, extack);
4613 	else {
4614 		cfg.fc_delete_all_nh = 1;
4615 		return ip6_route_del(&cfg, extack);
4616 	}
4617 }
4618 
4619 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4620 			      struct netlink_ext_ack *extack)
4621 {
4622 	struct fib6_config cfg;
4623 	int err;
4624 
4625 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4626 	if (err < 0)
4627 		return err;
4628 
4629 	if (cfg.fc_metric == 0)
4630 		cfg.fc_metric = IP6_RT_PRIO_USER;
4631 
4632 	if (cfg.fc_mp)
4633 		return ip6_route_multipath_add(&cfg, extack);
4634 	else
4635 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4636 }
4637 
4638 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4639 {
4640 	int nexthop_len = 0;
4641 
4642 	if (rt->fib6_nsiblings) {
4643 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4644 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4645 			    + nla_total_size(16) /* RTA_GATEWAY */
4646 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4647 
4648 		nexthop_len *= rt->fib6_nsiblings;
4649 	}
4650 
4651 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4652 	       + nla_total_size(16) /* RTA_SRC */
4653 	       + nla_total_size(16) /* RTA_DST */
4654 	       + nla_total_size(16) /* RTA_GATEWAY */
4655 	       + nla_total_size(16) /* RTA_PREFSRC */
4656 	       + nla_total_size(4) /* RTA_TABLE */
4657 	       + nla_total_size(4) /* RTA_IIF */
4658 	       + nla_total_size(4) /* RTA_OIF */
4659 	       + nla_total_size(4) /* RTA_PRIORITY */
4660 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4661 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4662 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4663 	       + nla_total_size(1) /* RTA_PREF */
4664 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4665 	       + nexthop_len;
4666 }
4667 
4668 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4669 			 struct fib6_info *rt, struct dst_entry *dst,
4670 			 struct in6_addr *dest, struct in6_addr *src,
4671 			 int iif, int type, u32 portid, u32 seq,
4672 			 unsigned int flags)
4673 {
4674 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4675 	struct rt6key *rt6_dst, *rt6_src;
4676 	u32 *pmetrics, table, rt6_flags;
4677 	struct nlmsghdr *nlh;
4678 	struct rtmsg *rtm;
4679 	long expires = 0;
4680 
4681 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4682 	if (!nlh)
4683 		return -EMSGSIZE;
4684 
4685 	if (rt6) {
4686 		rt6_dst = &rt6->rt6i_dst;
4687 		rt6_src = &rt6->rt6i_src;
4688 		rt6_flags = rt6->rt6i_flags;
4689 	} else {
4690 		rt6_dst = &rt->fib6_dst;
4691 		rt6_src = &rt->fib6_src;
4692 		rt6_flags = rt->fib6_flags;
4693 	}
4694 
4695 	rtm = nlmsg_data(nlh);
4696 	rtm->rtm_family = AF_INET6;
4697 	rtm->rtm_dst_len = rt6_dst->plen;
4698 	rtm->rtm_src_len = rt6_src->plen;
4699 	rtm->rtm_tos = 0;
4700 	if (rt->fib6_table)
4701 		table = rt->fib6_table->tb6_id;
4702 	else
4703 		table = RT6_TABLE_UNSPEC;
4704 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4705 	if (nla_put_u32(skb, RTA_TABLE, table))
4706 		goto nla_put_failure;
4707 
4708 	rtm->rtm_type = rt->fib6_type;
4709 	rtm->rtm_flags = 0;
4710 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4711 	rtm->rtm_protocol = rt->fib6_protocol;
4712 
4713 	if (rt6_flags & RTF_CACHE)
4714 		rtm->rtm_flags |= RTM_F_CLONED;
4715 
4716 	if (dest) {
4717 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4718 			goto nla_put_failure;
4719 		rtm->rtm_dst_len = 128;
4720 	} else if (rtm->rtm_dst_len)
4721 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4722 			goto nla_put_failure;
4723 #ifdef CONFIG_IPV6_SUBTREES
4724 	if (src) {
4725 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4726 			goto nla_put_failure;
4727 		rtm->rtm_src_len = 128;
4728 	} else if (rtm->rtm_src_len &&
4729 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4730 		goto nla_put_failure;
4731 #endif
4732 	if (iif) {
4733 #ifdef CONFIG_IPV6_MROUTE
4734 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4735 			int err = ip6mr_get_route(net, skb, rtm, portid);
4736 
4737 			if (err == 0)
4738 				return 0;
4739 			if (err < 0)
4740 				goto nla_put_failure;
4741 		} else
4742 #endif
4743 			if (nla_put_u32(skb, RTA_IIF, iif))
4744 				goto nla_put_failure;
4745 	} else if (dest) {
4746 		struct in6_addr saddr_buf;
4747 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4748 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4749 			goto nla_put_failure;
4750 	}
4751 
4752 	if (rt->fib6_prefsrc.plen) {
4753 		struct in6_addr saddr_buf;
4754 		saddr_buf = rt->fib6_prefsrc.addr;
4755 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4756 			goto nla_put_failure;
4757 	}
4758 
4759 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4760 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4761 		goto nla_put_failure;
4762 
4763 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4764 		goto nla_put_failure;
4765 
4766 	/* For multipath routes, walk the siblings list and add
4767 	 * each as a nexthop within RTA_MULTIPATH.
4768 	 */
4769 	if (rt6) {
4770 		if (rt6_flags & RTF_GATEWAY &&
4771 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4772 			goto nla_put_failure;
4773 
4774 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4775 			goto nla_put_failure;
4776 	} else if (rt->fib6_nsiblings) {
4777 		struct fib6_info *sibling, *next_sibling;
4778 		struct nlattr *mp;
4779 
4780 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4781 		if (!mp)
4782 			goto nla_put_failure;
4783 
4784 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4785 				    rt->fib6_nh.fib_nh_weight) < 0)
4786 			goto nla_put_failure;
4787 
4788 		list_for_each_entry_safe(sibling, next_sibling,
4789 					 &rt->fib6_siblings, fib6_siblings) {
4790 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4791 					    sibling->fib6_nh.fib_nh_weight) < 0)
4792 				goto nla_put_failure;
4793 		}
4794 
4795 		nla_nest_end(skb, mp);
4796 	} else {
4797 		unsigned char nh_flags = 0;
4798 
4799 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4800 				     &nh_flags, false) < 0)
4801 			goto nla_put_failure;
4802 
4803 		rtm->rtm_flags |= nh_flags;
4804 	}
4805 
4806 	if (rt6_flags & RTF_EXPIRES) {
4807 		expires = dst ? dst->expires : rt->expires;
4808 		expires -= jiffies;
4809 	}
4810 
4811 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4812 		goto nla_put_failure;
4813 
4814 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4815 		goto nla_put_failure;
4816 
4817 
4818 	nlmsg_end(skb, nlh);
4819 	return 0;
4820 
4821 nla_put_failure:
4822 	nlmsg_cancel(skb, nlh);
4823 	return -EMSGSIZE;
4824 }
4825 
4826 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4827 			       const struct net_device *dev)
4828 {
4829 	if (f6i->fib6_nh.fib_nh_dev == dev)
4830 		return true;
4831 
4832 	if (f6i->fib6_nsiblings) {
4833 		struct fib6_info *sibling, *next_sibling;
4834 
4835 		list_for_each_entry_safe(sibling, next_sibling,
4836 					 &f6i->fib6_siblings, fib6_siblings) {
4837 			if (sibling->fib6_nh.fib_nh_dev == dev)
4838 				return true;
4839 		}
4840 	}
4841 
4842 	return false;
4843 }
4844 
4845 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4846 {
4847 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4848 	struct fib_dump_filter *filter = &arg->filter;
4849 	unsigned int flags = NLM_F_MULTI;
4850 	struct net *net = arg->net;
4851 
4852 	if (rt == net->ipv6.fib6_null_entry)
4853 		return 0;
4854 
4855 	if ((filter->flags & RTM_F_PREFIX) &&
4856 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4857 		/* success since this is not a prefix route */
4858 		return 1;
4859 	}
4860 	if (filter->filter_set) {
4861 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4862 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4863 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4864 			return 1;
4865 		}
4866 		flags |= NLM_F_DUMP_FILTERED;
4867 	}
4868 
4869 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4870 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4871 			     arg->cb->nlh->nlmsg_seq, flags);
4872 }
4873 
4874 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4875 					const struct nlmsghdr *nlh,
4876 					struct nlattr **tb,
4877 					struct netlink_ext_ack *extack)
4878 {
4879 	struct rtmsg *rtm;
4880 	int i, err;
4881 
4882 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4883 		NL_SET_ERR_MSG_MOD(extack,
4884 				   "Invalid header for get route request");
4885 		return -EINVAL;
4886 	}
4887 
4888 	if (!netlink_strict_get_check(skb))
4889 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4890 					      rtm_ipv6_policy, extack);
4891 
4892 	rtm = nlmsg_data(nlh);
4893 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4894 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4895 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4896 	    rtm->rtm_type) {
4897 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4898 		return -EINVAL;
4899 	}
4900 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4901 		NL_SET_ERR_MSG_MOD(extack,
4902 				   "Invalid flags for get route request");
4903 		return -EINVAL;
4904 	}
4905 
4906 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4907 					    rtm_ipv6_policy, extack);
4908 	if (err)
4909 		return err;
4910 
4911 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4912 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4913 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4914 		return -EINVAL;
4915 	}
4916 
4917 	for (i = 0; i <= RTA_MAX; i++) {
4918 		if (!tb[i])
4919 			continue;
4920 
4921 		switch (i) {
4922 		case RTA_SRC:
4923 		case RTA_DST:
4924 		case RTA_IIF:
4925 		case RTA_OIF:
4926 		case RTA_MARK:
4927 		case RTA_UID:
4928 		case RTA_SPORT:
4929 		case RTA_DPORT:
4930 		case RTA_IP_PROTO:
4931 			break;
4932 		default:
4933 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4934 			return -EINVAL;
4935 		}
4936 	}
4937 
4938 	return 0;
4939 }
4940 
4941 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4942 			      struct netlink_ext_ack *extack)
4943 {
4944 	struct net *net = sock_net(in_skb->sk);
4945 	struct nlattr *tb[RTA_MAX+1];
4946 	int err, iif = 0, oif = 0;
4947 	struct fib6_info *from;
4948 	struct dst_entry *dst;
4949 	struct rt6_info *rt;
4950 	struct sk_buff *skb;
4951 	struct rtmsg *rtm;
4952 	struct flowi6 fl6 = {};
4953 	bool fibmatch;
4954 
4955 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4956 	if (err < 0)
4957 		goto errout;
4958 
4959 	err = -EINVAL;
4960 	rtm = nlmsg_data(nlh);
4961 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4962 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4963 
4964 	if (tb[RTA_SRC]) {
4965 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4966 			goto errout;
4967 
4968 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4969 	}
4970 
4971 	if (tb[RTA_DST]) {
4972 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4973 			goto errout;
4974 
4975 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4976 	}
4977 
4978 	if (tb[RTA_IIF])
4979 		iif = nla_get_u32(tb[RTA_IIF]);
4980 
4981 	if (tb[RTA_OIF])
4982 		oif = nla_get_u32(tb[RTA_OIF]);
4983 
4984 	if (tb[RTA_MARK])
4985 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4986 
4987 	if (tb[RTA_UID])
4988 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4989 					   nla_get_u32(tb[RTA_UID]));
4990 	else
4991 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4992 
4993 	if (tb[RTA_SPORT])
4994 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4995 
4996 	if (tb[RTA_DPORT])
4997 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4998 
4999 	if (tb[RTA_IP_PROTO]) {
5000 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5001 						  &fl6.flowi6_proto, AF_INET6,
5002 						  extack);
5003 		if (err)
5004 			goto errout;
5005 	}
5006 
5007 	if (iif) {
5008 		struct net_device *dev;
5009 		int flags = 0;
5010 
5011 		rcu_read_lock();
5012 
5013 		dev = dev_get_by_index_rcu(net, iif);
5014 		if (!dev) {
5015 			rcu_read_unlock();
5016 			err = -ENODEV;
5017 			goto errout;
5018 		}
5019 
5020 		fl6.flowi6_iif = iif;
5021 
5022 		if (!ipv6_addr_any(&fl6.saddr))
5023 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5024 
5025 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5026 
5027 		rcu_read_unlock();
5028 	} else {
5029 		fl6.flowi6_oif = oif;
5030 
5031 		dst = ip6_route_output(net, NULL, &fl6);
5032 	}
5033 
5034 
5035 	rt = container_of(dst, struct rt6_info, dst);
5036 	if (rt->dst.error) {
5037 		err = rt->dst.error;
5038 		ip6_rt_put(rt);
5039 		goto errout;
5040 	}
5041 
5042 	if (rt == net->ipv6.ip6_null_entry) {
5043 		err = rt->dst.error;
5044 		ip6_rt_put(rt);
5045 		goto errout;
5046 	}
5047 
5048 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5049 	if (!skb) {
5050 		ip6_rt_put(rt);
5051 		err = -ENOBUFS;
5052 		goto errout;
5053 	}
5054 
5055 	skb_dst_set(skb, &rt->dst);
5056 
5057 	rcu_read_lock();
5058 	from = rcu_dereference(rt->from);
5059 	if (from) {
5060 		if (fibmatch)
5061 			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5062 					    iif, RTM_NEWROUTE,
5063 					    NETLINK_CB(in_skb).portid,
5064 					    nlh->nlmsg_seq, 0);
5065 		else
5066 			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5067 					    &fl6.saddr, iif, RTM_NEWROUTE,
5068 					    NETLINK_CB(in_skb).portid,
5069 					    nlh->nlmsg_seq, 0);
5070 	} else {
5071 		err = -ENETUNREACH;
5072 	}
5073 	rcu_read_unlock();
5074 
5075 	if (err < 0) {
5076 		kfree_skb(skb);
5077 		goto errout;
5078 	}
5079 
5080 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5081 errout:
5082 	return err;
5083 }
5084 
5085 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5086 		     unsigned int nlm_flags)
5087 {
5088 	struct sk_buff *skb;
5089 	struct net *net = info->nl_net;
5090 	u32 seq;
5091 	int err;
5092 
5093 	err = -ENOBUFS;
5094 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5095 
5096 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5097 	if (!skb)
5098 		goto errout;
5099 
5100 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5101 			    event, info->portid, seq, nlm_flags);
5102 	if (err < 0) {
5103 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5104 		WARN_ON(err == -EMSGSIZE);
5105 		kfree_skb(skb);
5106 		goto errout;
5107 	}
5108 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5109 		    info->nlh, gfp_any());
5110 	return;
5111 errout:
5112 	if (err < 0)
5113 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5114 }
5115 
5116 static int ip6_route_dev_notify(struct notifier_block *this,
5117 				unsigned long event, void *ptr)
5118 {
5119 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5120 	struct net *net = dev_net(dev);
5121 
5122 	if (!(dev->flags & IFF_LOOPBACK))
5123 		return NOTIFY_OK;
5124 
5125 	if (event == NETDEV_REGISTER) {
5126 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5127 		net->ipv6.ip6_null_entry->dst.dev = dev;
5128 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5129 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5130 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5131 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5132 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5133 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5134 #endif
5135 	 } else if (event == NETDEV_UNREGISTER &&
5136 		    dev->reg_state != NETREG_UNREGISTERED) {
5137 		/* NETDEV_UNREGISTER could be fired for multiple times by
5138 		 * netdev_wait_allrefs(). Make sure we only call this once.
5139 		 */
5140 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5142 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5143 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5144 #endif
5145 	}
5146 
5147 	return NOTIFY_OK;
5148 }
5149 
5150 /*
5151  *	/proc
5152  */
5153 
5154 #ifdef CONFIG_PROC_FS
5155 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5156 {
5157 	struct net *net = (struct net *)seq->private;
5158 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5159 		   net->ipv6.rt6_stats->fib_nodes,
5160 		   net->ipv6.rt6_stats->fib_route_nodes,
5161 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5162 		   net->ipv6.rt6_stats->fib_rt_entries,
5163 		   net->ipv6.rt6_stats->fib_rt_cache,
5164 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5165 		   net->ipv6.rt6_stats->fib_discarded_routes);
5166 
5167 	return 0;
5168 }
5169 #endif	/* CONFIG_PROC_FS */
5170 
5171 #ifdef CONFIG_SYSCTL
5172 
5173 static
5174 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5175 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5176 {
5177 	struct net *net;
5178 	int delay;
5179 	int ret;
5180 	if (!write)
5181 		return -EINVAL;
5182 
5183 	net = (struct net *)ctl->extra1;
5184 	delay = net->ipv6.sysctl.flush_delay;
5185 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5186 	if (ret)
5187 		return ret;
5188 
5189 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5190 	return 0;
5191 }
5192 
5193 static int zero;
5194 static int one = 1;
5195 
5196 static struct ctl_table ipv6_route_table_template[] = {
5197 	{
5198 		.procname	=	"flush",
5199 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5200 		.maxlen		=	sizeof(int),
5201 		.mode		=	0200,
5202 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5203 	},
5204 	{
5205 		.procname	=	"gc_thresh",
5206 		.data		=	&ip6_dst_ops_template.gc_thresh,
5207 		.maxlen		=	sizeof(int),
5208 		.mode		=	0644,
5209 		.proc_handler	=	proc_dointvec,
5210 	},
5211 	{
5212 		.procname	=	"max_size",
5213 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5214 		.maxlen		=	sizeof(int),
5215 		.mode		=	0644,
5216 		.proc_handler	=	proc_dointvec,
5217 	},
5218 	{
5219 		.procname	=	"gc_min_interval",
5220 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5221 		.maxlen		=	sizeof(int),
5222 		.mode		=	0644,
5223 		.proc_handler	=	proc_dointvec_jiffies,
5224 	},
5225 	{
5226 		.procname	=	"gc_timeout",
5227 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5228 		.maxlen		=	sizeof(int),
5229 		.mode		=	0644,
5230 		.proc_handler	=	proc_dointvec_jiffies,
5231 	},
5232 	{
5233 		.procname	=	"gc_interval",
5234 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5235 		.maxlen		=	sizeof(int),
5236 		.mode		=	0644,
5237 		.proc_handler	=	proc_dointvec_jiffies,
5238 	},
5239 	{
5240 		.procname	=	"gc_elasticity",
5241 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5242 		.maxlen		=	sizeof(int),
5243 		.mode		=	0644,
5244 		.proc_handler	=	proc_dointvec,
5245 	},
5246 	{
5247 		.procname	=	"mtu_expires",
5248 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5249 		.maxlen		=	sizeof(int),
5250 		.mode		=	0644,
5251 		.proc_handler	=	proc_dointvec_jiffies,
5252 	},
5253 	{
5254 		.procname	=	"min_adv_mss",
5255 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5256 		.maxlen		=	sizeof(int),
5257 		.mode		=	0644,
5258 		.proc_handler	=	proc_dointvec,
5259 	},
5260 	{
5261 		.procname	=	"gc_min_interval_ms",
5262 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5263 		.maxlen		=	sizeof(int),
5264 		.mode		=	0644,
5265 		.proc_handler	=	proc_dointvec_ms_jiffies,
5266 	},
5267 	{
5268 		.procname	=	"skip_notify_on_dev_down",
5269 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5270 		.maxlen		=	sizeof(int),
5271 		.mode		=	0644,
5272 		.proc_handler	=	proc_dointvec,
5273 		.extra1		=	&zero,
5274 		.extra2		=	&one,
5275 	},
5276 	{ }
5277 };
5278 
5279 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5280 {
5281 	struct ctl_table *table;
5282 
5283 	table = kmemdup(ipv6_route_table_template,
5284 			sizeof(ipv6_route_table_template),
5285 			GFP_KERNEL);
5286 
5287 	if (table) {
5288 		table[0].data = &net->ipv6.sysctl.flush_delay;
5289 		table[0].extra1 = net;
5290 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5291 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5292 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5293 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5294 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5295 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5296 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5297 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5298 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5299 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5300 
5301 		/* Don't export sysctls to unprivileged users */
5302 		if (net->user_ns != &init_user_ns)
5303 			table[0].procname = NULL;
5304 	}
5305 
5306 	return table;
5307 }
5308 #endif
5309 
5310 static int __net_init ip6_route_net_init(struct net *net)
5311 {
5312 	int ret = -ENOMEM;
5313 
5314 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5315 	       sizeof(net->ipv6.ip6_dst_ops));
5316 
5317 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5318 		goto out_ip6_dst_ops;
5319 
5320 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5321 					    sizeof(*net->ipv6.fib6_null_entry),
5322 					    GFP_KERNEL);
5323 	if (!net->ipv6.fib6_null_entry)
5324 		goto out_ip6_dst_entries;
5325 
5326 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5327 					   sizeof(*net->ipv6.ip6_null_entry),
5328 					   GFP_KERNEL);
5329 	if (!net->ipv6.ip6_null_entry)
5330 		goto out_fib6_null_entry;
5331 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5332 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5333 			 ip6_template_metrics, true);
5334 
5335 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5336 	net->ipv6.fib6_has_custom_rules = false;
5337 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5338 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5339 					       GFP_KERNEL);
5340 	if (!net->ipv6.ip6_prohibit_entry)
5341 		goto out_ip6_null_entry;
5342 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5343 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5344 			 ip6_template_metrics, true);
5345 
5346 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5347 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5348 					       GFP_KERNEL);
5349 	if (!net->ipv6.ip6_blk_hole_entry)
5350 		goto out_ip6_prohibit_entry;
5351 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5352 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5353 			 ip6_template_metrics, true);
5354 #endif
5355 
5356 	net->ipv6.sysctl.flush_delay = 0;
5357 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5358 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5359 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5360 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5361 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5362 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5363 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5364 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5365 
5366 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5367 
5368 	ret = 0;
5369 out:
5370 	return ret;
5371 
5372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5373 out_ip6_prohibit_entry:
5374 	kfree(net->ipv6.ip6_prohibit_entry);
5375 out_ip6_null_entry:
5376 	kfree(net->ipv6.ip6_null_entry);
5377 #endif
5378 out_fib6_null_entry:
5379 	kfree(net->ipv6.fib6_null_entry);
5380 out_ip6_dst_entries:
5381 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5382 out_ip6_dst_ops:
5383 	goto out;
5384 }
5385 
5386 static void __net_exit ip6_route_net_exit(struct net *net)
5387 {
5388 	kfree(net->ipv6.fib6_null_entry);
5389 	kfree(net->ipv6.ip6_null_entry);
5390 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5391 	kfree(net->ipv6.ip6_prohibit_entry);
5392 	kfree(net->ipv6.ip6_blk_hole_entry);
5393 #endif
5394 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5395 }
5396 
5397 static int __net_init ip6_route_net_init_late(struct net *net)
5398 {
5399 #ifdef CONFIG_PROC_FS
5400 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5401 			sizeof(struct ipv6_route_iter));
5402 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5403 			rt6_stats_seq_show, NULL);
5404 #endif
5405 	return 0;
5406 }
5407 
5408 static void __net_exit ip6_route_net_exit_late(struct net *net)
5409 {
5410 #ifdef CONFIG_PROC_FS
5411 	remove_proc_entry("ipv6_route", net->proc_net);
5412 	remove_proc_entry("rt6_stats", net->proc_net);
5413 #endif
5414 }
5415 
5416 static struct pernet_operations ip6_route_net_ops = {
5417 	.init = ip6_route_net_init,
5418 	.exit = ip6_route_net_exit,
5419 };
5420 
5421 static int __net_init ipv6_inetpeer_init(struct net *net)
5422 {
5423 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5424 
5425 	if (!bp)
5426 		return -ENOMEM;
5427 	inet_peer_base_init(bp);
5428 	net->ipv6.peers = bp;
5429 	return 0;
5430 }
5431 
5432 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5433 {
5434 	struct inet_peer_base *bp = net->ipv6.peers;
5435 
5436 	net->ipv6.peers = NULL;
5437 	inetpeer_invalidate_tree(bp);
5438 	kfree(bp);
5439 }
5440 
5441 static struct pernet_operations ipv6_inetpeer_ops = {
5442 	.init	=	ipv6_inetpeer_init,
5443 	.exit	=	ipv6_inetpeer_exit,
5444 };
5445 
5446 static struct pernet_operations ip6_route_net_late_ops = {
5447 	.init = ip6_route_net_init_late,
5448 	.exit = ip6_route_net_exit_late,
5449 };
5450 
5451 static struct notifier_block ip6_route_dev_notifier = {
5452 	.notifier_call = ip6_route_dev_notify,
5453 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5454 };
5455 
5456 void __init ip6_route_init_special_entries(void)
5457 {
5458 	/* Registering of the loopback is done before this portion of code,
5459 	 * the loopback reference in rt6_info will not be taken, do it
5460 	 * manually for init_net */
5461 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5462 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5463 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5464   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5465 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5466 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5467 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5468 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5469   #endif
5470 }
5471 
5472 int __init ip6_route_init(void)
5473 {
5474 	int ret;
5475 	int cpu;
5476 
5477 	ret = -ENOMEM;
5478 	ip6_dst_ops_template.kmem_cachep =
5479 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5480 				  SLAB_HWCACHE_ALIGN, NULL);
5481 	if (!ip6_dst_ops_template.kmem_cachep)
5482 		goto out;
5483 
5484 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5485 	if (ret)
5486 		goto out_kmem_cache;
5487 
5488 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5489 	if (ret)
5490 		goto out_dst_entries;
5491 
5492 	ret = register_pernet_subsys(&ip6_route_net_ops);
5493 	if (ret)
5494 		goto out_register_inetpeer;
5495 
5496 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5497 
5498 	ret = fib6_init();
5499 	if (ret)
5500 		goto out_register_subsys;
5501 
5502 	ret = xfrm6_init();
5503 	if (ret)
5504 		goto out_fib6_init;
5505 
5506 	ret = fib6_rules_init();
5507 	if (ret)
5508 		goto xfrm6_init;
5509 
5510 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5511 	if (ret)
5512 		goto fib6_rules_init;
5513 
5514 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5515 				   inet6_rtm_newroute, NULL, 0);
5516 	if (ret < 0)
5517 		goto out_register_late_subsys;
5518 
5519 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5520 				   inet6_rtm_delroute, NULL, 0);
5521 	if (ret < 0)
5522 		goto out_register_late_subsys;
5523 
5524 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5525 				   inet6_rtm_getroute, NULL,
5526 				   RTNL_FLAG_DOIT_UNLOCKED);
5527 	if (ret < 0)
5528 		goto out_register_late_subsys;
5529 
5530 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5531 	if (ret)
5532 		goto out_register_late_subsys;
5533 
5534 	for_each_possible_cpu(cpu) {
5535 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5536 
5537 		INIT_LIST_HEAD(&ul->head);
5538 		spin_lock_init(&ul->lock);
5539 	}
5540 
5541 out:
5542 	return ret;
5543 
5544 out_register_late_subsys:
5545 	rtnl_unregister_all(PF_INET6);
5546 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5547 fib6_rules_init:
5548 	fib6_rules_cleanup();
5549 xfrm6_init:
5550 	xfrm6_fini();
5551 out_fib6_init:
5552 	fib6_gc_cleanup();
5553 out_register_subsys:
5554 	unregister_pernet_subsys(&ip6_route_net_ops);
5555 out_register_inetpeer:
5556 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5557 out_dst_entries:
5558 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5559 out_kmem_cache:
5560 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5561 	goto out;
5562 }
5563 
5564 void ip6_route_cleanup(void)
5565 {
5566 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5567 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5568 	fib6_rules_cleanup();
5569 	xfrm6_fini();
5570 	fib6_gc_cleanup();
5571 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5572 	unregister_pernet_subsys(&ip6_route_net_ops);
5573 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5574 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5575 }
5576