xref: /openbmc/linux/net/ipv6/route.c (revision b7019ac5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	Linux INET6 implementation
4  *	FIB front-end.
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  */
9 
10 /*	Changes:
11  *
12  *	YOSHIFUJI Hideaki @USAGI
13  *		reworked default router selection.
14  *		- respect outgoing interface
15  *		- select from (probably) reachable routers (i.e.
16  *		routers in REACHABLE, STALE, DELAY or PROBE states).
17  *		- always select the same router if it is (probably)
18  *		reachable.  otherwise, round-robin the list.
19  *	Ville Nuorvala
20  *		Fixed routing subtrees.
21  */
22 
23 #define pr_fmt(fmt) "IPv6: " fmt
24 
25 #include <linux/capability.h>
26 #include <linux/errno.h>
27 #include <linux/export.h>
28 #include <linux/types.h>
29 #include <linux/times.h>
30 #include <linux/socket.h>
31 #include <linux/sockios.h>
32 #include <linux/net.h>
33 #include <linux/route.h>
34 #include <linux/netdevice.h>
35 #include <linux/in6.h>
36 #include <linux/mroute6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 #include <linux/proc_fs.h>
40 #include <linux/seq_file.h>
41 #include <linux/nsproxy.h>
42 #include <linux/slab.h>
43 #include <linux/jhash.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/dst_metadata.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 #include <net/rtnh.h>
59 #include <net/lwtunnel.h>
60 #include <net/ip_tunnels.h>
61 #include <net/l3mdev.h>
62 #include <net/ip.h>
63 #include <linux/uaccess.h>
64 
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68 
69 static int ip6_rt_type_to_error(u8 fib6_type);
70 
71 #define CREATE_TRACE_POINTS
72 #include <trace/events/fib6.h>
73 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
74 #undef CREATE_TRACE_POINTS
75 
76 enum rt6_nud_state {
77 	RT6_NUD_FAIL_HARD = -3,
78 	RT6_NUD_FAIL_PROBE = -2,
79 	RT6_NUD_FAIL_DO_RR = -1,
80 	RT6_NUD_SUCCEED = 1
81 };
82 
83 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
84 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
85 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void		ip6_dst_destroy(struct dst_entry *);
88 static void		ip6_dst_ifdown(struct dst_entry *,
89 				       struct net_device *dev, int how);
90 static int		 ip6_dst_gc(struct dst_ops *ops);
91 
92 static int		ip6_pkt_discard(struct sk_buff *skb);
93 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static int		ip6_pkt_prohibit(struct sk_buff *skb);
95 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
96 static void		ip6_link_failure(struct sk_buff *skb);
97 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
98 					   struct sk_buff *skb, u32 mtu);
99 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 					struct sk_buff *skb);
101 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
102 			   int strict);
103 static size_t rt6_nlmsg_size(struct fib6_info *rt);
104 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
105 			 struct fib6_info *rt, struct dst_entry *dst,
106 			 struct in6_addr *dest, struct in6_addr *src,
107 			 int iif, int type, u32 portid, u32 seq,
108 			 unsigned int flags);
109 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
110 					   const struct in6_addr *daddr,
111 					   const struct in6_addr *saddr);
112 
113 #ifdef CONFIG_IPV6_ROUTE_INFO
114 static struct fib6_info *rt6_add_route_info(struct net *net,
115 					   const struct in6_addr *prefix, int prefixlen,
116 					   const struct in6_addr *gwaddr,
117 					   struct net_device *dev,
118 					   unsigned int pref);
119 static struct fib6_info *rt6_get_route_info(struct net *net,
120 					   const struct in6_addr *prefix, int prefixlen,
121 					   const struct in6_addr *gwaddr,
122 					   struct net_device *dev);
123 #endif
124 
125 struct uncached_list {
126 	spinlock_t		lock;
127 	struct list_head	head;
128 };
129 
130 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
131 
132 void rt6_uncached_list_add(struct rt6_info *rt)
133 {
134 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
135 
136 	rt->rt6i_uncached_list = ul;
137 
138 	spin_lock_bh(&ul->lock);
139 	list_add_tail(&rt->rt6i_uncached, &ul->head);
140 	spin_unlock_bh(&ul->lock);
141 }
142 
143 void rt6_uncached_list_del(struct rt6_info *rt)
144 {
145 	if (!list_empty(&rt->rt6i_uncached)) {
146 		struct uncached_list *ul = rt->rt6i_uncached_list;
147 		struct net *net = dev_net(rt->dst.dev);
148 
149 		spin_lock_bh(&ul->lock);
150 		list_del(&rt->rt6i_uncached);
151 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
152 		spin_unlock_bh(&ul->lock);
153 	}
154 }
155 
156 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
157 {
158 	struct net_device *loopback_dev = net->loopback_dev;
159 	int cpu;
160 
161 	if (dev == loopback_dev)
162 		return;
163 
164 	for_each_possible_cpu(cpu) {
165 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
166 		struct rt6_info *rt;
167 
168 		spin_lock_bh(&ul->lock);
169 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
170 			struct inet6_dev *rt_idev = rt->rt6i_idev;
171 			struct net_device *rt_dev = rt->dst.dev;
172 
173 			if (rt_idev->dev == dev) {
174 				rt->rt6i_idev = in6_dev_get(loopback_dev);
175 				in6_dev_put(rt_idev);
176 			}
177 
178 			if (rt_dev == dev) {
179 				rt->dst.dev = loopback_dev;
180 				dev_hold(rt->dst.dev);
181 				dev_put(rt_dev);
182 			}
183 		}
184 		spin_unlock_bh(&ul->lock);
185 	}
186 }
187 
188 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
189 					     struct sk_buff *skb,
190 					     const void *daddr)
191 {
192 	if (!ipv6_addr_any(p))
193 		return (const void *) p;
194 	else if (skb)
195 		return &ipv6_hdr(skb)->daddr;
196 	return daddr;
197 }
198 
199 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
200 				   struct net_device *dev,
201 				   struct sk_buff *skb,
202 				   const void *daddr)
203 {
204 	struct neighbour *n;
205 
206 	daddr = choose_neigh_daddr(gw, skb, daddr);
207 	n = __ipv6_neigh_lookup(dev, daddr);
208 	if (n)
209 		return n;
210 
211 	n = neigh_create(&nd_tbl, daddr, dev);
212 	return IS_ERR(n) ? NULL : n;
213 }
214 
215 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
216 					      struct sk_buff *skb,
217 					      const void *daddr)
218 {
219 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
220 
221 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
222 }
223 
224 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
225 {
226 	struct net_device *dev = dst->dev;
227 	struct rt6_info *rt = (struct rt6_info *)dst;
228 
229 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
230 	if (!daddr)
231 		return;
232 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
233 		return;
234 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
235 		return;
236 	__ipv6_confirm_neigh(dev, daddr);
237 }
238 
239 static struct dst_ops ip6_dst_ops_template = {
240 	.family			=	AF_INET6,
241 	.gc			=	ip6_dst_gc,
242 	.gc_thresh		=	1024,
243 	.check			=	ip6_dst_check,
244 	.default_advmss		=	ip6_default_advmss,
245 	.mtu			=	ip6_mtu,
246 	.cow_metrics		=	dst_cow_metrics_generic,
247 	.destroy		=	ip6_dst_destroy,
248 	.ifdown			=	ip6_dst_ifdown,
249 	.negative_advice	=	ip6_negative_advice,
250 	.link_failure		=	ip6_link_failure,
251 	.update_pmtu		=	ip6_rt_update_pmtu,
252 	.redirect		=	rt6_do_redirect,
253 	.local_out		=	__ip6_local_out,
254 	.neigh_lookup		=	ip6_dst_neigh_lookup,
255 	.confirm_neigh		=	ip6_confirm_neigh,
256 };
257 
258 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
259 {
260 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
261 
262 	return mtu ? : dst->dev->mtu;
263 }
264 
265 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
266 					 struct sk_buff *skb, u32 mtu)
267 {
268 }
269 
270 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
271 				      struct sk_buff *skb)
272 {
273 }
274 
275 static struct dst_ops ip6_dst_blackhole_ops = {
276 	.family			=	AF_INET6,
277 	.destroy		=	ip6_dst_destroy,
278 	.check			=	ip6_dst_check,
279 	.mtu			=	ip6_blackhole_mtu,
280 	.default_advmss		=	ip6_default_advmss,
281 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
282 	.redirect		=	ip6_rt_blackhole_redirect,
283 	.cow_metrics		=	dst_cow_metrics_generic,
284 	.neigh_lookup		=	ip6_dst_neigh_lookup,
285 };
286 
287 static const u32 ip6_template_metrics[RTAX_MAX] = {
288 	[RTAX_HOPLIMIT - 1] = 0,
289 };
290 
291 static const struct fib6_info fib6_null_entry_template = {
292 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
293 	.fib6_protocol  = RTPROT_KERNEL,
294 	.fib6_metric	= ~(u32)0,
295 	.fib6_ref	= REFCOUNT_INIT(1),
296 	.fib6_type	= RTN_UNREACHABLE,
297 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
298 };
299 
300 static const struct rt6_info ip6_null_entry_template = {
301 	.dst = {
302 		.__refcnt	= ATOMIC_INIT(1),
303 		.__use		= 1,
304 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
305 		.error		= -ENETUNREACH,
306 		.input		= ip6_pkt_discard,
307 		.output		= ip6_pkt_discard_out,
308 	},
309 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
310 };
311 
312 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
313 
314 static const struct rt6_info ip6_prohibit_entry_template = {
315 	.dst = {
316 		.__refcnt	= ATOMIC_INIT(1),
317 		.__use		= 1,
318 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
319 		.error		= -EACCES,
320 		.input		= ip6_pkt_prohibit,
321 		.output		= ip6_pkt_prohibit_out,
322 	},
323 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
324 };
325 
326 static const struct rt6_info ip6_blk_hole_entry_template = {
327 	.dst = {
328 		.__refcnt	= ATOMIC_INIT(1),
329 		.__use		= 1,
330 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
331 		.error		= -EINVAL,
332 		.input		= dst_discard,
333 		.output		= dst_discard_out,
334 	},
335 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
336 };
337 
338 #endif
339 
340 static void rt6_info_init(struct rt6_info *rt)
341 {
342 	struct dst_entry *dst = &rt->dst;
343 
344 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
345 	INIT_LIST_HEAD(&rt->rt6i_uncached);
346 }
347 
348 /* allocate dst with ip6_dst_ops */
349 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
350 			       int flags)
351 {
352 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
353 					1, DST_OBSOLETE_FORCE_CHK, flags);
354 
355 	if (rt) {
356 		rt6_info_init(rt);
357 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
358 	}
359 
360 	return rt;
361 }
362 EXPORT_SYMBOL(ip6_dst_alloc);
363 
364 static void ip6_dst_destroy(struct dst_entry *dst)
365 {
366 	struct rt6_info *rt = (struct rt6_info *)dst;
367 	struct fib6_info *from;
368 	struct inet6_dev *idev;
369 
370 	ip_dst_metrics_put(dst);
371 	rt6_uncached_list_del(rt);
372 
373 	idev = rt->rt6i_idev;
374 	if (idev) {
375 		rt->rt6i_idev = NULL;
376 		in6_dev_put(idev);
377 	}
378 
379 	from = xchg((__force struct fib6_info **)&rt->from, NULL);
380 	fib6_info_release(from);
381 }
382 
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384 			   int how)
385 {
386 	struct rt6_info *rt = (struct rt6_info *)dst;
387 	struct inet6_dev *idev = rt->rt6i_idev;
388 	struct net_device *loopback_dev =
389 		dev_net(dev)->loopback_dev;
390 
391 	if (idev && idev->dev != loopback_dev) {
392 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
393 		if (loopback_idev) {
394 			rt->rt6i_idev = loopback_idev;
395 			in6_dev_put(idev);
396 		}
397 	}
398 }
399 
400 static bool __rt6_check_expired(const struct rt6_info *rt)
401 {
402 	if (rt->rt6i_flags & RTF_EXPIRES)
403 		return time_after(jiffies, rt->dst.expires);
404 	else
405 		return false;
406 }
407 
408 static bool rt6_check_expired(const struct rt6_info *rt)
409 {
410 	struct fib6_info *from;
411 
412 	from = rcu_dereference(rt->from);
413 
414 	if (rt->rt6i_flags & RTF_EXPIRES) {
415 		if (time_after(jiffies, rt->dst.expires))
416 			return true;
417 	} else if (from) {
418 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
419 			fib6_check_expired(from);
420 	}
421 	return false;
422 }
423 
424 void fib6_select_path(const struct net *net, struct fib6_result *res,
425 		      struct flowi6 *fl6, int oif, bool have_oif_match,
426 		      const struct sk_buff *skb, int strict)
427 {
428 	struct fib6_info *sibling, *next_sibling;
429 	struct fib6_info *match = res->f6i;
430 
431 	if (!match->fib6_nsiblings || have_oif_match)
432 		goto out;
433 
434 	/* We might have already computed the hash for ICMPv6 errors. In such
435 	 * case it will always be non-zero. Otherwise now is the time to do it.
436 	 */
437 	if (!fl6->mp_hash)
438 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
439 
440 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
441 		goto out;
442 
443 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
444 				 fib6_siblings) {
445 		const struct fib6_nh *nh = &sibling->fib6_nh;
446 		int nh_upper_bound;
447 
448 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
449 		if (fl6->mp_hash > nh_upper_bound)
450 			continue;
451 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
452 			break;
453 		match = sibling;
454 		break;
455 	}
456 
457 out:
458 	res->f6i = match;
459 	res->nh = &match->fib6_nh;
460 }
461 
462 /*
463  *	Route lookup. rcu_read_lock() should be held.
464  */
465 
466 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
467 			       const struct in6_addr *saddr, int oif, int flags)
468 {
469 	const struct net_device *dev;
470 
471 	if (nh->fib_nh_flags & RTNH_F_DEAD)
472 		return false;
473 
474 	dev = nh->fib_nh_dev;
475 	if (oif) {
476 		if (dev->ifindex == oif)
477 			return true;
478 	} else {
479 		if (ipv6_chk_addr(net, saddr, dev,
480 				  flags & RT6_LOOKUP_F_IFACE))
481 			return true;
482 	}
483 
484 	return false;
485 }
486 
487 static void rt6_device_match(struct net *net, struct fib6_result *res,
488 			     const struct in6_addr *saddr, int oif, int flags)
489 {
490 	struct fib6_info *f6i = res->f6i;
491 	struct fib6_info *spf6i;
492 	struct fib6_nh *nh;
493 
494 	if (!oif && ipv6_addr_any(saddr)) {
495 		nh = &f6i->fib6_nh;
496 		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
497 			goto out;
498 	}
499 
500 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
501 		nh = &spf6i->fib6_nh;
502 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
503 			res->f6i = spf6i;
504 			goto out;
505 		}
506 	}
507 
508 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
509 		res->f6i = net->ipv6.fib6_null_entry;
510 		nh = &res->f6i->fib6_nh;
511 		goto out;
512 	}
513 
514 	nh = &f6i->fib6_nh;
515 	if (nh->fib_nh_flags & RTNH_F_DEAD) {
516 		res->f6i = net->ipv6.fib6_null_entry;
517 		nh = &res->f6i->fib6_nh;
518 	}
519 out:
520 	res->nh = nh;
521 	res->fib6_type = res->f6i->fib6_type;
522 	res->fib6_flags = res->f6i->fib6_flags;
523 }
524 
525 #ifdef CONFIG_IPV6_ROUTER_PREF
526 struct __rt6_probe_work {
527 	struct work_struct work;
528 	struct in6_addr target;
529 	struct net_device *dev;
530 };
531 
532 static void rt6_probe_deferred(struct work_struct *w)
533 {
534 	struct in6_addr mcaddr;
535 	struct __rt6_probe_work *work =
536 		container_of(w, struct __rt6_probe_work, work);
537 
538 	addrconf_addr_solict_mult(&work->target, &mcaddr);
539 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
540 	dev_put(work->dev);
541 	kfree(work);
542 }
543 
544 static void rt6_probe(struct fib6_nh *fib6_nh)
545 {
546 	struct __rt6_probe_work *work = NULL;
547 	const struct in6_addr *nh_gw;
548 	struct neighbour *neigh;
549 	struct net_device *dev;
550 	struct inet6_dev *idev;
551 
552 	/*
553 	 * Okay, this does not seem to be appropriate
554 	 * for now, however, we need to check if it
555 	 * is really so; aka Router Reachability Probing.
556 	 *
557 	 * Router Reachability Probe MUST be rate-limited
558 	 * to no more than one per minute.
559 	 */
560 	if (fib6_nh->fib_nh_gw_family)
561 		return;
562 
563 	nh_gw = &fib6_nh->fib_nh_gw6;
564 	dev = fib6_nh->fib_nh_dev;
565 	rcu_read_lock_bh();
566 	idev = __in6_dev_get(dev);
567 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
568 	if (neigh) {
569 		if (neigh->nud_state & NUD_VALID)
570 			goto out;
571 
572 		write_lock(&neigh->lock);
573 		if (!(neigh->nud_state & NUD_VALID) &&
574 		    time_after(jiffies,
575 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
576 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
577 			if (work)
578 				__neigh_set_probe_once(neigh);
579 		}
580 		write_unlock(&neigh->lock);
581 	} else if (time_after(jiffies, fib6_nh->last_probe +
582 				       idev->cnf.rtr_probe_interval)) {
583 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
584 	}
585 
586 	if (work) {
587 		fib6_nh->last_probe = jiffies;
588 		INIT_WORK(&work->work, rt6_probe_deferred);
589 		work->target = *nh_gw;
590 		dev_hold(dev);
591 		work->dev = dev;
592 		schedule_work(&work->work);
593 	}
594 
595 out:
596 	rcu_read_unlock_bh();
597 }
598 #else
599 static inline void rt6_probe(struct fib6_nh *fib6_nh)
600 {
601 }
602 #endif
603 
604 /*
605  * Default Router Selection (RFC 2461 6.3.6)
606  */
607 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
608 {
609 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
610 	struct neighbour *neigh;
611 
612 	rcu_read_lock_bh();
613 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
614 					  &fib6_nh->fib_nh_gw6);
615 	if (neigh) {
616 		read_lock(&neigh->lock);
617 		if (neigh->nud_state & NUD_VALID)
618 			ret = RT6_NUD_SUCCEED;
619 #ifdef CONFIG_IPV6_ROUTER_PREF
620 		else if (!(neigh->nud_state & NUD_FAILED))
621 			ret = RT6_NUD_SUCCEED;
622 		else
623 			ret = RT6_NUD_FAIL_PROBE;
624 #endif
625 		read_unlock(&neigh->lock);
626 	} else {
627 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
628 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
629 	}
630 	rcu_read_unlock_bh();
631 
632 	return ret;
633 }
634 
635 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
636 			   int strict)
637 {
638 	int m = 0;
639 
640 	if (!oif || nh->fib_nh_dev->ifindex == oif)
641 		m = 2;
642 
643 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
644 		return RT6_NUD_FAIL_HARD;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
647 #endif
648 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
649 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
650 		int n = rt6_check_neigh(nh);
651 		if (n < 0)
652 			return n;
653 	}
654 	return m;
655 }
656 
657 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
658 		       int oif, int strict, int *mpri, bool *do_rr)
659 {
660 	bool match_do_rr = false;
661 	bool rc = false;
662 	int m;
663 
664 	if (nh->fib_nh_flags & RTNH_F_DEAD)
665 		goto out;
666 
667 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
668 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
669 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
670 		goto out;
671 
672 	m = rt6_score_route(nh, fib6_flags, oif, strict);
673 	if (m == RT6_NUD_FAIL_DO_RR) {
674 		match_do_rr = true;
675 		m = 0; /* lowest valid score */
676 	} else if (m == RT6_NUD_FAIL_HARD) {
677 		goto out;
678 	}
679 
680 	if (strict & RT6_LOOKUP_F_REACHABLE)
681 		rt6_probe(nh);
682 
683 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
684 	if (m > *mpri) {
685 		*do_rr = match_do_rr;
686 		*mpri = m;
687 		rc = true;
688 	}
689 out:
690 	return rc;
691 }
692 
693 static void __find_rr_leaf(struct fib6_info *f6i_start,
694 			   struct fib6_info *nomatch, u32 metric,
695 			   struct fib6_result *res, struct fib6_info **cont,
696 			   int oif, int strict, bool *do_rr, int *mpri)
697 {
698 	struct fib6_info *f6i;
699 
700 	for (f6i = f6i_start;
701 	     f6i && f6i != nomatch;
702 	     f6i = rcu_dereference(f6i->fib6_next)) {
703 		struct fib6_nh *nh;
704 
705 		if (cont && f6i->fib6_metric != metric) {
706 			*cont = f6i;
707 			return;
708 		}
709 
710 		if (fib6_check_expired(f6i))
711 			continue;
712 
713 		nh = &f6i->fib6_nh;
714 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
715 			res->f6i = f6i;
716 			res->nh = nh;
717 			res->fib6_flags = f6i->fib6_flags;
718 			res->fib6_type = f6i->fib6_type;
719 		}
720 	}
721 }
722 
723 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
724 			 struct fib6_info *rr_head, int oif, int strict,
725 			 bool *do_rr, struct fib6_result *res)
726 {
727 	u32 metric = rr_head->fib6_metric;
728 	struct fib6_info *cont = NULL;
729 	int mpri = -1;
730 
731 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
732 		       oif, strict, do_rr, &mpri);
733 
734 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
735 		       oif, strict, do_rr, &mpri);
736 
737 	if (res->f6i || !cont)
738 		return;
739 
740 	__find_rr_leaf(cont, NULL, metric, res, NULL,
741 		       oif, strict, do_rr, &mpri);
742 }
743 
744 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
745 		       struct fib6_result *res, int strict)
746 {
747 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
748 	struct fib6_info *rt0;
749 	bool do_rr = false;
750 	int key_plen;
751 
752 	/* make sure this function or its helpers sets f6i */
753 	res->f6i = NULL;
754 
755 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
756 		goto out;
757 
758 	rt0 = rcu_dereference(fn->rr_ptr);
759 	if (!rt0)
760 		rt0 = leaf;
761 
762 	/* Double check to make sure fn is not an intermediate node
763 	 * and fn->leaf does not points to its child's leaf
764 	 * (This might happen if all routes under fn are deleted from
765 	 * the tree and fib6_repair_tree() is called on the node.)
766 	 */
767 	key_plen = rt0->fib6_dst.plen;
768 #ifdef CONFIG_IPV6_SUBTREES
769 	if (rt0->fib6_src.plen)
770 		key_plen = rt0->fib6_src.plen;
771 #endif
772 	if (fn->fn_bit != key_plen)
773 		goto out;
774 
775 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
776 	if (do_rr) {
777 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
778 
779 		/* no entries matched; do round-robin */
780 		if (!next || next->fib6_metric != rt0->fib6_metric)
781 			next = leaf;
782 
783 		if (next != rt0) {
784 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
785 			/* make sure next is not being deleted from the tree */
786 			if (next->fib6_node)
787 				rcu_assign_pointer(fn->rr_ptr, next);
788 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
789 		}
790 	}
791 
792 out:
793 	if (!res->f6i) {
794 		res->f6i = net->ipv6.fib6_null_entry;
795 		res->nh = &res->f6i->fib6_nh;
796 		res->fib6_flags = res->f6i->fib6_flags;
797 		res->fib6_type = res->f6i->fib6_type;
798 	}
799 }
800 
801 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
802 {
803 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
804 	       res->nh->fib_nh_gw_family;
805 }
806 
807 #ifdef CONFIG_IPV6_ROUTE_INFO
808 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
809 		  const struct in6_addr *gwaddr)
810 {
811 	struct net *net = dev_net(dev);
812 	struct route_info *rinfo = (struct route_info *) opt;
813 	struct in6_addr prefix_buf, *prefix;
814 	unsigned int pref;
815 	unsigned long lifetime;
816 	struct fib6_info *rt;
817 
818 	if (len < sizeof(struct route_info)) {
819 		return -EINVAL;
820 	}
821 
822 	/* Sanity check for prefix_len and length */
823 	if (rinfo->length > 3) {
824 		return -EINVAL;
825 	} else if (rinfo->prefix_len > 128) {
826 		return -EINVAL;
827 	} else if (rinfo->prefix_len > 64) {
828 		if (rinfo->length < 2) {
829 			return -EINVAL;
830 		}
831 	} else if (rinfo->prefix_len > 0) {
832 		if (rinfo->length < 1) {
833 			return -EINVAL;
834 		}
835 	}
836 
837 	pref = rinfo->route_pref;
838 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
839 		return -EINVAL;
840 
841 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842 
843 	if (rinfo->length == 3)
844 		prefix = (struct in6_addr *)rinfo->prefix;
845 	else {
846 		/* this function is safe */
847 		ipv6_addr_prefix(&prefix_buf,
848 				 (struct in6_addr *)rinfo->prefix,
849 				 rinfo->prefix_len);
850 		prefix = &prefix_buf;
851 	}
852 
853 	if (rinfo->prefix_len == 0)
854 		rt = rt6_get_dflt_router(net, gwaddr, dev);
855 	else
856 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
857 					gwaddr, dev);
858 
859 	if (rt && !lifetime) {
860 		ip6_del_rt(net, rt);
861 		rt = NULL;
862 	}
863 
864 	if (!rt && lifetime)
865 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
866 					dev, pref);
867 	else if (rt)
868 		rt->fib6_flags = RTF_ROUTEINFO |
869 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
870 
871 	if (rt) {
872 		if (!addrconf_finite_timeout(lifetime))
873 			fib6_clean_expires(rt);
874 		else
875 			fib6_set_expires(rt, jiffies + HZ * lifetime);
876 
877 		fib6_info_release(rt);
878 	}
879 	return 0;
880 }
881 #endif
882 
883 /*
884  *	Misc support functions
885  */
886 
887 /* called with rcu_lock held */
888 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
889 {
890 	struct net_device *dev = res->nh->fib_nh_dev;
891 
892 	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
893 		/* for copies of local routes, dst->dev needs to be the
894 		 * device if it is a master device, the master device if
895 		 * device is enslaved, and the loopback as the default
896 		 */
897 		if (netif_is_l3_slave(dev) &&
898 		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
899 			dev = l3mdev_master_dev_rcu(dev);
900 		else if (!netif_is_l3_master(dev))
901 			dev = dev_net(dev)->loopback_dev;
902 		/* last case is netif_is_l3_master(dev) is true in which
903 		 * case we want dev returned to be dev
904 		 */
905 	}
906 
907 	return dev;
908 }
909 
910 static const int fib6_prop[RTN_MAX + 1] = {
911 	[RTN_UNSPEC]	= 0,
912 	[RTN_UNICAST]	= 0,
913 	[RTN_LOCAL]	= 0,
914 	[RTN_BROADCAST]	= 0,
915 	[RTN_ANYCAST]	= 0,
916 	[RTN_MULTICAST]	= 0,
917 	[RTN_BLACKHOLE]	= -EINVAL,
918 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
919 	[RTN_PROHIBIT]	= -EACCES,
920 	[RTN_THROW]	= -EAGAIN,
921 	[RTN_NAT]	= -EINVAL,
922 	[RTN_XRESOLVE]	= -EINVAL,
923 };
924 
925 static int ip6_rt_type_to_error(u8 fib6_type)
926 {
927 	return fib6_prop[fib6_type];
928 }
929 
930 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
931 {
932 	unsigned short flags = 0;
933 
934 	if (rt->dst_nocount)
935 		flags |= DST_NOCOUNT;
936 	if (rt->dst_nopolicy)
937 		flags |= DST_NOPOLICY;
938 	if (rt->dst_host)
939 		flags |= DST_HOST;
940 
941 	return flags;
942 }
943 
944 static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
945 {
946 	rt->dst.error = ip6_rt_type_to_error(fib6_type);
947 
948 	switch (fib6_type) {
949 	case RTN_BLACKHOLE:
950 		rt->dst.output = dst_discard_out;
951 		rt->dst.input = dst_discard;
952 		break;
953 	case RTN_PROHIBIT:
954 		rt->dst.output = ip6_pkt_prohibit_out;
955 		rt->dst.input = ip6_pkt_prohibit;
956 		break;
957 	case RTN_THROW:
958 	case RTN_UNREACHABLE:
959 	default:
960 		rt->dst.output = ip6_pkt_discard_out;
961 		rt->dst.input = ip6_pkt_discard;
962 		break;
963 	}
964 }
965 
966 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
967 {
968 	struct fib6_info *f6i = res->f6i;
969 
970 	if (res->fib6_flags & RTF_REJECT) {
971 		ip6_rt_init_dst_reject(rt, res->fib6_type);
972 		return;
973 	}
974 
975 	rt->dst.error = 0;
976 	rt->dst.output = ip6_output;
977 
978 	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
979 		rt->dst.input = ip6_input;
980 	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
981 		rt->dst.input = ip6_mc_input;
982 	} else {
983 		rt->dst.input = ip6_forward;
984 	}
985 
986 	if (res->nh->fib_nh_lws) {
987 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
988 		lwtunnel_set_redirect(&rt->dst);
989 	}
990 
991 	rt->dst.lastuse = jiffies;
992 }
993 
994 /* Caller must already hold reference to @from */
995 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
996 {
997 	rt->rt6i_flags &= ~RTF_EXPIRES;
998 	rcu_assign_pointer(rt->from, from);
999 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1000 }
1001 
1002 /* Caller must already hold reference to f6i in result */
1003 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1004 {
1005 	const struct fib6_nh *nh = res->nh;
1006 	const struct net_device *dev = nh->fib_nh_dev;
1007 	struct fib6_info *f6i = res->f6i;
1008 
1009 	ip6_rt_init_dst(rt, res);
1010 
1011 	rt->rt6i_dst = f6i->fib6_dst;
1012 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1013 	rt->rt6i_flags = res->fib6_flags;
1014 	if (nh->fib_nh_gw_family) {
1015 		rt->rt6i_gateway = nh->fib_nh_gw6;
1016 		rt->rt6i_flags |= RTF_GATEWAY;
1017 	}
1018 	rt6_set_from(rt, f6i);
1019 #ifdef CONFIG_IPV6_SUBTREES
1020 	rt->rt6i_src = f6i->fib6_src;
1021 #endif
1022 }
1023 
1024 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1025 					struct in6_addr *saddr)
1026 {
1027 	struct fib6_node *pn, *sn;
1028 	while (1) {
1029 		if (fn->fn_flags & RTN_TL_ROOT)
1030 			return NULL;
1031 		pn = rcu_dereference(fn->parent);
1032 		sn = FIB6_SUBTREE(pn);
1033 		if (sn && sn != fn)
1034 			fn = fib6_node_lookup(sn, NULL, saddr);
1035 		else
1036 			fn = pn;
1037 		if (fn->fn_flags & RTN_RTINFO)
1038 			return fn;
1039 	}
1040 }
1041 
1042 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1043 {
1044 	struct rt6_info *rt = *prt;
1045 
1046 	if (dst_hold_safe(&rt->dst))
1047 		return true;
1048 	if (net) {
1049 		rt = net->ipv6.ip6_null_entry;
1050 		dst_hold(&rt->dst);
1051 	} else {
1052 		rt = NULL;
1053 	}
1054 	*prt = rt;
1055 	return false;
1056 }
1057 
1058 /* called with rcu_lock held */
1059 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1060 {
1061 	struct net_device *dev = res->nh->fib_nh_dev;
1062 	struct fib6_info *f6i = res->f6i;
1063 	unsigned short flags;
1064 	struct rt6_info *nrt;
1065 
1066 	if (!fib6_info_hold_safe(f6i))
1067 		goto fallback;
1068 
1069 	flags = fib6_info_dst_flags(f6i);
1070 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1071 	if (!nrt) {
1072 		fib6_info_release(f6i);
1073 		goto fallback;
1074 	}
1075 
1076 	ip6_rt_copy_init(nrt, res);
1077 	return nrt;
1078 
1079 fallback:
1080 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1081 	dst_hold(&nrt->dst);
1082 	return nrt;
1083 }
1084 
1085 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1086 					     struct fib6_table *table,
1087 					     struct flowi6 *fl6,
1088 					     const struct sk_buff *skb,
1089 					     int flags)
1090 {
1091 	struct fib6_result res = {};
1092 	struct fib6_node *fn;
1093 	struct rt6_info *rt;
1094 
1095 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1096 		flags &= ~RT6_LOOKUP_F_IFACE;
1097 
1098 	rcu_read_lock();
1099 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1100 restart:
1101 	res.f6i = rcu_dereference(fn->leaf);
1102 	if (!res.f6i)
1103 		res.f6i = net->ipv6.fib6_null_entry;
1104 	else
1105 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1106 				 flags);
1107 
1108 	if (res.f6i == net->ipv6.fib6_null_entry) {
1109 		fn = fib6_backtrack(fn, &fl6->saddr);
1110 		if (fn)
1111 			goto restart;
1112 
1113 		rt = net->ipv6.ip6_null_entry;
1114 		dst_hold(&rt->dst);
1115 		goto out;
1116 	}
1117 
1118 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1119 			 fl6->flowi6_oif != 0, skb, flags);
1120 
1121 	/* Search through exception table */
1122 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1123 	if (rt) {
1124 		if (ip6_hold_safe(net, &rt))
1125 			dst_use_noref(&rt->dst, jiffies);
1126 	} else {
1127 		rt = ip6_create_rt_rcu(&res);
1128 	}
1129 
1130 out:
1131 	trace_fib6_table_lookup(net, &res, table, fl6);
1132 
1133 	rcu_read_unlock();
1134 
1135 	return rt;
1136 }
1137 
1138 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1139 				   const struct sk_buff *skb, int flags)
1140 {
1141 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1142 }
1143 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1144 
1145 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1146 			    const struct in6_addr *saddr, int oif,
1147 			    const struct sk_buff *skb, int strict)
1148 {
1149 	struct flowi6 fl6 = {
1150 		.flowi6_oif = oif,
1151 		.daddr = *daddr,
1152 	};
1153 	struct dst_entry *dst;
1154 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1155 
1156 	if (saddr) {
1157 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1158 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1159 	}
1160 
1161 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1162 	if (dst->error == 0)
1163 		return (struct rt6_info *) dst;
1164 
1165 	dst_release(dst);
1166 
1167 	return NULL;
1168 }
1169 EXPORT_SYMBOL(rt6_lookup);
1170 
1171 /* ip6_ins_rt is called with FREE table->tb6_lock.
1172  * It takes new route entry, the addition fails by any reason the
1173  * route is released.
1174  * Caller must hold dst before calling it.
1175  */
1176 
1177 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1178 			struct netlink_ext_ack *extack)
1179 {
1180 	int err;
1181 	struct fib6_table *table;
1182 
1183 	table = rt->fib6_table;
1184 	spin_lock_bh(&table->tb6_lock);
1185 	err = fib6_add(&table->tb6_root, rt, info, extack);
1186 	spin_unlock_bh(&table->tb6_lock);
1187 
1188 	return err;
1189 }
1190 
1191 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1192 {
1193 	struct nl_info info = {	.nl_net = net, };
1194 
1195 	return __ip6_ins_rt(rt, &info, NULL);
1196 }
1197 
1198 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1199 					   const struct in6_addr *daddr,
1200 					   const struct in6_addr *saddr)
1201 {
1202 	struct fib6_info *f6i = res->f6i;
1203 	struct net_device *dev;
1204 	struct rt6_info *rt;
1205 
1206 	/*
1207 	 *	Clone the route.
1208 	 */
1209 
1210 	if (!fib6_info_hold_safe(f6i))
1211 		return NULL;
1212 
1213 	dev = ip6_rt_get_dev_rcu(res);
1214 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1215 	if (!rt) {
1216 		fib6_info_release(f6i);
1217 		return NULL;
1218 	}
1219 
1220 	ip6_rt_copy_init(rt, res);
1221 	rt->rt6i_flags |= RTF_CACHE;
1222 	rt->dst.flags |= DST_HOST;
1223 	rt->rt6i_dst.addr = *daddr;
1224 	rt->rt6i_dst.plen = 128;
1225 
1226 	if (!rt6_is_gw_or_nonexthop(res)) {
1227 		if (f6i->fib6_dst.plen != 128 &&
1228 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1229 			rt->rt6i_flags |= RTF_ANYCAST;
1230 #ifdef CONFIG_IPV6_SUBTREES
1231 		if (rt->rt6i_src.plen && saddr) {
1232 			rt->rt6i_src.addr = *saddr;
1233 			rt->rt6i_src.plen = 128;
1234 		}
1235 #endif
1236 	}
1237 
1238 	return rt;
1239 }
1240 
1241 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1242 {
1243 	struct fib6_info *f6i = res->f6i;
1244 	unsigned short flags = fib6_info_dst_flags(f6i);
1245 	struct net_device *dev;
1246 	struct rt6_info *pcpu_rt;
1247 
1248 	if (!fib6_info_hold_safe(f6i))
1249 		return NULL;
1250 
1251 	rcu_read_lock();
1252 	dev = ip6_rt_get_dev_rcu(res);
1253 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1254 	rcu_read_unlock();
1255 	if (!pcpu_rt) {
1256 		fib6_info_release(f6i);
1257 		return NULL;
1258 	}
1259 	ip6_rt_copy_init(pcpu_rt, res);
1260 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1261 	return pcpu_rt;
1262 }
1263 
1264 /* It should be called with rcu_read_lock() acquired */
1265 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1266 {
1267 	struct rt6_info *pcpu_rt, **p;
1268 
1269 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1270 	pcpu_rt = *p;
1271 
1272 	if (pcpu_rt)
1273 		ip6_hold_safe(NULL, &pcpu_rt);
1274 
1275 	return pcpu_rt;
1276 }
1277 
1278 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1279 					    const struct fib6_result *res)
1280 {
1281 	struct rt6_info *pcpu_rt, *prev, **p;
1282 
1283 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1284 	if (!pcpu_rt) {
1285 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1286 		return net->ipv6.ip6_null_entry;
1287 	}
1288 
1289 	dst_hold(&pcpu_rt->dst);
1290 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1291 	prev = cmpxchg(p, NULL, pcpu_rt);
1292 	BUG_ON(prev);
1293 
1294 	if (res->f6i->fib6_destroying) {
1295 		struct fib6_info *from;
1296 
1297 		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1298 		fib6_info_release(from);
1299 	}
1300 
1301 	return pcpu_rt;
1302 }
1303 
1304 /* exception hash table implementation
1305  */
1306 static DEFINE_SPINLOCK(rt6_exception_lock);
1307 
1308 /* Remove rt6_ex from hash table and free the memory
1309  * Caller must hold rt6_exception_lock
1310  */
1311 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1312 				 struct rt6_exception *rt6_ex)
1313 {
1314 	struct fib6_info *from;
1315 	struct net *net;
1316 
1317 	if (!bucket || !rt6_ex)
1318 		return;
1319 
1320 	net = dev_net(rt6_ex->rt6i->dst.dev);
1321 	net->ipv6.rt6_stats->fib_rt_cache--;
1322 
1323 	/* purge completely the exception to allow releasing the held resources:
1324 	 * some [sk] cache may keep the dst around for unlimited time
1325 	 */
1326 	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1327 	fib6_info_release(from);
1328 	dst_dev_put(&rt6_ex->rt6i->dst);
1329 
1330 	hlist_del_rcu(&rt6_ex->hlist);
1331 	dst_release(&rt6_ex->rt6i->dst);
1332 	kfree_rcu(rt6_ex, rcu);
1333 	WARN_ON_ONCE(!bucket->depth);
1334 	bucket->depth--;
1335 }
1336 
1337 /* Remove oldest rt6_ex in bucket and free the memory
1338  * Caller must hold rt6_exception_lock
1339  */
1340 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1341 {
1342 	struct rt6_exception *rt6_ex, *oldest = NULL;
1343 
1344 	if (!bucket)
1345 		return;
1346 
1347 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1348 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1349 			oldest = rt6_ex;
1350 	}
1351 	rt6_remove_exception(bucket, oldest);
1352 }
1353 
1354 static u32 rt6_exception_hash(const struct in6_addr *dst,
1355 			      const struct in6_addr *src)
1356 {
1357 	static u32 seed __read_mostly;
1358 	u32 val;
1359 
1360 	net_get_random_once(&seed, sizeof(seed));
1361 	val = jhash(dst, sizeof(*dst), seed);
1362 
1363 #ifdef CONFIG_IPV6_SUBTREES
1364 	if (src)
1365 		val = jhash(src, sizeof(*src), val);
1366 #endif
1367 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1368 }
1369 
1370 /* Helper function to find the cached rt in the hash table
1371  * and update bucket pointer to point to the bucket for this
1372  * (daddr, saddr) pair
1373  * Caller must hold rt6_exception_lock
1374  */
1375 static struct rt6_exception *
1376 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1377 			      const struct in6_addr *daddr,
1378 			      const struct in6_addr *saddr)
1379 {
1380 	struct rt6_exception *rt6_ex;
1381 	u32 hval;
1382 
1383 	if (!(*bucket) || !daddr)
1384 		return NULL;
1385 
1386 	hval = rt6_exception_hash(daddr, saddr);
1387 	*bucket += hval;
1388 
1389 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1390 		struct rt6_info *rt6 = rt6_ex->rt6i;
1391 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1392 
1393 #ifdef CONFIG_IPV6_SUBTREES
1394 		if (matched && saddr)
1395 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1396 #endif
1397 		if (matched)
1398 			return rt6_ex;
1399 	}
1400 	return NULL;
1401 }
1402 
1403 /* Helper function to find the cached rt in the hash table
1404  * and update bucket pointer to point to the bucket for this
1405  * (daddr, saddr) pair
1406  * Caller must hold rcu_read_lock()
1407  */
1408 static struct rt6_exception *
1409 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1410 			 const struct in6_addr *daddr,
1411 			 const struct in6_addr *saddr)
1412 {
1413 	struct rt6_exception *rt6_ex;
1414 	u32 hval;
1415 
1416 	WARN_ON_ONCE(!rcu_read_lock_held());
1417 
1418 	if (!(*bucket) || !daddr)
1419 		return NULL;
1420 
1421 	hval = rt6_exception_hash(daddr, saddr);
1422 	*bucket += hval;
1423 
1424 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1425 		struct rt6_info *rt6 = rt6_ex->rt6i;
1426 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1427 
1428 #ifdef CONFIG_IPV6_SUBTREES
1429 		if (matched && saddr)
1430 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1431 #endif
1432 		if (matched)
1433 			return rt6_ex;
1434 	}
1435 	return NULL;
1436 }
1437 
1438 static unsigned int fib6_mtu(const struct fib6_result *res)
1439 {
1440 	const struct fib6_nh *nh = res->nh;
1441 	unsigned int mtu;
1442 
1443 	if (res->f6i->fib6_pmtu) {
1444 		mtu = res->f6i->fib6_pmtu;
1445 	} else {
1446 		struct net_device *dev = nh->fib_nh_dev;
1447 		struct inet6_dev *idev;
1448 
1449 		rcu_read_lock();
1450 		idev = __in6_dev_get(dev);
1451 		mtu = idev->cnf.mtu6;
1452 		rcu_read_unlock();
1453 	}
1454 
1455 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1456 
1457 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1458 }
1459 
1460 static int rt6_insert_exception(struct rt6_info *nrt,
1461 				const struct fib6_result *res)
1462 {
1463 	struct net *net = dev_net(nrt->dst.dev);
1464 	struct rt6_exception_bucket *bucket;
1465 	struct in6_addr *src_key = NULL;
1466 	struct rt6_exception *rt6_ex;
1467 	struct fib6_info *f6i = res->f6i;
1468 	int err = 0;
1469 
1470 	spin_lock_bh(&rt6_exception_lock);
1471 
1472 	if (f6i->exception_bucket_flushed) {
1473 		err = -EINVAL;
1474 		goto out;
1475 	}
1476 
1477 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1478 					lockdep_is_held(&rt6_exception_lock));
1479 	if (!bucket) {
1480 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1481 				 GFP_ATOMIC);
1482 		if (!bucket) {
1483 			err = -ENOMEM;
1484 			goto out;
1485 		}
1486 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1487 	}
1488 
1489 #ifdef CONFIG_IPV6_SUBTREES
1490 	/* fib6_src.plen != 0 indicates f6i is in subtree
1491 	 * and exception table is indexed by a hash of
1492 	 * both fib6_dst and fib6_src.
1493 	 * Otherwise, the exception table is indexed by
1494 	 * a hash of only fib6_dst.
1495 	 */
1496 	if (f6i->fib6_src.plen)
1497 		src_key = &nrt->rt6i_src.addr;
1498 #endif
1499 	/* rt6_mtu_change() might lower mtu on f6i.
1500 	 * Only insert this exception route if its mtu
1501 	 * is less than f6i's mtu value.
1502 	 */
1503 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1504 		err = -EINVAL;
1505 		goto out;
1506 	}
1507 
1508 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1509 					       src_key);
1510 	if (rt6_ex)
1511 		rt6_remove_exception(bucket, rt6_ex);
1512 
1513 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1514 	if (!rt6_ex) {
1515 		err = -ENOMEM;
1516 		goto out;
1517 	}
1518 	rt6_ex->rt6i = nrt;
1519 	rt6_ex->stamp = jiffies;
1520 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1521 	bucket->depth++;
1522 	net->ipv6.rt6_stats->fib_rt_cache++;
1523 
1524 	if (bucket->depth > FIB6_MAX_DEPTH)
1525 		rt6_exception_remove_oldest(bucket);
1526 
1527 out:
1528 	spin_unlock_bh(&rt6_exception_lock);
1529 
1530 	/* Update fn->fn_sernum to invalidate all cached dst */
1531 	if (!err) {
1532 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1533 		fib6_update_sernum(net, f6i);
1534 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1535 		fib6_force_start_gc(net);
1536 	}
1537 
1538 	return err;
1539 }
1540 
1541 void rt6_flush_exceptions(struct fib6_info *rt)
1542 {
1543 	struct rt6_exception_bucket *bucket;
1544 	struct rt6_exception *rt6_ex;
1545 	struct hlist_node *tmp;
1546 	int i;
1547 
1548 	spin_lock_bh(&rt6_exception_lock);
1549 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1550 	rt->exception_bucket_flushed = 1;
1551 
1552 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553 				    lockdep_is_held(&rt6_exception_lock));
1554 	if (!bucket)
1555 		goto out;
1556 
1557 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1559 			rt6_remove_exception(bucket, rt6_ex);
1560 		WARN_ON_ONCE(bucket->depth);
1561 		bucket++;
1562 	}
1563 
1564 out:
1565 	spin_unlock_bh(&rt6_exception_lock);
1566 }
1567 
1568 /* Find cached rt in the hash table inside passed in rt
1569  * Caller has to hold rcu_read_lock()
1570  */
1571 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1572 					   const struct in6_addr *daddr,
1573 					   const struct in6_addr *saddr)
1574 {
1575 	const struct in6_addr *src_key = NULL;
1576 	struct rt6_exception_bucket *bucket;
1577 	struct rt6_exception *rt6_ex;
1578 	struct rt6_info *ret = NULL;
1579 
1580 #ifdef CONFIG_IPV6_SUBTREES
1581 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1582 	 * and exception table is indexed by a hash of
1583 	 * both fib6_dst and fib6_src.
1584 	 * However, the src addr used to create the hash
1585 	 * might not be exactly the passed in saddr which
1586 	 * is a /128 addr from the flow.
1587 	 * So we need to use f6i->fib6_src to redo lookup
1588 	 * if the passed in saddr does not find anything.
1589 	 * (See the logic in ip6_rt_cache_alloc() on how
1590 	 * rt->rt6i_src is updated.)
1591 	 */
1592 	if (res->f6i->fib6_src.plen)
1593 		src_key = saddr;
1594 find_ex:
1595 #endif
1596 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1597 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1598 
1599 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1600 		ret = rt6_ex->rt6i;
1601 
1602 #ifdef CONFIG_IPV6_SUBTREES
1603 	/* Use fib6_src as src_key and redo lookup */
1604 	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1605 		src_key = &res->f6i->fib6_src.addr;
1606 		goto find_ex;
1607 	}
1608 #endif
1609 
1610 	return ret;
1611 }
1612 
1613 /* Remove the passed in cached rt from the hash table that contains it */
1614 static int rt6_remove_exception_rt(struct rt6_info *rt)
1615 {
1616 	struct rt6_exception_bucket *bucket;
1617 	struct in6_addr *src_key = NULL;
1618 	struct rt6_exception *rt6_ex;
1619 	struct fib6_info *from;
1620 	int err;
1621 
1622 	from = rcu_dereference(rt->from);
1623 	if (!from ||
1624 	    !(rt->rt6i_flags & RTF_CACHE))
1625 		return -EINVAL;
1626 
1627 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1628 		return -ENOENT;
1629 
1630 	spin_lock_bh(&rt6_exception_lock);
1631 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1632 				    lockdep_is_held(&rt6_exception_lock));
1633 #ifdef CONFIG_IPV6_SUBTREES
1634 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1635 	 * and exception table is indexed by a hash of
1636 	 * both rt6i_dst and rt6i_src.
1637 	 * Otherwise, the exception table is indexed by
1638 	 * a hash of only rt6i_dst.
1639 	 */
1640 	if (from->fib6_src.plen)
1641 		src_key = &rt->rt6i_src.addr;
1642 #endif
1643 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1644 					       &rt->rt6i_dst.addr,
1645 					       src_key);
1646 	if (rt6_ex) {
1647 		rt6_remove_exception(bucket, rt6_ex);
1648 		err = 0;
1649 	} else {
1650 		err = -ENOENT;
1651 	}
1652 
1653 	spin_unlock_bh(&rt6_exception_lock);
1654 	return err;
1655 }
1656 
1657 /* Find rt6_ex which contains the passed in rt cache and
1658  * refresh its stamp
1659  */
1660 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1661 {
1662 	struct rt6_exception_bucket *bucket;
1663 	struct in6_addr *src_key = NULL;
1664 	struct rt6_exception *rt6_ex;
1665 	struct fib6_info *from;
1666 
1667 	rcu_read_lock();
1668 	from = rcu_dereference(rt->from);
1669 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1670 		goto unlock;
1671 
1672 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1673 
1674 #ifdef CONFIG_IPV6_SUBTREES
1675 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1676 	 * and exception table is indexed by a hash of
1677 	 * both rt6i_dst and rt6i_src.
1678 	 * Otherwise, the exception table is indexed by
1679 	 * a hash of only rt6i_dst.
1680 	 */
1681 	if (from->fib6_src.plen)
1682 		src_key = &rt->rt6i_src.addr;
1683 #endif
1684 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1685 					  &rt->rt6i_dst.addr,
1686 					  src_key);
1687 	if (rt6_ex)
1688 		rt6_ex->stamp = jiffies;
1689 
1690 unlock:
1691 	rcu_read_unlock();
1692 }
1693 
1694 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1695 					 struct rt6_info *rt, int mtu)
1696 {
1697 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1698 	 * lowest MTU in the path: always allow updating the route PMTU to
1699 	 * reflect PMTU decreases.
1700 	 *
1701 	 * If the new MTU is higher, and the route PMTU is equal to the local
1702 	 * MTU, this means the old MTU is the lowest in the path, so allow
1703 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1704 	 * handle this.
1705 	 */
1706 
1707 	if (dst_mtu(&rt->dst) >= mtu)
1708 		return true;
1709 
1710 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1711 		return true;
1712 
1713 	return false;
1714 }
1715 
1716 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1717 				       struct fib6_info *rt, int mtu)
1718 {
1719 	struct rt6_exception_bucket *bucket;
1720 	struct rt6_exception *rt6_ex;
1721 	int i;
1722 
1723 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1724 					lockdep_is_held(&rt6_exception_lock));
1725 
1726 	if (!bucket)
1727 		return;
1728 
1729 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1730 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1731 			struct rt6_info *entry = rt6_ex->rt6i;
1732 
1733 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1734 			 * route), the metrics of its rt->from have already
1735 			 * been updated.
1736 			 */
1737 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1738 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1739 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1740 		}
1741 		bucket++;
1742 	}
1743 }
1744 
1745 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1746 
1747 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1748 					struct in6_addr *gateway)
1749 {
1750 	struct rt6_exception_bucket *bucket;
1751 	struct rt6_exception *rt6_ex;
1752 	struct hlist_node *tmp;
1753 	int i;
1754 
1755 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1756 		return;
1757 
1758 	spin_lock_bh(&rt6_exception_lock);
1759 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1760 				     lockdep_is_held(&rt6_exception_lock));
1761 
1762 	if (bucket) {
1763 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1764 			hlist_for_each_entry_safe(rt6_ex, tmp,
1765 						  &bucket->chain, hlist) {
1766 				struct rt6_info *entry = rt6_ex->rt6i;
1767 
1768 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1769 				    RTF_CACHE_GATEWAY &&
1770 				    ipv6_addr_equal(gateway,
1771 						    &entry->rt6i_gateway)) {
1772 					rt6_remove_exception(bucket, rt6_ex);
1773 				}
1774 			}
1775 			bucket++;
1776 		}
1777 	}
1778 
1779 	spin_unlock_bh(&rt6_exception_lock);
1780 }
1781 
1782 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1783 				      struct rt6_exception *rt6_ex,
1784 				      struct fib6_gc_args *gc_args,
1785 				      unsigned long now)
1786 {
1787 	struct rt6_info *rt = rt6_ex->rt6i;
1788 
1789 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1790 	 * even if others have still references to them, so that on next
1791 	 * dst_check() such references can be dropped.
1792 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1793 	 * expired, independently from their aging, as per RFC 8201 section 4
1794 	 */
1795 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1796 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1797 			RT6_TRACE("aging clone %p\n", rt);
1798 			rt6_remove_exception(bucket, rt6_ex);
1799 			return;
1800 		}
1801 	} else if (time_after(jiffies, rt->dst.expires)) {
1802 		RT6_TRACE("purging expired route %p\n", rt);
1803 		rt6_remove_exception(bucket, rt6_ex);
1804 		return;
1805 	}
1806 
1807 	if (rt->rt6i_flags & RTF_GATEWAY) {
1808 		struct neighbour *neigh;
1809 		__u8 neigh_flags = 0;
1810 
1811 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1812 		if (neigh)
1813 			neigh_flags = neigh->flags;
1814 
1815 		if (!(neigh_flags & NTF_ROUTER)) {
1816 			RT6_TRACE("purging route %p via non-router but gateway\n",
1817 				  rt);
1818 			rt6_remove_exception(bucket, rt6_ex);
1819 			return;
1820 		}
1821 	}
1822 
1823 	gc_args->more++;
1824 }
1825 
1826 void rt6_age_exceptions(struct fib6_info *rt,
1827 			struct fib6_gc_args *gc_args,
1828 			unsigned long now)
1829 {
1830 	struct rt6_exception_bucket *bucket;
1831 	struct rt6_exception *rt6_ex;
1832 	struct hlist_node *tmp;
1833 	int i;
1834 
1835 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1836 		return;
1837 
1838 	rcu_read_lock_bh();
1839 	spin_lock(&rt6_exception_lock);
1840 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1841 				    lockdep_is_held(&rt6_exception_lock));
1842 
1843 	if (bucket) {
1844 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1845 			hlist_for_each_entry_safe(rt6_ex, tmp,
1846 						  &bucket->chain, hlist) {
1847 				rt6_age_examine_exception(bucket, rt6_ex,
1848 							  gc_args, now);
1849 			}
1850 			bucket++;
1851 		}
1852 	}
1853 	spin_unlock(&rt6_exception_lock);
1854 	rcu_read_unlock_bh();
1855 }
1856 
1857 /* must be called with rcu lock held */
1858 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1859 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1860 {
1861 	struct fib6_node *fn, *saved_fn;
1862 
1863 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1864 	saved_fn = fn;
1865 
1866 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1867 		oif = 0;
1868 
1869 redo_rt6_select:
1870 	rt6_select(net, fn, oif, res, strict);
1871 	if (res->f6i == net->ipv6.fib6_null_entry) {
1872 		fn = fib6_backtrack(fn, &fl6->saddr);
1873 		if (fn)
1874 			goto redo_rt6_select;
1875 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1876 			/* also consider unreachable route */
1877 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1878 			fn = saved_fn;
1879 			goto redo_rt6_select;
1880 		}
1881 	}
1882 
1883 	trace_fib6_table_lookup(net, res, table, fl6);
1884 
1885 	return 0;
1886 }
1887 
1888 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1889 			       int oif, struct flowi6 *fl6,
1890 			       const struct sk_buff *skb, int flags)
1891 {
1892 	struct fib6_result res = {};
1893 	struct rt6_info *rt;
1894 	int strict = 0;
1895 
1896 	strict |= flags & RT6_LOOKUP_F_IFACE;
1897 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1898 	if (net->ipv6.devconf_all->forwarding == 0)
1899 		strict |= RT6_LOOKUP_F_REACHABLE;
1900 
1901 	rcu_read_lock();
1902 
1903 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1904 	if (res.f6i == net->ipv6.fib6_null_entry) {
1905 		rt = net->ipv6.ip6_null_entry;
1906 		rcu_read_unlock();
1907 		dst_hold(&rt->dst);
1908 		return rt;
1909 	}
1910 
1911 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1912 
1913 	/*Search through exception table */
1914 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1915 	if (rt) {
1916 		if (ip6_hold_safe(net, &rt))
1917 			dst_use_noref(&rt->dst, jiffies);
1918 
1919 		rcu_read_unlock();
1920 		return rt;
1921 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1922 			    !res.nh->fib_nh_gw_family)) {
1923 		/* Create a RTF_CACHE clone which will not be
1924 		 * owned by the fib6 tree.  It is for the special case where
1925 		 * the daddr in the skb during the neighbor look-up is different
1926 		 * from the fl6->daddr used to look-up route here.
1927 		 */
1928 		struct rt6_info *uncached_rt;
1929 
1930 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1931 
1932 		rcu_read_unlock();
1933 
1934 		if (uncached_rt) {
1935 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1936 			 * No need for another dst_hold()
1937 			 */
1938 			rt6_uncached_list_add(uncached_rt);
1939 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1940 		} else {
1941 			uncached_rt = net->ipv6.ip6_null_entry;
1942 			dst_hold(&uncached_rt->dst);
1943 		}
1944 
1945 		return uncached_rt;
1946 	} else {
1947 		/* Get a percpu copy */
1948 
1949 		struct rt6_info *pcpu_rt;
1950 
1951 		local_bh_disable();
1952 		pcpu_rt = rt6_get_pcpu_route(&res);
1953 
1954 		if (!pcpu_rt)
1955 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1956 
1957 		local_bh_enable();
1958 		rcu_read_unlock();
1959 
1960 		return pcpu_rt;
1961 	}
1962 }
1963 EXPORT_SYMBOL_GPL(ip6_pol_route);
1964 
1965 static struct rt6_info *ip6_pol_route_input(struct net *net,
1966 					    struct fib6_table *table,
1967 					    struct flowi6 *fl6,
1968 					    const struct sk_buff *skb,
1969 					    int flags)
1970 {
1971 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1972 }
1973 
1974 struct dst_entry *ip6_route_input_lookup(struct net *net,
1975 					 struct net_device *dev,
1976 					 struct flowi6 *fl6,
1977 					 const struct sk_buff *skb,
1978 					 int flags)
1979 {
1980 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1981 		flags |= RT6_LOOKUP_F_IFACE;
1982 
1983 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1984 }
1985 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1986 
1987 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1988 				  struct flow_keys *keys,
1989 				  struct flow_keys *flkeys)
1990 {
1991 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1992 	const struct ipv6hdr *key_iph = outer_iph;
1993 	struct flow_keys *_flkeys = flkeys;
1994 	const struct ipv6hdr *inner_iph;
1995 	const struct icmp6hdr *icmph;
1996 	struct ipv6hdr _inner_iph;
1997 	struct icmp6hdr _icmph;
1998 
1999 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2000 		goto out;
2001 
2002 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2003 				   sizeof(_icmph), &_icmph);
2004 	if (!icmph)
2005 		goto out;
2006 
2007 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2008 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2009 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2010 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
2011 		goto out;
2012 
2013 	inner_iph = skb_header_pointer(skb,
2014 				       skb_transport_offset(skb) + sizeof(*icmph),
2015 				       sizeof(_inner_iph), &_inner_iph);
2016 	if (!inner_iph)
2017 		goto out;
2018 
2019 	key_iph = inner_iph;
2020 	_flkeys = NULL;
2021 out:
2022 	if (_flkeys) {
2023 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2024 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2025 		keys->tags.flow_label = _flkeys->tags.flow_label;
2026 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2027 	} else {
2028 		keys->addrs.v6addrs.src = key_iph->saddr;
2029 		keys->addrs.v6addrs.dst = key_iph->daddr;
2030 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2031 		keys->basic.ip_proto = key_iph->nexthdr;
2032 	}
2033 }
2034 
2035 /* if skb is set it will be used and fl6 can be NULL */
2036 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2037 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2038 {
2039 	struct flow_keys hash_keys;
2040 	u32 mhash;
2041 
2042 	switch (ip6_multipath_hash_policy(net)) {
2043 	case 0:
2044 		memset(&hash_keys, 0, sizeof(hash_keys));
2045 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2046 		if (skb) {
2047 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2048 		} else {
2049 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2050 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2051 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2052 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2053 		}
2054 		break;
2055 	case 1:
2056 		if (skb) {
2057 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2058 			struct flow_keys keys;
2059 
2060 			/* short-circuit if we already have L4 hash present */
2061 			if (skb->l4_hash)
2062 				return skb_get_hash_raw(skb) >> 1;
2063 
2064 			memset(&hash_keys, 0, sizeof(hash_keys));
2065 
2066                         if (!flkeys) {
2067 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2068 				flkeys = &keys;
2069 			}
2070 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2071 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2072 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2073 			hash_keys.ports.src = flkeys->ports.src;
2074 			hash_keys.ports.dst = flkeys->ports.dst;
2075 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2076 		} else {
2077 			memset(&hash_keys, 0, sizeof(hash_keys));
2078 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2079 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2080 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2081 			hash_keys.ports.src = fl6->fl6_sport;
2082 			hash_keys.ports.dst = fl6->fl6_dport;
2083 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2084 		}
2085 		break;
2086 	}
2087 	mhash = flow_hash_from_keys(&hash_keys);
2088 
2089 	return mhash >> 1;
2090 }
2091 
2092 void ip6_route_input(struct sk_buff *skb)
2093 {
2094 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2095 	struct net *net = dev_net(skb->dev);
2096 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2097 	struct ip_tunnel_info *tun_info;
2098 	struct flowi6 fl6 = {
2099 		.flowi6_iif = skb->dev->ifindex,
2100 		.daddr = iph->daddr,
2101 		.saddr = iph->saddr,
2102 		.flowlabel = ip6_flowinfo(iph),
2103 		.flowi6_mark = skb->mark,
2104 		.flowi6_proto = iph->nexthdr,
2105 	};
2106 	struct flow_keys *flkeys = NULL, _flkeys;
2107 
2108 	tun_info = skb_tunnel_info(skb);
2109 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2110 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2111 
2112 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2113 		flkeys = &_flkeys;
2114 
2115 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2116 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2117 	skb_dst_drop(skb);
2118 	skb_dst_set(skb,
2119 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2120 }
2121 
2122 static struct rt6_info *ip6_pol_route_output(struct net *net,
2123 					     struct fib6_table *table,
2124 					     struct flowi6 *fl6,
2125 					     const struct sk_buff *skb,
2126 					     int flags)
2127 {
2128 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2129 }
2130 
2131 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2132 					 struct flowi6 *fl6, int flags)
2133 {
2134 	bool any_src;
2135 
2136 	if (ipv6_addr_type(&fl6->daddr) &
2137 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2138 		struct dst_entry *dst;
2139 
2140 		dst = l3mdev_link_scope_lookup(net, fl6);
2141 		if (dst)
2142 			return dst;
2143 	}
2144 
2145 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2146 
2147 	any_src = ipv6_addr_any(&fl6->saddr);
2148 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2149 	    (fl6->flowi6_oif && any_src))
2150 		flags |= RT6_LOOKUP_F_IFACE;
2151 
2152 	if (!any_src)
2153 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2154 	else if (sk)
2155 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2156 
2157 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2158 }
2159 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2160 
2161 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2162 {
2163 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2164 	struct net_device *loopback_dev = net->loopback_dev;
2165 	struct dst_entry *new = NULL;
2166 
2167 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2168 		       DST_OBSOLETE_DEAD, 0);
2169 	if (rt) {
2170 		rt6_info_init(rt);
2171 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2172 
2173 		new = &rt->dst;
2174 		new->__use = 1;
2175 		new->input = dst_discard;
2176 		new->output = dst_discard_out;
2177 
2178 		dst_copy_metrics(new, &ort->dst);
2179 
2180 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2181 		rt->rt6i_gateway = ort->rt6i_gateway;
2182 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2183 
2184 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2185 #ifdef CONFIG_IPV6_SUBTREES
2186 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2187 #endif
2188 	}
2189 
2190 	dst_release(dst_orig);
2191 	return new ? new : ERR_PTR(-ENOMEM);
2192 }
2193 
2194 /*
2195  *	Destination cache support functions
2196  */
2197 
2198 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2199 {
2200 	u32 rt_cookie = 0;
2201 
2202 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2203 		return false;
2204 
2205 	if (fib6_check_expired(f6i))
2206 		return false;
2207 
2208 	return true;
2209 }
2210 
2211 static struct dst_entry *rt6_check(struct rt6_info *rt,
2212 				   struct fib6_info *from,
2213 				   u32 cookie)
2214 {
2215 	u32 rt_cookie = 0;
2216 
2217 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2218 	    rt_cookie != cookie)
2219 		return NULL;
2220 
2221 	if (rt6_check_expired(rt))
2222 		return NULL;
2223 
2224 	return &rt->dst;
2225 }
2226 
2227 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2228 					    struct fib6_info *from,
2229 					    u32 cookie)
2230 {
2231 	if (!__rt6_check_expired(rt) &&
2232 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2233 	    fib6_check(from, cookie))
2234 		return &rt->dst;
2235 	else
2236 		return NULL;
2237 }
2238 
2239 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2240 {
2241 	struct dst_entry *dst_ret;
2242 	struct fib6_info *from;
2243 	struct rt6_info *rt;
2244 
2245 	rt = container_of(dst, struct rt6_info, dst);
2246 
2247 	rcu_read_lock();
2248 
2249 	/* All IPV6 dsts are created with ->obsolete set to the value
2250 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2251 	 * into this function always.
2252 	 */
2253 
2254 	from = rcu_dereference(rt->from);
2255 
2256 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2257 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2258 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2259 	else
2260 		dst_ret = rt6_check(rt, from, cookie);
2261 
2262 	rcu_read_unlock();
2263 
2264 	return dst_ret;
2265 }
2266 
2267 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2268 {
2269 	struct rt6_info *rt = (struct rt6_info *) dst;
2270 
2271 	if (rt) {
2272 		if (rt->rt6i_flags & RTF_CACHE) {
2273 			rcu_read_lock();
2274 			if (rt6_check_expired(rt)) {
2275 				rt6_remove_exception_rt(rt);
2276 				dst = NULL;
2277 			}
2278 			rcu_read_unlock();
2279 		} else {
2280 			dst_release(dst);
2281 			dst = NULL;
2282 		}
2283 	}
2284 	return dst;
2285 }
2286 
2287 static void ip6_link_failure(struct sk_buff *skb)
2288 {
2289 	struct rt6_info *rt;
2290 
2291 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2292 
2293 	rt = (struct rt6_info *) skb_dst(skb);
2294 	if (rt) {
2295 		rcu_read_lock();
2296 		if (rt->rt6i_flags & RTF_CACHE) {
2297 			rt6_remove_exception_rt(rt);
2298 		} else {
2299 			struct fib6_info *from;
2300 			struct fib6_node *fn;
2301 
2302 			from = rcu_dereference(rt->from);
2303 			if (from) {
2304 				fn = rcu_dereference(from->fib6_node);
2305 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2306 					fn->fn_sernum = -1;
2307 			}
2308 		}
2309 		rcu_read_unlock();
2310 	}
2311 }
2312 
2313 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2314 {
2315 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2316 		struct fib6_info *from;
2317 
2318 		rcu_read_lock();
2319 		from = rcu_dereference(rt0->from);
2320 		if (from)
2321 			rt0->dst.expires = from->expires;
2322 		rcu_read_unlock();
2323 	}
2324 
2325 	dst_set_expires(&rt0->dst, timeout);
2326 	rt0->rt6i_flags |= RTF_EXPIRES;
2327 }
2328 
2329 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2330 {
2331 	struct net *net = dev_net(rt->dst.dev);
2332 
2333 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2334 	rt->rt6i_flags |= RTF_MODIFIED;
2335 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2336 }
2337 
2338 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2339 {
2340 	return !(rt->rt6i_flags & RTF_CACHE) &&
2341 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2342 }
2343 
2344 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2345 				 const struct ipv6hdr *iph, u32 mtu)
2346 {
2347 	const struct in6_addr *daddr, *saddr;
2348 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2349 
2350 	if (dst_metric_locked(dst, RTAX_MTU))
2351 		return;
2352 
2353 	if (iph) {
2354 		daddr = &iph->daddr;
2355 		saddr = &iph->saddr;
2356 	} else if (sk) {
2357 		daddr = &sk->sk_v6_daddr;
2358 		saddr = &inet6_sk(sk)->saddr;
2359 	} else {
2360 		daddr = NULL;
2361 		saddr = NULL;
2362 	}
2363 	dst_confirm_neigh(dst, daddr);
2364 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2365 	if (mtu >= dst_mtu(dst))
2366 		return;
2367 
2368 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2369 		rt6_do_update_pmtu(rt6, mtu);
2370 		/* update rt6_ex->stamp for cache */
2371 		if (rt6->rt6i_flags & RTF_CACHE)
2372 			rt6_update_exception_stamp_rt(rt6);
2373 	} else if (daddr) {
2374 		struct fib6_result res = {};
2375 		struct rt6_info *nrt6;
2376 
2377 		rcu_read_lock();
2378 		res.f6i = rcu_dereference(rt6->from);
2379 		if (!res.f6i) {
2380 			rcu_read_unlock();
2381 			return;
2382 		}
2383 		res.nh = &res.f6i->fib6_nh;
2384 		res.fib6_flags = res.f6i->fib6_flags;
2385 		res.fib6_type = res.f6i->fib6_type;
2386 
2387 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2388 		if (nrt6) {
2389 			rt6_do_update_pmtu(nrt6, mtu);
2390 			if (rt6_insert_exception(nrt6, &res))
2391 				dst_release_immediate(&nrt6->dst);
2392 		}
2393 		rcu_read_unlock();
2394 	}
2395 }
2396 
2397 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2398 			       struct sk_buff *skb, u32 mtu)
2399 {
2400 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2401 }
2402 
2403 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2404 		     int oif, u32 mark, kuid_t uid)
2405 {
2406 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2407 	struct dst_entry *dst;
2408 	struct flowi6 fl6 = {
2409 		.flowi6_oif = oif,
2410 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2411 		.daddr = iph->daddr,
2412 		.saddr = iph->saddr,
2413 		.flowlabel = ip6_flowinfo(iph),
2414 		.flowi6_uid = uid,
2415 	};
2416 
2417 	dst = ip6_route_output(net, NULL, &fl6);
2418 	if (!dst->error)
2419 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2420 	dst_release(dst);
2421 }
2422 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2423 
2424 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2425 {
2426 	int oif = sk->sk_bound_dev_if;
2427 	struct dst_entry *dst;
2428 
2429 	if (!oif && skb->dev)
2430 		oif = l3mdev_master_ifindex(skb->dev);
2431 
2432 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2433 
2434 	dst = __sk_dst_get(sk);
2435 	if (!dst || !dst->obsolete ||
2436 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2437 		return;
2438 
2439 	bh_lock_sock(sk);
2440 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2441 		ip6_datagram_dst_update(sk, false);
2442 	bh_unlock_sock(sk);
2443 }
2444 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2445 
2446 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2447 			   const struct flowi6 *fl6)
2448 {
2449 #ifdef CONFIG_IPV6_SUBTREES
2450 	struct ipv6_pinfo *np = inet6_sk(sk);
2451 #endif
2452 
2453 	ip6_dst_store(sk, dst,
2454 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2455 		      &sk->sk_v6_daddr : NULL,
2456 #ifdef CONFIG_IPV6_SUBTREES
2457 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2458 		      &np->saddr :
2459 #endif
2460 		      NULL);
2461 }
2462 
2463 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2464 				  struct flowi6 *fl6,
2465 				  const struct in6_addr *gw,
2466 				  struct rt6_info **ret)
2467 {
2468 	const struct fib6_nh *nh = res->nh;
2469 
2470 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2471 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2472 		return false;
2473 
2474 	/* rt_cache's gateway might be different from its 'parent'
2475 	 * in the case of an ip redirect.
2476 	 * So we keep searching in the exception table if the gateway
2477 	 * is different.
2478 	 */
2479 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2480 		struct rt6_info *rt_cache;
2481 
2482 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2483 		if (rt_cache &&
2484 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2485 			*ret = rt_cache;
2486 			return true;
2487 		}
2488 		return false;
2489 	}
2490 	return true;
2491 }
2492 
2493 /* Handle redirects */
2494 struct ip6rd_flowi {
2495 	struct flowi6 fl6;
2496 	struct in6_addr gateway;
2497 };
2498 
2499 static struct rt6_info *__ip6_route_redirect(struct net *net,
2500 					     struct fib6_table *table,
2501 					     struct flowi6 *fl6,
2502 					     const struct sk_buff *skb,
2503 					     int flags)
2504 {
2505 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2506 	struct rt6_info *ret = NULL;
2507 	struct fib6_result res = {};
2508 	struct fib6_info *rt;
2509 	struct fib6_node *fn;
2510 
2511 	/* l3mdev_update_flow overrides oif if the device is enslaved; in
2512 	 * this case we must match on the real ingress device, so reset it
2513 	 */
2514 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2515 		fl6->flowi6_oif = skb->dev->ifindex;
2516 
2517 	/* Get the "current" route for this destination and
2518 	 * check if the redirect has come from appropriate router.
2519 	 *
2520 	 * RFC 4861 specifies that redirects should only be
2521 	 * accepted if they come from the nexthop to the target.
2522 	 * Due to the way the routes are chosen, this notion
2523 	 * is a bit fuzzy and one might need to check all possible
2524 	 * routes.
2525 	 */
2526 
2527 	rcu_read_lock();
2528 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2529 restart:
2530 	for_each_fib6_node_rt_rcu(fn) {
2531 		res.f6i = rt;
2532 		res.nh = &rt->fib6_nh;
2533 
2534 		if (fib6_check_expired(rt))
2535 			continue;
2536 		if (rt->fib6_flags & RTF_REJECT)
2537 			break;
2538 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2539 			goto out;
2540 	}
2541 
2542 	if (!rt)
2543 		rt = net->ipv6.fib6_null_entry;
2544 	else if (rt->fib6_flags & RTF_REJECT) {
2545 		ret = net->ipv6.ip6_null_entry;
2546 		goto out;
2547 	}
2548 
2549 	if (rt == net->ipv6.fib6_null_entry) {
2550 		fn = fib6_backtrack(fn, &fl6->saddr);
2551 		if (fn)
2552 			goto restart;
2553 	}
2554 
2555 	res.f6i = rt;
2556 	res.nh = &rt->fib6_nh;
2557 out:
2558 	if (ret) {
2559 		ip6_hold_safe(net, &ret);
2560 	} else {
2561 		res.fib6_flags = res.f6i->fib6_flags;
2562 		res.fib6_type = res.f6i->fib6_type;
2563 		ret = ip6_create_rt_rcu(&res);
2564 	}
2565 
2566 	rcu_read_unlock();
2567 
2568 	trace_fib6_table_lookup(net, &res, table, fl6);
2569 	return ret;
2570 };
2571 
2572 static struct dst_entry *ip6_route_redirect(struct net *net,
2573 					    const struct flowi6 *fl6,
2574 					    const struct sk_buff *skb,
2575 					    const struct in6_addr *gateway)
2576 {
2577 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2578 	struct ip6rd_flowi rdfl;
2579 
2580 	rdfl.fl6 = *fl6;
2581 	rdfl.gateway = *gateway;
2582 
2583 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2584 				flags, __ip6_route_redirect);
2585 }
2586 
2587 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2588 		  kuid_t uid)
2589 {
2590 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2591 	struct dst_entry *dst;
2592 	struct flowi6 fl6 = {
2593 		.flowi6_iif = LOOPBACK_IFINDEX,
2594 		.flowi6_oif = oif,
2595 		.flowi6_mark = mark,
2596 		.daddr = iph->daddr,
2597 		.saddr = iph->saddr,
2598 		.flowlabel = ip6_flowinfo(iph),
2599 		.flowi6_uid = uid,
2600 	};
2601 
2602 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2603 	rt6_do_redirect(dst, NULL, skb);
2604 	dst_release(dst);
2605 }
2606 EXPORT_SYMBOL_GPL(ip6_redirect);
2607 
2608 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2609 {
2610 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2611 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2612 	struct dst_entry *dst;
2613 	struct flowi6 fl6 = {
2614 		.flowi6_iif = LOOPBACK_IFINDEX,
2615 		.flowi6_oif = oif,
2616 		.daddr = msg->dest,
2617 		.saddr = iph->daddr,
2618 		.flowi6_uid = sock_net_uid(net, NULL),
2619 	};
2620 
2621 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2622 	rt6_do_redirect(dst, NULL, skb);
2623 	dst_release(dst);
2624 }
2625 
2626 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2627 {
2628 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2629 		     sk->sk_uid);
2630 }
2631 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2632 
2633 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2634 {
2635 	struct net_device *dev = dst->dev;
2636 	unsigned int mtu = dst_mtu(dst);
2637 	struct net *net = dev_net(dev);
2638 
2639 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2640 
2641 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2642 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2643 
2644 	/*
2645 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2646 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2647 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2648 	 * rely only on pmtu discovery"
2649 	 */
2650 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2651 		mtu = IPV6_MAXPLEN;
2652 	return mtu;
2653 }
2654 
2655 static unsigned int ip6_mtu(const struct dst_entry *dst)
2656 {
2657 	struct inet6_dev *idev;
2658 	unsigned int mtu;
2659 
2660 	mtu = dst_metric_raw(dst, RTAX_MTU);
2661 	if (mtu)
2662 		goto out;
2663 
2664 	mtu = IPV6_MIN_MTU;
2665 
2666 	rcu_read_lock();
2667 	idev = __in6_dev_get(dst->dev);
2668 	if (idev)
2669 		mtu = idev->cnf.mtu6;
2670 	rcu_read_unlock();
2671 
2672 out:
2673 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2674 
2675 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2676 }
2677 
2678 /* MTU selection:
2679  * 1. mtu on route is locked - use it
2680  * 2. mtu from nexthop exception
2681  * 3. mtu from egress device
2682  *
2683  * based on ip6_dst_mtu_forward and exception logic of
2684  * rt6_find_cached_rt; called with rcu_read_lock
2685  */
2686 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2687 		      const struct in6_addr *daddr,
2688 		      const struct in6_addr *saddr)
2689 {
2690 	const struct fib6_nh *nh = res->nh;
2691 	struct fib6_info *f6i = res->f6i;
2692 	struct inet6_dev *idev;
2693 	struct rt6_info *rt;
2694 	u32 mtu = 0;
2695 
2696 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2697 		mtu = f6i->fib6_pmtu;
2698 		if (mtu)
2699 			goto out;
2700 	}
2701 
2702 	rt = rt6_find_cached_rt(res, daddr, saddr);
2703 	if (unlikely(rt)) {
2704 		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2705 	} else {
2706 		struct net_device *dev = nh->fib_nh_dev;
2707 
2708 		mtu = IPV6_MIN_MTU;
2709 		idev = __in6_dev_get(dev);
2710 		if (idev && idev->cnf.mtu6 > mtu)
2711 			mtu = idev->cnf.mtu6;
2712 	}
2713 
2714 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2715 out:
2716 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2717 }
2718 
2719 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2720 				  struct flowi6 *fl6)
2721 {
2722 	struct dst_entry *dst;
2723 	struct rt6_info *rt;
2724 	struct inet6_dev *idev = in6_dev_get(dev);
2725 	struct net *net = dev_net(dev);
2726 
2727 	if (unlikely(!idev))
2728 		return ERR_PTR(-ENODEV);
2729 
2730 	rt = ip6_dst_alloc(net, dev, 0);
2731 	if (unlikely(!rt)) {
2732 		in6_dev_put(idev);
2733 		dst = ERR_PTR(-ENOMEM);
2734 		goto out;
2735 	}
2736 
2737 	rt->dst.flags |= DST_HOST;
2738 	rt->dst.input = ip6_input;
2739 	rt->dst.output  = ip6_output;
2740 	rt->rt6i_gateway  = fl6->daddr;
2741 	rt->rt6i_dst.addr = fl6->daddr;
2742 	rt->rt6i_dst.plen = 128;
2743 	rt->rt6i_idev     = idev;
2744 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2745 
2746 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2747 	 * do proper release of the net_device
2748 	 */
2749 	rt6_uncached_list_add(rt);
2750 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2751 
2752 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2753 
2754 out:
2755 	return dst;
2756 }
2757 
2758 static int ip6_dst_gc(struct dst_ops *ops)
2759 {
2760 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2761 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2762 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2763 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2764 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2765 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2766 	int entries;
2767 
2768 	entries = dst_entries_get_fast(ops);
2769 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2770 	    entries <= rt_max_size)
2771 		goto out;
2772 
2773 	net->ipv6.ip6_rt_gc_expire++;
2774 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2775 	entries = dst_entries_get_slow(ops);
2776 	if (entries < ops->gc_thresh)
2777 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2778 out:
2779 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2780 	return entries > rt_max_size;
2781 }
2782 
2783 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2784 					    struct fib6_config *cfg,
2785 					    const struct in6_addr *gw_addr,
2786 					    u32 tbid, int flags)
2787 {
2788 	struct flowi6 fl6 = {
2789 		.flowi6_oif = cfg->fc_ifindex,
2790 		.daddr = *gw_addr,
2791 		.saddr = cfg->fc_prefsrc,
2792 	};
2793 	struct fib6_table *table;
2794 	struct rt6_info *rt;
2795 
2796 	table = fib6_get_table(net, tbid);
2797 	if (!table)
2798 		return NULL;
2799 
2800 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2801 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2802 
2803 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2804 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2805 
2806 	/* if table lookup failed, fall back to full lookup */
2807 	if (rt == net->ipv6.ip6_null_entry) {
2808 		ip6_rt_put(rt);
2809 		rt = NULL;
2810 	}
2811 
2812 	return rt;
2813 }
2814 
2815 static int ip6_route_check_nh_onlink(struct net *net,
2816 				     struct fib6_config *cfg,
2817 				     const struct net_device *dev,
2818 				     struct netlink_ext_ack *extack)
2819 {
2820 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2821 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2822 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2823 	struct fib6_info *from;
2824 	struct rt6_info *grt;
2825 	int err;
2826 
2827 	err = 0;
2828 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2829 	if (grt) {
2830 		rcu_read_lock();
2831 		from = rcu_dereference(grt->from);
2832 		if (!grt->dst.error &&
2833 		    /* ignore match if it is the default route */
2834 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2835 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2836 			NL_SET_ERR_MSG(extack,
2837 				       "Nexthop has invalid gateway or device mismatch");
2838 			err = -EINVAL;
2839 		}
2840 		rcu_read_unlock();
2841 
2842 		ip6_rt_put(grt);
2843 	}
2844 
2845 	return err;
2846 }
2847 
2848 static int ip6_route_check_nh(struct net *net,
2849 			      struct fib6_config *cfg,
2850 			      struct net_device **_dev,
2851 			      struct inet6_dev **idev)
2852 {
2853 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2854 	struct net_device *dev = _dev ? *_dev : NULL;
2855 	struct rt6_info *grt = NULL;
2856 	int err = -EHOSTUNREACH;
2857 
2858 	if (cfg->fc_table) {
2859 		int flags = RT6_LOOKUP_F_IFACE;
2860 
2861 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2862 					  cfg->fc_table, flags);
2863 		if (grt) {
2864 			if (grt->rt6i_flags & RTF_GATEWAY ||
2865 			    (dev && dev != grt->dst.dev)) {
2866 				ip6_rt_put(grt);
2867 				grt = NULL;
2868 			}
2869 		}
2870 	}
2871 
2872 	if (!grt)
2873 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2874 
2875 	if (!grt)
2876 		goto out;
2877 
2878 	if (dev) {
2879 		if (dev != grt->dst.dev) {
2880 			ip6_rt_put(grt);
2881 			goto out;
2882 		}
2883 	} else {
2884 		*_dev = dev = grt->dst.dev;
2885 		*idev = grt->rt6i_idev;
2886 		dev_hold(dev);
2887 		in6_dev_hold(grt->rt6i_idev);
2888 	}
2889 
2890 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2891 		err = 0;
2892 
2893 	ip6_rt_put(grt);
2894 
2895 out:
2896 	return err;
2897 }
2898 
2899 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2900 			   struct net_device **_dev, struct inet6_dev **idev,
2901 			   struct netlink_ext_ack *extack)
2902 {
2903 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2904 	int gwa_type = ipv6_addr_type(gw_addr);
2905 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2906 	const struct net_device *dev = *_dev;
2907 	bool need_addr_check = !dev;
2908 	int err = -EINVAL;
2909 
2910 	/* if gw_addr is local we will fail to detect this in case
2911 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2912 	 * will return already-added prefix route via interface that
2913 	 * prefix route was assigned to, which might be non-loopback.
2914 	 */
2915 	if (dev &&
2916 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2917 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2918 		goto out;
2919 	}
2920 
2921 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2922 		/* IPv6 strictly inhibits using not link-local
2923 		 * addresses as nexthop address.
2924 		 * Otherwise, router will not able to send redirects.
2925 		 * It is very good, but in some (rare!) circumstances
2926 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2927 		 * some exceptions. --ANK
2928 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2929 		 * addressing
2930 		 */
2931 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2932 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2933 			goto out;
2934 		}
2935 
2936 		if (cfg->fc_flags & RTNH_F_ONLINK)
2937 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2938 		else
2939 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2940 
2941 		if (err)
2942 			goto out;
2943 	}
2944 
2945 	/* reload in case device was changed */
2946 	dev = *_dev;
2947 
2948 	err = -EINVAL;
2949 	if (!dev) {
2950 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2951 		goto out;
2952 	} else if (dev->flags & IFF_LOOPBACK) {
2953 		NL_SET_ERR_MSG(extack,
2954 			       "Egress device can not be loopback device for this route");
2955 		goto out;
2956 	}
2957 
2958 	/* if we did not check gw_addr above, do so now that the
2959 	 * egress device has been resolved.
2960 	 */
2961 	if (need_addr_check &&
2962 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2963 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2964 		goto out;
2965 	}
2966 
2967 	err = 0;
2968 out:
2969 	return err;
2970 }
2971 
2972 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2973 {
2974 	if ((flags & RTF_REJECT) ||
2975 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2976 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2977 	     !(flags & RTF_LOCAL)))
2978 		return true;
2979 
2980 	return false;
2981 }
2982 
2983 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2984 		 struct fib6_config *cfg, gfp_t gfp_flags,
2985 		 struct netlink_ext_ack *extack)
2986 {
2987 	struct net_device *dev = NULL;
2988 	struct inet6_dev *idev = NULL;
2989 	int addr_type;
2990 	int err;
2991 
2992 	fib6_nh->fib_nh_family = AF_INET6;
2993 
2994 	err = -ENODEV;
2995 	if (cfg->fc_ifindex) {
2996 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2997 		if (!dev)
2998 			goto out;
2999 		idev = in6_dev_get(dev);
3000 		if (!idev)
3001 			goto out;
3002 	}
3003 
3004 	if (cfg->fc_flags & RTNH_F_ONLINK) {
3005 		if (!dev) {
3006 			NL_SET_ERR_MSG(extack,
3007 				       "Nexthop device required for onlink");
3008 			goto out;
3009 		}
3010 
3011 		if (!(dev->flags & IFF_UP)) {
3012 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3013 			err = -ENETDOWN;
3014 			goto out;
3015 		}
3016 
3017 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3018 	}
3019 
3020 	fib6_nh->fib_nh_weight = 1;
3021 
3022 	/* We cannot add true routes via loopback here,
3023 	 * they would result in kernel looping; promote them to reject routes
3024 	 */
3025 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3026 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3027 		/* hold loopback dev/idev if we haven't done so. */
3028 		if (dev != net->loopback_dev) {
3029 			if (dev) {
3030 				dev_put(dev);
3031 				in6_dev_put(idev);
3032 			}
3033 			dev = net->loopback_dev;
3034 			dev_hold(dev);
3035 			idev = in6_dev_get(dev);
3036 			if (!idev) {
3037 				err = -ENODEV;
3038 				goto out;
3039 			}
3040 		}
3041 		goto set_dev;
3042 	}
3043 
3044 	if (cfg->fc_flags & RTF_GATEWAY) {
3045 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3046 		if (err)
3047 			goto out;
3048 
3049 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3050 		fib6_nh->fib_nh_gw_family = AF_INET6;
3051 	}
3052 
3053 	err = -ENODEV;
3054 	if (!dev)
3055 		goto out;
3056 
3057 	if (idev->cnf.disable_ipv6) {
3058 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3059 		err = -EACCES;
3060 		goto out;
3061 	}
3062 
3063 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3064 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3065 		err = -ENETDOWN;
3066 		goto out;
3067 	}
3068 
3069 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3070 	    !netif_carrier_ok(dev))
3071 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3072 
3073 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3074 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3075 	if (err)
3076 		goto out;
3077 set_dev:
3078 	fib6_nh->fib_nh_dev = dev;
3079 	fib6_nh->fib_nh_oif = dev->ifindex;
3080 	err = 0;
3081 out:
3082 	if (idev)
3083 		in6_dev_put(idev);
3084 
3085 	if (err) {
3086 		lwtstate_put(fib6_nh->fib_nh_lws);
3087 		fib6_nh->fib_nh_lws = NULL;
3088 		if (dev)
3089 			dev_put(dev);
3090 	}
3091 
3092 	return err;
3093 }
3094 
3095 void fib6_nh_release(struct fib6_nh *fib6_nh)
3096 {
3097 	fib_nh_common_release(&fib6_nh->nh_common);
3098 }
3099 
3100 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3101 					      gfp_t gfp_flags,
3102 					      struct netlink_ext_ack *extack)
3103 {
3104 	struct net *net = cfg->fc_nlinfo.nl_net;
3105 	struct fib6_info *rt = NULL;
3106 	struct fib6_table *table;
3107 	int err = -EINVAL;
3108 	int addr_type;
3109 
3110 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3111 	if (cfg->fc_flags & RTF_PCPU) {
3112 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3113 		goto out;
3114 	}
3115 
3116 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3117 	if (cfg->fc_flags & RTF_CACHE) {
3118 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3119 		goto out;
3120 	}
3121 
3122 	if (cfg->fc_type > RTN_MAX) {
3123 		NL_SET_ERR_MSG(extack, "Invalid route type");
3124 		goto out;
3125 	}
3126 
3127 	if (cfg->fc_dst_len > 128) {
3128 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3129 		goto out;
3130 	}
3131 	if (cfg->fc_src_len > 128) {
3132 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3133 		goto out;
3134 	}
3135 #ifndef CONFIG_IPV6_SUBTREES
3136 	if (cfg->fc_src_len) {
3137 		NL_SET_ERR_MSG(extack,
3138 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3139 		goto out;
3140 	}
3141 #endif
3142 
3143 	err = -ENOBUFS;
3144 	if (cfg->fc_nlinfo.nlh &&
3145 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3146 		table = fib6_get_table(net, cfg->fc_table);
3147 		if (!table) {
3148 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3149 			table = fib6_new_table(net, cfg->fc_table);
3150 		}
3151 	} else {
3152 		table = fib6_new_table(net, cfg->fc_table);
3153 	}
3154 
3155 	if (!table)
3156 		goto out;
3157 
3158 	err = -ENOMEM;
3159 	rt = fib6_info_alloc(gfp_flags);
3160 	if (!rt)
3161 		goto out;
3162 
3163 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3164 					       extack);
3165 	if (IS_ERR(rt->fib6_metrics)) {
3166 		err = PTR_ERR(rt->fib6_metrics);
3167 		/* Do not leave garbage there. */
3168 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3169 		goto out;
3170 	}
3171 
3172 	if (cfg->fc_flags & RTF_ADDRCONF)
3173 		rt->dst_nocount = true;
3174 
3175 	if (cfg->fc_flags & RTF_EXPIRES)
3176 		fib6_set_expires(rt, jiffies +
3177 				clock_t_to_jiffies(cfg->fc_expires));
3178 	else
3179 		fib6_clean_expires(rt);
3180 
3181 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3182 		cfg->fc_protocol = RTPROT_BOOT;
3183 	rt->fib6_protocol = cfg->fc_protocol;
3184 
3185 	rt->fib6_table = table;
3186 	rt->fib6_metric = cfg->fc_metric;
3187 	rt->fib6_type = cfg->fc_type;
3188 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3189 
3190 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3191 	rt->fib6_dst.plen = cfg->fc_dst_len;
3192 	if (rt->fib6_dst.plen == 128)
3193 		rt->dst_host = true;
3194 
3195 #ifdef CONFIG_IPV6_SUBTREES
3196 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3197 	rt->fib6_src.plen = cfg->fc_src_len;
3198 #endif
3199 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3200 	if (err)
3201 		goto out;
3202 
3203 	/* We cannot add true routes via loopback here,
3204 	 * they would result in kernel looping; promote them to reject routes
3205 	 */
3206 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3207 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3208 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3209 
3210 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3211 		struct net_device *dev = fib6_info_nh_dev(rt);
3212 
3213 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3214 			NL_SET_ERR_MSG(extack, "Invalid source address");
3215 			err = -EINVAL;
3216 			goto out;
3217 		}
3218 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3219 		rt->fib6_prefsrc.plen = 128;
3220 	} else
3221 		rt->fib6_prefsrc.plen = 0;
3222 
3223 	return rt;
3224 out:
3225 	fib6_info_release(rt);
3226 	return ERR_PTR(err);
3227 }
3228 
3229 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3230 		  struct netlink_ext_ack *extack)
3231 {
3232 	struct fib6_info *rt;
3233 	int err;
3234 
3235 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3236 	if (IS_ERR(rt))
3237 		return PTR_ERR(rt);
3238 
3239 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3240 	fib6_info_release(rt);
3241 
3242 	return err;
3243 }
3244 
3245 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3246 {
3247 	struct net *net = info->nl_net;
3248 	struct fib6_table *table;
3249 	int err;
3250 
3251 	if (rt == net->ipv6.fib6_null_entry) {
3252 		err = -ENOENT;
3253 		goto out;
3254 	}
3255 
3256 	table = rt->fib6_table;
3257 	spin_lock_bh(&table->tb6_lock);
3258 	err = fib6_del(rt, info);
3259 	spin_unlock_bh(&table->tb6_lock);
3260 
3261 out:
3262 	fib6_info_release(rt);
3263 	return err;
3264 }
3265 
3266 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3267 {
3268 	struct nl_info info = { .nl_net = net };
3269 
3270 	return __ip6_del_rt(rt, &info);
3271 }
3272 
3273 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3274 {
3275 	struct nl_info *info = &cfg->fc_nlinfo;
3276 	struct net *net = info->nl_net;
3277 	struct sk_buff *skb = NULL;
3278 	struct fib6_table *table;
3279 	int err = -ENOENT;
3280 
3281 	if (rt == net->ipv6.fib6_null_entry)
3282 		goto out_put;
3283 	table = rt->fib6_table;
3284 	spin_lock_bh(&table->tb6_lock);
3285 
3286 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3287 		struct fib6_info *sibling, *next_sibling;
3288 
3289 		/* prefer to send a single notification with all hops */
3290 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3291 		if (skb) {
3292 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3293 
3294 			if (rt6_fill_node(net, skb, rt, NULL,
3295 					  NULL, NULL, 0, RTM_DELROUTE,
3296 					  info->portid, seq, 0) < 0) {
3297 				kfree_skb(skb);
3298 				skb = NULL;
3299 			} else
3300 				info->skip_notify = 1;
3301 		}
3302 
3303 		list_for_each_entry_safe(sibling, next_sibling,
3304 					 &rt->fib6_siblings,
3305 					 fib6_siblings) {
3306 			err = fib6_del(sibling, info);
3307 			if (err)
3308 				goto out_unlock;
3309 		}
3310 	}
3311 
3312 	err = fib6_del(rt, info);
3313 out_unlock:
3314 	spin_unlock_bh(&table->tb6_lock);
3315 out_put:
3316 	fib6_info_release(rt);
3317 
3318 	if (skb) {
3319 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3320 			    info->nlh, gfp_any());
3321 	}
3322 	return err;
3323 }
3324 
3325 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3326 {
3327 	int rc = -ESRCH;
3328 
3329 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3330 		goto out;
3331 
3332 	if (cfg->fc_flags & RTF_GATEWAY &&
3333 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3334 		goto out;
3335 
3336 	rc = rt6_remove_exception_rt(rt);
3337 out:
3338 	return rc;
3339 }
3340 
3341 static int ip6_route_del(struct fib6_config *cfg,
3342 			 struct netlink_ext_ack *extack)
3343 {
3344 	struct rt6_info *rt_cache;
3345 	struct fib6_table *table;
3346 	struct fib6_info *rt;
3347 	struct fib6_node *fn;
3348 	int err = -ESRCH;
3349 
3350 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3351 	if (!table) {
3352 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3353 		return err;
3354 	}
3355 
3356 	rcu_read_lock();
3357 
3358 	fn = fib6_locate(&table->tb6_root,
3359 			 &cfg->fc_dst, cfg->fc_dst_len,
3360 			 &cfg->fc_src, cfg->fc_src_len,
3361 			 !(cfg->fc_flags & RTF_CACHE));
3362 
3363 	if (fn) {
3364 		for_each_fib6_node_rt_rcu(fn) {
3365 			struct fib6_nh *nh;
3366 
3367 			if (cfg->fc_flags & RTF_CACHE) {
3368 				struct fib6_result res = {
3369 					.f6i = rt,
3370 				};
3371 				int rc;
3372 
3373 				rt_cache = rt6_find_cached_rt(&res,
3374 							      &cfg->fc_dst,
3375 							      &cfg->fc_src);
3376 				if (rt_cache) {
3377 					rc = ip6_del_cached_rt(rt_cache, cfg);
3378 					if (rc != -ESRCH) {
3379 						rcu_read_unlock();
3380 						return rc;
3381 					}
3382 				}
3383 				continue;
3384 			}
3385 
3386 			nh = &rt->fib6_nh;
3387 			if (cfg->fc_ifindex &&
3388 			    (!nh->fib_nh_dev ||
3389 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3390 				continue;
3391 			if (cfg->fc_flags & RTF_GATEWAY &&
3392 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3393 				continue;
3394 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3395 				continue;
3396 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3397 				continue;
3398 			if (!fib6_info_hold_safe(rt))
3399 				continue;
3400 			rcu_read_unlock();
3401 
3402 			/* if gateway was specified only delete the one hop */
3403 			if (cfg->fc_flags & RTF_GATEWAY)
3404 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3405 
3406 			return __ip6_del_rt_siblings(rt, cfg);
3407 		}
3408 	}
3409 	rcu_read_unlock();
3410 
3411 	return err;
3412 }
3413 
3414 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3415 {
3416 	struct netevent_redirect netevent;
3417 	struct rt6_info *rt, *nrt = NULL;
3418 	struct fib6_result res = {};
3419 	struct ndisc_options ndopts;
3420 	struct inet6_dev *in6_dev;
3421 	struct neighbour *neigh;
3422 	struct rd_msg *msg;
3423 	int optlen, on_link;
3424 	u8 *lladdr;
3425 
3426 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3427 	optlen -= sizeof(*msg);
3428 
3429 	if (optlen < 0) {
3430 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3431 		return;
3432 	}
3433 
3434 	msg = (struct rd_msg *)icmp6_hdr(skb);
3435 
3436 	if (ipv6_addr_is_multicast(&msg->dest)) {
3437 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3438 		return;
3439 	}
3440 
3441 	on_link = 0;
3442 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3443 		on_link = 1;
3444 	} else if (ipv6_addr_type(&msg->target) !=
3445 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3446 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3447 		return;
3448 	}
3449 
3450 	in6_dev = __in6_dev_get(skb->dev);
3451 	if (!in6_dev)
3452 		return;
3453 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3454 		return;
3455 
3456 	/* RFC2461 8.1:
3457 	 *	The IP source address of the Redirect MUST be the same as the current
3458 	 *	first-hop router for the specified ICMP Destination Address.
3459 	 */
3460 
3461 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3462 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3463 		return;
3464 	}
3465 
3466 	lladdr = NULL;
3467 	if (ndopts.nd_opts_tgt_lladdr) {
3468 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3469 					     skb->dev);
3470 		if (!lladdr) {
3471 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3472 			return;
3473 		}
3474 	}
3475 
3476 	rt = (struct rt6_info *) dst;
3477 	if (rt->rt6i_flags & RTF_REJECT) {
3478 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3479 		return;
3480 	}
3481 
3482 	/* Redirect received -> path was valid.
3483 	 * Look, redirects are sent only in response to data packets,
3484 	 * so that this nexthop apparently is reachable. --ANK
3485 	 */
3486 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3487 
3488 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3489 	if (!neigh)
3490 		return;
3491 
3492 	/*
3493 	 *	We have finally decided to accept it.
3494 	 */
3495 
3496 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3497 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3498 		     NEIGH_UPDATE_F_OVERRIDE|
3499 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3500 				     NEIGH_UPDATE_F_ISROUTER)),
3501 		     NDISC_REDIRECT, &ndopts);
3502 
3503 	rcu_read_lock();
3504 	res.f6i = rcu_dereference(rt->from);
3505 	if (!res.f6i)
3506 		goto out;
3507 
3508 	res.nh = &res.f6i->fib6_nh;
3509 	res.fib6_flags = res.f6i->fib6_flags;
3510 	res.fib6_type = res.f6i->fib6_type;
3511 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3512 	if (!nrt)
3513 		goto out;
3514 
3515 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3516 	if (on_link)
3517 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3518 
3519 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3520 
3521 	/* rt6_insert_exception() will take care of duplicated exceptions */
3522 	if (rt6_insert_exception(nrt, &res)) {
3523 		dst_release_immediate(&nrt->dst);
3524 		goto out;
3525 	}
3526 
3527 	netevent.old = &rt->dst;
3528 	netevent.new = &nrt->dst;
3529 	netevent.daddr = &msg->dest;
3530 	netevent.neigh = neigh;
3531 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3532 
3533 out:
3534 	rcu_read_unlock();
3535 	neigh_release(neigh);
3536 }
3537 
3538 #ifdef CONFIG_IPV6_ROUTE_INFO
3539 static struct fib6_info *rt6_get_route_info(struct net *net,
3540 					   const struct in6_addr *prefix, int prefixlen,
3541 					   const struct in6_addr *gwaddr,
3542 					   struct net_device *dev)
3543 {
3544 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3545 	int ifindex = dev->ifindex;
3546 	struct fib6_node *fn;
3547 	struct fib6_info *rt = NULL;
3548 	struct fib6_table *table;
3549 
3550 	table = fib6_get_table(net, tb_id);
3551 	if (!table)
3552 		return NULL;
3553 
3554 	rcu_read_lock();
3555 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3556 	if (!fn)
3557 		goto out;
3558 
3559 	for_each_fib6_node_rt_rcu(fn) {
3560 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3561 			continue;
3562 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3563 		    !rt->fib6_nh.fib_nh_gw_family)
3564 			continue;
3565 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3566 			continue;
3567 		if (!fib6_info_hold_safe(rt))
3568 			continue;
3569 		break;
3570 	}
3571 out:
3572 	rcu_read_unlock();
3573 	return rt;
3574 }
3575 
3576 static struct fib6_info *rt6_add_route_info(struct net *net,
3577 					   const struct in6_addr *prefix, int prefixlen,
3578 					   const struct in6_addr *gwaddr,
3579 					   struct net_device *dev,
3580 					   unsigned int pref)
3581 {
3582 	struct fib6_config cfg = {
3583 		.fc_metric	= IP6_RT_PRIO_USER,
3584 		.fc_ifindex	= dev->ifindex,
3585 		.fc_dst_len	= prefixlen,
3586 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3587 				  RTF_UP | RTF_PREF(pref),
3588 		.fc_protocol = RTPROT_RA,
3589 		.fc_type = RTN_UNICAST,
3590 		.fc_nlinfo.portid = 0,
3591 		.fc_nlinfo.nlh = NULL,
3592 		.fc_nlinfo.nl_net = net,
3593 	};
3594 
3595 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3596 	cfg.fc_dst = *prefix;
3597 	cfg.fc_gateway = *gwaddr;
3598 
3599 	/* We should treat it as a default route if prefix length is 0. */
3600 	if (!prefixlen)
3601 		cfg.fc_flags |= RTF_DEFAULT;
3602 
3603 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3604 
3605 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3606 }
3607 #endif
3608 
3609 struct fib6_info *rt6_get_dflt_router(struct net *net,
3610 				     const struct in6_addr *addr,
3611 				     struct net_device *dev)
3612 {
3613 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3614 	struct fib6_info *rt;
3615 	struct fib6_table *table;
3616 
3617 	table = fib6_get_table(net, tb_id);
3618 	if (!table)
3619 		return NULL;
3620 
3621 	rcu_read_lock();
3622 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3623 		struct fib6_nh *nh = &rt->fib6_nh;
3624 
3625 		if (dev == nh->fib_nh_dev &&
3626 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3627 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3628 			break;
3629 	}
3630 	if (rt && !fib6_info_hold_safe(rt))
3631 		rt = NULL;
3632 	rcu_read_unlock();
3633 	return rt;
3634 }
3635 
3636 struct fib6_info *rt6_add_dflt_router(struct net *net,
3637 				     const struct in6_addr *gwaddr,
3638 				     struct net_device *dev,
3639 				     unsigned int pref)
3640 {
3641 	struct fib6_config cfg = {
3642 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3643 		.fc_metric	= IP6_RT_PRIO_USER,
3644 		.fc_ifindex	= dev->ifindex,
3645 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3646 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3647 		.fc_protocol = RTPROT_RA,
3648 		.fc_type = RTN_UNICAST,
3649 		.fc_nlinfo.portid = 0,
3650 		.fc_nlinfo.nlh = NULL,
3651 		.fc_nlinfo.nl_net = net,
3652 	};
3653 
3654 	cfg.fc_gateway = *gwaddr;
3655 
3656 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3657 		struct fib6_table *table;
3658 
3659 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3660 		if (table)
3661 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3662 	}
3663 
3664 	return rt6_get_dflt_router(net, gwaddr, dev);
3665 }
3666 
3667 static void __rt6_purge_dflt_routers(struct net *net,
3668 				     struct fib6_table *table)
3669 {
3670 	struct fib6_info *rt;
3671 
3672 restart:
3673 	rcu_read_lock();
3674 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3675 		struct net_device *dev = fib6_info_nh_dev(rt);
3676 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3677 
3678 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3679 		    (!idev || idev->cnf.accept_ra != 2) &&
3680 		    fib6_info_hold_safe(rt)) {
3681 			rcu_read_unlock();
3682 			ip6_del_rt(net, rt);
3683 			goto restart;
3684 		}
3685 	}
3686 	rcu_read_unlock();
3687 
3688 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3689 }
3690 
3691 void rt6_purge_dflt_routers(struct net *net)
3692 {
3693 	struct fib6_table *table;
3694 	struct hlist_head *head;
3695 	unsigned int h;
3696 
3697 	rcu_read_lock();
3698 
3699 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3700 		head = &net->ipv6.fib_table_hash[h];
3701 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3702 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3703 				__rt6_purge_dflt_routers(net, table);
3704 		}
3705 	}
3706 
3707 	rcu_read_unlock();
3708 }
3709 
3710 static void rtmsg_to_fib6_config(struct net *net,
3711 				 struct in6_rtmsg *rtmsg,
3712 				 struct fib6_config *cfg)
3713 {
3714 	*cfg = (struct fib6_config){
3715 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3716 			 : RT6_TABLE_MAIN,
3717 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3718 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3719 		.fc_expires = rtmsg->rtmsg_info,
3720 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3721 		.fc_src_len = rtmsg->rtmsg_src_len,
3722 		.fc_flags = rtmsg->rtmsg_flags,
3723 		.fc_type = rtmsg->rtmsg_type,
3724 
3725 		.fc_nlinfo.nl_net = net,
3726 
3727 		.fc_dst = rtmsg->rtmsg_dst,
3728 		.fc_src = rtmsg->rtmsg_src,
3729 		.fc_gateway = rtmsg->rtmsg_gateway,
3730 	};
3731 }
3732 
3733 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3734 {
3735 	struct fib6_config cfg;
3736 	struct in6_rtmsg rtmsg;
3737 	int err;
3738 
3739 	switch (cmd) {
3740 	case SIOCADDRT:		/* Add a route */
3741 	case SIOCDELRT:		/* Delete a route */
3742 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3743 			return -EPERM;
3744 		err = copy_from_user(&rtmsg, arg,
3745 				     sizeof(struct in6_rtmsg));
3746 		if (err)
3747 			return -EFAULT;
3748 
3749 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3750 
3751 		rtnl_lock();
3752 		switch (cmd) {
3753 		case SIOCADDRT:
3754 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3755 			break;
3756 		case SIOCDELRT:
3757 			err = ip6_route_del(&cfg, NULL);
3758 			break;
3759 		default:
3760 			err = -EINVAL;
3761 		}
3762 		rtnl_unlock();
3763 
3764 		return err;
3765 	}
3766 
3767 	return -EINVAL;
3768 }
3769 
3770 /*
3771  *	Drop the packet on the floor
3772  */
3773 
3774 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3775 {
3776 	struct dst_entry *dst = skb_dst(skb);
3777 	struct net *net = dev_net(dst->dev);
3778 	struct inet6_dev *idev;
3779 	int type;
3780 
3781 	if (netif_is_l3_master(skb->dev) &&
3782 	    dst->dev == net->loopback_dev)
3783 		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
3784 	else
3785 		idev = ip6_dst_idev(dst);
3786 
3787 	switch (ipstats_mib_noroutes) {
3788 	case IPSTATS_MIB_INNOROUTES:
3789 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3790 		if (type == IPV6_ADDR_ANY) {
3791 			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
3792 			break;
3793 		}
3794 		/* FALLTHROUGH */
3795 	case IPSTATS_MIB_OUTNOROUTES:
3796 		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
3797 		break;
3798 	}
3799 
3800 	/* Start over by dropping the dst for l3mdev case */
3801 	if (netif_is_l3_master(skb->dev))
3802 		skb_dst_drop(skb);
3803 
3804 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3805 	kfree_skb(skb);
3806 	return 0;
3807 }
3808 
3809 static int ip6_pkt_discard(struct sk_buff *skb)
3810 {
3811 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3812 }
3813 
3814 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3815 {
3816 	skb->dev = skb_dst(skb)->dev;
3817 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3818 }
3819 
3820 static int ip6_pkt_prohibit(struct sk_buff *skb)
3821 {
3822 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3823 }
3824 
3825 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3826 {
3827 	skb->dev = skb_dst(skb)->dev;
3828 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3829 }
3830 
3831 /*
3832  *	Allocate a dst for local (unicast / anycast) address.
3833  */
3834 
3835 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3836 				     struct inet6_dev *idev,
3837 				     const struct in6_addr *addr,
3838 				     bool anycast, gfp_t gfp_flags)
3839 {
3840 	struct fib6_config cfg = {
3841 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3842 		.fc_ifindex = idev->dev->ifindex,
3843 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3844 		.fc_dst = *addr,
3845 		.fc_dst_len = 128,
3846 		.fc_protocol = RTPROT_KERNEL,
3847 		.fc_nlinfo.nl_net = net,
3848 		.fc_ignore_dev_down = true,
3849 	};
3850 
3851 	if (anycast) {
3852 		cfg.fc_type = RTN_ANYCAST;
3853 		cfg.fc_flags |= RTF_ANYCAST;
3854 	} else {
3855 		cfg.fc_type = RTN_LOCAL;
3856 		cfg.fc_flags |= RTF_LOCAL;
3857 	}
3858 
3859 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3860 }
3861 
3862 /* remove deleted ip from prefsrc entries */
3863 struct arg_dev_net_ip {
3864 	struct net_device *dev;
3865 	struct net *net;
3866 	struct in6_addr *addr;
3867 };
3868 
3869 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3870 {
3871 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3872 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3873 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3874 
3875 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3876 	    rt != net->ipv6.fib6_null_entry &&
3877 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3878 		spin_lock_bh(&rt6_exception_lock);
3879 		/* remove prefsrc entry */
3880 		rt->fib6_prefsrc.plen = 0;
3881 		spin_unlock_bh(&rt6_exception_lock);
3882 	}
3883 	return 0;
3884 }
3885 
3886 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3887 {
3888 	struct net *net = dev_net(ifp->idev->dev);
3889 	struct arg_dev_net_ip adni = {
3890 		.dev = ifp->idev->dev,
3891 		.net = net,
3892 		.addr = &ifp->addr,
3893 	};
3894 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3895 }
3896 
3897 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3898 
3899 /* Remove routers and update dst entries when gateway turn into host. */
3900 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3901 {
3902 	struct in6_addr *gateway = (struct in6_addr *)arg;
3903 
3904 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3905 	    rt->fib6_nh.fib_nh_gw_family &&
3906 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3907 		return -1;
3908 	}
3909 
3910 	/* Further clean up cached routes in exception table.
3911 	 * This is needed because cached route may have a different
3912 	 * gateway than its 'parent' in the case of an ip redirect.
3913 	 */
3914 	rt6_exceptions_clean_tohost(rt, gateway);
3915 
3916 	return 0;
3917 }
3918 
3919 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3920 {
3921 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3922 }
3923 
3924 struct arg_netdev_event {
3925 	const struct net_device *dev;
3926 	union {
3927 		unsigned char nh_flags;
3928 		unsigned long event;
3929 	};
3930 };
3931 
3932 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3933 {
3934 	struct fib6_info *iter;
3935 	struct fib6_node *fn;
3936 
3937 	fn = rcu_dereference_protected(rt->fib6_node,
3938 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3939 	iter = rcu_dereference_protected(fn->leaf,
3940 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3941 	while (iter) {
3942 		if (iter->fib6_metric == rt->fib6_metric &&
3943 		    rt6_qualify_for_ecmp(iter))
3944 			return iter;
3945 		iter = rcu_dereference_protected(iter->fib6_next,
3946 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3947 	}
3948 
3949 	return NULL;
3950 }
3951 
3952 static bool rt6_is_dead(const struct fib6_info *rt)
3953 {
3954 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3955 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3956 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3957 		return true;
3958 
3959 	return false;
3960 }
3961 
3962 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3963 {
3964 	struct fib6_info *iter;
3965 	int total = 0;
3966 
3967 	if (!rt6_is_dead(rt))
3968 		total += rt->fib6_nh.fib_nh_weight;
3969 
3970 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3971 		if (!rt6_is_dead(iter))
3972 			total += iter->fib6_nh.fib_nh_weight;
3973 	}
3974 
3975 	return total;
3976 }
3977 
3978 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3979 {
3980 	int upper_bound = -1;
3981 
3982 	if (!rt6_is_dead(rt)) {
3983 		*weight += rt->fib6_nh.fib_nh_weight;
3984 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3985 						    total) - 1;
3986 	}
3987 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3988 }
3989 
3990 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3991 {
3992 	struct fib6_info *iter;
3993 	int weight = 0;
3994 
3995 	rt6_upper_bound_set(rt, &weight, total);
3996 
3997 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3998 		rt6_upper_bound_set(iter, &weight, total);
3999 }
4000 
4001 void rt6_multipath_rebalance(struct fib6_info *rt)
4002 {
4003 	struct fib6_info *first;
4004 	int total;
4005 
4006 	/* In case the entire multipath route was marked for flushing,
4007 	 * then there is no need to rebalance upon the removal of every
4008 	 * sibling route.
4009 	 */
4010 	if (!rt->fib6_nsiblings || rt->should_flush)
4011 		return;
4012 
4013 	/* During lookup routes are evaluated in order, so we need to
4014 	 * make sure upper bounds are assigned from the first sibling
4015 	 * onwards.
4016 	 */
4017 	first = rt6_multipath_first_sibling(rt);
4018 	if (WARN_ON_ONCE(!first))
4019 		return;
4020 
4021 	total = rt6_multipath_total_weight(first);
4022 	rt6_multipath_upper_bound_set(first, total);
4023 }
4024 
4025 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4026 {
4027 	const struct arg_netdev_event *arg = p_arg;
4028 	struct net *net = dev_net(arg->dev);
4029 
4030 	if (rt != net->ipv6.fib6_null_entry &&
4031 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4032 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4033 		fib6_update_sernum_upto_root(net, rt);
4034 		rt6_multipath_rebalance(rt);
4035 	}
4036 
4037 	return 0;
4038 }
4039 
4040 void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4041 {
4042 	struct arg_netdev_event arg = {
4043 		.dev = dev,
4044 		{
4045 			.nh_flags = nh_flags,
4046 		},
4047 	};
4048 
4049 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4050 		arg.nh_flags |= RTNH_F_LINKDOWN;
4051 
4052 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4053 }
4054 
4055 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4056 				   const struct net_device *dev)
4057 {
4058 	struct fib6_info *iter;
4059 
4060 	if (rt->fib6_nh.fib_nh_dev == dev)
4061 		return true;
4062 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063 		if (iter->fib6_nh.fib_nh_dev == dev)
4064 			return true;
4065 
4066 	return false;
4067 }
4068 
4069 static void rt6_multipath_flush(struct fib6_info *rt)
4070 {
4071 	struct fib6_info *iter;
4072 
4073 	rt->should_flush = 1;
4074 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4075 		iter->should_flush = 1;
4076 }
4077 
4078 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4079 					     const struct net_device *down_dev)
4080 {
4081 	struct fib6_info *iter;
4082 	unsigned int dead = 0;
4083 
4084 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4085 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4086 		dead++;
4087 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4088 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4089 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4090 			dead++;
4091 
4092 	return dead;
4093 }
4094 
4095 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4096 				       const struct net_device *dev,
4097 				       unsigned char nh_flags)
4098 {
4099 	struct fib6_info *iter;
4100 
4101 	if (rt->fib6_nh.fib_nh_dev == dev)
4102 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4103 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4104 		if (iter->fib6_nh.fib_nh_dev == dev)
4105 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4106 }
4107 
4108 /* called with write lock held for table with rt */
4109 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4110 {
4111 	const struct arg_netdev_event *arg = p_arg;
4112 	const struct net_device *dev = arg->dev;
4113 	struct net *net = dev_net(dev);
4114 
4115 	if (rt == net->ipv6.fib6_null_entry)
4116 		return 0;
4117 
4118 	switch (arg->event) {
4119 	case NETDEV_UNREGISTER:
4120 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4121 	case NETDEV_DOWN:
4122 		if (rt->should_flush)
4123 			return -1;
4124 		if (!rt->fib6_nsiblings)
4125 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4126 		if (rt6_multipath_uses_dev(rt, dev)) {
4127 			unsigned int count;
4128 
4129 			count = rt6_multipath_dead_count(rt, dev);
4130 			if (rt->fib6_nsiblings + 1 == count) {
4131 				rt6_multipath_flush(rt);
4132 				return -1;
4133 			}
4134 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4135 						   RTNH_F_LINKDOWN);
4136 			fib6_update_sernum(net, rt);
4137 			rt6_multipath_rebalance(rt);
4138 		}
4139 		return -2;
4140 	case NETDEV_CHANGE:
4141 		if (rt->fib6_nh.fib_nh_dev != dev ||
4142 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4143 			break;
4144 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4145 		rt6_multipath_rebalance(rt);
4146 		break;
4147 	}
4148 
4149 	return 0;
4150 }
4151 
4152 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4153 {
4154 	struct arg_netdev_event arg = {
4155 		.dev = dev,
4156 		{
4157 			.event = event,
4158 		},
4159 	};
4160 	struct net *net = dev_net(dev);
4161 
4162 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4163 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4164 	else
4165 		fib6_clean_all(net, fib6_ifdown, &arg);
4166 }
4167 
4168 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4169 {
4170 	rt6_sync_down_dev(dev, event);
4171 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4172 	neigh_ifdown(&nd_tbl, dev);
4173 }
4174 
4175 struct rt6_mtu_change_arg {
4176 	struct net_device *dev;
4177 	unsigned int mtu;
4178 };
4179 
4180 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4181 {
4182 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4183 	struct inet6_dev *idev;
4184 
4185 	/* In IPv6 pmtu discovery is not optional,
4186 	   so that RTAX_MTU lock cannot disable it.
4187 	   We still use this lock to block changes
4188 	   caused by addrconf/ndisc.
4189 	*/
4190 
4191 	idev = __in6_dev_get(arg->dev);
4192 	if (!idev)
4193 		return 0;
4194 
4195 	/* For administrative MTU increase, there is no way to discover
4196 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4197 	   Since RFC 1981 doesn't include administrative MTU increase
4198 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4199 	 */
4200 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4201 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4202 		u32 mtu = rt->fib6_pmtu;
4203 
4204 		if (mtu >= arg->mtu ||
4205 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4206 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4207 
4208 		spin_lock_bh(&rt6_exception_lock);
4209 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4210 		spin_unlock_bh(&rt6_exception_lock);
4211 	}
4212 	return 0;
4213 }
4214 
4215 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4216 {
4217 	struct rt6_mtu_change_arg arg = {
4218 		.dev = dev,
4219 		.mtu = mtu,
4220 	};
4221 
4222 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4223 }
4224 
4225 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4226 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4227 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4228 	[RTA_OIF]               = { .type = NLA_U32 },
4229 	[RTA_IIF]		= { .type = NLA_U32 },
4230 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4231 	[RTA_METRICS]           = { .type = NLA_NESTED },
4232 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4233 	[RTA_PREF]              = { .type = NLA_U8 },
4234 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4235 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4236 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4237 	[RTA_UID]		= { .type = NLA_U32 },
4238 	[RTA_MARK]		= { .type = NLA_U32 },
4239 	[RTA_TABLE]		= { .type = NLA_U32 },
4240 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4241 	[RTA_SPORT]		= { .type = NLA_U16 },
4242 	[RTA_DPORT]		= { .type = NLA_U16 },
4243 };
4244 
4245 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4246 			      struct fib6_config *cfg,
4247 			      struct netlink_ext_ack *extack)
4248 {
4249 	struct rtmsg *rtm;
4250 	struct nlattr *tb[RTA_MAX+1];
4251 	unsigned int pref;
4252 	int err;
4253 
4254 	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4255 				     rtm_ipv6_policy, extack);
4256 	if (err < 0)
4257 		goto errout;
4258 
4259 	err = -EINVAL;
4260 	rtm = nlmsg_data(nlh);
4261 
4262 	*cfg = (struct fib6_config){
4263 		.fc_table = rtm->rtm_table,
4264 		.fc_dst_len = rtm->rtm_dst_len,
4265 		.fc_src_len = rtm->rtm_src_len,
4266 		.fc_flags = RTF_UP,
4267 		.fc_protocol = rtm->rtm_protocol,
4268 		.fc_type = rtm->rtm_type,
4269 
4270 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4271 		.fc_nlinfo.nlh = nlh,
4272 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4273 	};
4274 
4275 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4276 	    rtm->rtm_type == RTN_BLACKHOLE ||
4277 	    rtm->rtm_type == RTN_PROHIBIT ||
4278 	    rtm->rtm_type == RTN_THROW)
4279 		cfg->fc_flags |= RTF_REJECT;
4280 
4281 	if (rtm->rtm_type == RTN_LOCAL)
4282 		cfg->fc_flags |= RTF_LOCAL;
4283 
4284 	if (rtm->rtm_flags & RTM_F_CLONED)
4285 		cfg->fc_flags |= RTF_CACHE;
4286 
4287 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4288 
4289 	if (tb[RTA_GATEWAY]) {
4290 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4291 		cfg->fc_flags |= RTF_GATEWAY;
4292 	}
4293 	if (tb[RTA_VIA]) {
4294 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4295 		goto errout;
4296 	}
4297 
4298 	if (tb[RTA_DST]) {
4299 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4300 
4301 		if (nla_len(tb[RTA_DST]) < plen)
4302 			goto errout;
4303 
4304 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4305 	}
4306 
4307 	if (tb[RTA_SRC]) {
4308 		int plen = (rtm->rtm_src_len + 7) >> 3;
4309 
4310 		if (nla_len(tb[RTA_SRC]) < plen)
4311 			goto errout;
4312 
4313 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4314 	}
4315 
4316 	if (tb[RTA_PREFSRC])
4317 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4318 
4319 	if (tb[RTA_OIF])
4320 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4321 
4322 	if (tb[RTA_PRIORITY])
4323 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4324 
4325 	if (tb[RTA_METRICS]) {
4326 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4327 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4328 	}
4329 
4330 	if (tb[RTA_TABLE])
4331 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4332 
4333 	if (tb[RTA_MULTIPATH]) {
4334 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4335 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4336 
4337 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4338 						     cfg->fc_mp_len, extack);
4339 		if (err < 0)
4340 			goto errout;
4341 	}
4342 
4343 	if (tb[RTA_PREF]) {
4344 		pref = nla_get_u8(tb[RTA_PREF]);
4345 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4346 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4347 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4348 		cfg->fc_flags |= RTF_PREF(pref);
4349 	}
4350 
4351 	if (tb[RTA_ENCAP])
4352 		cfg->fc_encap = tb[RTA_ENCAP];
4353 
4354 	if (tb[RTA_ENCAP_TYPE]) {
4355 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4356 
4357 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4358 		if (err < 0)
4359 			goto errout;
4360 	}
4361 
4362 	if (tb[RTA_EXPIRES]) {
4363 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4364 
4365 		if (addrconf_finite_timeout(timeout)) {
4366 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4367 			cfg->fc_flags |= RTF_EXPIRES;
4368 		}
4369 	}
4370 
4371 	err = 0;
4372 errout:
4373 	return err;
4374 }
4375 
4376 struct rt6_nh {
4377 	struct fib6_info *fib6_info;
4378 	struct fib6_config r_cfg;
4379 	struct list_head next;
4380 };
4381 
4382 static int ip6_route_info_append(struct net *net,
4383 				 struct list_head *rt6_nh_list,
4384 				 struct fib6_info *rt,
4385 				 struct fib6_config *r_cfg)
4386 {
4387 	struct rt6_nh *nh;
4388 	int err = -EEXIST;
4389 
4390 	list_for_each_entry(nh, rt6_nh_list, next) {
4391 		/* check if fib6_info already exists */
4392 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4393 			return err;
4394 	}
4395 
4396 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4397 	if (!nh)
4398 		return -ENOMEM;
4399 	nh->fib6_info = rt;
4400 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4401 	list_add_tail(&nh->next, rt6_nh_list);
4402 
4403 	return 0;
4404 }
4405 
4406 static void ip6_route_mpath_notify(struct fib6_info *rt,
4407 				   struct fib6_info *rt_last,
4408 				   struct nl_info *info,
4409 				   __u16 nlflags)
4410 {
4411 	/* if this is an APPEND route, then rt points to the first route
4412 	 * inserted and rt_last points to last route inserted. Userspace
4413 	 * wants a consistent dump of the route which starts at the first
4414 	 * nexthop. Since sibling routes are always added at the end of
4415 	 * the list, find the first sibling of the last route appended
4416 	 */
4417 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4418 		rt = list_first_entry(&rt_last->fib6_siblings,
4419 				      struct fib6_info,
4420 				      fib6_siblings);
4421 	}
4422 
4423 	if (rt)
4424 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4425 }
4426 
4427 static int ip6_route_multipath_add(struct fib6_config *cfg,
4428 				   struct netlink_ext_ack *extack)
4429 {
4430 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4431 	struct nl_info *info = &cfg->fc_nlinfo;
4432 	struct fib6_config r_cfg;
4433 	struct rtnexthop *rtnh;
4434 	struct fib6_info *rt;
4435 	struct rt6_nh *err_nh;
4436 	struct rt6_nh *nh, *nh_safe;
4437 	__u16 nlflags;
4438 	int remaining;
4439 	int attrlen;
4440 	int err = 1;
4441 	int nhn = 0;
4442 	int replace = (cfg->fc_nlinfo.nlh &&
4443 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4444 	LIST_HEAD(rt6_nh_list);
4445 
4446 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4447 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4448 		nlflags |= NLM_F_APPEND;
4449 
4450 	remaining = cfg->fc_mp_len;
4451 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4452 
4453 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4454 	 * fib6_info structs per nexthop
4455 	 */
4456 	while (rtnh_ok(rtnh, remaining)) {
4457 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4458 		if (rtnh->rtnh_ifindex)
4459 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4460 
4461 		attrlen = rtnh_attrlen(rtnh);
4462 		if (attrlen > 0) {
4463 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4464 
4465 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4466 			if (nla) {
4467 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4468 				r_cfg.fc_flags |= RTF_GATEWAY;
4469 			}
4470 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4471 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4472 			if (nla)
4473 				r_cfg.fc_encap_type = nla_get_u16(nla);
4474 		}
4475 
4476 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4477 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4478 		if (IS_ERR(rt)) {
4479 			err = PTR_ERR(rt);
4480 			rt = NULL;
4481 			goto cleanup;
4482 		}
4483 		if (!rt6_qualify_for_ecmp(rt)) {
4484 			err = -EINVAL;
4485 			NL_SET_ERR_MSG(extack,
4486 				       "Device only routes can not be added for IPv6 using the multipath API.");
4487 			fib6_info_release(rt);
4488 			goto cleanup;
4489 		}
4490 
4491 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4492 
4493 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4494 					    rt, &r_cfg);
4495 		if (err) {
4496 			fib6_info_release(rt);
4497 			goto cleanup;
4498 		}
4499 
4500 		rtnh = rtnh_next(rtnh, &remaining);
4501 	}
4502 
4503 	/* for add and replace send one notification with all nexthops.
4504 	 * Skip the notification in fib6_add_rt2node and send one with
4505 	 * the full route when done
4506 	 */
4507 	info->skip_notify = 1;
4508 
4509 	err_nh = NULL;
4510 	list_for_each_entry(nh, &rt6_nh_list, next) {
4511 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4512 		fib6_info_release(nh->fib6_info);
4513 
4514 		if (!err) {
4515 			/* save reference to last route successfully inserted */
4516 			rt_last = nh->fib6_info;
4517 
4518 			/* save reference to first route for notification */
4519 			if (!rt_notif)
4520 				rt_notif = nh->fib6_info;
4521 		}
4522 
4523 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4524 		nh->fib6_info = NULL;
4525 		if (err) {
4526 			if (replace && nhn)
4527 				NL_SET_ERR_MSG_MOD(extack,
4528 						   "multipath route replace failed (check consistency of installed routes)");
4529 			err_nh = nh;
4530 			goto add_errout;
4531 		}
4532 
4533 		/* Because each route is added like a single route we remove
4534 		 * these flags after the first nexthop: if there is a collision,
4535 		 * we have already failed to add the first nexthop:
4536 		 * fib6_add_rt2node() has rejected it; when replacing, old
4537 		 * nexthops have been replaced by first new, the rest should
4538 		 * be added to it.
4539 		 */
4540 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4541 						     NLM_F_REPLACE);
4542 		nhn++;
4543 	}
4544 
4545 	/* success ... tell user about new route */
4546 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4547 	goto cleanup;
4548 
4549 add_errout:
4550 	/* send notification for routes that were added so that
4551 	 * the delete notifications sent by ip6_route_del are
4552 	 * coherent
4553 	 */
4554 	if (rt_notif)
4555 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4556 
4557 	/* Delete routes that were already added */
4558 	list_for_each_entry(nh, &rt6_nh_list, next) {
4559 		if (err_nh == nh)
4560 			break;
4561 		ip6_route_del(&nh->r_cfg, extack);
4562 	}
4563 
4564 cleanup:
4565 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4566 		if (nh->fib6_info)
4567 			fib6_info_release(nh->fib6_info);
4568 		list_del(&nh->next);
4569 		kfree(nh);
4570 	}
4571 
4572 	return err;
4573 }
4574 
4575 static int ip6_route_multipath_del(struct fib6_config *cfg,
4576 				   struct netlink_ext_ack *extack)
4577 {
4578 	struct fib6_config r_cfg;
4579 	struct rtnexthop *rtnh;
4580 	int remaining;
4581 	int attrlen;
4582 	int err = 1, last_err = 0;
4583 
4584 	remaining = cfg->fc_mp_len;
4585 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4586 
4587 	/* Parse a Multipath Entry */
4588 	while (rtnh_ok(rtnh, remaining)) {
4589 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4590 		if (rtnh->rtnh_ifindex)
4591 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4592 
4593 		attrlen = rtnh_attrlen(rtnh);
4594 		if (attrlen > 0) {
4595 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4596 
4597 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4598 			if (nla) {
4599 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4600 				r_cfg.fc_flags |= RTF_GATEWAY;
4601 			}
4602 		}
4603 		err = ip6_route_del(&r_cfg, extack);
4604 		if (err)
4605 			last_err = err;
4606 
4607 		rtnh = rtnh_next(rtnh, &remaining);
4608 	}
4609 
4610 	return last_err;
4611 }
4612 
4613 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4614 			      struct netlink_ext_ack *extack)
4615 {
4616 	struct fib6_config cfg;
4617 	int err;
4618 
4619 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4620 	if (err < 0)
4621 		return err;
4622 
4623 	if (cfg.fc_mp)
4624 		return ip6_route_multipath_del(&cfg, extack);
4625 	else {
4626 		cfg.fc_delete_all_nh = 1;
4627 		return ip6_route_del(&cfg, extack);
4628 	}
4629 }
4630 
4631 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4632 			      struct netlink_ext_ack *extack)
4633 {
4634 	struct fib6_config cfg;
4635 	int err;
4636 
4637 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4638 	if (err < 0)
4639 		return err;
4640 
4641 	if (cfg.fc_metric == 0)
4642 		cfg.fc_metric = IP6_RT_PRIO_USER;
4643 
4644 	if (cfg.fc_mp)
4645 		return ip6_route_multipath_add(&cfg, extack);
4646 	else
4647 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4648 }
4649 
4650 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4651 {
4652 	int nexthop_len = 0;
4653 
4654 	if (rt->fib6_nsiblings) {
4655 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4656 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4657 			    + nla_total_size(16) /* RTA_GATEWAY */
4658 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4659 
4660 		nexthop_len *= rt->fib6_nsiblings;
4661 	}
4662 
4663 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4664 	       + nla_total_size(16) /* RTA_SRC */
4665 	       + nla_total_size(16) /* RTA_DST */
4666 	       + nla_total_size(16) /* RTA_GATEWAY */
4667 	       + nla_total_size(16) /* RTA_PREFSRC */
4668 	       + nla_total_size(4) /* RTA_TABLE */
4669 	       + nla_total_size(4) /* RTA_IIF */
4670 	       + nla_total_size(4) /* RTA_OIF */
4671 	       + nla_total_size(4) /* RTA_PRIORITY */
4672 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4673 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4674 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4675 	       + nla_total_size(1) /* RTA_PREF */
4676 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4677 	       + nexthop_len;
4678 }
4679 
4680 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4681 			 struct fib6_info *rt, struct dst_entry *dst,
4682 			 struct in6_addr *dest, struct in6_addr *src,
4683 			 int iif, int type, u32 portid, u32 seq,
4684 			 unsigned int flags)
4685 {
4686 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4687 	struct rt6key *rt6_dst, *rt6_src;
4688 	u32 *pmetrics, table, rt6_flags;
4689 	struct nlmsghdr *nlh;
4690 	struct rtmsg *rtm;
4691 	long expires = 0;
4692 
4693 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4694 	if (!nlh)
4695 		return -EMSGSIZE;
4696 
4697 	if (rt6) {
4698 		rt6_dst = &rt6->rt6i_dst;
4699 		rt6_src = &rt6->rt6i_src;
4700 		rt6_flags = rt6->rt6i_flags;
4701 	} else {
4702 		rt6_dst = &rt->fib6_dst;
4703 		rt6_src = &rt->fib6_src;
4704 		rt6_flags = rt->fib6_flags;
4705 	}
4706 
4707 	rtm = nlmsg_data(nlh);
4708 	rtm->rtm_family = AF_INET6;
4709 	rtm->rtm_dst_len = rt6_dst->plen;
4710 	rtm->rtm_src_len = rt6_src->plen;
4711 	rtm->rtm_tos = 0;
4712 	if (rt->fib6_table)
4713 		table = rt->fib6_table->tb6_id;
4714 	else
4715 		table = RT6_TABLE_UNSPEC;
4716 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4717 	if (nla_put_u32(skb, RTA_TABLE, table))
4718 		goto nla_put_failure;
4719 
4720 	rtm->rtm_type = rt->fib6_type;
4721 	rtm->rtm_flags = 0;
4722 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4723 	rtm->rtm_protocol = rt->fib6_protocol;
4724 
4725 	if (rt6_flags & RTF_CACHE)
4726 		rtm->rtm_flags |= RTM_F_CLONED;
4727 
4728 	if (dest) {
4729 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4730 			goto nla_put_failure;
4731 		rtm->rtm_dst_len = 128;
4732 	} else if (rtm->rtm_dst_len)
4733 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4734 			goto nla_put_failure;
4735 #ifdef CONFIG_IPV6_SUBTREES
4736 	if (src) {
4737 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4738 			goto nla_put_failure;
4739 		rtm->rtm_src_len = 128;
4740 	} else if (rtm->rtm_src_len &&
4741 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4742 		goto nla_put_failure;
4743 #endif
4744 	if (iif) {
4745 #ifdef CONFIG_IPV6_MROUTE
4746 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4747 			int err = ip6mr_get_route(net, skb, rtm, portid);
4748 
4749 			if (err == 0)
4750 				return 0;
4751 			if (err < 0)
4752 				goto nla_put_failure;
4753 		} else
4754 #endif
4755 			if (nla_put_u32(skb, RTA_IIF, iif))
4756 				goto nla_put_failure;
4757 	} else if (dest) {
4758 		struct in6_addr saddr_buf;
4759 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4760 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4761 			goto nla_put_failure;
4762 	}
4763 
4764 	if (rt->fib6_prefsrc.plen) {
4765 		struct in6_addr saddr_buf;
4766 		saddr_buf = rt->fib6_prefsrc.addr;
4767 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4768 			goto nla_put_failure;
4769 	}
4770 
4771 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4772 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4773 		goto nla_put_failure;
4774 
4775 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4776 		goto nla_put_failure;
4777 
4778 	/* For multipath routes, walk the siblings list and add
4779 	 * each as a nexthop within RTA_MULTIPATH.
4780 	 */
4781 	if (rt6) {
4782 		if (rt6_flags & RTF_GATEWAY &&
4783 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4784 			goto nla_put_failure;
4785 
4786 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4787 			goto nla_put_failure;
4788 	} else if (rt->fib6_nsiblings) {
4789 		struct fib6_info *sibling, *next_sibling;
4790 		struct nlattr *mp;
4791 
4792 		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
4793 		if (!mp)
4794 			goto nla_put_failure;
4795 
4796 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4797 				    rt->fib6_nh.fib_nh_weight) < 0)
4798 			goto nla_put_failure;
4799 
4800 		list_for_each_entry_safe(sibling, next_sibling,
4801 					 &rt->fib6_siblings, fib6_siblings) {
4802 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4803 					    sibling->fib6_nh.fib_nh_weight) < 0)
4804 				goto nla_put_failure;
4805 		}
4806 
4807 		nla_nest_end(skb, mp);
4808 	} else {
4809 		unsigned char nh_flags = 0;
4810 
4811 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4812 				     &nh_flags, false) < 0)
4813 			goto nla_put_failure;
4814 
4815 		rtm->rtm_flags |= nh_flags;
4816 	}
4817 
4818 	if (rt6_flags & RTF_EXPIRES) {
4819 		expires = dst ? dst->expires : rt->expires;
4820 		expires -= jiffies;
4821 	}
4822 
4823 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4824 		goto nla_put_failure;
4825 
4826 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4827 		goto nla_put_failure;
4828 
4829 
4830 	nlmsg_end(skb, nlh);
4831 	return 0;
4832 
4833 nla_put_failure:
4834 	nlmsg_cancel(skb, nlh);
4835 	return -EMSGSIZE;
4836 }
4837 
4838 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4839 			       const struct net_device *dev)
4840 {
4841 	if (f6i->fib6_nh.fib_nh_dev == dev)
4842 		return true;
4843 
4844 	if (f6i->fib6_nsiblings) {
4845 		struct fib6_info *sibling, *next_sibling;
4846 
4847 		list_for_each_entry_safe(sibling, next_sibling,
4848 					 &f6i->fib6_siblings, fib6_siblings) {
4849 			if (sibling->fib6_nh.fib_nh_dev == dev)
4850 				return true;
4851 		}
4852 	}
4853 
4854 	return false;
4855 }
4856 
4857 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4858 {
4859 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4860 	struct fib_dump_filter *filter = &arg->filter;
4861 	unsigned int flags = NLM_F_MULTI;
4862 	struct net *net = arg->net;
4863 
4864 	if (rt == net->ipv6.fib6_null_entry)
4865 		return 0;
4866 
4867 	if ((filter->flags & RTM_F_PREFIX) &&
4868 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4869 		/* success since this is not a prefix route */
4870 		return 1;
4871 	}
4872 	if (filter->filter_set) {
4873 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4874 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4875 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4876 			return 1;
4877 		}
4878 		flags |= NLM_F_DUMP_FILTERED;
4879 	}
4880 
4881 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4882 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4883 			     arg->cb->nlh->nlmsg_seq, flags);
4884 }
4885 
4886 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4887 					const struct nlmsghdr *nlh,
4888 					struct nlattr **tb,
4889 					struct netlink_ext_ack *extack)
4890 {
4891 	struct rtmsg *rtm;
4892 	int i, err;
4893 
4894 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4895 		NL_SET_ERR_MSG_MOD(extack,
4896 				   "Invalid header for get route request");
4897 		return -EINVAL;
4898 	}
4899 
4900 	if (!netlink_strict_get_check(skb))
4901 		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4902 					      rtm_ipv6_policy, extack);
4903 
4904 	rtm = nlmsg_data(nlh);
4905 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4906 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4907 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4908 	    rtm->rtm_type) {
4909 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4910 		return -EINVAL;
4911 	}
4912 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4913 		NL_SET_ERR_MSG_MOD(extack,
4914 				   "Invalid flags for get route request");
4915 		return -EINVAL;
4916 	}
4917 
4918 	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4919 					    rtm_ipv6_policy, extack);
4920 	if (err)
4921 		return err;
4922 
4923 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4924 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4925 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4926 		return -EINVAL;
4927 	}
4928 
4929 	for (i = 0; i <= RTA_MAX; i++) {
4930 		if (!tb[i])
4931 			continue;
4932 
4933 		switch (i) {
4934 		case RTA_SRC:
4935 		case RTA_DST:
4936 		case RTA_IIF:
4937 		case RTA_OIF:
4938 		case RTA_MARK:
4939 		case RTA_UID:
4940 		case RTA_SPORT:
4941 		case RTA_DPORT:
4942 		case RTA_IP_PROTO:
4943 			break;
4944 		default:
4945 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4946 			return -EINVAL;
4947 		}
4948 	}
4949 
4950 	return 0;
4951 }
4952 
4953 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4954 			      struct netlink_ext_ack *extack)
4955 {
4956 	struct net *net = sock_net(in_skb->sk);
4957 	struct nlattr *tb[RTA_MAX+1];
4958 	int err, iif = 0, oif = 0;
4959 	struct fib6_info *from;
4960 	struct dst_entry *dst;
4961 	struct rt6_info *rt;
4962 	struct sk_buff *skb;
4963 	struct rtmsg *rtm;
4964 	struct flowi6 fl6 = {};
4965 	bool fibmatch;
4966 
4967 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4968 	if (err < 0)
4969 		goto errout;
4970 
4971 	err = -EINVAL;
4972 	rtm = nlmsg_data(nlh);
4973 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4974 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4975 
4976 	if (tb[RTA_SRC]) {
4977 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4978 			goto errout;
4979 
4980 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4981 	}
4982 
4983 	if (tb[RTA_DST]) {
4984 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4985 			goto errout;
4986 
4987 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4988 	}
4989 
4990 	if (tb[RTA_IIF])
4991 		iif = nla_get_u32(tb[RTA_IIF]);
4992 
4993 	if (tb[RTA_OIF])
4994 		oif = nla_get_u32(tb[RTA_OIF]);
4995 
4996 	if (tb[RTA_MARK])
4997 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4998 
4999 	if (tb[RTA_UID])
5000 		fl6.flowi6_uid = make_kuid(current_user_ns(),
5001 					   nla_get_u32(tb[RTA_UID]));
5002 	else
5003 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5004 
5005 	if (tb[RTA_SPORT])
5006 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5007 
5008 	if (tb[RTA_DPORT])
5009 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5010 
5011 	if (tb[RTA_IP_PROTO]) {
5012 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5013 						  &fl6.flowi6_proto, AF_INET6,
5014 						  extack);
5015 		if (err)
5016 			goto errout;
5017 	}
5018 
5019 	if (iif) {
5020 		struct net_device *dev;
5021 		int flags = 0;
5022 
5023 		rcu_read_lock();
5024 
5025 		dev = dev_get_by_index_rcu(net, iif);
5026 		if (!dev) {
5027 			rcu_read_unlock();
5028 			err = -ENODEV;
5029 			goto errout;
5030 		}
5031 
5032 		fl6.flowi6_iif = iif;
5033 
5034 		if (!ipv6_addr_any(&fl6.saddr))
5035 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5036 
5037 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5038 
5039 		rcu_read_unlock();
5040 	} else {
5041 		fl6.flowi6_oif = oif;
5042 
5043 		dst = ip6_route_output(net, NULL, &fl6);
5044 	}
5045 
5046 
5047 	rt = container_of(dst, struct rt6_info, dst);
5048 	if (rt->dst.error) {
5049 		err = rt->dst.error;
5050 		ip6_rt_put(rt);
5051 		goto errout;
5052 	}
5053 
5054 	if (rt == net->ipv6.ip6_null_entry) {
5055 		err = rt->dst.error;
5056 		ip6_rt_put(rt);
5057 		goto errout;
5058 	}
5059 
5060 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5061 	if (!skb) {
5062 		ip6_rt_put(rt);
5063 		err = -ENOBUFS;
5064 		goto errout;
5065 	}
5066 
5067 	skb_dst_set(skb, &rt->dst);
5068 
5069 	rcu_read_lock();
5070 	from = rcu_dereference(rt->from);
5071 	if (from) {
5072 		if (fibmatch)
5073 			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5074 					    iif, RTM_NEWROUTE,
5075 					    NETLINK_CB(in_skb).portid,
5076 					    nlh->nlmsg_seq, 0);
5077 		else
5078 			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5079 					    &fl6.saddr, iif, RTM_NEWROUTE,
5080 					    NETLINK_CB(in_skb).portid,
5081 					    nlh->nlmsg_seq, 0);
5082 	} else {
5083 		err = -ENETUNREACH;
5084 	}
5085 	rcu_read_unlock();
5086 
5087 	if (err < 0) {
5088 		kfree_skb(skb);
5089 		goto errout;
5090 	}
5091 
5092 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5093 errout:
5094 	return err;
5095 }
5096 
5097 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5098 		     unsigned int nlm_flags)
5099 {
5100 	struct sk_buff *skb;
5101 	struct net *net = info->nl_net;
5102 	u32 seq;
5103 	int err;
5104 
5105 	err = -ENOBUFS;
5106 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5107 
5108 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5109 	if (!skb)
5110 		goto errout;
5111 
5112 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5113 			    event, info->portid, seq, nlm_flags);
5114 	if (err < 0) {
5115 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5116 		WARN_ON(err == -EMSGSIZE);
5117 		kfree_skb(skb);
5118 		goto errout;
5119 	}
5120 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5121 		    info->nlh, gfp_any());
5122 	return;
5123 errout:
5124 	if (err < 0)
5125 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5126 }
5127 
5128 static int ip6_route_dev_notify(struct notifier_block *this,
5129 				unsigned long event, void *ptr)
5130 {
5131 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5132 	struct net *net = dev_net(dev);
5133 
5134 	if (!(dev->flags & IFF_LOOPBACK))
5135 		return NOTIFY_OK;
5136 
5137 	if (event == NETDEV_REGISTER) {
5138 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5139 		net->ipv6.ip6_null_entry->dst.dev = dev;
5140 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5141 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5142 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5143 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5144 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5145 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5146 #endif
5147 	 } else if (event == NETDEV_UNREGISTER &&
5148 		    dev->reg_state != NETREG_UNREGISTERED) {
5149 		/* NETDEV_UNREGISTER could be fired for multiple times by
5150 		 * netdev_wait_allrefs(). Make sure we only call this once.
5151 		 */
5152 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5153 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5154 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5155 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5156 #endif
5157 	}
5158 
5159 	return NOTIFY_OK;
5160 }
5161 
5162 /*
5163  *	/proc
5164  */
5165 
5166 #ifdef CONFIG_PROC_FS
5167 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5168 {
5169 	struct net *net = (struct net *)seq->private;
5170 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5171 		   net->ipv6.rt6_stats->fib_nodes,
5172 		   net->ipv6.rt6_stats->fib_route_nodes,
5173 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5174 		   net->ipv6.rt6_stats->fib_rt_entries,
5175 		   net->ipv6.rt6_stats->fib_rt_cache,
5176 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5177 		   net->ipv6.rt6_stats->fib_discarded_routes);
5178 
5179 	return 0;
5180 }
5181 #endif	/* CONFIG_PROC_FS */
5182 
5183 #ifdef CONFIG_SYSCTL
5184 
5185 static
5186 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5187 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5188 {
5189 	struct net *net;
5190 	int delay;
5191 	int ret;
5192 	if (!write)
5193 		return -EINVAL;
5194 
5195 	net = (struct net *)ctl->extra1;
5196 	delay = net->ipv6.sysctl.flush_delay;
5197 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5198 	if (ret)
5199 		return ret;
5200 
5201 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5202 	return 0;
5203 }
5204 
5205 static int zero;
5206 static int one = 1;
5207 
5208 static struct ctl_table ipv6_route_table_template[] = {
5209 	{
5210 		.procname	=	"flush",
5211 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5212 		.maxlen		=	sizeof(int),
5213 		.mode		=	0200,
5214 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5215 	},
5216 	{
5217 		.procname	=	"gc_thresh",
5218 		.data		=	&ip6_dst_ops_template.gc_thresh,
5219 		.maxlen		=	sizeof(int),
5220 		.mode		=	0644,
5221 		.proc_handler	=	proc_dointvec,
5222 	},
5223 	{
5224 		.procname	=	"max_size",
5225 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5226 		.maxlen		=	sizeof(int),
5227 		.mode		=	0644,
5228 		.proc_handler	=	proc_dointvec,
5229 	},
5230 	{
5231 		.procname	=	"gc_min_interval",
5232 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5233 		.maxlen		=	sizeof(int),
5234 		.mode		=	0644,
5235 		.proc_handler	=	proc_dointvec_jiffies,
5236 	},
5237 	{
5238 		.procname	=	"gc_timeout",
5239 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5240 		.maxlen		=	sizeof(int),
5241 		.mode		=	0644,
5242 		.proc_handler	=	proc_dointvec_jiffies,
5243 	},
5244 	{
5245 		.procname	=	"gc_interval",
5246 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5247 		.maxlen		=	sizeof(int),
5248 		.mode		=	0644,
5249 		.proc_handler	=	proc_dointvec_jiffies,
5250 	},
5251 	{
5252 		.procname	=	"gc_elasticity",
5253 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5254 		.maxlen		=	sizeof(int),
5255 		.mode		=	0644,
5256 		.proc_handler	=	proc_dointvec,
5257 	},
5258 	{
5259 		.procname	=	"mtu_expires",
5260 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5261 		.maxlen		=	sizeof(int),
5262 		.mode		=	0644,
5263 		.proc_handler	=	proc_dointvec_jiffies,
5264 	},
5265 	{
5266 		.procname	=	"min_adv_mss",
5267 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5268 		.maxlen		=	sizeof(int),
5269 		.mode		=	0644,
5270 		.proc_handler	=	proc_dointvec,
5271 	},
5272 	{
5273 		.procname	=	"gc_min_interval_ms",
5274 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5275 		.maxlen		=	sizeof(int),
5276 		.mode		=	0644,
5277 		.proc_handler	=	proc_dointvec_ms_jiffies,
5278 	},
5279 	{
5280 		.procname	=	"skip_notify_on_dev_down",
5281 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5282 		.maxlen		=	sizeof(int),
5283 		.mode		=	0644,
5284 		.proc_handler	=	proc_dointvec,
5285 		.extra1		=	&zero,
5286 		.extra2		=	&one,
5287 	},
5288 	{ }
5289 };
5290 
5291 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5292 {
5293 	struct ctl_table *table;
5294 
5295 	table = kmemdup(ipv6_route_table_template,
5296 			sizeof(ipv6_route_table_template),
5297 			GFP_KERNEL);
5298 
5299 	if (table) {
5300 		table[0].data = &net->ipv6.sysctl.flush_delay;
5301 		table[0].extra1 = net;
5302 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5303 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5304 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5305 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5306 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5307 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5308 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5309 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5310 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5311 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5312 
5313 		/* Don't export sysctls to unprivileged users */
5314 		if (net->user_ns != &init_user_ns)
5315 			table[0].procname = NULL;
5316 	}
5317 
5318 	return table;
5319 }
5320 #endif
5321 
5322 static int __net_init ip6_route_net_init(struct net *net)
5323 {
5324 	int ret = -ENOMEM;
5325 
5326 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5327 	       sizeof(net->ipv6.ip6_dst_ops));
5328 
5329 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5330 		goto out_ip6_dst_ops;
5331 
5332 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5333 					    sizeof(*net->ipv6.fib6_null_entry),
5334 					    GFP_KERNEL);
5335 	if (!net->ipv6.fib6_null_entry)
5336 		goto out_ip6_dst_entries;
5337 
5338 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5339 					   sizeof(*net->ipv6.ip6_null_entry),
5340 					   GFP_KERNEL);
5341 	if (!net->ipv6.ip6_null_entry)
5342 		goto out_fib6_null_entry;
5343 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5344 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5345 			 ip6_template_metrics, true);
5346 
5347 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5348 	net->ipv6.fib6_has_custom_rules = false;
5349 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5350 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5351 					       GFP_KERNEL);
5352 	if (!net->ipv6.ip6_prohibit_entry)
5353 		goto out_ip6_null_entry;
5354 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5355 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5356 			 ip6_template_metrics, true);
5357 
5358 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5359 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5360 					       GFP_KERNEL);
5361 	if (!net->ipv6.ip6_blk_hole_entry)
5362 		goto out_ip6_prohibit_entry;
5363 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5364 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5365 			 ip6_template_metrics, true);
5366 #endif
5367 
5368 	net->ipv6.sysctl.flush_delay = 0;
5369 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5370 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5371 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5372 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5373 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5374 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5375 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5376 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5377 
5378 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5379 
5380 	ret = 0;
5381 out:
5382 	return ret;
5383 
5384 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5385 out_ip6_prohibit_entry:
5386 	kfree(net->ipv6.ip6_prohibit_entry);
5387 out_ip6_null_entry:
5388 	kfree(net->ipv6.ip6_null_entry);
5389 #endif
5390 out_fib6_null_entry:
5391 	kfree(net->ipv6.fib6_null_entry);
5392 out_ip6_dst_entries:
5393 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5394 out_ip6_dst_ops:
5395 	goto out;
5396 }
5397 
5398 static void __net_exit ip6_route_net_exit(struct net *net)
5399 {
5400 	kfree(net->ipv6.fib6_null_entry);
5401 	kfree(net->ipv6.ip6_null_entry);
5402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5403 	kfree(net->ipv6.ip6_prohibit_entry);
5404 	kfree(net->ipv6.ip6_blk_hole_entry);
5405 #endif
5406 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5407 }
5408 
5409 static int __net_init ip6_route_net_init_late(struct net *net)
5410 {
5411 #ifdef CONFIG_PROC_FS
5412 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5413 			sizeof(struct ipv6_route_iter));
5414 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5415 			rt6_stats_seq_show, NULL);
5416 #endif
5417 	return 0;
5418 }
5419 
5420 static void __net_exit ip6_route_net_exit_late(struct net *net)
5421 {
5422 #ifdef CONFIG_PROC_FS
5423 	remove_proc_entry("ipv6_route", net->proc_net);
5424 	remove_proc_entry("rt6_stats", net->proc_net);
5425 #endif
5426 }
5427 
5428 static struct pernet_operations ip6_route_net_ops = {
5429 	.init = ip6_route_net_init,
5430 	.exit = ip6_route_net_exit,
5431 };
5432 
5433 static int __net_init ipv6_inetpeer_init(struct net *net)
5434 {
5435 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5436 
5437 	if (!bp)
5438 		return -ENOMEM;
5439 	inet_peer_base_init(bp);
5440 	net->ipv6.peers = bp;
5441 	return 0;
5442 }
5443 
5444 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5445 {
5446 	struct inet_peer_base *bp = net->ipv6.peers;
5447 
5448 	net->ipv6.peers = NULL;
5449 	inetpeer_invalidate_tree(bp);
5450 	kfree(bp);
5451 }
5452 
5453 static struct pernet_operations ipv6_inetpeer_ops = {
5454 	.init	=	ipv6_inetpeer_init,
5455 	.exit	=	ipv6_inetpeer_exit,
5456 };
5457 
5458 static struct pernet_operations ip6_route_net_late_ops = {
5459 	.init = ip6_route_net_init_late,
5460 	.exit = ip6_route_net_exit_late,
5461 };
5462 
5463 static struct notifier_block ip6_route_dev_notifier = {
5464 	.notifier_call = ip6_route_dev_notify,
5465 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5466 };
5467 
5468 void __init ip6_route_init_special_entries(void)
5469 {
5470 	/* Registering of the loopback is done before this portion of code,
5471 	 * the loopback reference in rt6_info will not be taken, do it
5472 	 * manually for init_net */
5473 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5474 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5475 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5476   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5477 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5478 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5479 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5480 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5481   #endif
5482 }
5483 
5484 int __init ip6_route_init(void)
5485 {
5486 	int ret;
5487 	int cpu;
5488 
5489 	ret = -ENOMEM;
5490 	ip6_dst_ops_template.kmem_cachep =
5491 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5492 				  SLAB_HWCACHE_ALIGN, NULL);
5493 	if (!ip6_dst_ops_template.kmem_cachep)
5494 		goto out;
5495 
5496 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5497 	if (ret)
5498 		goto out_kmem_cache;
5499 
5500 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5501 	if (ret)
5502 		goto out_dst_entries;
5503 
5504 	ret = register_pernet_subsys(&ip6_route_net_ops);
5505 	if (ret)
5506 		goto out_register_inetpeer;
5507 
5508 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5509 
5510 	ret = fib6_init();
5511 	if (ret)
5512 		goto out_register_subsys;
5513 
5514 	ret = xfrm6_init();
5515 	if (ret)
5516 		goto out_fib6_init;
5517 
5518 	ret = fib6_rules_init();
5519 	if (ret)
5520 		goto xfrm6_init;
5521 
5522 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5523 	if (ret)
5524 		goto fib6_rules_init;
5525 
5526 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5527 				   inet6_rtm_newroute, NULL, 0);
5528 	if (ret < 0)
5529 		goto out_register_late_subsys;
5530 
5531 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5532 				   inet6_rtm_delroute, NULL, 0);
5533 	if (ret < 0)
5534 		goto out_register_late_subsys;
5535 
5536 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5537 				   inet6_rtm_getroute, NULL,
5538 				   RTNL_FLAG_DOIT_UNLOCKED);
5539 	if (ret < 0)
5540 		goto out_register_late_subsys;
5541 
5542 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5543 	if (ret)
5544 		goto out_register_late_subsys;
5545 
5546 	for_each_possible_cpu(cpu) {
5547 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5548 
5549 		INIT_LIST_HEAD(&ul->head);
5550 		spin_lock_init(&ul->lock);
5551 	}
5552 
5553 out:
5554 	return ret;
5555 
5556 out_register_late_subsys:
5557 	rtnl_unregister_all(PF_INET6);
5558 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5559 fib6_rules_init:
5560 	fib6_rules_cleanup();
5561 xfrm6_init:
5562 	xfrm6_fini();
5563 out_fib6_init:
5564 	fib6_gc_cleanup();
5565 out_register_subsys:
5566 	unregister_pernet_subsys(&ip6_route_net_ops);
5567 out_register_inetpeer:
5568 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5569 out_dst_entries:
5570 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5571 out_kmem_cache:
5572 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5573 	goto out;
5574 }
5575 
5576 void ip6_route_cleanup(void)
5577 {
5578 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5579 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5580 	fib6_rules_cleanup();
5581 	xfrm6_fini();
5582 	fib6_gc_cleanup();
5583 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5584 	unregister_pernet_subsys(&ip6_route_net_ops);
5585 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5586 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5587 }
5588