xref: /openbmc/linux/net/ipv6/route.c (revision effda4dd97e878ab83336bec7411cc41b5cc6d37)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <net/ip.h>
67 #include <linux/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 static int ip6_rt_type_to_error(u8 fib6_type);
74 
75 #define CREATE_TRACE_POINTS
76 #include <trace/events/fib6.h>
77 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
78 #undef CREATE_TRACE_POINTS
79 
80 enum rt6_nud_state {
81 	RT6_NUD_FAIL_HARD = -3,
82 	RT6_NUD_FAIL_PROBE = -2,
83 	RT6_NUD_FAIL_DO_RR = -1,
84 	RT6_NUD_SUCCEED = 1
85 };
86 
87 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(struct dst_ops *ops);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98 static int		ip6_pkt_prohibit(struct sk_buff *skb);
99 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
100 static void		ip6_link_failure(struct sk_buff *skb);
101 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
102 					   struct sk_buff *skb, u32 mtu);
103 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
104 					struct sk_buff *skb);
105 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
106 			   int strict);
107 static size_t rt6_nlmsg_size(struct fib6_info *rt);
108 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
109 			 struct fib6_info *rt, struct dst_entry *dst,
110 			 struct in6_addr *dest, struct in6_addr *src,
111 			 int iif, int type, u32 portid, u32 seq,
112 			 unsigned int flags);
113 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
114 					   struct in6_addr *daddr,
115 					   struct in6_addr *saddr);
116 
117 #ifdef CONFIG_IPV6_ROUTE_INFO
118 static struct fib6_info *rt6_add_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev,
122 					   unsigned int pref);
123 static struct fib6_info *rt6_get_route_info(struct net *net,
124 					   const struct in6_addr *prefix, int prefixlen,
125 					   const struct in6_addr *gwaddr,
126 					   struct net_device *dev);
127 #endif
128 
129 struct uncached_list {
130 	spinlock_t		lock;
131 	struct list_head	head;
132 };
133 
134 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
135 
136 void rt6_uncached_list_add(struct rt6_info *rt)
137 {
138 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
139 
140 	rt->rt6i_uncached_list = ul;
141 
142 	spin_lock_bh(&ul->lock);
143 	list_add_tail(&rt->rt6i_uncached, &ul->head);
144 	spin_unlock_bh(&ul->lock);
145 }
146 
147 void rt6_uncached_list_del(struct rt6_info *rt)
148 {
149 	if (!list_empty(&rt->rt6i_uncached)) {
150 		struct uncached_list *ul = rt->rt6i_uncached_list;
151 		struct net *net = dev_net(rt->dst.dev);
152 
153 		spin_lock_bh(&ul->lock);
154 		list_del(&rt->rt6i_uncached);
155 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
156 		spin_unlock_bh(&ul->lock);
157 	}
158 }
159 
160 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
161 {
162 	struct net_device *loopback_dev = net->loopback_dev;
163 	int cpu;
164 
165 	if (dev == loopback_dev)
166 		return;
167 
168 	for_each_possible_cpu(cpu) {
169 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
170 		struct rt6_info *rt;
171 
172 		spin_lock_bh(&ul->lock);
173 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
174 			struct inet6_dev *rt_idev = rt->rt6i_idev;
175 			struct net_device *rt_dev = rt->dst.dev;
176 
177 			if (rt_idev->dev == dev) {
178 				rt->rt6i_idev = in6_dev_get(loopback_dev);
179 				in6_dev_put(rt_idev);
180 			}
181 
182 			if (rt_dev == dev) {
183 				rt->dst.dev = loopback_dev;
184 				dev_hold(rt->dst.dev);
185 				dev_put(rt_dev);
186 			}
187 		}
188 		spin_unlock_bh(&ul->lock);
189 	}
190 }
191 
192 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
193 					     struct sk_buff *skb,
194 					     const void *daddr)
195 {
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
204 				   struct net_device *dev,
205 				   struct sk_buff *skb,
206 				   const void *daddr)
207 {
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(gw, skb, daddr);
211 	n = __ipv6_neigh_lookup(dev, daddr);
212 	if (n)
213 		return n;
214 
215 	n = neigh_create(&nd_tbl, daddr, dev);
216 	return IS_ERR(n) ? NULL : n;
217 }
218 
219 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
220 					      struct sk_buff *skb,
221 					      const void *daddr)
222 {
223 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
224 
225 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
226 }
227 
228 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
229 {
230 	struct net_device *dev = dst->dev;
231 	struct rt6_info *rt = (struct rt6_info *)dst;
232 
233 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
234 	if (!daddr)
235 		return;
236 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
237 		return;
238 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
239 		return;
240 	__ipv6_confirm_neigh(dev, daddr);
241 }
242 
243 static struct dst_ops ip6_dst_ops_template = {
244 	.family			=	AF_INET6,
245 	.gc			=	ip6_dst_gc,
246 	.gc_thresh		=	1024,
247 	.check			=	ip6_dst_check,
248 	.default_advmss		=	ip6_default_advmss,
249 	.mtu			=	ip6_mtu,
250 	.cow_metrics		=	dst_cow_metrics_generic,
251 	.destroy		=	ip6_dst_destroy,
252 	.ifdown			=	ip6_dst_ifdown,
253 	.negative_advice	=	ip6_negative_advice,
254 	.link_failure		=	ip6_link_failure,
255 	.update_pmtu		=	ip6_rt_update_pmtu,
256 	.redirect		=	rt6_do_redirect,
257 	.local_out		=	__ip6_local_out,
258 	.neigh_lookup		=	ip6_dst_neigh_lookup,
259 	.confirm_neigh		=	ip6_confirm_neigh,
260 };
261 
262 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
263 {
264 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
265 
266 	return mtu ? : dst->dev->mtu;
267 }
268 
269 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
270 					 struct sk_buff *skb, u32 mtu)
271 {
272 }
273 
274 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
275 				      struct sk_buff *skb)
276 {
277 }
278 
279 static struct dst_ops ip6_dst_blackhole_ops = {
280 	.family			=	AF_INET6,
281 	.destroy		=	ip6_dst_destroy,
282 	.check			=	ip6_dst_check,
283 	.mtu			=	ip6_blackhole_mtu,
284 	.default_advmss		=	ip6_default_advmss,
285 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
286 	.redirect		=	ip6_rt_blackhole_redirect,
287 	.cow_metrics		=	dst_cow_metrics_generic,
288 	.neigh_lookup		=	ip6_dst_neigh_lookup,
289 };
290 
291 static const u32 ip6_template_metrics[RTAX_MAX] = {
292 	[RTAX_HOPLIMIT - 1] = 0,
293 };
294 
295 static const struct fib6_info fib6_null_entry_template = {
296 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.fib6_protocol  = RTPROT_KERNEL,
298 	.fib6_metric	= ~(u32)0,
299 	.fib6_ref	= ATOMIC_INIT(1),
300 	.fib6_type	= RTN_UNREACHABLE,
301 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
302 };
303 
304 static const struct rt6_info ip6_null_entry_template = {
305 	.dst = {
306 		.__refcnt	= ATOMIC_INIT(1),
307 		.__use		= 1,
308 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
309 		.error		= -ENETUNREACH,
310 		.input		= ip6_pkt_discard,
311 		.output		= ip6_pkt_discard_out,
312 	},
313 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
314 };
315 
316 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
317 
318 static const struct rt6_info ip6_prohibit_entry_template = {
319 	.dst = {
320 		.__refcnt	= ATOMIC_INIT(1),
321 		.__use		= 1,
322 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
323 		.error		= -EACCES,
324 		.input		= ip6_pkt_prohibit,
325 		.output		= ip6_pkt_prohibit_out,
326 	},
327 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 };
341 
342 #endif
343 
344 static void rt6_info_init(struct rt6_info *rt)
345 {
346 	struct dst_entry *dst = &rt->dst;
347 
348 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
349 	INIT_LIST_HEAD(&rt->rt6i_uncached);
350 }
351 
352 /* allocate dst with ip6_dst_ops */
353 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
354 			       int flags)
355 {
356 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
357 					1, DST_OBSOLETE_FORCE_CHK, flags);
358 
359 	if (rt) {
360 		rt6_info_init(rt);
361 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
362 	}
363 
364 	return rt;
365 }
366 EXPORT_SYMBOL(ip6_dst_alloc);
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct fib6_info *from;
372 	struct inet6_dev *idev;
373 
374 	ip_dst_metrics_put(dst);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	rcu_read_lock();
384 	from = rcu_dereference(rt->from);
385 	rcu_assign_pointer(rt->from, NULL);
386 	fib6_info_release(from);
387 	rcu_read_unlock();
388 }
389 
390 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
391 			   int how)
392 {
393 	struct rt6_info *rt = (struct rt6_info *)dst;
394 	struct inet6_dev *idev = rt->rt6i_idev;
395 	struct net_device *loopback_dev =
396 		dev_net(dev)->loopback_dev;
397 
398 	if (idev && idev->dev != loopback_dev) {
399 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
400 		if (loopback_idev) {
401 			rt->rt6i_idev = loopback_idev;
402 			in6_dev_put(idev);
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	struct fib6_info *from;
418 
419 	from = rcu_dereference(rt->from);
420 
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (from) {
425 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426 			fib6_check_expired(from);
427 	}
428 	return false;
429 }
430 
431 void fib6_select_path(const struct net *net, struct fib6_result *res,
432 		      struct flowi6 *fl6, int oif, bool have_oif_match,
433 		      const struct sk_buff *skb, int strict)
434 {
435 	struct fib6_info *sibling, *next_sibling;
436 	struct fib6_info *match = res->f6i;
437 
438 	if (!match->fib6_nsiblings || have_oif_match)
439 		goto out;
440 
441 	/* We might have already computed the hash for ICMPv6 errors. In such
442 	 * case it will always be non-zero. Otherwise now is the time to do it.
443 	 */
444 	if (!fl6->mp_hash)
445 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
446 
447 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
448 		goto out;
449 
450 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
451 				 fib6_siblings) {
452 		const struct fib6_nh *nh = &sibling->fib6_nh;
453 		int nh_upper_bound;
454 
455 		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
456 		if (fl6->mp_hash > nh_upper_bound)
457 			continue;
458 		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
459 			break;
460 		match = sibling;
461 		break;
462 	}
463 
464 out:
465 	res->f6i = match;
466 	res->nh = &match->fib6_nh;
467 }
468 
469 /*
470  *	Route lookup. rcu_read_lock() should be held.
471  */
472 
473 static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
474 			       const struct in6_addr *saddr, int oif, int flags)
475 {
476 	const struct net_device *dev;
477 
478 	if (nh->fib_nh_flags & RTNH_F_DEAD)
479 		return false;
480 
481 	dev = nh->fib_nh_dev;
482 	if (oif) {
483 		if (dev->ifindex == oif)
484 			return true;
485 	} else {
486 		if (ipv6_chk_addr(net, saddr, dev,
487 				  flags & RT6_LOOKUP_F_IFACE))
488 			return true;
489 	}
490 
491 	return false;
492 }
493 
494 static void rt6_device_match(struct net *net, struct fib6_result *res,
495 			     const struct in6_addr *saddr, int oif, int flags)
496 {
497 	struct fib6_info *f6i = res->f6i;
498 	struct fib6_info *spf6i;
499 	struct fib6_nh *nh;
500 
501 	if (!oif && ipv6_addr_any(saddr)) {
502 		nh = &f6i->fib6_nh;
503 		if (!(nh->fib_nh_flags & RTNH_F_DEAD)) {
504 			res->nh = nh;
505 			return;
506 		}
507 	}
508 
509 	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
510 		nh = &spf6i->fib6_nh;
511 		if (__rt6_device_match(net, nh, saddr, oif, flags)) {
512 			res->f6i = spf6i;
513 			res->nh = nh;
514 		}
515 	}
516 
517 	if (oif && flags & RT6_LOOKUP_F_IFACE) {
518 		res->f6i = net->ipv6.fib6_null_entry;
519 		res->nh = &res->f6i->fib6_nh;
520 		return;
521 	}
522 
523 	res->nh = &f6i->fib6_nh;
524 	if (res->nh->fib_nh_flags & RTNH_F_DEAD) {
525 		res->f6i = net->ipv6.fib6_null_entry;
526 		res->nh = &res->f6i->fib6_nh;
527 	}
528 }
529 
530 #ifdef CONFIG_IPV6_ROUTER_PREF
531 struct __rt6_probe_work {
532 	struct work_struct work;
533 	struct in6_addr target;
534 	struct net_device *dev;
535 };
536 
537 static void rt6_probe_deferred(struct work_struct *w)
538 {
539 	struct in6_addr mcaddr;
540 	struct __rt6_probe_work *work =
541 		container_of(w, struct __rt6_probe_work, work);
542 
543 	addrconf_addr_solict_mult(&work->target, &mcaddr);
544 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
545 	dev_put(work->dev);
546 	kfree(work);
547 }
548 
549 static void rt6_probe(struct fib6_nh *fib6_nh)
550 {
551 	struct __rt6_probe_work *work = NULL;
552 	const struct in6_addr *nh_gw;
553 	struct neighbour *neigh;
554 	struct net_device *dev;
555 	struct inet6_dev *idev;
556 
557 	/*
558 	 * Okay, this does not seem to be appropriate
559 	 * for now, however, we need to check if it
560 	 * is really so; aka Router Reachability Probing.
561 	 *
562 	 * Router Reachability Probe MUST be rate-limited
563 	 * to no more than one per minute.
564 	 */
565 	if (fib6_nh->fib_nh_gw_family)
566 		return;
567 
568 	nh_gw = &fib6_nh->fib_nh_gw6;
569 	dev = fib6_nh->fib_nh_dev;
570 	rcu_read_lock_bh();
571 	idev = __in6_dev_get(dev);
572 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
573 	if (neigh) {
574 		if (neigh->nud_state & NUD_VALID)
575 			goto out;
576 
577 		write_lock(&neigh->lock);
578 		if (!(neigh->nud_state & NUD_VALID) &&
579 		    time_after(jiffies,
580 			       neigh->updated + idev->cnf.rtr_probe_interval)) {
581 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
582 			if (work)
583 				__neigh_set_probe_once(neigh);
584 		}
585 		write_unlock(&neigh->lock);
586 	} else if (time_after(jiffies, fib6_nh->last_probe +
587 				       idev->cnf.rtr_probe_interval)) {
588 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 	}
590 
591 	if (work) {
592 		fib6_nh->last_probe = jiffies;
593 		INIT_WORK(&work->work, rt6_probe_deferred);
594 		work->target = *nh_gw;
595 		dev_hold(dev);
596 		work->dev = dev;
597 		schedule_work(&work->work);
598 	}
599 
600 out:
601 	rcu_read_unlock_bh();
602 }
603 #else
604 static inline void rt6_probe(struct fib6_nh *fib6_nh)
605 {
606 }
607 #endif
608 
609 /*
610  * Default Router Selection (RFC 2461 6.3.6)
611  */
612 static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
613 {
614 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
615 	struct neighbour *neigh;
616 
617 	rcu_read_lock_bh();
618 	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
619 					  &fib6_nh->fib_nh_gw6);
620 	if (neigh) {
621 		read_lock(&neigh->lock);
622 		if (neigh->nud_state & NUD_VALID)
623 			ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625 		else if (!(neigh->nud_state & NUD_FAILED))
626 			ret = RT6_NUD_SUCCEED;
627 		else
628 			ret = RT6_NUD_FAIL_PROBE;
629 #endif
630 		read_unlock(&neigh->lock);
631 	} else {
632 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634 	}
635 	rcu_read_unlock_bh();
636 
637 	return ret;
638 }
639 
640 static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
641 			   int strict)
642 {
643 	int m = 0;
644 
645 	if (!oif || nh->fib_nh_dev->ifindex == oif)
646 		m = 2;
647 
648 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
649 		return RT6_NUD_FAIL_HARD;
650 #ifdef CONFIG_IPV6_ROUTER_PREF
651 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
652 #endif
653 	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
654 	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
655 		int n = rt6_check_neigh(nh);
656 		if (n < 0)
657 			return n;
658 	}
659 	return m;
660 }
661 
662 static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
663 		       int oif, int strict, int *mpri, bool *do_rr)
664 {
665 	bool match_do_rr = false;
666 	bool rc = false;
667 	int m;
668 
669 	if (nh->fib_nh_flags & RTNH_F_DEAD)
670 		goto out;
671 
672 	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
673 	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
674 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
675 		goto out;
676 
677 	m = rt6_score_route(nh, fib6_flags, oif, strict);
678 	if (m == RT6_NUD_FAIL_DO_RR) {
679 		match_do_rr = true;
680 		m = 0; /* lowest valid score */
681 	} else if (m == RT6_NUD_FAIL_HARD) {
682 		goto out;
683 	}
684 
685 	if (strict & RT6_LOOKUP_F_REACHABLE)
686 		rt6_probe(nh);
687 
688 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
689 	if (m > *mpri) {
690 		*do_rr = match_do_rr;
691 		*mpri = m;
692 		rc = true;
693 	}
694 out:
695 	return rc;
696 }
697 
698 static void __find_rr_leaf(struct fib6_info *f6i_start,
699 			   struct fib6_info *nomatch, u32 metric,
700 			   struct fib6_result *res, struct fib6_info **cont,
701 			   int oif, int strict, bool *do_rr, int *mpri)
702 {
703 	struct fib6_info *f6i;
704 
705 	for (f6i = f6i_start;
706 	     f6i && f6i != nomatch;
707 	     f6i = rcu_dereference(f6i->fib6_next)) {
708 		struct fib6_nh *nh;
709 
710 		if (cont && f6i->fib6_metric != metric) {
711 			*cont = f6i;
712 			return;
713 		}
714 
715 		if (fib6_check_expired(f6i))
716 			continue;
717 
718 		nh = &f6i->fib6_nh;
719 		if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) {
720 			res->f6i = f6i;
721 			res->nh = nh;
722 		}
723 	}
724 }
725 
726 static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
727 			 struct fib6_info *rr_head, int oif, int strict,
728 			 bool *do_rr, struct fib6_result *res)
729 {
730 	u32 metric = rr_head->fib6_metric;
731 	struct fib6_info *cont = NULL;
732 	int mpri = -1;
733 
734 	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
735 		       oif, strict, do_rr, &mpri);
736 
737 	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
738 		       oif, strict, do_rr, &mpri);
739 
740 	if (res->f6i || !cont)
741 		return;
742 
743 	__find_rr_leaf(cont, NULL, metric, res, NULL,
744 		       oif, strict, do_rr, &mpri);
745 }
746 
747 static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
748 		       struct fib6_result *res, int strict)
749 {
750 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
751 	struct fib6_info *rt0;
752 	bool do_rr = false;
753 	int key_plen;
754 
755 	/* make sure this function or its helpers sets f6i */
756 	res->f6i = NULL;
757 
758 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
759 		goto out;
760 
761 	rt0 = rcu_dereference(fn->rr_ptr);
762 	if (!rt0)
763 		rt0 = leaf;
764 
765 	/* Double check to make sure fn is not an intermediate node
766 	 * and fn->leaf does not points to its child's leaf
767 	 * (This might happen if all routes under fn are deleted from
768 	 * the tree and fib6_repair_tree() is called on the node.)
769 	 */
770 	key_plen = rt0->fib6_dst.plen;
771 #ifdef CONFIG_IPV6_SUBTREES
772 	if (rt0->fib6_src.plen)
773 		key_plen = rt0->fib6_src.plen;
774 #endif
775 	if (fn->fn_bit != key_plen)
776 		goto out;
777 
778 	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
779 	if (do_rr) {
780 		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
781 
782 		/* no entries matched; do round-robin */
783 		if (!next || next->fib6_metric != rt0->fib6_metric)
784 			next = leaf;
785 
786 		if (next != rt0) {
787 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
788 			/* make sure next is not being deleted from the tree */
789 			if (next->fib6_node)
790 				rcu_assign_pointer(fn->rr_ptr, next);
791 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
792 		}
793 	}
794 
795 out:
796 	if (!res->f6i) {
797 		res->f6i = net->ipv6.fib6_null_entry;
798 		res->nh = &res->f6i->fib6_nh;
799 	}
800 }
801 
802 static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
803 {
804 	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
805 	       res->nh->fib_nh_gw_family;
806 }
807 
808 #ifdef CONFIG_IPV6_ROUTE_INFO
809 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
810 		  const struct in6_addr *gwaddr)
811 {
812 	struct net *net = dev_net(dev);
813 	struct route_info *rinfo = (struct route_info *) opt;
814 	struct in6_addr prefix_buf, *prefix;
815 	unsigned int pref;
816 	unsigned long lifetime;
817 	struct fib6_info *rt;
818 
819 	if (len < sizeof(struct route_info)) {
820 		return -EINVAL;
821 	}
822 
823 	/* Sanity check for prefix_len and length */
824 	if (rinfo->length > 3) {
825 		return -EINVAL;
826 	} else if (rinfo->prefix_len > 128) {
827 		return -EINVAL;
828 	} else if (rinfo->prefix_len > 64) {
829 		if (rinfo->length < 2) {
830 			return -EINVAL;
831 		}
832 	} else if (rinfo->prefix_len > 0) {
833 		if (rinfo->length < 1) {
834 			return -EINVAL;
835 		}
836 	}
837 
838 	pref = rinfo->route_pref;
839 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
840 		return -EINVAL;
841 
842 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
843 
844 	if (rinfo->length == 3)
845 		prefix = (struct in6_addr *)rinfo->prefix;
846 	else {
847 		/* this function is safe */
848 		ipv6_addr_prefix(&prefix_buf,
849 				 (struct in6_addr *)rinfo->prefix,
850 				 rinfo->prefix_len);
851 		prefix = &prefix_buf;
852 	}
853 
854 	if (rinfo->prefix_len == 0)
855 		rt = rt6_get_dflt_router(net, gwaddr, dev);
856 	else
857 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
858 					gwaddr, dev);
859 
860 	if (rt && !lifetime) {
861 		ip6_del_rt(net, rt);
862 		rt = NULL;
863 	}
864 
865 	if (!rt && lifetime)
866 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
867 					dev, pref);
868 	else if (rt)
869 		rt->fib6_flags = RTF_ROUTEINFO |
870 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
871 
872 	if (rt) {
873 		if (!addrconf_finite_timeout(lifetime))
874 			fib6_clean_expires(rt);
875 		else
876 			fib6_set_expires(rt, jiffies + HZ * lifetime);
877 
878 		fib6_info_release(rt);
879 	}
880 	return 0;
881 }
882 #endif
883 
884 /*
885  *	Misc support functions
886  */
887 
888 /* called with rcu_lock held */
889 static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
890 {
891 	struct net_device *dev = res->nh->fib_nh_dev;
892 	const struct fib6_info *f6i = res->f6i;
893 
894 	if (f6i->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
895 		/* for copies of local routes, dst->dev needs to be the
896 		 * device if it is a master device, the master device if
897 		 * device is enslaved, and the loopback as the default
898 		 */
899 		if (netif_is_l3_slave(dev) &&
900 		    !rt6_need_strict(&f6i->fib6_dst.addr))
901 			dev = l3mdev_master_dev_rcu(dev);
902 		else if (!netif_is_l3_master(dev))
903 			dev = dev_net(dev)->loopback_dev;
904 		/* last case is netif_is_l3_master(dev) is true in which
905 		 * case we want dev returned to be dev
906 		 */
907 	}
908 
909 	return dev;
910 }
911 
912 static const int fib6_prop[RTN_MAX + 1] = {
913 	[RTN_UNSPEC]	= 0,
914 	[RTN_UNICAST]	= 0,
915 	[RTN_LOCAL]	= 0,
916 	[RTN_BROADCAST]	= 0,
917 	[RTN_ANYCAST]	= 0,
918 	[RTN_MULTICAST]	= 0,
919 	[RTN_BLACKHOLE]	= -EINVAL,
920 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
921 	[RTN_PROHIBIT]	= -EACCES,
922 	[RTN_THROW]	= -EAGAIN,
923 	[RTN_NAT]	= -EINVAL,
924 	[RTN_XRESOLVE]	= -EINVAL,
925 };
926 
927 static int ip6_rt_type_to_error(u8 fib6_type)
928 {
929 	return fib6_prop[fib6_type];
930 }
931 
932 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
933 {
934 	unsigned short flags = 0;
935 
936 	if (rt->dst_nocount)
937 		flags |= DST_NOCOUNT;
938 	if (rt->dst_nopolicy)
939 		flags |= DST_NOPOLICY;
940 	if (rt->dst_host)
941 		flags |= DST_HOST;
942 
943 	return flags;
944 }
945 
946 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
947 {
948 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
949 
950 	switch (ort->fib6_type) {
951 	case RTN_BLACKHOLE:
952 		rt->dst.output = dst_discard_out;
953 		rt->dst.input = dst_discard;
954 		break;
955 	case RTN_PROHIBIT:
956 		rt->dst.output = ip6_pkt_prohibit_out;
957 		rt->dst.input = ip6_pkt_prohibit;
958 		break;
959 	case RTN_THROW:
960 	case RTN_UNREACHABLE:
961 	default:
962 		rt->dst.output = ip6_pkt_discard_out;
963 		rt->dst.input = ip6_pkt_discard;
964 		break;
965 	}
966 }
967 
968 static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
969 {
970 	struct fib6_info *ort = res->f6i;
971 
972 	if (ort->fib6_flags & RTF_REJECT) {
973 		ip6_rt_init_dst_reject(rt, ort);
974 		return;
975 	}
976 
977 	rt->dst.error = 0;
978 	rt->dst.output = ip6_output;
979 
980 	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
981 		rt->dst.input = ip6_input;
982 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
983 		rt->dst.input = ip6_mc_input;
984 	} else {
985 		rt->dst.input = ip6_forward;
986 	}
987 
988 	if (res->nh->fib_nh_lws) {
989 		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
990 		lwtunnel_set_redirect(&rt->dst);
991 	}
992 
993 	rt->dst.lastuse = jiffies;
994 }
995 
996 /* Caller must already hold reference to @from */
997 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
998 {
999 	rt->rt6i_flags &= ~RTF_EXPIRES;
1000 	rcu_assign_pointer(rt->from, from);
1001 	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1002 }
1003 
1004 /* Caller must already hold reference to f6i in result */
1005 static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1006 {
1007 	const struct fib6_nh *nh = res->nh;
1008 	const struct net_device *dev = nh->fib_nh_dev;
1009 	struct fib6_info *f6i = res->f6i;
1010 
1011 	ip6_rt_init_dst(rt, res);
1012 
1013 	rt->rt6i_dst = f6i->fib6_dst;
1014 	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1015 	rt->rt6i_flags = f6i->fib6_flags;
1016 	if (nh->fib_nh_gw_family) {
1017 		rt->rt6i_gateway = nh->fib_nh_gw6;
1018 		rt->rt6i_flags |= RTF_GATEWAY;
1019 	}
1020 	rt6_set_from(rt, f6i);
1021 #ifdef CONFIG_IPV6_SUBTREES
1022 	rt->rt6i_src = f6i->fib6_src;
1023 #endif
1024 }
1025 
1026 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1027 					struct in6_addr *saddr)
1028 {
1029 	struct fib6_node *pn, *sn;
1030 	while (1) {
1031 		if (fn->fn_flags & RTN_TL_ROOT)
1032 			return NULL;
1033 		pn = rcu_dereference(fn->parent);
1034 		sn = FIB6_SUBTREE(pn);
1035 		if (sn && sn != fn)
1036 			fn = fib6_node_lookup(sn, NULL, saddr);
1037 		else
1038 			fn = pn;
1039 		if (fn->fn_flags & RTN_RTINFO)
1040 			return fn;
1041 	}
1042 }
1043 
1044 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1045 {
1046 	struct rt6_info *rt = *prt;
1047 
1048 	if (dst_hold_safe(&rt->dst))
1049 		return true;
1050 	if (net) {
1051 		rt = net->ipv6.ip6_null_entry;
1052 		dst_hold(&rt->dst);
1053 	} else {
1054 		rt = NULL;
1055 	}
1056 	*prt = rt;
1057 	return false;
1058 }
1059 
1060 /* called with rcu_lock held */
1061 static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1062 {
1063 	struct net_device *dev = res->nh->fib_nh_dev;
1064 	struct fib6_info *f6i = res->f6i;
1065 	unsigned short flags;
1066 	struct rt6_info *nrt;
1067 
1068 	if (!fib6_info_hold_safe(f6i))
1069 		goto fallback;
1070 
1071 	flags = fib6_info_dst_flags(f6i);
1072 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1073 	if (!nrt) {
1074 		fib6_info_release(f6i);
1075 		goto fallback;
1076 	}
1077 
1078 	ip6_rt_copy_init(nrt, res);
1079 	return nrt;
1080 
1081 fallback:
1082 	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1083 	dst_hold(&nrt->dst);
1084 	return nrt;
1085 }
1086 
1087 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1088 					     struct fib6_table *table,
1089 					     struct flowi6 *fl6,
1090 					     const struct sk_buff *skb,
1091 					     int flags)
1092 {
1093 	struct fib6_result res = {};
1094 	struct fib6_node *fn;
1095 	struct rt6_info *rt;
1096 
1097 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1098 		flags &= ~RT6_LOOKUP_F_IFACE;
1099 
1100 	rcu_read_lock();
1101 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1102 restart:
1103 	res.f6i = rcu_dereference(fn->leaf);
1104 	if (!res.f6i)
1105 		res.f6i = net->ipv6.fib6_null_entry;
1106 	else
1107 		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1108 				 flags);
1109 
1110 	if (res.f6i == net->ipv6.fib6_null_entry) {
1111 		fn = fib6_backtrack(fn, &fl6->saddr);
1112 		if (fn)
1113 			goto restart;
1114 
1115 		rt = net->ipv6.ip6_null_entry;
1116 		dst_hold(&rt->dst);
1117 		goto out;
1118 	}
1119 
1120 	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1121 			 fl6->flowi6_oif != 0, skb, flags);
1122 
1123 	/* Search through exception table */
1124 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1125 	if (rt) {
1126 		if (ip6_hold_safe(net, &rt))
1127 			dst_use_noref(&rt->dst, jiffies);
1128 	} else {
1129 		rt = ip6_create_rt_rcu(&res);
1130 	}
1131 
1132 out:
1133 	trace_fib6_table_lookup(net, &res, table, fl6);
1134 
1135 	rcu_read_unlock();
1136 
1137 	return rt;
1138 }
1139 
1140 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1141 				   const struct sk_buff *skb, int flags)
1142 {
1143 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1144 }
1145 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1146 
1147 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1148 			    const struct in6_addr *saddr, int oif,
1149 			    const struct sk_buff *skb, int strict)
1150 {
1151 	struct flowi6 fl6 = {
1152 		.flowi6_oif = oif,
1153 		.daddr = *daddr,
1154 	};
1155 	struct dst_entry *dst;
1156 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1157 
1158 	if (saddr) {
1159 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1160 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1161 	}
1162 
1163 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1164 	if (dst->error == 0)
1165 		return (struct rt6_info *) dst;
1166 
1167 	dst_release(dst);
1168 
1169 	return NULL;
1170 }
1171 EXPORT_SYMBOL(rt6_lookup);
1172 
1173 /* ip6_ins_rt is called with FREE table->tb6_lock.
1174  * It takes new route entry, the addition fails by any reason the
1175  * route is released.
1176  * Caller must hold dst before calling it.
1177  */
1178 
1179 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1180 			struct netlink_ext_ack *extack)
1181 {
1182 	int err;
1183 	struct fib6_table *table;
1184 
1185 	table = rt->fib6_table;
1186 	spin_lock_bh(&table->tb6_lock);
1187 	err = fib6_add(&table->tb6_root, rt, info, extack);
1188 	spin_unlock_bh(&table->tb6_lock);
1189 
1190 	return err;
1191 }
1192 
1193 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1194 {
1195 	struct nl_info info = {	.nl_net = net, };
1196 
1197 	return __ip6_ins_rt(rt, &info, NULL);
1198 }
1199 
1200 static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1201 					   const struct in6_addr *daddr,
1202 					   const struct in6_addr *saddr)
1203 {
1204 	struct fib6_info *f6i = res->f6i;
1205 	struct net_device *dev;
1206 	struct rt6_info *rt;
1207 
1208 	/*
1209 	 *	Clone the route.
1210 	 */
1211 
1212 	if (!fib6_info_hold_safe(f6i))
1213 		return NULL;
1214 
1215 	dev = ip6_rt_get_dev_rcu(res);
1216 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1217 	if (!rt) {
1218 		fib6_info_release(f6i);
1219 		return NULL;
1220 	}
1221 
1222 	ip6_rt_copy_init(rt, res);
1223 	rt->rt6i_flags |= RTF_CACHE;
1224 	rt->dst.flags |= DST_HOST;
1225 	rt->rt6i_dst.addr = *daddr;
1226 	rt->rt6i_dst.plen = 128;
1227 
1228 	if (!rt6_is_gw_or_nonexthop(res)) {
1229 		if (f6i->fib6_dst.plen != 128 &&
1230 		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1231 			rt->rt6i_flags |= RTF_ANYCAST;
1232 #ifdef CONFIG_IPV6_SUBTREES
1233 		if (rt->rt6i_src.plen && saddr) {
1234 			rt->rt6i_src.addr = *saddr;
1235 			rt->rt6i_src.plen = 128;
1236 		}
1237 #endif
1238 	}
1239 
1240 	return rt;
1241 }
1242 
1243 static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1244 {
1245 	struct fib6_info *f6i = res->f6i;
1246 	unsigned short flags = fib6_info_dst_flags(f6i);
1247 	struct net_device *dev;
1248 	struct rt6_info *pcpu_rt;
1249 
1250 	if (!fib6_info_hold_safe(f6i))
1251 		return NULL;
1252 
1253 	rcu_read_lock();
1254 	dev = ip6_rt_get_dev_rcu(res);
1255 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1256 	rcu_read_unlock();
1257 	if (!pcpu_rt) {
1258 		fib6_info_release(f6i);
1259 		return NULL;
1260 	}
1261 	ip6_rt_copy_init(pcpu_rt, res);
1262 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1263 	return pcpu_rt;
1264 }
1265 
1266 /* It should be called with rcu_read_lock() acquired */
1267 static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1268 {
1269 	struct rt6_info *pcpu_rt, **p;
1270 
1271 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1272 	pcpu_rt = *p;
1273 
1274 	if (pcpu_rt)
1275 		ip6_hold_safe(NULL, &pcpu_rt);
1276 
1277 	return pcpu_rt;
1278 }
1279 
1280 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1281 					    const struct fib6_result *res)
1282 {
1283 	struct rt6_info *pcpu_rt, *prev, **p;
1284 
1285 	pcpu_rt = ip6_rt_pcpu_alloc(res);
1286 	if (!pcpu_rt) {
1287 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1288 		return net->ipv6.ip6_null_entry;
1289 	}
1290 
1291 	dst_hold(&pcpu_rt->dst);
1292 	p = this_cpu_ptr(res->f6i->rt6i_pcpu);
1293 	prev = cmpxchg(p, NULL, pcpu_rt);
1294 	BUG_ON(prev);
1295 
1296 	return pcpu_rt;
1297 }
1298 
1299 /* exception hash table implementation
1300  */
1301 static DEFINE_SPINLOCK(rt6_exception_lock);
1302 
1303 /* Remove rt6_ex from hash table and free the memory
1304  * Caller must hold rt6_exception_lock
1305  */
1306 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1307 				 struct rt6_exception *rt6_ex)
1308 {
1309 	struct fib6_info *from;
1310 	struct net *net;
1311 
1312 	if (!bucket || !rt6_ex)
1313 		return;
1314 
1315 	net = dev_net(rt6_ex->rt6i->dst.dev);
1316 	net->ipv6.rt6_stats->fib_rt_cache--;
1317 
1318 	/* purge completely the exception to allow releasing the held resources:
1319 	 * some [sk] cache may keep the dst around for unlimited time
1320 	 */
1321 	from = rcu_dereference_protected(rt6_ex->rt6i->from,
1322 					 lockdep_is_held(&rt6_exception_lock));
1323 	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
1324 	fib6_info_release(from);
1325 	dst_dev_put(&rt6_ex->rt6i->dst);
1326 
1327 	hlist_del_rcu(&rt6_ex->hlist);
1328 	dst_release(&rt6_ex->rt6i->dst);
1329 	kfree_rcu(rt6_ex, rcu);
1330 	WARN_ON_ONCE(!bucket->depth);
1331 	bucket->depth--;
1332 }
1333 
1334 /* Remove oldest rt6_ex in bucket and free the memory
1335  * Caller must hold rt6_exception_lock
1336  */
1337 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1338 {
1339 	struct rt6_exception *rt6_ex, *oldest = NULL;
1340 
1341 	if (!bucket)
1342 		return;
1343 
1344 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1345 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1346 			oldest = rt6_ex;
1347 	}
1348 	rt6_remove_exception(bucket, oldest);
1349 }
1350 
1351 static u32 rt6_exception_hash(const struct in6_addr *dst,
1352 			      const struct in6_addr *src)
1353 {
1354 	static u32 seed __read_mostly;
1355 	u32 val;
1356 
1357 	net_get_random_once(&seed, sizeof(seed));
1358 	val = jhash(dst, sizeof(*dst), seed);
1359 
1360 #ifdef CONFIG_IPV6_SUBTREES
1361 	if (src)
1362 		val = jhash(src, sizeof(*src), val);
1363 #endif
1364 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1365 }
1366 
1367 /* Helper function to find the cached rt in the hash table
1368  * and update bucket pointer to point to the bucket for this
1369  * (daddr, saddr) pair
1370  * Caller must hold rt6_exception_lock
1371  */
1372 static struct rt6_exception *
1373 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1374 			      const struct in6_addr *daddr,
1375 			      const struct in6_addr *saddr)
1376 {
1377 	struct rt6_exception *rt6_ex;
1378 	u32 hval;
1379 
1380 	if (!(*bucket) || !daddr)
1381 		return NULL;
1382 
1383 	hval = rt6_exception_hash(daddr, saddr);
1384 	*bucket += hval;
1385 
1386 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1387 		struct rt6_info *rt6 = rt6_ex->rt6i;
1388 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1389 
1390 #ifdef CONFIG_IPV6_SUBTREES
1391 		if (matched && saddr)
1392 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1393 #endif
1394 		if (matched)
1395 			return rt6_ex;
1396 	}
1397 	return NULL;
1398 }
1399 
1400 /* Helper function to find the cached rt in the hash table
1401  * and update bucket pointer to point to the bucket for this
1402  * (daddr, saddr) pair
1403  * Caller must hold rcu_read_lock()
1404  */
1405 static struct rt6_exception *
1406 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1407 			 const struct in6_addr *daddr,
1408 			 const struct in6_addr *saddr)
1409 {
1410 	struct rt6_exception *rt6_ex;
1411 	u32 hval;
1412 
1413 	WARN_ON_ONCE(!rcu_read_lock_held());
1414 
1415 	if (!(*bucket) || !daddr)
1416 		return NULL;
1417 
1418 	hval = rt6_exception_hash(daddr, saddr);
1419 	*bucket += hval;
1420 
1421 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1422 		struct rt6_info *rt6 = rt6_ex->rt6i;
1423 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1424 
1425 #ifdef CONFIG_IPV6_SUBTREES
1426 		if (matched && saddr)
1427 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1428 #endif
1429 		if (matched)
1430 			return rt6_ex;
1431 	}
1432 	return NULL;
1433 }
1434 
1435 static unsigned int fib6_mtu(const struct fib6_result *res)
1436 {
1437 	const struct fib6_nh *nh = res->nh;
1438 	unsigned int mtu;
1439 
1440 	if (res->f6i->fib6_pmtu) {
1441 		mtu = res->f6i->fib6_pmtu;
1442 	} else {
1443 		struct net_device *dev = nh->fib_nh_dev;
1444 		struct inet6_dev *idev;
1445 
1446 		rcu_read_lock();
1447 		idev = __in6_dev_get(dev);
1448 		mtu = idev->cnf.mtu6;
1449 		rcu_read_unlock();
1450 	}
1451 
1452 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1453 
1454 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1455 }
1456 
1457 static int rt6_insert_exception(struct rt6_info *nrt,
1458 				const struct fib6_result *res)
1459 {
1460 	struct net *net = dev_net(nrt->dst.dev);
1461 	struct rt6_exception_bucket *bucket;
1462 	struct in6_addr *src_key = NULL;
1463 	struct rt6_exception *rt6_ex;
1464 	struct fib6_info *f6i = res->f6i;
1465 	int err = 0;
1466 
1467 	spin_lock_bh(&rt6_exception_lock);
1468 
1469 	if (f6i->exception_bucket_flushed) {
1470 		err = -EINVAL;
1471 		goto out;
1472 	}
1473 
1474 	bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket,
1475 					lockdep_is_held(&rt6_exception_lock));
1476 	if (!bucket) {
1477 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1478 				 GFP_ATOMIC);
1479 		if (!bucket) {
1480 			err = -ENOMEM;
1481 			goto out;
1482 		}
1483 		rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket);
1484 	}
1485 
1486 #ifdef CONFIG_IPV6_SUBTREES
1487 	/* fib6_src.plen != 0 indicates f6i is in subtree
1488 	 * and exception table is indexed by a hash of
1489 	 * both fib6_dst and fib6_src.
1490 	 * Otherwise, the exception table is indexed by
1491 	 * a hash of only fib6_dst.
1492 	 */
1493 	if (f6i->fib6_src.plen)
1494 		src_key = &nrt->rt6i_src.addr;
1495 #endif
1496 	/* rt6_mtu_change() might lower mtu on f6i.
1497 	 * Only insert this exception route if its mtu
1498 	 * is less than f6i's mtu value.
1499 	 */
1500 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1501 		err = -EINVAL;
1502 		goto out;
1503 	}
1504 
1505 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1506 					       src_key);
1507 	if (rt6_ex)
1508 		rt6_remove_exception(bucket, rt6_ex);
1509 
1510 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1511 	if (!rt6_ex) {
1512 		err = -ENOMEM;
1513 		goto out;
1514 	}
1515 	rt6_ex->rt6i = nrt;
1516 	rt6_ex->stamp = jiffies;
1517 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1518 	bucket->depth++;
1519 	net->ipv6.rt6_stats->fib_rt_cache++;
1520 
1521 	if (bucket->depth > FIB6_MAX_DEPTH)
1522 		rt6_exception_remove_oldest(bucket);
1523 
1524 out:
1525 	spin_unlock_bh(&rt6_exception_lock);
1526 
1527 	/* Update fn->fn_sernum to invalidate all cached dst */
1528 	if (!err) {
1529 		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1530 		fib6_update_sernum(net, f6i);
1531 		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1532 		fib6_force_start_gc(net);
1533 	}
1534 
1535 	return err;
1536 }
1537 
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1539 {
1540 	struct rt6_exception_bucket *bucket;
1541 	struct rt6_exception *rt6_ex;
1542 	struct hlist_node *tmp;
1543 	int i;
1544 
1545 	spin_lock_bh(&rt6_exception_lock);
1546 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1547 	rt->exception_bucket_flushed = 1;
1548 
1549 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 				    lockdep_is_held(&rt6_exception_lock));
1551 	if (!bucket)
1552 		goto out;
1553 
1554 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556 			rt6_remove_exception(bucket, rt6_ex);
1557 		WARN_ON_ONCE(bucket->depth);
1558 		bucket++;
1559 	}
1560 
1561 out:
1562 	spin_unlock_bh(&rt6_exception_lock);
1563 }
1564 
1565 /* Find cached rt in the hash table inside passed in rt
1566  * Caller has to hold rcu_read_lock()
1567  */
1568 static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1569 					   struct in6_addr *daddr,
1570 					   struct in6_addr *saddr)
1571 {
1572 	struct rt6_exception_bucket *bucket;
1573 	struct in6_addr *src_key = NULL;
1574 	struct rt6_exception *rt6_ex;
1575 	struct rt6_info *ret = NULL;
1576 
1577 	bucket = rcu_dereference(res->f6i->rt6i_exception_bucket);
1578 
1579 #ifdef CONFIG_IPV6_SUBTREES
1580 	/* fib6i_src.plen != 0 indicates f6i is in subtree
1581 	 * and exception table is indexed by a hash of
1582 	 * both fib6_dst and fib6_src.
1583 	 * Otherwise, the exception table is indexed by
1584 	 * a hash of only fib6_dst.
1585 	 */
1586 	if (res->f6i->fib6_src.plen)
1587 		src_key = saddr;
1588 #endif
1589 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1590 
1591 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1592 		ret = rt6_ex->rt6i;
1593 
1594 	return ret;
1595 }
1596 
1597 /* Remove the passed in cached rt from the hash table that contains it */
1598 static int rt6_remove_exception_rt(struct rt6_info *rt)
1599 {
1600 	struct rt6_exception_bucket *bucket;
1601 	struct in6_addr *src_key = NULL;
1602 	struct rt6_exception *rt6_ex;
1603 	struct fib6_info *from;
1604 	int err;
1605 
1606 	from = rcu_dereference(rt->from);
1607 	if (!from ||
1608 	    !(rt->rt6i_flags & RTF_CACHE))
1609 		return -EINVAL;
1610 
1611 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1612 		return -ENOENT;
1613 
1614 	spin_lock_bh(&rt6_exception_lock);
1615 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1616 				    lockdep_is_held(&rt6_exception_lock));
1617 #ifdef CONFIG_IPV6_SUBTREES
1618 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1619 	 * and exception table is indexed by a hash of
1620 	 * both rt6i_dst and rt6i_src.
1621 	 * Otherwise, the exception table is indexed by
1622 	 * a hash of only rt6i_dst.
1623 	 */
1624 	if (from->fib6_src.plen)
1625 		src_key = &rt->rt6i_src.addr;
1626 #endif
1627 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1628 					       &rt->rt6i_dst.addr,
1629 					       src_key);
1630 	if (rt6_ex) {
1631 		rt6_remove_exception(bucket, rt6_ex);
1632 		err = 0;
1633 	} else {
1634 		err = -ENOENT;
1635 	}
1636 
1637 	spin_unlock_bh(&rt6_exception_lock);
1638 	return err;
1639 }
1640 
1641 /* Find rt6_ex which contains the passed in rt cache and
1642  * refresh its stamp
1643  */
1644 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1645 {
1646 	struct rt6_exception_bucket *bucket;
1647 	struct in6_addr *src_key = NULL;
1648 	struct rt6_exception *rt6_ex;
1649 	struct fib6_info *from;
1650 
1651 	rcu_read_lock();
1652 	from = rcu_dereference(rt->from);
1653 	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1654 		goto unlock;
1655 
1656 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1657 
1658 #ifdef CONFIG_IPV6_SUBTREES
1659 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1660 	 * and exception table is indexed by a hash of
1661 	 * both rt6i_dst and rt6i_src.
1662 	 * Otherwise, the exception table is indexed by
1663 	 * a hash of only rt6i_dst.
1664 	 */
1665 	if (from->fib6_src.plen)
1666 		src_key = &rt->rt6i_src.addr;
1667 #endif
1668 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1669 					  &rt->rt6i_dst.addr,
1670 					  src_key);
1671 	if (rt6_ex)
1672 		rt6_ex->stamp = jiffies;
1673 
1674 unlock:
1675 	rcu_read_unlock();
1676 }
1677 
1678 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1679 					 struct rt6_info *rt, int mtu)
1680 {
1681 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1682 	 * lowest MTU in the path: always allow updating the route PMTU to
1683 	 * reflect PMTU decreases.
1684 	 *
1685 	 * If the new MTU is higher, and the route PMTU is equal to the local
1686 	 * MTU, this means the old MTU is the lowest in the path, so allow
1687 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1688 	 * handle this.
1689 	 */
1690 
1691 	if (dst_mtu(&rt->dst) >= mtu)
1692 		return true;
1693 
1694 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1695 		return true;
1696 
1697 	return false;
1698 }
1699 
1700 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1701 				       struct fib6_info *rt, int mtu)
1702 {
1703 	struct rt6_exception_bucket *bucket;
1704 	struct rt6_exception *rt6_ex;
1705 	int i;
1706 
1707 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1708 					lockdep_is_held(&rt6_exception_lock));
1709 
1710 	if (!bucket)
1711 		return;
1712 
1713 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1714 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1715 			struct rt6_info *entry = rt6_ex->rt6i;
1716 
1717 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1718 			 * route), the metrics of its rt->from have already
1719 			 * been updated.
1720 			 */
1721 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1722 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1723 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1724 		}
1725 		bucket++;
1726 	}
1727 }
1728 
1729 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1730 
1731 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1732 					struct in6_addr *gateway)
1733 {
1734 	struct rt6_exception_bucket *bucket;
1735 	struct rt6_exception *rt6_ex;
1736 	struct hlist_node *tmp;
1737 	int i;
1738 
1739 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1740 		return;
1741 
1742 	spin_lock_bh(&rt6_exception_lock);
1743 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1744 				     lockdep_is_held(&rt6_exception_lock));
1745 
1746 	if (bucket) {
1747 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1748 			hlist_for_each_entry_safe(rt6_ex, tmp,
1749 						  &bucket->chain, hlist) {
1750 				struct rt6_info *entry = rt6_ex->rt6i;
1751 
1752 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1753 				    RTF_CACHE_GATEWAY &&
1754 				    ipv6_addr_equal(gateway,
1755 						    &entry->rt6i_gateway)) {
1756 					rt6_remove_exception(bucket, rt6_ex);
1757 				}
1758 			}
1759 			bucket++;
1760 		}
1761 	}
1762 
1763 	spin_unlock_bh(&rt6_exception_lock);
1764 }
1765 
1766 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1767 				      struct rt6_exception *rt6_ex,
1768 				      struct fib6_gc_args *gc_args,
1769 				      unsigned long now)
1770 {
1771 	struct rt6_info *rt = rt6_ex->rt6i;
1772 
1773 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1774 	 * even if others have still references to them, so that on next
1775 	 * dst_check() such references can be dropped.
1776 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1777 	 * expired, independently from their aging, as per RFC 8201 section 4
1778 	 */
1779 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1780 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1781 			RT6_TRACE("aging clone %p\n", rt);
1782 			rt6_remove_exception(bucket, rt6_ex);
1783 			return;
1784 		}
1785 	} else if (time_after(jiffies, rt->dst.expires)) {
1786 		RT6_TRACE("purging expired route %p\n", rt);
1787 		rt6_remove_exception(bucket, rt6_ex);
1788 		return;
1789 	}
1790 
1791 	if (rt->rt6i_flags & RTF_GATEWAY) {
1792 		struct neighbour *neigh;
1793 		__u8 neigh_flags = 0;
1794 
1795 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1796 		if (neigh)
1797 			neigh_flags = neigh->flags;
1798 
1799 		if (!(neigh_flags & NTF_ROUTER)) {
1800 			RT6_TRACE("purging route %p via non-router but gateway\n",
1801 				  rt);
1802 			rt6_remove_exception(bucket, rt6_ex);
1803 			return;
1804 		}
1805 	}
1806 
1807 	gc_args->more++;
1808 }
1809 
1810 void rt6_age_exceptions(struct fib6_info *rt,
1811 			struct fib6_gc_args *gc_args,
1812 			unsigned long now)
1813 {
1814 	struct rt6_exception_bucket *bucket;
1815 	struct rt6_exception *rt6_ex;
1816 	struct hlist_node *tmp;
1817 	int i;
1818 
1819 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1820 		return;
1821 
1822 	rcu_read_lock_bh();
1823 	spin_lock(&rt6_exception_lock);
1824 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1825 				    lockdep_is_held(&rt6_exception_lock));
1826 
1827 	if (bucket) {
1828 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1829 			hlist_for_each_entry_safe(rt6_ex, tmp,
1830 						  &bucket->chain, hlist) {
1831 				rt6_age_examine_exception(bucket, rt6_ex,
1832 							  gc_args, now);
1833 			}
1834 			bucket++;
1835 		}
1836 	}
1837 	spin_unlock(&rt6_exception_lock);
1838 	rcu_read_unlock_bh();
1839 }
1840 
1841 /* must be called with rcu lock held */
1842 int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
1843 		      struct flowi6 *fl6, struct fib6_result *res, int strict)
1844 {
1845 	struct fib6_node *fn, *saved_fn;
1846 
1847 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1848 	saved_fn = fn;
1849 
1850 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1851 		oif = 0;
1852 
1853 redo_rt6_select:
1854 	rt6_select(net, fn, oif, res, strict);
1855 	if (res->f6i == net->ipv6.fib6_null_entry) {
1856 		fn = fib6_backtrack(fn, &fl6->saddr);
1857 		if (fn)
1858 			goto redo_rt6_select;
1859 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1860 			/* also consider unreachable route */
1861 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1862 			fn = saved_fn;
1863 			goto redo_rt6_select;
1864 		}
1865 	}
1866 
1867 	trace_fib6_table_lookup(net, res, table, fl6);
1868 
1869 	return 0;
1870 }
1871 
1872 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1873 			       int oif, struct flowi6 *fl6,
1874 			       const struct sk_buff *skb, int flags)
1875 {
1876 	struct fib6_result res = {};
1877 	struct rt6_info *rt;
1878 	int strict = 0;
1879 
1880 	strict |= flags & RT6_LOOKUP_F_IFACE;
1881 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1882 	if (net->ipv6.devconf_all->forwarding == 0)
1883 		strict |= RT6_LOOKUP_F_REACHABLE;
1884 
1885 	rcu_read_lock();
1886 
1887 	fib6_table_lookup(net, table, oif, fl6, &res, strict);
1888 	if (res.f6i == net->ipv6.fib6_null_entry) {
1889 		rt = net->ipv6.ip6_null_entry;
1890 		rcu_read_unlock();
1891 		dst_hold(&rt->dst);
1892 		return rt;
1893 	}
1894 
1895 	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
1896 
1897 	/*Search through exception table */
1898 	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1899 	if (rt) {
1900 		if (ip6_hold_safe(net, &rt))
1901 			dst_use_noref(&rt->dst, jiffies);
1902 
1903 		rcu_read_unlock();
1904 		return rt;
1905 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1906 			    !res.nh->fib_nh_gw_family)) {
1907 		/* Create a RTF_CACHE clone which will not be
1908 		 * owned by the fib6 tree.  It is for the special case where
1909 		 * the daddr in the skb during the neighbor look-up is different
1910 		 * from the fl6->daddr used to look-up route here.
1911 		 */
1912 		struct rt6_info *uncached_rt;
1913 
1914 		uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
1915 
1916 		rcu_read_unlock();
1917 
1918 		if (uncached_rt) {
1919 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1920 			 * No need for another dst_hold()
1921 			 */
1922 			rt6_uncached_list_add(uncached_rt);
1923 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1924 		} else {
1925 			uncached_rt = net->ipv6.ip6_null_entry;
1926 			dst_hold(&uncached_rt->dst);
1927 		}
1928 
1929 		return uncached_rt;
1930 	} else {
1931 		/* Get a percpu copy */
1932 
1933 		struct rt6_info *pcpu_rt;
1934 
1935 		local_bh_disable();
1936 		pcpu_rt = rt6_get_pcpu_route(&res);
1937 
1938 		if (!pcpu_rt)
1939 			pcpu_rt = rt6_make_pcpu_route(net, &res);
1940 
1941 		local_bh_enable();
1942 		rcu_read_unlock();
1943 
1944 		return pcpu_rt;
1945 	}
1946 }
1947 EXPORT_SYMBOL_GPL(ip6_pol_route);
1948 
1949 static struct rt6_info *ip6_pol_route_input(struct net *net,
1950 					    struct fib6_table *table,
1951 					    struct flowi6 *fl6,
1952 					    const struct sk_buff *skb,
1953 					    int flags)
1954 {
1955 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1956 }
1957 
1958 struct dst_entry *ip6_route_input_lookup(struct net *net,
1959 					 struct net_device *dev,
1960 					 struct flowi6 *fl6,
1961 					 const struct sk_buff *skb,
1962 					 int flags)
1963 {
1964 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1965 		flags |= RT6_LOOKUP_F_IFACE;
1966 
1967 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1968 }
1969 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1970 
1971 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1972 				  struct flow_keys *keys,
1973 				  struct flow_keys *flkeys)
1974 {
1975 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1976 	const struct ipv6hdr *key_iph = outer_iph;
1977 	struct flow_keys *_flkeys = flkeys;
1978 	const struct ipv6hdr *inner_iph;
1979 	const struct icmp6hdr *icmph;
1980 	struct ipv6hdr _inner_iph;
1981 	struct icmp6hdr _icmph;
1982 
1983 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1984 		goto out;
1985 
1986 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1987 				   sizeof(_icmph), &_icmph);
1988 	if (!icmph)
1989 		goto out;
1990 
1991 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1992 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1993 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1994 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1995 		goto out;
1996 
1997 	inner_iph = skb_header_pointer(skb,
1998 				       skb_transport_offset(skb) + sizeof(*icmph),
1999 				       sizeof(_inner_iph), &_inner_iph);
2000 	if (!inner_iph)
2001 		goto out;
2002 
2003 	key_iph = inner_iph;
2004 	_flkeys = NULL;
2005 out:
2006 	if (_flkeys) {
2007 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2008 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2009 		keys->tags.flow_label = _flkeys->tags.flow_label;
2010 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2011 	} else {
2012 		keys->addrs.v6addrs.src = key_iph->saddr;
2013 		keys->addrs.v6addrs.dst = key_iph->daddr;
2014 		keys->tags.flow_label = ip6_flowlabel(key_iph);
2015 		keys->basic.ip_proto = key_iph->nexthdr;
2016 	}
2017 }
2018 
2019 /* if skb is set it will be used and fl6 can be NULL */
2020 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2021 		       const struct sk_buff *skb, struct flow_keys *flkeys)
2022 {
2023 	struct flow_keys hash_keys;
2024 	u32 mhash;
2025 
2026 	switch (ip6_multipath_hash_policy(net)) {
2027 	case 0:
2028 		memset(&hash_keys, 0, sizeof(hash_keys));
2029 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2030 		if (skb) {
2031 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2032 		} else {
2033 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2034 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2035 			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2036 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2037 		}
2038 		break;
2039 	case 1:
2040 		if (skb) {
2041 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2042 			struct flow_keys keys;
2043 
2044 			/* short-circuit if we already have L4 hash present */
2045 			if (skb->l4_hash)
2046 				return skb_get_hash_raw(skb) >> 1;
2047 
2048 			memset(&hash_keys, 0, sizeof(hash_keys));
2049 
2050                         if (!flkeys) {
2051 				skb_flow_dissect_flow_keys(skb, &keys, flag);
2052 				flkeys = &keys;
2053 			}
2054 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2055 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2056 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2057 			hash_keys.ports.src = flkeys->ports.src;
2058 			hash_keys.ports.dst = flkeys->ports.dst;
2059 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2060 		} else {
2061 			memset(&hash_keys, 0, sizeof(hash_keys));
2062 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2063 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2064 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2065 			hash_keys.ports.src = fl6->fl6_sport;
2066 			hash_keys.ports.dst = fl6->fl6_dport;
2067 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2068 		}
2069 		break;
2070 	}
2071 	mhash = flow_hash_from_keys(&hash_keys);
2072 
2073 	return mhash >> 1;
2074 }
2075 
2076 void ip6_route_input(struct sk_buff *skb)
2077 {
2078 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2079 	struct net *net = dev_net(skb->dev);
2080 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2081 	struct ip_tunnel_info *tun_info;
2082 	struct flowi6 fl6 = {
2083 		.flowi6_iif = skb->dev->ifindex,
2084 		.daddr = iph->daddr,
2085 		.saddr = iph->saddr,
2086 		.flowlabel = ip6_flowinfo(iph),
2087 		.flowi6_mark = skb->mark,
2088 		.flowi6_proto = iph->nexthdr,
2089 	};
2090 	struct flow_keys *flkeys = NULL, _flkeys;
2091 
2092 	tun_info = skb_tunnel_info(skb);
2093 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2094 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2095 
2096 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2097 		flkeys = &_flkeys;
2098 
2099 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2100 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2101 	skb_dst_drop(skb);
2102 	skb_dst_set(skb,
2103 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2104 }
2105 
2106 static struct rt6_info *ip6_pol_route_output(struct net *net,
2107 					     struct fib6_table *table,
2108 					     struct flowi6 *fl6,
2109 					     const struct sk_buff *skb,
2110 					     int flags)
2111 {
2112 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2113 }
2114 
2115 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2116 					 struct flowi6 *fl6, int flags)
2117 {
2118 	bool any_src;
2119 
2120 	if (ipv6_addr_type(&fl6->daddr) &
2121 	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2122 		struct dst_entry *dst;
2123 
2124 		dst = l3mdev_link_scope_lookup(net, fl6);
2125 		if (dst)
2126 			return dst;
2127 	}
2128 
2129 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2130 
2131 	any_src = ipv6_addr_any(&fl6->saddr);
2132 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2133 	    (fl6->flowi6_oif && any_src))
2134 		flags |= RT6_LOOKUP_F_IFACE;
2135 
2136 	if (!any_src)
2137 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2138 	else if (sk)
2139 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2140 
2141 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2142 }
2143 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2144 
2145 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2146 {
2147 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2148 	struct net_device *loopback_dev = net->loopback_dev;
2149 	struct dst_entry *new = NULL;
2150 
2151 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2152 		       DST_OBSOLETE_DEAD, 0);
2153 	if (rt) {
2154 		rt6_info_init(rt);
2155 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2156 
2157 		new = &rt->dst;
2158 		new->__use = 1;
2159 		new->input = dst_discard;
2160 		new->output = dst_discard_out;
2161 
2162 		dst_copy_metrics(new, &ort->dst);
2163 
2164 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2165 		rt->rt6i_gateway = ort->rt6i_gateway;
2166 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2167 
2168 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2169 #ifdef CONFIG_IPV6_SUBTREES
2170 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2171 #endif
2172 	}
2173 
2174 	dst_release(dst_orig);
2175 	return new ? new : ERR_PTR(-ENOMEM);
2176 }
2177 
2178 /*
2179  *	Destination cache support functions
2180  */
2181 
2182 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2183 {
2184 	u32 rt_cookie = 0;
2185 
2186 	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2187 		return false;
2188 
2189 	if (fib6_check_expired(f6i))
2190 		return false;
2191 
2192 	return true;
2193 }
2194 
2195 static struct dst_entry *rt6_check(struct rt6_info *rt,
2196 				   struct fib6_info *from,
2197 				   u32 cookie)
2198 {
2199 	u32 rt_cookie = 0;
2200 
2201 	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2202 	    rt_cookie != cookie)
2203 		return NULL;
2204 
2205 	if (rt6_check_expired(rt))
2206 		return NULL;
2207 
2208 	return &rt->dst;
2209 }
2210 
2211 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2212 					    struct fib6_info *from,
2213 					    u32 cookie)
2214 {
2215 	if (!__rt6_check_expired(rt) &&
2216 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2217 	    fib6_check(from, cookie))
2218 		return &rt->dst;
2219 	else
2220 		return NULL;
2221 }
2222 
2223 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2224 {
2225 	struct dst_entry *dst_ret;
2226 	struct fib6_info *from;
2227 	struct rt6_info *rt;
2228 
2229 	rt = container_of(dst, struct rt6_info, dst);
2230 
2231 	rcu_read_lock();
2232 
2233 	/* All IPV6 dsts are created with ->obsolete set to the value
2234 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2235 	 * into this function always.
2236 	 */
2237 
2238 	from = rcu_dereference(rt->from);
2239 
2240 	if (from && (rt->rt6i_flags & RTF_PCPU ||
2241 	    unlikely(!list_empty(&rt->rt6i_uncached))))
2242 		dst_ret = rt6_dst_from_check(rt, from, cookie);
2243 	else
2244 		dst_ret = rt6_check(rt, from, cookie);
2245 
2246 	rcu_read_unlock();
2247 
2248 	return dst_ret;
2249 }
2250 
2251 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2252 {
2253 	struct rt6_info *rt = (struct rt6_info *) dst;
2254 
2255 	if (rt) {
2256 		if (rt->rt6i_flags & RTF_CACHE) {
2257 			rcu_read_lock();
2258 			if (rt6_check_expired(rt)) {
2259 				rt6_remove_exception_rt(rt);
2260 				dst = NULL;
2261 			}
2262 			rcu_read_unlock();
2263 		} else {
2264 			dst_release(dst);
2265 			dst = NULL;
2266 		}
2267 	}
2268 	return dst;
2269 }
2270 
2271 static void ip6_link_failure(struct sk_buff *skb)
2272 {
2273 	struct rt6_info *rt;
2274 
2275 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2276 
2277 	rt = (struct rt6_info *) skb_dst(skb);
2278 	if (rt) {
2279 		rcu_read_lock();
2280 		if (rt->rt6i_flags & RTF_CACHE) {
2281 			rt6_remove_exception_rt(rt);
2282 		} else {
2283 			struct fib6_info *from;
2284 			struct fib6_node *fn;
2285 
2286 			from = rcu_dereference(rt->from);
2287 			if (from) {
2288 				fn = rcu_dereference(from->fib6_node);
2289 				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2290 					fn->fn_sernum = -1;
2291 			}
2292 		}
2293 		rcu_read_unlock();
2294 	}
2295 }
2296 
2297 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2298 {
2299 	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2300 		struct fib6_info *from;
2301 
2302 		rcu_read_lock();
2303 		from = rcu_dereference(rt0->from);
2304 		if (from)
2305 			rt0->dst.expires = from->expires;
2306 		rcu_read_unlock();
2307 	}
2308 
2309 	dst_set_expires(&rt0->dst, timeout);
2310 	rt0->rt6i_flags |= RTF_EXPIRES;
2311 }
2312 
2313 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2314 {
2315 	struct net *net = dev_net(rt->dst.dev);
2316 
2317 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2318 	rt->rt6i_flags |= RTF_MODIFIED;
2319 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2320 }
2321 
2322 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2323 {
2324 	return !(rt->rt6i_flags & RTF_CACHE) &&
2325 		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2326 }
2327 
2328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2329 				 const struct ipv6hdr *iph, u32 mtu)
2330 {
2331 	const struct in6_addr *daddr, *saddr;
2332 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2333 
2334 	if (dst_metric_locked(dst, RTAX_MTU))
2335 		return;
2336 
2337 	if (iph) {
2338 		daddr = &iph->daddr;
2339 		saddr = &iph->saddr;
2340 	} else if (sk) {
2341 		daddr = &sk->sk_v6_daddr;
2342 		saddr = &inet6_sk(sk)->saddr;
2343 	} else {
2344 		daddr = NULL;
2345 		saddr = NULL;
2346 	}
2347 	dst_confirm_neigh(dst, daddr);
2348 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2349 	if (mtu >= dst_mtu(dst))
2350 		return;
2351 
2352 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2353 		rt6_do_update_pmtu(rt6, mtu);
2354 		/* update rt6_ex->stamp for cache */
2355 		if (rt6->rt6i_flags & RTF_CACHE)
2356 			rt6_update_exception_stamp_rt(rt6);
2357 	} else if (daddr) {
2358 		struct fib6_result res = {};
2359 		struct rt6_info *nrt6;
2360 
2361 		rcu_read_lock();
2362 		res.f6i = rcu_dereference(rt6->from);
2363 		if (!res.f6i) {
2364 			rcu_read_unlock();
2365 			return;
2366 		}
2367 		res.nh = &res.f6i->fib6_nh;
2368 		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2369 		if (nrt6) {
2370 			rt6_do_update_pmtu(nrt6, mtu);
2371 			if (rt6_insert_exception(nrt6, &res))
2372 				dst_release_immediate(&nrt6->dst);
2373 		}
2374 		rcu_read_unlock();
2375 	}
2376 }
2377 
2378 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2379 			       struct sk_buff *skb, u32 mtu)
2380 {
2381 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2382 }
2383 
2384 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2385 		     int oif, u32 mark, kuid_t uid)
2386 {
2387 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2388 	struct dst_entry *dst;
2389 	struct flowi6 fl6 = {
2390 		.flowi6_oif = oif,
2391 		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2392 		.daddr = iph->daddr,
2393 		.saddr = iph->saddr,
2394 		.flowlabel = ip6_flowinfo(iph),
2395 		.flowi6_uid = uid,
2396 	};
2397 
2398 	dst = ip6_route_output(net, NULL, &fl6);
2399 	if (!dst->error)
2400 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2401 	dst_release(dst);
2402 }
2403 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2404 
2405 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2406 {
2407 	int oif = sk->sk_bound_dev_if;
2408 	struct dst_entry *dst;
2409 
2410 	if (!oif && skb->dev)
2411 		oif = l3mdev_master_ifindex(skb->dev);
2412 
2413 	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2414 
2415 	dst = __sk_dst_get(sk);
2416 	if (!dst || !dst->obsolete ||
2417 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2418 		return;
2419 
2420 	bh_lock_sock(sk);
2421 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2422 		ip6_datagram_dst_update(sk, false);
2423 	bh_unlock_sock(sk);
2424 }
2425 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2426 
2427 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2428 			   const struct flowi6 *fl6)
2429 {
2430 #ifdef CONFIG_IPV6_SUBTREES
2431 	struct ipv6_pinfo *np = inet6_sk(sk);
2432 #endif
2433 
2434 	ip6_dst_store(sk, dst,
2435 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2436 		      &sk->sk_v6_daddr : NULL,
2437 #ifdef CONFIG_IPV6_SUBTREES
2438 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2439 		      &np->saddr :
2440 #endif
2441 		      NULL);
2442 }
2443 
2444 static bool ip6_redirect_nh_match(const struct fib6_result *res,
2445 				  struct flowi6 *fl6,
2446 				  const struct in6_addr *gw,
2447 				  struct rt6_info **ret)
2448 {
2449 	const struct fib6_nh *nh = res->nh;
2450 
2451 	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2452 	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2453 		return false;
2454 
2455 	/* rt_cache's gateway might be different from its 'parent'
2456 	 * in the case of an ip redirect.
2457 	 * So we keep searching in the exception table if the gateway
2458 	 * is different.
2459 	 */
2460 	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2461 		struct rt6_info *rt_cache;
2462 
2463 		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2464 		if (rt_cache &&
2465 		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2466 			*ret = rt_cache;
2467 			return true;
2468 		}
2469 		return false;
2470 	}
2471 	return true;
2472 }
2473 
2474 /* Handle redirects */
2475 struct ip6rd_flowi {
2476 	struct flowi6 fl6;
2477 	struct in6_addr gateway;
2478 };
2479 
2480 static struct rt6_info *__ip6_route_redirect(struct net *net,
2481 					     struct fib6_table *table,
2482 					     struct flowi6 *fl6,
2483 					     const struct sk_buff *skb,
2484 					     int flags)
2485 {
2486 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2487 	struct rt6_info *ret = NULL;
2488 	struct fib6_result res = {};
2489 	struct fib6_info *rt;
2490 	struct fib6_node *fn;
2491 
2492 	/* Get the "current" route for this destination and
2493 	 * check if the redirect has come from appropriate router.
2494 	 *
2495 	 * RFC 4861 specifies that redirects should only be
2496 	 * accepted if they come from the nexthop to the target.
2497 	 * Due to the way the routes are chosen, this notion
2498 	 * is a bit fuzzy and one might need to check all possible
2499 	 * routes.
2500 	 */
2501 
2502 	rcu_read_lock();
2503 	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2504 restart:
2505 	for_each_fib6_node_rt_rcu(fn) {
2506 		res.f6i = rt;
2507 		res.nh = &rt->fib6_nh;
2508 
2509 		if (fib6_check_expired(rt))
2510 			continue;
2511 		if (rt->fib6_flags & RTF_REJECT)
2512 			break;
2513 		if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret))
2514 			goto out;
2515 	}
2516 
2517 	if (!rt)
2518 		rt = net->ipv6.fib6_null_entry;
2519 	else if (rt->fib6_flags & RTF_REJECT) {
2520 		ret = net->ipv6.ip6_null_entry;
2521 		goto out;
2522 	}
2523 
2524 	if (rt == net->ipv6.fib6_null_entry) {
2525 		fn = fib6_backtrack(fn, &fl6->saddr);
2526 		if (fn)
2527 			goto restart;
2528 	}
2529 
2530 	res.f6i = rt;
2531 	res.nh = &rt->fib6_nh;
2532 out:
2533 	if (ret)
2534 		ip6_hold_safe(net, &ret);
2535 	else
2536 		ret = ip6_create_rt_rcu(&res);
2537 
2538 	rcu_read_unlock();
2539 
2540 	trace_fib6_table_lookup(net, &res, table, fl6);
2541 	return ret;
2542 };
2543 
2544 static struct dst_entry *ip6_route_redirect(struct net *net,
2545 					    const struct flowi6 *fl6,
2546 					    const struct sk_buff *skb,
2547 					    const struct in6_addr *gateway)
2548 {
2549 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2550 	struct ip6rd_flowi rdfl;
2551 
2552 	rdfl.fl6 = *fl6;
2553 	rdfl.gateway = *gateway;
2554 
2555 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2556 				flags, __ip6_route_redirect);
2557 }
2558 
2559 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2560 		  kuid_t uid)
2561 {
2562 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2563 	struct dst_entry *dst;
2564 	struct flowi6 fl6 = {
2565 		.flowi6_iif = LOOPBACK_IFINDEX,
2566 		.flowi6_oif = oif,
2567 		.flowi6_mark = mark,
2568 		.daddr = iph->daddr,
2569 		.saddr = iph->saddr,
2570 		.flowlabel = ip6_flowinfo(iph),
2571 		.flowi6_uid = uid,
2572 	};
2573 
2574 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2575 	rt6_do_redirect(dst, NULL, skb);
2576 	dst_release(dst);
2577 }
2578 EXPORT_SYMBOL_GPL(ip6_redirect);
2579 
2580 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2581 {
2582 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2583 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2584 	struct dst_entry *dst;
2585 	struct flowi6 fl6 = {
2586 		.flowi6_iif = LOOPBACK_IFINDEX,
2587 		.flowi6_oif = oif,
2588 		.daddr = msg->dest,
2589 		.saddr = iph->daddr,
2590 		.flowi6_uid = sock_net_uid(net, NULL),
2591 	};
2592 
2593 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2594 	rt6_do_redirect(dst, NULL, skb);
2595 	dst_release(dst);
2596 }
2597 
2598 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2599 {
2600 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2601 		     sk->sk_uid);
2602 }
2603 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2604 
2605 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2606 {
2607 	struct net_device *dev = dst->dev;
2608 	unsigned int mtu = dst_mtu(dst);
2609 	struct net *net = dev_net(dev);
2610 
2611 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2612 
2613 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2614 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2615 
2616 	/*
2617 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2618 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2619 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2620 	 * rely only on pmtu discovery"
2621 	 */
2622 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2623 		mtu = IPV6_MAXPLEN;
2624 	return mtu;
2625 }
2626 
2627 static unsigned int ip6_mtu(const struct dst_entry *dst)
2628 {
2629 	struct inet6_dev *idev;
2630 	unsigned int mtu;
2631 
2632 	mtu = dst_metric_raw(dst, RTAX_MTU);
2633 	if (mtu)
2634 		goto out;
2635 
2636 	mtu = IPV6_MIN_MTU;
2637 
2638 	rcu_read_lock();
2639 	idev = __in6_dev_get(dst->dev);
2640 	if (idev)
2641 		mtu = idev->cnf.mtu6;
2642 	rcu_read_unlock();
2643 
2644 out:
2645 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2646 
2647 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2648 }
2649 
2650 /* MTU selection:
2651  * 1. mtu on route is locked - use it
2652  * 2. mtu from nexthop exception
2653  * 3. mtu from egress device
2654  *
2655  * based on ip6_dst_mtu_forward and exception logic of
2656  * rt6_find_cached_rt; called with rcu_read_lock
2657  */
2658 u32 ip6_mtu_from_fib6(const struct fib6_result *res,
2659 		      const struct in6_addr *daddr,
2660 		      const struct in6_addr *saddr)
2661 {
2662 	struct rt6_exception_bucket *bucket;
2663 	const struct fib6_nh *nh = res->nh;
2664 	struct fib6_info *f6i = res->f6i;
2665 	const struct in6_addr *src_key;
2666 	struct rt6_exception *rt6_ex;
2667 	struct inet6_dev *idev;
2668 	u32 mtu = 0;
2669 
2670 	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2671 		mtu = f6i->fib6_pmtu;
2672 		if (mtu)
2673 			goto out;
2674 	}
2675 
2676 	src_key = NULL;
2677 #ifdef CONFIG_IPV6_SUBTREES
2678 	if (f6i->fib6_src.plen)
2679 		src_key = saddr;
2680 #endif
2681 
2682 	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
2683 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
2684 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
2685 		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
2686 
2687 	if (likely(!mtu)) {
2688 		struct net_device *dev = nh->fib_nh_dev;
2689 
2690 		mtu = IPV6_MIN_MTU;
2691 		idev = __in6_dev_get(dev);
2692 		if (idev && idev->cnf.mtu6 > mtu)
2693 			mtu = idev->cnf.mtu6;
2694 	}
2695 
2696 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2697 out:
2698 	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
2699 }
2700 
2701 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2702 				  struct flowi6 *fl6)
2703 {
2704 	struct dst_entry *dst;
2705 	struct rt6_info *rt;
2706 	struct inet6_dev *idev = in6_dev_get(dev);
2707 	struct net *net = dev_net(dev);
2708 
2709 	if (unlikely(!idev))
2710 		return ERR_PTR(-ENODEV);
2711 
2712 	rt = ip6_dst_alloc(net, dev, 0);
2713 	if (unlikely(!rt)) {
2714 		in6_dev_put(idev);
2715 		dst = ERR_PTR(-ENOMEM);
2716 		goto out;
2717 	}
2718 
2719 	rt->dst.flags |= DST_HOST;
2720 	rt->dst.input = ip6_input;
2721 	rt->dst.output  = ip6_output;
2722 	rt->rt6i_gateway  = fl6->daddr;
2723 	rt->rt6i_dst.addr = fl6->daddr;
2724 	rt->rt6i_dst.plen = 128;
2725 	rt->rt6i_idev     = idev;
2726 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2727 
2728 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2729 	 * do proper release of the net_device
2730 	 */
2731 	rt6_uncached_list_add(rt);
2732 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2733 
2734 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2735 
2736 out:
2737 	return dst;
2738 }
2739 
2740 static int ip6_dst_gc(struct dst_ops *ops)
2741 {
2742 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2743 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2744 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2745 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2746 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2747 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2748 	int entries;
2749 
2750 	entries = dst_entries_get_fast(ops);
2751 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2752 	    entries <= rt_max_size)
2753 		goto out;
2754 
2755 	net->ipv6.ip6_rt_gc_expire++;
2756 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2757 	entries = dst_entries_get_slow(ops);
2758 	if (entries < ops->gc_thresh)
2759 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2760 out:
2761 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2762 	return entries > rt_max_size;
2763 }
2764 
2765 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2766 					    struct fib6_config *cfg,
2767 					    const struct in6_addr *gw_addr,
2768 					    u32 tbid, int flags)
2769 {
2770 	struct flowi6 fl6 = {
2771 		.flowi6_oif = cfg->fc_ifindex,
2772 		.daddr = *gw_addr,
2773 		.saddr = cfg->fc_prefsrc,
2774 	};
2775 	struct fib6_table *table;
2776 	struct rt6_info *rt;
2777 
2778 	table = fib6_get_table(net, tbid);
2779 	if (!table)
2780 		return NULL;
2781 
2782 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2783 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2784 
2785 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2786 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2787 
2788 	/* if table lookup failed, fall back to full lookup */
2789 	if (rt == net->ipv6.ip6_null_entry) {
2790 		ip6_rt_put(rt);
2791 		rt = NULL;
2792 	}
2793 
2794 	return rt;
2795 }
2796 
2797 static int ip6_route_check_nh_onlink(struct net *net,
2798 				     struct fib6_config *cfg,
2799 				     const struct net_device *dev,
2800 				     struct netlink_ext_ack *extack)
2801 {
2802 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2803 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2804 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2805 	struct fib6_info *from;
2806 	struct rt6_info *grt;
2807 	int err;
2808 
2809 	err = 0;
2810 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2811 	if (grt) {
2812 		rcu_read_lock();
2813 		from = rcu_dereference(grt->from);
2814 		if (!grt->dst.error &&
2815 		    /* ignore match if it is the default route */
2816 		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2817 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2818 			NL_SET_ERR_MSG(extack,
2819 				       "Nexthop has invalid gateway or device mismatch");
2820 			err = -EINVAL;
2821 		}
2822 		rcu_read_unlock();
2823 
2824 		ip6_rt_put(grt);
2825 	}
2826 
2827 	return err;
2828 }
2829 
2830 static int ip6_route_check_nh(struct net *net,
2831 			      struct fib6_config *cfg,
2832 			      struct net_device **_dev,
2833 			      struct inet6_dev **idev)
2834 {
2835 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2836 	struct net_device *dev = _dev ? *_dev : NULL;
2837 	struct rt6_info *grt = NULL;
2838 	int err = -EHOSTUNREACH;
2839 
2840 	if (cfg->fc_table) {
2841 		int flags = RT6_LOOKUP_F_IFACE;
2842 
2843 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2844 					  cfg->fc_table, flags);
2845 		if (grt) {
2846 			if (grt->rt6i_flags & RTF_GATEWAY ||
2847 			    (dev && dev != grt->dst.dev)) {
2848 				ip6_rt_put(grt);
2849 				grt = NULL;
2850 			}
2851 		}
2852 	}
2853 
2854 	if (!grt)
2855 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2856 
2857 	if (!grt)
2858 		goto out;
2859 
2860 	if (dev) {
2861 		if (dev != grt->dst.dev) {
2862 			ip6_rt_put(grt);
2863 			goto out;
2864 		}
2865 	} else {
2866 		*_dev = dev = grt->dst.dev;
2867 		*idev = grt->rt6i_idev;
2868 		dev_hold(dev);
2869 		in6_dev_hold(grt->rt6i_idev);
2870 	}
2871 
2872 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2873 		err = 0;
2874 
2875 	ip6_rt_put(grt);
2876 
2877 out:
2878 	return err;
2879 }
2880 
2881 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2882 			   struct net_device **_dev, struct inet6_dev **idev,
2883 			   struct netlink_ext_ack *extack)
2884 {
2885 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2886 	int gwa_type = ipv6_addr_type(gw_addr);
2887 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2888 	const struct net_device *dev = *_dev;
2889 	bool need_addr_check = !dev;
2890 	int err = -EINVAL;
2891 
2892 	/* if gw_addr is local we will fail to detect this in case
2893 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2894 	 * will return already-added prefix route via interface that
2895 	 * prefix route was assigned to, which might be non-loopback.
2896 	 */
2897 	if (dev &&
2898 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2899 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2900 		goto out;
2901 	}
2902 
2903 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2904 		/* IPv6 strictly inhibits using not link-local
2905 		 * addresses as nexthop address.
2906 		 * Otherwise, router will not able to send redirects.
2907 		 * It is very good, but in some (rare!) circumstances
2908 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2909 		 * some exceptions. --ANK
2910 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2911 		 * addressing
2912 		 */
2913 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2914 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2915 			goto out;
2916 		}
2917 
2918 		if (cfg->fc_flags & RTNH_F_ONLINK)
2919 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2920 		else
2921 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2922 
2923 		if (err)
2924 			goto out;
2925 	}
2926 
2927 	/* reload in case device was changed */
2928 	dev = *_dev;
2929 
2930 	err = -EINVAL;
2931 	if (!dev) {
2932 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2933 		goto out;
2934 	} else if (dev->flags & IFF_LOOPBACK) {
2935 		NL_SET_ERR_MSG(extack,
2936 			       "Egress device can not be loopback device for this route");
2937 		goto out;
2938 	}
2939 
2940 	/* if we did not check gw_addr above, do so now that the
2941 	 * egress device has been resolved.
2942 	 */
2943 	if (need_addr_check &&
2944 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2945 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2946 		goto out;
2947 	}
2948 
2949 	err = 0;
2950 out:
2951 	return err;
2952 }
2953 
2954 static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
2955 {
2956 	if ((flags & RTF_REJECT) ||
2957 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2958 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2959 	     !(flags & RTF_LOCAL)))
2960 		return true;
2961 
2962 	return false;
2963 }
2964 
2965 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
2966 		 struct fib6_config *cfg, gfp_t gfp_flags,
2967 		 struct netlink_ext_ack *extack)
2968 {
2969 	struct net_device *dev = NULL;
2970 	struct inet6_dev *idev = NULL;
2971 	int addr_type;
2972 	int err;
2973 
2974 	fib6_nh->fib_nh_family = AF_INET6;
2975 
2976 	err = -ENODEV;
2977 	if (cfg->fc_ifindex) {
2978 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2979 		if (!dev)
2980 			goto out;
2981 		idev = in6_dev_get(dev);
2982 		if (!idev)
2983 			goto out;
2984 	}
2985 
2986 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2987 		if (!dev) {
2988 			NL_SET_ERR_MSG(extack,
2989 				       "Nexthop device required for onlink");
2990 			goto out;
2991 		}
2992 
2993 		if (!(dev->flags & IFF_UP)) {
2994 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2995 			err = -ENETDOWN;
2996 			goto out;
2997 		}
2998 
2999 		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3000 	}
3001 
3002 	fib6_nh->fib_nh_weight = 1;
3003 
3004 	/* We cannot add true routes via loopback here,
3005 	 * they would result in kernel looping; promote them to reject routes
3006 	 */
3007 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3008 	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3009 		/* hold loopback dev/idev if we haven't done so. */
3010 		if (dev != net->loopback_dev) {
3011 			if (dev) {
3012 				dev_put(dev);
3013 				in6_dev_put(idev);
3014 			}
3015 			dev = net->loopback_dev;
3016 			dev_hold(dev);
3017 			idev = in6_dev_get(dev);
3018 			if (!idev) {
3019 				err = -ENODEV;
3020 				goto out;
3021 			}
3022 		}
3023 		goto set_dev;
3024 	}
3025 
3026 	if (cfg->fc_flags & RTF_GATEWAY) {
3027 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3028 		if (err)
3029 			goto out;
3030 
3031 		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3032 		fib6_nh->fib_nh_gw_family = AF_INET6;
3033 	}
3034 
3035 	err = -ENODEV;
3036 	if (!dev)
3037 		goto out;
3038 
3039 	if (idev->cnf.disable_ipv6) {
3040 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3041 		err = -EACCES;
3042 		goto out;
3043 	}
3044 
3045 	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3046 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3047 		err = -ENETDOWN;
3048 		goto out;
3049 	}
3050 
3051 	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3052 	    !netif_carrier_ok(dev))
3053 		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3054 
3055 	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3056 				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3057 	if (err)
3058 		goto out;
3059 set_dev:
3060 	fib6_nh->fib_nh_dev = dev;
3061 	fib6_nh->fib_nh_oif = dev->ifindex;
3062 	err = 0;
3063 out:
3064 	if (idev)
3065 		in6_dev_put(idev);
3066 
3067 	if (err) {
3068 		lwtstate_put(fib6_nh->fib_nh_lws);
3069 		fib6_nh->fib_nh_lws = NULL;
3070 		if (dev)
3071 			dev_put(dev);
3072 	}
3073 
3074 	return err;
3075 }
3076 
3077 void fib6_nh_release(struct fib6_nh *fib6_nh)
3078 {
3079 	fib_nh_common_release(&fib6_nh->nh_common);
3080 }
3081 
3082 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3083 					      gfp_t gfp_flags,
3084 					      struct netlink_ext_ack *extack)
3085 {
3086 	struct net *net = cfg->fc_nlinfo.nl_net;
3087 	struct fib6_info *rt = NULL;
3088 	struct fib6_table *table;
3089 	int err = -EINVAL;
3090 	int addr_type;
3091 
3092 	/* RTF_PCPU is an internal flag; can not be set by userspace */
3093 	if (cfg->fc_flags & RTF_PCPU) {
3094 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3095 		goto out;
3096 	}
3097 
3098 	/* RTF_CACHE is an internal flag; can not be set by userspace */
3099 	if (cfg->fc_flags & RTF_CACHE) {
3100 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3101 		goto out;
3102 	}
3103 
3104 	if (cfg->fc_type > RTN_MAX) {
3105 		NL_SET_ERR_MSG(extack, "Invalid route type");
3106 		goto out;
3107 	}
3108 
3109 	if (cfg->fc_dst_len > 128) {
3110 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3111 		goto out;
3112 	}
3113 	if (cfg->fc_src_len > 128) {
3114 		NL_SET_ERR_MSG(extack, "Invalid source address length");
3115 		goto out;
3116 	}
3117 #ifndef CONFIG_IPV6_SUBTREES
3118 	if (cfg->fc_src_len) {
3119 		NL_SET_ERR_MSG(extack,
3120 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3121 		goto out;
3122 	}
3123 #endif
3124 
3125 	err = -ENOBUFS;
3126 	if (cfg->fc_nlinfo.nlh &&
3127 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3128 		table = fib6_get_table(net, cfg->fc_table);
3129 		if (!table) {
3130 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3131 			table = fib6_new_table(net, cfg->fc_table);
3132 		}
3133 	} else {
3134 		table = fib6_new_table(net, cfg->fc_table);
3135 	}
3136 
3137 	if (!table)
3138 		goto out;
3139 
3140 	err = -ENOMEM;
3141 	rt = fib6_info_alloc(gfp_flags);
3142 	if (!rt)
3143 		goto out;
3144 
3145 	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3146 					       extack);
3147 	if (IS_ERR(rt->fib6_metrics)) {
3148 		err = PTR_ERR(rt->fib6_metrics);
3149 		/* Do not leave garbage there. */
3150 		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3151 		goto out;
3152 	}
3153 
3154 	if (cfg->fc_flags & RTF_ADDRCONF)
3155 		rt->dst_nocount = true;
3156 
3157 	if (cfg->fc_flags & RTF_EXPIRES)
3158 		fib6_set_expires(rt, jiffies +
3159 				clock_t_to_jiffies(cfg->fc_expires));
3160 	else
3161 		fib6_clean_expires(rt);
3162 
3163 	if (cfg->fc_protocol == RTPROT_UNSPEC)
3164 		cfg->fc_protocol = RTPROT_BOOT;
3165 	rt->fib6_protocol = cfg->fc_protocol;
3166 
3167 	rt->fib6_table = table;
3168 	rt->fib6_metric = cfg->fc_metric;
3169 	rt->fib6_type = cfg->fc_type;
3170 	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3171 
3172 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3173 	rt->fib6_dst.plen = cfg->fc_dst_len;
3174 	if (rt->fib6_dst.plen == 128)
3175 		rt->dst_host = true;
3176 
3177 #ifdef CONFIG_IPV6_SUBTREES
3178 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3179 	rt->fib6_src.plen = cfg->fc_src_len;
3180 #endif
3181 	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
3182 	if (err)
3183 		goto out;
3184 
3185 	/* We cannot add true routes via loopback here,
3186 	 * they would result in kernel looping; promote them to reject routes
3187 	 */
3188 	addr_type = ipv6_addr_type(&cfg->fc_dst);
3189 	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3190 		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3191 
3192 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3193 		struct net_device *dev = fib6_info_nh_dev(rt);
3194 
3195 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3196 			NL_SET_ERR_MSG(extack, "Invalid source address");
3197 			err = -EINVAL;
3198 			goto out;
3199 		}
3200 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3201 		rt->fib6_prefsrc.plen = 128;
3202 	} else
3203 		rt->fib6_prefsrc.plen = 0;
3204 
3205 	return rt;
3206 out:
3207 	fib6_info_release(rt);
3208 	return ERR_PTR(err);
3209 }
3210 
3211 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3212 		  struct netlink_ext_ack *extack)
3213 {
3214 	struct fib6_info *rt;
3215 	int err;
3216 
3217 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3218 	if (IS_ERR(rt))
3219 		return PTR_ERR(rt);
3220 
3221 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3222 	fib6_info_release(rt);
3223 
3224 	return err;
3225 }
3226 
3227 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3228 {
3229 	struct net *net = info->nl_net;
3230 	struct fib6_table *table;
3231 	int err;
3232 
3233 	if (rt == net->ipv6.fib6_null_entry) {
3234 		err = -ENOENT;
3235 		goto out;
3236 	}
3237 
3238 	table = rt->fib6_table;
3239 	spin_lock_bh(&table->tb6_lock);
3240 	err = fib6_del(rt, info);
3241 	spin_unlock_bh(&table->tb6_lock);
3242 
3243 out:
3244 	fib6_info_release(rt);
3245 	return err;
3246 }
3247 
3248 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3249 {
3250 	struct nl_info info = { .nl_net = net };
3251 
3252 	return __ip6_del_rt(rt, &info);
3253 }
3254 
3255 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3256 {
3257 	struct nl_info *info = &cfg->fc_nlinfo;
3258 	struct net *net = info->nl_net;
3259 	struct sk_buff *skb = NULL;
3260 	struct fib6_table *table;
3261 	int err = -ENOENT;
3262 
3263 	if (rt == net->ipv6.fib6_null_entry)
3264 		goto out_put;
3265 	table = rt->fib6_table;
3266 	spin_lock_bh(&table->tb6_lock);
3267 
3268 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3269 		struct fib6_info *sibling, *next_sibling;
3270 
3271 		/* prefer to send a single notification with all hops */
3272 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3273 		if (skb) {
3274 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3275 
3276 			if (rt6_fill_node(net, skb, rt, NULL,
3277 					  NULL, NULL, 0, RTM_DELROUTE,
3278 					  info->portid, seq, 0) < 0) {
3279 				kfree_skb(skb);
3280 				skb = NULL;
3281 			} else
3282 				info->skip_notify = 1;
3283 		}
3284 
3285 		list_for_each_entry_safe(sibling, next_sibling,
3286 					 &rt->fib6_siblings,
3287 					 fib6_siblings) {
3288 			err = fib6_del(sibling, info);
3289 			if (err)
3290 				goto out_unlock;
3291 		}
3292 	}
3293 
3294 	err = fib6_del(rt, info);
3295 out_unlock:
3296 	spin_unlock_bh(&table->tb6_lock);
3297 out_put:
3298 	fib6_info_release(rt);
3299 
3300 	if (skb) {
3301 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3302 			    info->nlh, gfp_any());
3303 	}
3304 	return err;
3305 }
3306 
3307 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3308 {
3309 	int rc = -ESRCH;
3310 
3311 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3312 		goto out;
3313 
3314 	if (cfg->fc_flags & RTF_GATEWAY &&
3315 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3316 		goto out;
3317 
3318 	rc = rt6_remove_exception_rt(rt);
3319 out:
3320 	return rc;
3321 }
3322 
3323 static int ip6_route_del(struct fib6_config *cfg,
3324 			 struct netlink_ext_ack *extack)
3325 {
3326 	struct rt6_info *rt_cache;
3327 	struct fib6_table *table;
3328 	struct fib6_info *rt;
3329 	struct fib6_node *fn;
3330 	int err = -ESRCH;
3331 
3332 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3333 	if (!table) {
3334 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3335 		return err;
3336 	}
3337 
3338 	rcu_read_lock();
3339 
3340 	fn = fib6_locate(&table->tb6_root,
3341 			 &cfg->fc_dst, cfg->fc_dst_len,
3342 			 &cfg->fc_src, cfg->fc_src_len,
3343 			 !(cfg->fc_flags & RTF_CACHE));
3344 
3345 	if (fn) {
3346 		for_each_fib6_node_rt_rcu(fn) {
3347 			struct fib6_nh *nh;
3348 
3349 			if (cfg->fc_flags & RTF_CACHE) {
3350 				struct fib6_result res = {
3351 					.f6i = rt,
3352 				};
3353 				int rc;
3354 
3355 				rt_cache = rt6_find_cached_rt(&res,
3356 							      &cfg->fc_dst,
3357 							      &cfg->fc_src);
3358 				if (rt_cache) {
3359 					rc = ip6_del_cached_rt(rt_cache, cfg);
3360 					if (rc != -ESRCH) {
3361 						rcu_read_unlock();
3362 						return rc;
3363 					}
3364 				}
3365 				continue;
3366 			}
3367 
3368 			nh = &rt->fib6_nh;
3369 			if (cfg->fc_ifindex &&
3370 			    (!nh->fib_nh_dev ||
3371 			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3372 				continue;
3373 			if (cfg->fc_flags & RTF_GATEWAY &&
3374 			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3375 				continue;
3376 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3377 				continue;
3378 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3379 				continue;
3380 			if (!fib6_info_hold_safe(rt))
3381 				continue;
3382 			rcu_read_unlock();
3383 
3384 			/* if gateway was specified only delete the one hop */
3385 			if (cfg->fc_flags & RTF_GATEWAY)
3386 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3387 
3388 			return __ip6_del_rt_siblings(rt, cfg);
3389 		}
3390 	}
3391 	rcu_read_unlock();
3392 
3393 	return err;
3394 }
3395 
3396 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3397 {
3398 	struct netevent_redirect netevent;
3399 	struct rt6_info *rt, *nrt = NULL;
3400 	struct fib6_result res = {};
3401 	struct ndisc_options ndopts;
3402 	struct inet6_dev *in6_dev;
3403 	struct neighbour *neigh;
3404 	struct rd_msg *msg;
3405 	int optlen, on_link;
3406 	u8 *lladdr;
3407 
3408 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3409 	optlen -= sizeof(*msg);
3410 
3411 	if (optlen < 0) {
3412 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3413 		return;
3414 	}
3415 
3416 	msg = (struct rd_msg *)icmp6_hdr(skb);
3417 
3418 	if (ipv6_addr_is_multicast(&msg->dest)) {
3419 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3420 		return;
3421 	}
3422 
3423 	on_link = 0;
3424 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3425 		on_link = 1;
3426 	} else if (ipv6_addr_type(&msg->target) !=
3427 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3428 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3429 		return;
3430 	}
3431 
3432 	in6_dev = __in6_dev_get(skb->dev);
3433 	if (!in6_dev)
3434 		return;
3435 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3436 		return;
3437 
3438 	/* RFC2461 8.1:
3439 	 *	The IP source address of the Redirect MUST be the same as the current
3440 	 *	first-hop router for the specified ICMP Destination Address.
3441 	 */
3442 
3443 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3444 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3445 		return;
3446 	}
3447 
3448 	lladdr = NULL;
3449 	if (ndopts.nd_opts_tgt_lladdr) {
3450 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3451 					     skb->dev);
3452 		if (!lladdr) {
3453 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3454 			return;
3455 		}
3456 	}
3457 
3458 	rt = (struct rt6_info *) dst;
3459 	if (rt->rt6i_flags & RTF_REJECT) {
3460 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3461 		return;
3462 	}
3463 
3464 	/* Redirect received -> path was valid.
3465 	 * Look, redirects are sent only in response to data packets,
3466 	 * so that this nexthop apparently is reachable. --ANK
3467 	 */
3468 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3469 
3470 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3471 	if (!neigh)
3472 		return;
3473 
3474 	/*
3475 	 *	We have finally decided to accept it.
3476 	 */
3477 
3478 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3479 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3480 		     NEIGH_UPDATE_F_OVERRIDE|
3481 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3482 				     NEIGH_UPDATE_F_ISROUTER)),
3483 		     NDISC_REDIRECT, &ndopts);
3484 
3485 	rcu_read_lock();
3486 	res.f6i = rcu_dereference(rt->from);
3487 	/* This fib6_info_hold() is safe here because we hold reference to rt
3488 	 * and rt already holds reference to fib6_info.
3489 	 */
3490 	fib6_info_hold(res.f6i);
3491 	rcu_read_unlock();
3492 
3493 	res.nh = &res.f6i->fib6_nh;
3494 	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
3495 	if (!nrt)
3496 		goto out;
3497 
3498 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3499 	if (on_link)
3500 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3501 
3502 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3503 
3504 	/* No need to remove rt from the exception table if rt is
3505 	 * a cached route because rt6_insert_exception() will
3506 	 * takes care of it
3507 	 */
3508 	if (rt6_insert_exception(nrt, &res)) {
3509 		dst_release_immediate(&nrt->dst);
3510 		goto out;
3511 	}
3512 
3513 	netevent.old = &rt->dst;
3514 	netevent.new = &nrt->dst;
3515 	netevent.daddr = &msg->dest;
3516 	netevent.neigh = neigh;
3517 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3518 
3519 out:
3520 	fib6_info_release(res.f6i);
3521 	neigh_release(neigh);
3522 }
3523 
3524 #ifdef CONFIG_IPV6_ROUTE_INFO
3525 static struct fib6_info *rt6_get_route_info(struct net *net,
3526 					   const struct in6_addr *prefix, int prefixlen,
3527 					   const struct in6_addr *gwaddr,
3528 					   struct net_device *dev)
3529 {
3530 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3531 	int ifindex = dev->ifindex;
3532 	struct fib6_node *fn;
3533 	struct fib6_info *rt = NULL;
3534 	struct fib6_table *table;
3535 
3536 	table = fib6_get_table(net, tb_id);
3537 	if (!table)
3538 		return NULL;
3539 
3540 	rcu_read_lock();
3541 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3542 	if (!fn)
3543 		goto out;
3544 
3545 	for_each_fib6_node_rt_rcu(fn) {
3546 		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3547 			continue;
3548 		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
3549 		    !rt->fib6_nh.fib_nh_gw_family)
3550 			continue;
3551 		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3552 			continue;
3553 		if (!fib6_info_hold_safe(rt))
3554 			continue;
3555 		break;
3556 	}
3557 out:
3558 	rcu_read_unlock();
3559 	return rt;
3560 }
3561 
3562 static struct fib6_info *rt6_add_route_info(struct net *net,
3563 					   const struct in6_addr *prefix, int prefixlen,
3564 					   const struct in6_addr *gwaddr,
3565 					   struct net_device *dev,
3566 					   unsigned int pref)
3567 {
3568 	struct fib6_config cfg = {
3569 		.fc_metric	= IP6_RT_PRIO_USER,
3570 		.fc_ifindex	= dev->ifindex,
3571 		.fc_dst_len	= prefixlen,
3572 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3573 				  RTF_UP | RTF_PREF(pref),
3574 		.fc_protocol = RTPROT_RA,
3575 		.fc_type = RTN_UNICAST,
3576 		.fc_nlinfo.portid = 0,
3577 		.fc_nlinfo.nlh = NULL,
3578 		.fc_nlinfo.nl_net = net,
3579 	};
3580 
3581 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3582 	cfg.fc_dst = *prefix;
3583 	cfg.fc_gateway = *gwaddr;
3584 
3585 	/* We should treat it as a default route if prefix length is 0. */
3586 	if (!prefixlen)
3587 		cfg.fc_flags |= RTF_DEFAULT;
3588 
3589 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3590 
3591 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3592 }
3593 #endif
3594 
3595 struct fib6_info *rt6_get_dflt_router(struct net *net,
3596 				     const struct in6_addr *addr,
3597 				     struct net_device *dev)
3598 {
3599 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3600 	struct fib6_info *rt;
3601 	struct fib6_table *table;
3602 
3603 	table = fib6_get_table(net, tb_id);
3604 	if (!table)
3605 		return NULL;
3606 
3607 	rcu_read_lock();
3608 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3609 		struct fib6_nh *nh = &rt->fib6_nh;
3610 
3611 		if (dev == nh->fib_nh_dev &&
3612 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3613 		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
3614 			break;
3615 	}
3616 	if (rt && !fib6_info_hold_safe(rt))
3617 		rt = NULL;
3618 	rcu_read_unlock();
3619 	return rt;
3620 }
3621 
3622 struct fib6_info *rt6_add_dflt_router(struct net *net,
3623 				     const struct in6_addr *gwaddr,
3624 				     struct net_device *dev,
3625 				     unsigned int pref)
3626 {
3627 	struct fib6_config cfg = {
3628 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3629 		.fc_metric	= IP6_RT_PRIO_USER,
3630 		.fc_ifindex	= dev->ifindex,
3631 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3632 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3633 		.fc_protocol = RTPROT_RA,
3634 		.fc_type = RTN_UNICAST,
3635 		.fc_nlinfo.portid = 0,
3636 		.fc_nlinfo.nlh = NULL,
3637 		.fc_nlinfo.nl_net = net,
3638 	};
3639 
3640 	cfg.fc_gateway = *gwaddr;
3641 
3642 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3643 		struct fib6_table *table;
3644 
3645 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3646 		if (table)
3647 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3648 	}
3649 
3650 	return rt6_get_dflt_router(net, gwaddr, dev);
3651 }
3652 
3653 static void __rt6_purge_dflt_routers(struct net *net,
3654 				     struct fib6_table *table)
3655 {
3656 	struct fib6_info *rt;
3657 
3658 restart:
3659 	rcu_read_lock();
3660 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3661 		struct net_device *dev = fib6_info_nh_dev(rt);
3662 		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3663 
3664 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3665 		    (!idev || idev->cnf.accept_ra != 2) &&
3666 		    fib6_info_hold_safe(rt)) {
3667 			rcu_read_unlock();
3668 			ip6_del_rt(net, rt);
3669 			goto restart;
3670 		}
3671 	}
3672 	rcu_read_unlock();
3673 
3674 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3675 }
3676 
3677 void rt6_purge_dflt_routers(struct net *net)
3678 {
3679 	struct fib6_table *table;
3680 	struct hlist_head *head;
3681 	unsigned int h;
3682 
3683 	rcu_read_lock();
3684 
3685 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3686 		head = &net->ipv6.fib_table_hash[h];
3687 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3688 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3689 				__rt6_purge_dflt_routers(net, table);
3690 		}
3691 	}
3692 
3693 	rcu_read_unlock();
3694 }
3695 
3696 static void rtmsg_to_fib6_config(struct net *net,
3697 				 struct in6_rtmsg *rtmsg,
3698 				 struct fib6_config *cfg)
3699 {
3700 	*cfg = (struct fib6_config){
3701 		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3702 			 : RT6_TABLE_MAIN,
3703 		.fc_ifindex = rtmsg->rtmsg_ifindex,
3704 		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3705 		.fc_expires = rtmsg->rtmsg_info,
3706 		.fc_dst_len = rtmsg->rtmsg_dst_len,
3707 		.fc_src_len = rtmsg->rtmsg_src_len,
3708 		.fc_flags = rtmsg->rtmsg_flags,
3709 		.fc_type = rtmsg->rtmsg_type,
3710 
3711 		.fc_nlinfo.nl_net = net,
3712 
3713 		.fc_dst = rtmsg->rtmsg_dst,
3714 		.fc_src = rtmsg->rtmsg_src,
3715 		.fc_gateway = rtmsg->rtmsg_gateway,
3716 	};
3717 }
3718 
3719 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3720 {
3721 	struct fib6_config cfg;
3722 	struct in6_rtmsg rtmsg;
3723 	int err;
3724 
3725 	switch (cmd) {
3726 	case SIOCADDRT:		/* Add a route */
3727 	case SIOCDELRT:		/* Delete a route */
3728 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3729 			return -EPERM;
3730 		err = copy_from_user(&rtmsg, arg,
3731 				     sizeof(struct in6_rtmsg));
3732 		if (err)
3733 			return -EFAULT;
3734 
3735 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3736 
3737 		rtnl_lock();
3738 		switch (cmd) {
3739 		case SIOCADDRT:
3740 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3741 			break;
3742 		case SIOCDELRT:
3743 			err = ip6_route_del(&cfg, NULL);
3744 			break;
3745 		default:
3746 			err = -EINVAL;
3747 		}
3748 		rtnl_unlock();
3749 
3750 		return err;
3751 	}
3752 
3753 	return -EINVAL;
3754 }
3755 
3756 /*
3757  *	Drop the packet on the floor
3758  */
3759 
3760 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3761 {
3762 	int type;
3763 	struct dst_entry *dst = skb_dst(skb);
3764 	switch (ipstats_mib_noroutes) {
3765 	case IPSTATS_MIB_INNOROUTES:
3766 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3767 		if (type == IPV6_ADDR_ANY) {
3768 			IP6_INC_STATS(dev_net(dst->dev),
3769 				      __in6_dev_get_safely(skb->dev),
3770 				      IPSTATS_MIB_INADDRERRORS);
3771 			break;
3772 		}
3773 		/* FALLTHROUGH */
3774 	case IPSTATS_MIB_OUTNOROUTES:
3775 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3776 			      ipstats_mib_noroutes);
3777 		break;
3778 	}
3779 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3780 	kfree_skb(skb);
3781 	return 0;
3782 }
3783 
3784 static int ip6_pkt_discard(struct sk_buff *skb)
3785 {
3786 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3787 }
3788 
3789 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3790 {
3791 	skb->dev = skb_dst(skb)->dev;
3792 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3793 }
3794 
3795 static int ip6_pkt_prohibit(struct sk_buff *skb)
3796 {
3797 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3798 }
3799 
3800 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3801 {
3802 	skb->dev = skb_dst(skb)->dev;
3803 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3804 }
3805 
3806 /*
3807  *	Allocate a dst for local (unicast / anycast) address.
3808  */
3809 
3810 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3811 				     struct inet6_dev *idev,
3812 				     const struct in6_addr *addr,
3813 				     bool anycast, gfp_t gfp_flags)
3814 {
3815 	struct fib6_config cfg = {
3816 		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
3817 		.fc_ifindex = idev->dev->ifindex,
3818 		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
3819 		.fc_dst = *addr,
3820 		.fc_dst_len = 128,
3821 		.fc_protocol = RTPROT_KERNEL,
3822 		.fc_nlinfo.nl_net = net,
3823 		.fc_ignore_dev_down = true,
3824 	};
3825 
3826 	if (anycast) {
3827 		cfg.fc_type = RTN_ANYCAST;
3828 		cfg.fc_flags |= RTF_ANYCAST;
3829 	} else {
3830 		cfg.fc_type = RTN_LOCAL;
3831 		cfg.fc_flags |= RTF_LOCAL;
3832 	}
3833 
3834 	return ip6_route_info_create(&cfg, gfp_flags, NULL);
3835 }
3836 
3837 /* remove deleted ip from prefsrc entries */
3838 struct arg_dev_net_ip {
3839 	struct net_device *dev;
3840 	struct net *net;
3841 	struct in6_addr *addr;
3842 };
3843 
3844 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3845 {
3846 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3847 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3848 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3849 
3850 	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
3851 	    rt != net->ipv6.fib6_null_entry &&
3852 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3853 		spin_lock_bh(&rt6_exception_lock);
3854 		/* remove prefsrc entry */
3855 		rt->fib6_prefsrc.plen = 0;
3856 		spin_unlock_bh(&rt6_exception_lock);
3857 	}
3858 	return 0;
3859 }
3860 
3861 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3862 {
3863 	struct net *net = dev_net(ifp->idev->dev);
3864 	struct arg_dev_net_ip adni = {
3865 		.dev = ifp->idev->dev,
3866 		.net = net,
3867 		.addr = &ifp->addr,
3868 	};
3869 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3870 }
3871 
3872 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3873 
3874 /* Remove routers and update dst entries when gateway turn into host. */
3875 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3876 {
3877 	struct in6_addr *gateway = (struct in6_addr *)arg;
3878 
3879 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3880 	    rt->fib6_nh.fib_nh_gw_family &&
3881 	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3882 		return -1;
3883 	}
3884 
3885 	/* Further clean up cached routes in exception table.
3886 	 * This is needed because cached route may have a different
3887 	 * gateway than its 'parent' in the case of an ip redirect.
3888 	 */
3889 	rt6_exceptions_clean_tohost(rt, gateway);
3890 
3891 	return 0;
3892 }
3893 
3894 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3895 {
3896 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3897 }
3898 
3899 struct arg_netdev_event {
3900 	const struct net_device *dev;
3901 	union {
3902 		unsigned int nh_flags;
3903 		unsigned long event;
3904 	};
3905 };
3906 
3907 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3908 {
3909 	struct fib6_info *iter;
3910 	struct fib6_node *fn;
3911 
3912 	fn = rcu_dereference_protected(rt->fib6_node,
3913 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3914 	iter = rcu_dereference_protected(fn->leaf,
3915 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3916 	while (iter) {
3917 		if (iter->fib6_metric == rt->fib6_metric &&
3918 		    rt6_qualify_for_ecmp(iter))
3919 			return iter;
3920 		iter = rcu_dereference_protected(iter->fib6_next,
3921 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3922 	}
3923 
3924 	return NULL;
3925 }
3926 
3927 static bool rt6_is_dead(const struct fib6_info *rt)
3928 {
3929 	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
3930 	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
3931 	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3932 		return true;
3933 
3934 	return false;
3935 }
3936 
3937 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3938 {
3939 	struct fib6_info *iter;
3940 	int total = 0;
3941 
3942 	if (!rt6_is_dead(rt))
3943 		total += rt->fib6_nh.fib_nh_weight;
3944 
3945 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3946 		if (!rt6_is_dead(iter))
3947 			total += iter->fib6_nh.fib_nh_weight;
3948 	}
3949 
3950 	return total;
3951 }
3952 
3953 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3954 {
3955 	int upper_bound = -1;
3956 
3957 	if (!rt6_is_dead(rt)) {
3958 		*weight += rt->fib6_nh.fib_nh_weight;
3959 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3960 						    total) - 1;
3961 	}
3962 	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3963 }
3964 
3965 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3966 {
3967 	struct fib6_info *iter;
3968 	int weight = 0;
3969 
3970 	rt6_upper_bound_set(rt, &weight, total);
3971 
3972 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3973 		rt6_upper_bound_set(iter, &weight, total);
3974 }
3975 
3976 void rt6_multipath_rebalance(struct fib6_info *rt)
3977 {
3978 	struct fib6_info *first;
3979 	int total;
3980 
3981 	/* In case the entire multipath route was marked for flushing,
3982 	 * then there is no need to rebalance upon the removal of every
3983 	 * sibling route.
3984 	 */
3985 	if (!rt->fib6_nsiblings || rt->should_flush)
3986 		return;
3987 
3988 	/* During lookup routes are evaluated in order, so we need to
3989 	 * make sure upper bounds are assigned from the first sibling
3990 	 * onwards.
3991 	 */
3992 	first = rt6_multipath_first_sibling(rt);
3993 	if (WARN_ON_ONCE(!first))
3994 		return;
3995 
3996 	total = rt6_multipath_total_weight(first);
3997 	rt6_multipath_upper_bound_set(first, total);
3998 }
3999 
4000 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4001 {
4002 	const struct arg_netdev_event *arg = p_arg;
4003 	struct net *net = dev_net(arg->dev);
4004 
4005 	if (rt != net->ipv6.fib6_null_entry &&
4006 	    rt->fib6_nh.fib_nh_dev == arg->dev) {
4007 		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
4008 		fib6_update_sernum_upto_root(net, rt);
4009 		rt6_multipath_rebalance(rt);
4010 	}
4011 
4012 	return 0;
4013 }
4014 
4015 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
4016 {
4017 	struct arg_netdev_event arg = {
4018 		.dev = dev,
4019 		{
4020 			.nh_flags = nh_flags,
4021 		},
4022 	};
4023 
4024 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4025 		arg.nh_flags |= RTNH_F_LINKDOWN;
4026 
4027 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4028 }
4029 
4030 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4031 				   const struct net_device *dev)
4032 {
4033 	struct fib6_info *iter;
4034 
4035 	if (rt->fib6_nh.fib_nh_dev == dev)
4036 		return true;
4037 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4038 		if (iter->fib6_nh.fib_nh_dev == dev)
4039 			return true;
4040 
4041 	return false;
4042 }
4043 
4044 static void rt6_multipath_flush(struct fib6_info *rt)
4045 {
4046 	struct fib6_info *iter;
4047 
4048 	rt->should_flush = 1;
4049 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4050 		iter->should_flush = 1;
4051 }
4052 
4053 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4054 					     const struct net_device *down_dev)
4055 {
4056 	struct fib6_info *iter;
4057 	unsigned int dead = 0;
4058 
4059 	if (rt->fib6_nh.fib_nh_dev == down_dev ||
4060 	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4061 		dead++;
4062 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4063 		if (iter->fib6_nh.fib_nh_dev == down_dev ||
4064 		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
4065 			dead++;
4066 
4067 	return dead;
4068 }
4069 
4070 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4071 				       const struct net_device *dev,
4072 				       unsigned int nh_flags)
4073 {
4074 	struct fib6_info *iter;
4075 
4076 	if (rt->fib6_nh.fib_nh_dev == dev)
4077 		rt->fib6_nh.fib_nh_flags |= nh_flags;
4078 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4079 		if (iter->fib6_nh.fib_nh_dev == dev)
4080 			iter->fib6_nh.fib_nh_flags |= nh_flags;
4081 }
4082 
4083 /* called with write lock held for table with rt */
4084 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4085 {
4086 	const struct arg_netdev_event *arg = p_arg;
4087 	const struct net_device *dev = arg->dev;
4088 	struct net *net = dev_net(dev);
4089 
4090 	if (rt == net->ipv6.fib6_null_entry)
4091 		return 0;
4092 
4093 	switch (arg->event) {
4094 	case NETDEV_UNREGISTER:
4095 		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4096 	case NETDEV_DOWN:
4097 		if (rt->should_flush)
4098 			return -1;
4099 		if (!rt->fib6_nsiblings)
4100 			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4101 		if (rt6_multipath_uses_dev(rt, dev)) {
4102 			unsigned int count;
4103 
4104 			count = rt6_multipath_dead_count(rt, dev);
4105 			if (rt->fib6_nsiblings + 1 == count) {
4106 				rt6_multipath_flush(rt);
4107 				return -1;
4108 			}
4109 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4110 						   RTNH_F_LINKDOWN);
4111 			fib6_update_sernum(net, rt);
4112 			rt6_multipath_rebalance(rt);
4113 		}
4114 		return -2;
4115 	case NETDEV_CHANGE:
4116 		if (rt->fib6_nh.fib_nh_dev != dev ||
4117 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4118 			break;
4119 		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4120 		rt6_multipath_rebalance(rt);
4121 		break;
4122 	}
4123 
4124 	return 0;
4125 }
4126 
4127 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4128 {
4129 	struct arg_netdev_event arg = {
4130 		.dev = dev,
4131 		{
4132 			.event = event,
4133 		},
4134 	};
4135 	struct net *net = dev_net(dev);
4136 
4137 	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4138 		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4139 	else
4140 		fib6_clean_all(net, fib6_ifdown, &arg);
4141 }
4142 
4143 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4144 {
4145 	rt6_sync_down_dev(dev, event);
4146 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4147 	neigh_ifdown(&nd_tbl, dev);
4148 }
4149 
4150 struct rt6_mtu_change_arg {
4151 	struct net_device *dev;
4152 	unsigned int mtu;
4153 };
4154 
4155 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4156 {
4157 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4158 	struct inet6_dev *idev;
4159 
4160 	/* In IPv6 pmtu discovery is not optional,
4161 	   so that RTAX_MTU lock cannot disable it.
4162 	   We still use this lock to block changes
4163 	   caused by addrconf/ndisc.
4164 	*/
4165 
4166 	idev = __in6_dev_get(arg->dev);
4167 	if (!idev)
4168 		return 0;
4169 
4170 	/* For administrative MTU increase, there is no way to discover
4171 	   IPv6 PMTU increase, so PMTU increase should be updated here.
4172 	   Since RFC 1981 doesn't include administrative MTU increase
4173 	   update PMTU increase is a MUST. (i.e. jumbo frame)
4174 	 */
4175 	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4176 	    !fib6_metric_locked(rt, RTAX_MTU)) {
4177 		u32 mtu = rt->fib6_pmtu;
4178 
4179 		if (mtu >= arg->mtu ||
4180 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4181 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4182 
4183 		spin_lock_bh(&rt6_exception_lock);
4184 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4185 		spin_unlock_bh(&rt6_exception_lock);
4186 	}
4187 	return 0;
4188 }
4189 
4190 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4191 {
4192 	struct rt6_mtu_change_arg arg = {
4193 		.dev = dev,
4194 		.mtu = mtu,
4195 	};
4196 
4197 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4198 }
4199 
4200 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4201 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4202 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4203 	[RTA_OIF]               = { .type = NLA_U32 },
4204 	[RTA_IIF]		= { .type = NLA_U32 },
4205 	[RTA_PRIORITY]          = { .type = NLA_U32 },
4206 	[RTA_METRICS]           = { .type = NLA_NESTED },
4207 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4208 	[RTA_PREF]              = { .type = NLA_U8 },
4209 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4210 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4211 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4212 	[RTA_UID]		= { .type = NLA_U32 },
4213 	[RTA_MARK]		= { .type = NLA_U32 },
4214 	[RTA_TABLE]		= { .type = NLA_U32 },
4215 	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4216 	[RTA_SPORT]		= { .type = NLA_U16 },
4217 	[RTA_DPORT]		= { .type = NLA_U16 },
4218 };
4219 
4220 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4221 			      struct fib6_config *cfg,
4222 			      struct netlink_ext_ack *extack)
4223 {
4224 	struct rtmsg *rtm;
4225 	struct nlattr *tb[RTA_MAX+1];
4226 	unsigned int pref;
4227 	int err;
4228 
4229 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4230 			  extack);
4231 	if (err < 0)
4232 		goto errout;
4233 
4234 	err = -EINVAL;
4235 	rtm = nlmsg_data(nlh);
4236 
4237 	*cfg = (struct fib6_config){
4238 		.fc_table = rtm->rtm_table,
4239 		.fc_dst_len = rtm->rtm_dst_len,
4240 		.fc_src_len = rtm->rtm_src_len,
4241 		.fc_flags = RTF_UP,
4242 		.fc_protocol = rtm->rtm_protocol,
4243 		.fc_type = rtm->rtm_type,
4244 
4245 		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4246 		.fc_nlinfo.nlh = nlh,
4247 		.fc_nlinfo.nl_net = sock_net(skb->sk),
4248 	};
4249 
4250 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4251 	    rtm->rtm_type == RTN_BLACKHOLE ||
4252 	    rtm->rtm_type == RTN_PROHIBIT ||
4253 	    rtm->rtm_type == RTN_THROW)
4254 		cfg->fc_flags |= RTF_REJECT;
4255 
4256 	if (rtm->rtm_type == RTN_LOCAL)
4257 		cfg->fc_flags |= RTF_LOCAL;
4258 
4259 	if (rtm->rtm_flags & RTM_F_CLONED)
4260 		cfg->fc_flags |= RTF_CACHE;
4261 
4262 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4263 
4264 	if (tb[RTA_GATEWAY]) {
4265 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4266 		cfg->fc_flags |= RTF_GATEWAY;
4267 	}
4268 	if (tb[RTA_VIA]) {
4269 		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4270 		goto errout;
4271 	}
4272 
4273 	if (tb[RTA_DST]) {
4274 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4275 
4276 		if (nla_len(tb[RTA_DST]) < plen)
4277 			goto errout;
4278 
4279 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4280 	}
4281 
4282 	if (tb[RTA_SRC]) {
4283 		int plen = (rtm->rtm_src_len + 7) >> 3;
4284 
4285 		if (nla_len(tb[RTA_SRC]) < plen)
4286 			goto errout;
4287 
4288 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4289 	}
4290 
4291 	if (tb[RTA_PREFSRC])
4292 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4293 
4294 	if (tb[RTA_OIF])
4295 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4296 
4297 	if (tb[RTA_PRIORITY])
4298 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4299 
4300 	if (tb[RTA_METRICS]) {
4301 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4302 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4303 	}
4304 
4305 	if (tb[RTA_TABLE])
4306 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4307 
4308 	if (tb[RTA_MULTIPATH]) {
4309 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4310 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4311 
4312 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4313 						     cfg->fc_mp_len, extack);
4314 		if (err < 0)
4315 			goto errout;
4316 	}
4317 
4318 	if (tb[RTA_PREF]) {
4319 		pref = nla_get_u8(tb[RTA_PREF]);
4320 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4321 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4322 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4323 		cfg->fc_flags |= RTF_PREF(pref);
4324 	}
4325 
4326 	if (tb[RTA_ENCAP])
4327 		cfg->fc_encap = tb[RTA_ENCAP];
4328 
4329 	if (tb[RTA_ENCAP_TYPE]) {
4330 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4331 
4332 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4333 		if (err < 0)
4334 			goto errout;
4335 	}
4336 
4337 	if (tb[RTA_EXPIRES]) {
4338 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4339 
4340 		if (addrconf_finite_timeout(timeout)) {
4341 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4342 			cfg->fc_flags |= RTF_EXPIRES;
4343 		}
4344 	}
4345 
4346 	err = 0;
4347 errout:
4348 	return err;
4349 }
4350 
4351 struct rt6_nh {
4352 	struct fib6_info *fib6_info;
4353 	struct fib6_config r_cfg;
4354 	struct list_head next;
4355 };
4356 
4357 static int ip6_route_info_append(struct net *net,
4358 				 struct list_head *rt6_nh_list,
4359 				 struct fib6_info *rt,
4360 				 struct fib6_config *r_cfg)
4361 {
4362 	struct rt6_nh *nh;
4363 	int err = -EEXIST;
4364 
4365 	list_for_each_entry(nh, rt6_nh_list, next) {
4366 		/* check if fib6_info already exists */
4367 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4368 			return err;
4369 	}
4370 
4371 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4372 	if (!nh)
4373 		return -ENOMEM;
4374 	nh->fib6_info = rt;
4375 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4376 	list_add_tail(&nh->next, rt6_nh_list);
4377 
4378 	return 0;
4379 }
4380 
4381 static void ip6_route_mpath_notify(struct fib6_info *rt,
4382 				   struct fib6_info *rt_last,
4383 				   struct nl_info *info,
4384 				   __u16 nlflags)
4385 {
4386 	/* if this is an APPEND route, then rt points to the first route
4387 	 * inserted and rt_last points to last route inserted. Userspace
4388 	 * wants a consistent dump of the route which starts at the first
4389 	 * nexthop. Since sibling routes are always added at the end of
4390 	 * the list, find the first sibling of the last route appended
4391 	 */
4392 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4393 		rt = list_first_entry(&rt_last->fib6_siblings,
4394 				      struct fib6_info,
4395 				      fib6_siblings);
4396 	}
4397 
4398 	if (rt)
4399 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4400 }
4401 
4402 static int ip6_route_multipath_add(struct fib6_config *cfg,
4403 				   struct netlink_ext_ack *extack)
4404 {
4405 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4406 	struct nl_info *info = &cfg->fc_nlinfo;
4407 	struct fib6_config r_cfg;
4408 	struct rtnexthop *rtnh;
4409 	struct fib6_info *rt;
4410 	struct rt6_nh *err_nh;
4411 	struct rt6_nh *nh, *nh_safe;
4412 	__u16 nlflags;
4413 	int remaining;
4414 	int attrlen;
4415 	int err = 1;
4416 	int nhn = 0;
4417 	int replace = (cfg->fc_nlinfo.nlh &&
4418 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4419 	LIST_HEAD(rt6_nh_list);
4420 
4421 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4422 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4423 		nlflags |= NLM_F_APPEND;
4424 
4425 	remaining = cfg->fc_mp_len;
4426 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4427 
4428 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4429 	 * fib6_info structs per nexthop
4430 	 */
4431 	while (rtnh_ok(rtnh, remaining)) {
4432 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4433 		if (rtnh->rtnh_ifindex)
4434 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4435 
4436 		attrlen = rtnh_attrlen(rtnh);
4437 		if (attrlen > 0) {
4438 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4439 
4440 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4441 			if (nla) {
4442 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4443 				r_cfg.fc_flags |= RTF_GATEWAY;
4444 			}
4445 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4446 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4447 			if (nla)
4448 				r_cfg.fc_encap_type = nla_get_u16(nla);
4449 		}
4450 
4451 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4452 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4453 		if (IS_ERR(rt)) {
4454 			err = PTR_ERR(rt);
4455 			rt = NULL;
4456 			goto cleanup;
4457 		}
4458 		if (!rt6_qualify_for_ecmp(rt)) {
4459 			err = -EINVAL;
4460 			NL_SET_ERR_MSG(extack,
4461 				       "Device only routes can not be added for IPv6 using the multipath API.");
4462 			fib6_info_release(rt);
4463 			goto cleanup;
4464 		}
4465 
4466 		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4467 
4468 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4469 					    rt, &r_cfg);
4470 		if (err) {
4471 			fib6_info_release(rt);
4472 			goto cleanup;
4473 		}
4474 
4475 		rtnh = rtnh_next(rtnh, &remaining);
4476 	}
4477 
4478 	/* for add and replace send one notification with all nexthops.
4479 	 * Skip the notification in fib6_add_rt2node and send one with
4480 	 * the full route when done
4481 	 */
4482 	info->skip_notify = 1;
4483 
4484 	err_nh = NULL;
4485 	list_for_each_entry(nh, &rt6_nh_list, next) {
4486 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4487 		fib6_info_release(nh->fib6_info);
4488 
4489 		if (!err) {
4490 			/* save reference to last route successfully inserted */
4491 			rt_last = nh->fib6_info;
4492 
4493 			/* save reference to first route for notification */
4494 			if (!rt_notif)
4495 				rt_notif = nh->fib6_info;
4496 		}
4497 
4498 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4499 		nh->fib6_info = NULL;
4500 		if (err) {
4501 			if (replace && nhn)
4502 				NL_SET_ERR_MSG_MOD(extack,
4503 						   "multipath route replace failed (check consistency of installed routes)");
4504 			err_nh = nh;
4505 			goto add_errout;
4506 		}
4507 
4508 		/* Because each route is added like a single route we remove
4509 		 * these flags after the first nexthop: if there is a collision,
4510 		 * we have already failed to add the first nexthop:
4511 		 * fib6_add_rt2node() has rejected it; when replacing, old
4512 		 * nexthops have been replaced by first new, the rest should
4513 		 * be added to it.
4514 		 */
4515 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4516 						     NLM_F_REPLACE);
4517 		nhn++;
4518 	}
4519 
4520 	/* success ... tell user about new route */
4521 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4522 	goto cleanup;
4523 
4524 add_errout:
4525 	/* send notification for routes that were added so that
4526 	 * the delete notifications sent by ip6_route_del are
4527 	 * coherent
4528 	 */
4529 	if (rt_notif)
4530 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4531 
4532 	/* Delete routes that were already added */
4533 	list_for_each_entry(nh, &rt6_nh_list, next) {
4534 		if (err_nh == nh)
4535 			break;
4536 		ip6_route_del(&nh->r_cfg, extack);
4537 	}
4538 
4539 cleanup:
4540 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4541 		if (nh->fib6_info)
4542 			fib6_info_release(nh->fib6_info);
4543 		list_del(&nh->next);
4544 		kfree(nh);
4545 	}
4546 
4547 	return err;
4548 }
4549 
4550 static int ip6_route_multipath_del(struct fib6_config *cfg,
4551 				   struct netlink_ext_ack *extack)
4552 {
4553 	struct fib6_config r_cfg;
4554 	struct rtnexthop *rtnh;
4555 	int remaining;
4556 	int attrlen;
4557 	int err = 1, last_err = 0;
4558 
4559 	remaining = cfg->fc_mp_len;
4560 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4561 
4562 	/* Parse a Multipath Entry */
4563 	while (rtnh_ok(rtnh, remaining)) {
4564 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4565 		if (rtnh->rtnh_ifindex)
4566 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4567 
4568 		attrlen = rtnh_attrlen(rtnh);
4569 		if (attrlen > 0) {
4570 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4571 
4572 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4573 			if (nla) {
4574 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4575 				r_cfg.fc_flags |= RTF_GATEWAY;
4576 			}
4577 		}
4578 		err = ip6_route_del(&r_cfg, extack);
4579 		if (err)
4580 			last_err = err;
4581 
4582 		rtnh = rtnh_next(rtnh, &remaining);
4583 	}
4584 
4585 	return last_err;
4586 }
4587 
4588 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4589 			      struct netlink_ext_ack *extack)
4590 {
4591 	struct fib6_config cfg;
4592 	int err;
4593 
4594 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4595 	if (err < 0)
4596 		return err;
4597 
4598 	if (cfg.fc_mp)
4599 		return ip6_route_multipath_del(&cfg, extack);
4600 	else {
4601 		cfg.fc_delete_all_nh = 1;
4602 		return ip6_route_del(&cfg, extack);
4603 	}
4604 }
4605 
4606 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4607 			      struct netlink_ext_ack *extack)
4608 {
4609 	struct fib6_config cfg;
4610 	int err;
4611 
4612 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4613 	if (err < 0)
4614 		return err;
4615 
4616 	if (cfg.fc_metric == 0)
4617 		cfg.fc_metric = IP6_RT_PRIO_USER;
4618 
4619 	if (cfg.fc_mp)
4620 		return ip6_route_multipath_add(&cfg, extack);
4621 	else
4622 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4623 }
4624 
4625 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4626 {
4627 	int nexthop_len = 0;
4628 
4629 	if (rt->fib6_nsiblings) {
4630 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4631 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4632 			    + nla_total_size(16) /* RTA_GATEWAY */
4633 			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4634 
4635 		nexthop_len *= rt->fib6_nsiblings;
4636 	}
4637 
4638 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4639 	       + nla_total_size(16) /* RTA_SRC */
4640 	       + nla_total_size(16) /* RTA_DST */
4641 	       + nla_total_size(16) /* RTA_GATEWAY */
4642 	       + nla_total_size(16) /* RTA_PREFSRC */
4643 	       + nla_total_size(4) /* RTA_TABLE */
4644 	       + nla_total_size(4) /* RTA_IIF */
4645 	       + nla_total_size(4) /* RTA_OIF */
4646 	       + nla_total_size(4) /* RTA_PRIORITY */
4647 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4648 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4649 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4650 	       + nla_total_size(1) /* RTA_PREF */
4651 	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4652 	       + nexthop_len;
4653 }
4654 
4655 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4656 			 struct fib6_info *rt, struct dst_entry *dst,
4657 			 struct in6_addr *dest, struct in6_addr *src,
4658 			 int iif, int type, u32 portid, u32 seq,
4659 			 unsigned int flags)
4660 {
4661 	struct rt6_info *rt6 = (struct rt6_info *)dst;
4662 	struct rt6key *rt6_dst, *rt6_src;
4663 	u32 *pmetrics, table, rt6_flags;
4664 	struct nlmsghdr *nlh;
4665 	struct rtmsg *rtm;
4666 	long expires = 0;
4667 
4668 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4669 	if (!nlh)
4670 		return -EMSGSIZE;
4671 
4672 	if (rt6) {
4673 		rt6_dst = &rt6->rt6i_dst;
4674 		rt6_src = &rt6->rt6i_src;
4675 		rt6_flags = rt6->rt6i_flags;
4676 	} else {
4677 		rt6_dst = &rt->fib6_dst;
4678 		rt6_src = &rt->fib6_src;
4679 		rt6_flags = rt->fib6_flags;
4680 	}
4681 
4682 	rtm = nlmsg_data(nlh);
4683 	rtm->rtm_family = AF_INET6;
4684 	rtm->rtm_dst_len = rt6_dst->plen;
4685 	rtm->rtm_src_len = rt6_src->plen;
4686 	rtm->rtm_tos = 0;
4687 	if (rt->fib6_table)
4688 		table = rt->fib6_table->tb6_id;
4689 	else
4690 		table = RT6_TABLE_UNSPEC;
4691 	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4692 	if (nla_put_u32(skb, RTA_TABLE, table))
4693 		goto nla_put_failure;
4694 
4695 	rtm->rtm_type = rt->fib6_type;
4696 	rtm->rtm_flags = 0;
4697 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4698 	rtm->rtm_protocol = rt->fib6_protocol;
4699 
4700 	if (rt6_flags & RTF_CACHE)
4701 		rtm->rtm_flags |= RTM_F_CLONED;
4702 
4703 	if (dest) {
4704 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4705 			goto nla_put_failure;
4706 		rtm->rtm_dst_len = 128;
4707 	} else if (rtm->rtm_dst_len)
4708 		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4709 			goto nla_put_failure;
4710 #ifdef CONFIG_IPV6_SUBTREES
4711 	if (src) {
4712 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4713 			goto nla_put_failure;
4714 		rtm->rtm_src_len = 128;
4715 	} else if (rtm->rtm_src_len &&
4716 		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4717 		goto nla_put_failure;
4718 #endif
4719 	if (iif) {
4720 #ifdef CONFIG_IPV6_MROUTE
4721 		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4722 			int err = ip6mr_get_route(net, skb, rtm, portid);
4723 
4724 			if (err == 0)
4725 				return 0;
4726 			if (err < 0)
4727 				goto nla_put_failure;
4728 		} else
4729 #endif
4730 			if (nla_put_u32(skb, RTA_IIF, iif))
4731 				goto nla_put_failure;
4732 	} else if (dest) {
4733 		struct in6_addr saddr_buf;
4734 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4735 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4736 			goto nla_put_failure;
4737 	}
4738 
4739 	if (rt->fib6_prefsrc.plen) {
4740 		struct in6_addr saddr_buf;
4741 		saddr_buf = rt->fib6_prefsrc.addr;
4742 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4743 			goto nla_put_failure;
4744 	}
4745 
4746 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4747 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4748 		goto nla_put_failure;
4749 
4750 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4751 		goto nla_put_failure;
4752 
4753 	/* For multipath routes, walk the siblings list and add
4754 	 * each as a nexthop within RTA_MULTIPATH.
4755 	 */
4756 	if (rt6) {
4757 		if (rt6_flags & RTF_GATEWAY &&
4758 		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4759 			goto nla_put_failure;
4760 
4761 		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4762 			goto nla_put_failure;
4763 	} else if (rt->fib6_nsiblings) {
4764 		struct fib6_info *sibling, *next_sibling;
4765 		struct nlattr *mp;
4766 
4767 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4768 		if (!mp)
4769 			goto nla_put_failure;
4770 
4771 		if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common,
4772 				    rt->fib6_nh.fib_nh_weight) < 0)
4773 			goto nla_put_failure;
4774 
4775 		list_for_each_entry_safe(sibling, next_sibling,
4776 					 &rt->fib6_siblings, fib6_siblings) {
4777 			if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common,
4778 					    sibling->fib6_nh.fib_nh_weight) < 0)
4779 				goto nla_put_failure;
4780 		}
4781 
4782 		nla_nest_end(skb, mp);
4783 	} else {
4784 		if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common,
4785 				     &rtm->rtm_flags, false) < 0)
4786 			goto nla_put_failure;
4787 	}
4788 
4789 	if (rt6_flags & RTF_EXPIRES) {
4790 		expires = dst ? dst->expires : rt->expires;
4791 		expires -= jiffies;
4792 	}
4793 
4794 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4795 		goto nla_put_failure;
4796 
4797 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4798 		goto nla_put_failure;
4799 
4800 
4801 	nlmsg_end(skb, nlh);
4802 	return 0;
4803 
4804 nla_put_failure:
4805 	nlmsg_cancel(skb, nlh);
4806 	return -EMSGSIZE;
4807 }
4808 
4809 static bool fib6_info_uses_dev(const struct fib6_info *f6i,
4810 			       const struct net_device *dev)
4811 {
4812 	if (f6i->fib6_nh.fib_nh_dev == dev)
4813 		return true;
4814 
4815 	if (f6i->fib6_nsiblings) {
4816 		struct fib6_info *sibling, *next_sibling;
4817 
4818 		list_for_each_entry_safe(sibling, next_sibling,
4819 					 &f6i->fib6_siblings, fib6_siblings) {
4820 			if (sibling->fib6_nh.fib_nh_dev == dev)
4821 				return true;
4822 		}
4823 	}
4824 
4825 	return false;
4826 }
4827 
4828 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4829 {
4830 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4831 	struct fib_dump_filter *filter = &arg->filter;
4832 	unsigned int flags = NLM_F_MULTI;
4833 	struct net *net = arg->net;
4834 
4835 	if (rt == net->ipv6.fib6_null_entry)
4836 		return 0;
4837 
4838 	if ((filter->flags & RTM_F_PREFIX) &&
4839 	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4840 		/* success since this is not a prefix route */
4841 		return 1;
4842 	}
4843 	if (filter->filter_set) {
4844 		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
4845 		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
4846 		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4847 			return 1;
4848 		}
4849 		flags |= NLM_F_DUMP_FILTERED;
4850 	}
4851 
4852 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4853 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4854 			     arg->cb->nlh->nlmsg_seq, flags);
4855 }
4856 
4857 static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
4858 					const struct nlmsghdr *nlh,
4859 					struct nlattr **tb,
4860 					struct netlink_ext_ack *extack)
4861 {
4862 	struct rtmsg *rtm;
4863 	int i, err;
4864 
4865 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
4866 		NL_SET_ERR_MSG_MOD(extack,
4867 				   "Invalid header for get route request");
4868 		return -EINVAL;
4869 	}
4870 
4871 	if (!netlink_strict_get_check(skb))
4872 		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
4873 				   rtm_ipv6_policy, extack);
4874 
4875 	rtm = nlmsg_data(nlh);
4876 	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
4877 	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
4878 	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
4879 	    rtm->rtm_type) {
4880 		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
4881 		return -EINVAL;
4882 	}
4883 	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
4884 		NL_SET_ERR_MSG_MOD(extack,
4885 				   "Invalid flags for get route request");
4886 		return -EINVAL;
4887 	}
4888 
4889 	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
4890 				 rtm_ipv6_policy, extack);
4891 	if (err)
4892 		return err;
4893 
4894 	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
4895 	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
4896 		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
4897 		return -EINVAL;
4898 	}
4899 
4900 	for (i = 0; i <= RTA_MAX; i++) {
4901 		if (!tb[i])
4902 			continue;
4903 
4904 		switch (i) {
4905 		case RTA_SRC:
4906 		case RTA_DST:
4907 		case RTA_IIF:
4908 		case RTA_OIF:
4909 		case RTA_MARK:
4910 		case RTA_UID:
4911 		case RTA_SPORT:
4912 		case RTA_DPORT:
4913 		case RTA_IP_PROTO:
4914 			break;
4915 		default:
4916 			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
4917 			return -EINVAL;
4918 		}
4919 	}
4920 
4921 	return 0;
4922 }
4923 
4924 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4925 			      struct netlink_ext_ack *extack)
4926 {
4927 	struct net *net = sock_net(in_skb->sk);
4928 	struct nlattr *tb[RTA_MAX+1];
4929 	int err, iif = 0, oif = 0;
4930 	struct fib6_info *from;
4931 	struct dst_entry *dst;
4932 	struct rt6_info *rt;
4933 	struct sk_buff *skb;
4934 	struct rtmsg *rtm;
4935 	struct flowi6 fl6 = {};
4936 	bool fibmatch;
4937 
4938 	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4939 	if (err < 0)
4940 		goto errout;
4941 
4942 	err = -EINVAL;
4943 	rtm = nlmsg_data(nlh);
4944 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4945 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4946 
4947 	if (tb[RTA_SRC]) {
4948 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4949 			goto errout;
4950 
4951 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4952 	}
4953 
4954 	if (tb[RTA_DST]) {
4955 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4956 			goto errout;
4957 
4958 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4959 	}
4960 
4961 	if (tb[RTA_IIF])
4962 		iif = nla_get_u32(tb[RTA_IIF]);
4963 
4964 	if (tb[RTA_OIF])
4965 		oif = nla_get_u32(tb[RTA_OIF]);
4966 
4967 	if (tb[RTA_MARK])
4968 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4969 
4970 	if (tb[RTA_UID])
4971 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4972 					   nla_get_u32(tb[RTA_UID]));
4973 	else
4974 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4975 
4976 	if (tb[RTA_SPORT])
4977 		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4978 
4979 	if (tb[RTA_DPORT])
4980 		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4981 
4982 	if (tb[RTA_IP_PROTO]) {
4983 		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4984 						  &fl6.flowi6_proto, AF_INET6,
4985 						  extack);
4986 		if (err)
4987 			goto errout;
4988 	}
4989 
4990 	if (iif) {
4991 		struct net_device *dev;
4992 		int flags = 0;
4993 
4994 		rcu_read_lock();
4995 
4996 		dev = dev_get_by_index_rcu(net, iif);
4997 		if (!dev) {
4998 			rcu_read_unlock();
4999 			err = -ENODEV;
5000 			goto errout;
5001 		}
5002 
5003 		fl6.flowi6_iif = iif;
5004 
5005 		if (!ipv6_addr_any(&fl6.saddr))
5006 			flags |= RT6_LOOKUP_F_HAS_SADDR;
5007 
5008 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5009 
5010 		rcu_read_unlock();
5011 	} else {
5012 		fl6.flowi6_oif = oif;
5013 
5014 		dst = ip6_route_output(net, NULL, &fl6);
5015 	}
5016 
5017 
5018 	rt = container_of(dst, struct rt6_info, dst);
5019 	if (rt->dst.error) {
5020 		err = rt->dst.error;
5021 		ip6_rt_put(rt);
5022 		goto errout;
5023 	}
5024 
5025 	if (rt == net->ipv6.ip6_null_entry) {
5026 		err = rt->dst.error;
5027 		ip6_rt_put(rt);
5028 		goto errout;
5029 	}
5030 
5031 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5032 	if (!skb) {
5033 		ip6_rt_put(rt);
5034 		err = -ENOBUFS;
5035 		goto errout;
5036 	}
5037 
5038 	skb_dst_set(skb, &rt->dst);
5039 
5040 	rcu_read_lock();
5041 	from = rcu_dereference(rt->from);
5042 
5043 	if (fibmatch)
5044 		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5045 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
5046 				    nlh->nlmsg_seq, 0);
5047 	else
5048 		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5049 				    &fl6.saddr, iif, RTM_NEWROUTE,
5050 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
5051 				    0);
5052 	rcu_read_unlock();
5053 
5054 	if (err < 0) {
5055 		kfree_skb(skb);
5056 		goto errout;
5057 	}
5058 
5059 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5060 errout:
5061 	return err;
5062 }
5063 
5064 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5065 		     unsigned int nlm_flags)
5066 {
5067 	struct sk_buff *skb;
5068 	struct net *net = info->nl_net;
5069 	u32 seq;
5070 	int err;
5071 
5072 	err = -ENOBUFS;
5073 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5074 
5075 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5076 	if (!skb)
5077 		goto errout;
5078 
5079 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5080 			    event, info->portid, seq, nlm_flags);
5081 	if (err < 0) {
5082 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5083 		WARN_ON(err == -EMSGSIZE);
5084 		kfree_skb(skb);
5085 		goto errout;
5086 	}
5087 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5088 		    info->nlh, gfp_any());
5089 	return;
5090 errout:
5091 	if (err < 0)
5092 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5093 }
5094 
5095 static int ip6_route_dev_notify(struct notifier_block *this,
5096 				unsigned long event, void *ptr)
5097 {
5098 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5099 	struct net *net = dev_net(dev);
5100 
5101 	if (!(dev->flags & IFF_LOOPBACK))
5102 		return NOTIFY_OK;
5103 
5104 	if (event == NETDEV_REGISTER) {
5105 		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5106 		net->ipv6.ip6_null_entry->dst.dev = dev;
5107 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5108 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5109 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5110 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5111 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5112 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5113 #endif
5114 	 } else if (event == NETDEV_UNREGISTER &&
5115 		    dev->reg_state != NETREG_UNREGISTERED) {
5116 		/* NETDEV_UNREGISTER could be fired for multiple times by
5117 		 * netdev_wait_allrefs(). Make sure we only call this once.
5118 		 */
5119 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5120 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5121 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5122 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5123 #endif
5124 	}
5125 
5126 	return NOTIFY_OK;
5127 }
5128 
5129 /*
5130  *	/proc
5131  */
5132 
5133 #ifdef CONFIG_PROC_FS
5134 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5135 {
5136 	struct net *net = (struct net *)seq->private;
5137 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5138 		   net->ipv6.rt6_stats->fib_nodes,
5139 		   net->ipv6.rt6_stats->fib_route_nodes,
5140 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5141 		   net->ipv6.rt6_stats->fib_rt_entries,
5142 		   net->ipv6.rt6_stats->fib_rt_cache,
5143 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5144 		   net->ipv6.rt6_stats->fib_discarded_routes);
5145 
5146 	return 0;
5147 }
5148 #endif	/* CONFIG_PROC_FS */
5149 
5150 #ifdef CONFIG_SYSCTL
5151 
5152 static
5153 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5154 			      void __user *buffer, size_t *lenp, loff_t *ppos)
5155 {
5156 	struct net *net;
5157 	int delay;
5158 	int ret;
5159 	if (!write)
5160 		return -EINVAL;
5161 
5162 	net = (struct net *)ctl->extra1;
5163 	delay = net->ipv6.sysctl.flush_delay;
5164 	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
5165 	if (ret)
5166 		return ret;
5167 
5168 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5169 	return 0;
5170 }
5171 
5172 static int zero;
5173 static int one = 1;
5174 
5175 static struct ctl_table ipv6_route_table_template[] = {
5176 	{
5177 		.procname	=	"flush",
5178 		.data		=	&init_net.ipv6.sysctl.flush_delay,
5179 		.maxlen		=	sizeof(int),
5180 		.mode		=	0200,
5181 		.proc_handler	=	ipv6_sysctl_rtcache_flush
5182 	},
5183 	{
5184 		.procname	=	"gc_thresh",
5185 		.data		=	&ip6_dst_ops_template.gc_thresh,
5186 		.maxlen		=	sizeof(int),
5187 		.mode		=	0644,
5188 		.proc_handler	=	proc_dointvec,
5189 	},
5190 	{
5191 		.procname	=	"max_size",
5192 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
5193 		.maxlen		=	sizeof(int),
5194 		.mode		=	0644,
5195 		.proc_handler	=	proc_dointvec,
5196 	},
5197 	{
5198 		.procname	=	"gc_min_interval",
5199 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5200 		.maxlen		=	sizeof(int),
5201 		.mode		=	0644,
5202 		.proc_handler	=	proc_dointvec_jiffies,
5203 	},
5204 	{
5205 		.procname	=	"gc_timeout",
5206 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5207 		.maxlen		=	sizeof(int),
5208 		.mode		=	0644,
5209 		.proc_handler	=	proc_dointvec_jiffies,
5210 	},
5211 	{
5212 		.procname	=	"gc_interval",
5213 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
5214 		.maxlen		=	sizeof(int),
5215 		.mode		=	0644,
5216 		.proc_handler	=	proc_dointvec_jiffies,
5217 	},
5218 	{
5219 		.procname	=	"gc_elasticity",
5220 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5221 		.maxlen		=	sizeof(int),
5222 		.mode		=	0644,
5223 		.proc_handler	=	proc_dointvec,
5224 	},
5225 	{
5226 		.procname	=	"mtu_expires",
5227 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5228 		.maxlen		=	sizeof(int),
5229 		.mode		=	0644,
5230 		.proc_handler	=	proc_dointvec_jiffies,
5231 	},
5232 	{
5233 		.procname	=	"min_adv_mss",
5234 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
5235 		.maxlen		=	sizeof(int),
5236 		.mode		=	0644,
5237 		.proc_handler	=	proc_dointvec,
5238 	},
5239 	{
5240 		.procname	=	"gc_min_interval_ms",
5241 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5242 		.maxlen		=	sizeof(int),
5243 		.mode		=	0644,
5244 		.proc_handler	=	proc_dointvec_ms_jiffies,
5245 	},
5246 	{
5247 		.procname	=	"skip_notify_on_dev_down",
5248 		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
5249 		.maxlen		=	sizeof(int),
5250 		.mode		=	0644,
5251 		.proc_handler	=	proc_dointvec,
5252 		.extra1		=	&zero,
5253 		.extra2		=	&one,
5254 	},
5255 	{ }
5256 };
5257 
5258 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5259 {
5260 	struct ctl_table *table;
5261 
5262 	table = kmemdup(ipv6_route_table_template,
5263 			sizeof(ipv6_route_table_template),
5264 			GFP_KERNEL);
5265 
5266 	if (table) {
5267 		table[0].data = &net->ipv6.sysctl.flush_delay;
5268 		table[0].extra1 = net;
5269 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5270 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5271 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5272 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5273 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5274 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5275 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5276 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5277 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5278 		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5279 
5280 		/* Don't export sysctls to unprivileged users */
5281 		if (net->user_ns != &init_user_ns)
5282 			table[0].procname = NULL;
5283 	}
5284 
5285 	return table;
5286 }
5287 #endif
5288 
5289 static int __net_init ip6_route_net_init(struct net *net)
5290 {
5291 	int ret = -ENOMEM;
5292 
5293 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5294 	       sizeof(net->ipv6.ip6_dst_ops));
5295 
5296 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5297 		goto out_ip6_dst_ops;
5298 
5299 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5300 					    sizeof(*net->ipv6.fib6_null_entry),
5301 					    GFP_KERNEL);
5302 	if (!net->ipv6.fib6_null_entry)
5303 		goto out_ip6_dst_entries;
5304 
5305 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5306 					   sizeof(*net->ipv6.ip6_null_entry),
5307 					   GFP_KERNEL);
5308 	if (!net->ipv6.ip6_null_entry)
5309 		goto out_fib6_null_entry;
5310 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5311 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5312 			 ip6_template_metrics, true);
5313 
5314 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5315 	net->ipv6.fib6_has_custom_rules = false;
5316 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5317 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5318 					       GFP_KERNEL);
5319 	if (!net->ipv6.ip6_prohibit_entry)
5320 		goto out_ip6_null_entry;
5321 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5322 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5323 			 ip6_template_metrics, true);
5324 
5325 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5326 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5327 					       GFP_KERNEL);
5328 	if (!net->ipv6.ip6_blk_hole_entry)
5329 		goto out_ip6_prohibit_entry;
5330 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5331 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5332 			 ip6_template_metrics, true);
5333 #endif
5334 
5335 	net->ipv6.sysctl.flush_delay = 0;
5336 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5337 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5338 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5339 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5340 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5341 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5342 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5343 	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5344 
5345 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5346 
5347 	ret = 0;
5348 out:
5349 	return ret;
5350 
5351 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5352 out_ip6_prohibit_entry:
5353 	kfree(net->ipv6.ip6_prohibit_entry);
5354 out_ip6_null_entry:
5355 	kfree(net->ipv6.ip6_null_entry);
5356 #endif
5357 out_fib6_null_entry:
5358 	kfree(net->ipv6.fib6_null_entry);
5359 out_ip6_dst_entries:
5360 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5361 out_ip6_dst_ops:
5362 	goto out;
5363 }
5364 
5365 static void __net_exit ip6_route_net_exit(struct net *net)
5366 {
5367 	kfree(net->ipv6.fib6_null_entry);
5368 	kfree(net->ipv6.ip6_null_entry);
5369 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5370 	kfree(net->ipv6.ip6_prohibit_entry);
5371 	kfree(net->ipv6.ip6_blk_hole_entry);
5372 #endif
5373 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5374 }
5375 
5376 static int __net_init ip6_route_net_init_late(struct net *net)
5377 {
5378 #ifdef CONFIG_PROC_FS
5379 	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5380 			sizeof(struct ipv6_route_iter));
5381 	proc_create_net_single("rt6_stats", 0444, net->proc_net,
5382 			rt6_stats_seq_show, NULL);
5383 #endif
5384 	return 0;
5385 }
5386 
5387 static void __net_exit ip6_route_net_exit_late(struct net *net)
5388 {
5389 #ifdef CONFIG_PROC_FS
5390 	remove_proc_entry("ipv6_route", net->proc_net);
5391 	remove_proc_entry("rt6_stats", net->proc_net);
5392 #endif
5393 }
5394 
5395 static struct pernet_operations ip6_route_net_ops = {
5396 	.init = ip6_route_net_init,
5397 	.exit = ip6_route_net_exit,
5398 };
5399 
5400 static int __net_init ipv6_inetpeer_init(struct net *net)
5401 {
5402 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5403 
5404 	if (!bp)
5405 		return -ENOMEM;
5406 	inet_peer_base_init(bp);
5407 	net->ipv6.peers = bp;
5408 	return 0;
5409 }
5410 
5411 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5412 {
5413 	struct inet_peer_base *bp = net->ipv6.peers;
5414 
5415 	net->ipv6.peers = NULL;
5416 	inetpeer_invalidate_tree(bp);
5417 	kfree(bp);
5418 }
5419 
5420 static struct pernet_operations ipv6_inetpeer_ops = {
5421 	.init	=	ipv6_inetpeer_init,
5422 	.exit	=	ipv6_inetpeer_exit,
5423 };
5424 
5425 static struct pernet_operations ip6_route_net_late_ops = {
5426 	.init = ip6_route_net_init_late,
5427 	.exit = ip6_route_net_exit_late,
5428 };
5429 
5430 static struct notifier_block ip6_route_dev_notifier = {
5431 	.notifier_call = ip6_route_dev_notify,
5432 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5433 };
5434 
5435 void __init ip6_route_init_special_entries(void)
5436 {
5437 	/* Registering of the loopback is done before this portion of code,
5438 	 * the loopback reference in rt6_info will not be taken, do it
5439 	 * manually for init_net */
5440 	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5441 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5442 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5443   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5444 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5445 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5446 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5447 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5448   #endif
5449 }
5450 
5451 int __init ip6_route_init(void)
5452 {
5453 	int ret;
5454 	int cpu;
5455 
5456 	ret = -ENOMEM;
5457 	ip6_dst_ops_template.kmem_cachep =
5458 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5459 				  SLAB_HWCACHE_ALIGN, NULL);
5460 	if (!ip6_dst_ops_template.kmem_cachep)
5461 		goto out;
5462 
5463 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5464 	if (ret)
5465 		goto out_kmem_cache;
5466 
5467 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5468 	if (ret)
5469 		goto out_dst_entries;
5470 
5471 	ret = register_pernet_subsys(&ip6_route_net_ops);
5472 	if (ret)
5473 		goto out_register_inetpeer;
5474 
5475 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5476 
5477 	ret = fib6_init();
5478 	if (ret)
5479 		goto out_register_subsys;
5480 
5481 	ret = xfrm6_init();
5482 	if (ret)
5483 		goto out_fib6_init;
5484 
5485 	ret = fib6_rules_init();
5486 	if (ret)
5487 		goto xfrm6_init;
5488 
5489 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5490 	if (ret)
5491 		goto fib6_rules_init;
5492 
5493 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5494 				   inet6_rtm_newroute, NULL, 0);
5495 	if (ret < 0)
5496 		goto out_register_late_subsys;
5497 
5498 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5499 				   inet6_rtm_delroute, NULL, 0);
5500 	if (ret < 0)
5501 		goto out_register_late_subsys;
5502 
5503 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5504 				   inet6_rtm_getroute, NULL,
5505 				   RTNL_FLAG_DOIT_UNLOCKED);
5506 	if (ret < 0)
5507 		goto out_register_late_subsys;
5508 
5509 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5510 	if (ret)
5511 		goto out_register_late_subsys;
5512 
5513 	for_each_possible_cpu(cpu) {
5514 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5515 
5516 		INIT_LIST_HEAD(&ul->head);
5517 		spin_lock_init(&ul->lock);
5518 	}
5519 
5520 out:
5521 	return ret;
5522 
5523 out_register_late_subsys:
5524 	rtnl_unregister_all(PF_INET6);
5525 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5526 fib6_rules_init:
5527 	fib6_rules_cleanup();
5528 xfrm6_init:
5529 	xfrm6_fini();
5530 out_fib6_init:
5531 	fib6_gc_cleanup();
5532 out_register_subsys:
5533 	unregister_pernet_subsys(&ip6_route_net_ops);
5534 out_register_inetpeer:
5535 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5536 out_dst_entries:
5537 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5538 out_kmem_cache:
5539 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5540 	goto out;
5541 }
5542 
5543 void ip6_route_cleanup(void)
5544 {
5545 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5546 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5547 	fib6_rules_cleanup();
5548 	xfrm6_fini();
5549 	fib6_gc_cleanup();
5550 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5551 	unregister_pernet_subsys(&ip6_route_net_ops);
5552 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5553 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5554 }
5555