xref: /openbmc/linux/net/ipv6/route.c (revision 6724ed7f)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(rt->dst.from);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct dst_entry *from = dst->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	dst->from = NULL;
413 	dst_release(from);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->dst.from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 		       rt6_check_expired((struct rt6_info *)rt->dst.from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 	int route_choosen;
459 
460 	/* We might have already computed the hash for ICMPv6 errors. In such
461 	 * case it will always be non-zero. Otherwise now is the time to do it.
462 	 */
463 	if (!fl6->mp_hash)
464 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465 
466 	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467 	/* Don't change the route, if route_choosen == 0
468 	 * (siblings does not include ourself)
469 	 */
470 	if (route_choosen)
471 		list_for_each_entry_safe(sibling, next_sibling,
472 				&match->rt6i_siblings, rt6i_siblings) {
473 			route_choosen--;
474 			if (route_choosen == 0) {
475 				struct inet6_dev *idev = sibling->rt6i_idev;
476 
477 				if (!netif_carrier_ok(sibling->dst.dev) &&
478 				    idev->cnf.ignore_routes_with_linkdown)
479 					break;
480 				if (rt6_score_route(sibling, oif, strict) < 0)
481 					break;
482 				match = sibling;
483 				break;
484 			}
485 		}
486 	return match;
487 }
488 
489 /*
490  *	Route lookup. rcu_read_lock() should be held.
491  */
492 
493 static inline struct rt6_info *rt6_device_match(struct net *net,
494 						    struct rt6_info *rt,
495 						    const struct in6_addr *saddr,
496 						    int oif,
497 						    int flags)
498 {
499 	struct rt6_info *local = NULL;
500 	struct rt6_info *sprt;
501 
502 	if (!oif && ipv6_addr_any(saddr))
503 		goto out;
504 
505 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) {
506 		struct net_device *dev = sprt->dst.dev;
507 
508 		if (oif) {
509 			if (dev->ifindex == oif)
510 				return sprt;
511 			if (dev->flags & IFF_LOOPBACK) {
512 				if (!sprt->rt6i_idev ||
513 				    sprt->rt6i_idev->dev->ifindex != oif) {
514 					if (flags & RT6_LOOKUP_F_IFACE)
515 						continue;
516 					if (local &&
517 					    local->rt6i_idev->dev->ifindex == oif)
518 						continue;
519 				}
520 				local = sprt;
521 			}
522 		} else {
523 			if (ipv6_chk_addr(net, saddr, dev,
524 					  flags & RT6_LOOKUP_F_IFACE))
525 				return sprt;
526 		}
527 	}
528 
529 	if (oif) {
530 		if (local)
531 			return local;
532 
533 		if (flags & RT6_LOOKUP_F_IFACE)
534 			return net->ipv6.ip6_null_entry;
535 	}
536 out:
537 	return rt;
538 }
539 
540 #ifdef CONFIG_IPV6_ROUTER_PREF
541 struct __rt6_probe_work {
542 	struct work_struct work;
543 	struct in6_addr target;
544 	struct net_device *dev;
545 };
546 
547 static void rt6_probe_deferred(struct work_struct *w)
548 {
549 	struct in6_addr mcaddr;
550 	struct __rt6_probe_work *work =
551 		container_of(w, struct __rt6_probe_work, work);
552 
553 	addrconf_addr_solict_mult(&work->target, &mcaddr);
554 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
555 	dev_put(work->dev);
556 	kfree(work);
557 }
558 
559 static void rt6_probe(struct rt6_info *rt)
560 {
561 	struct __rt6_probe_work *work;
562 	struct neighbour *neigh;
563 	/*
564 	 * Okay, this does not seem to be appropriate
565 	 * for now, however, we need to check if it
566 	 * is really so; aka Router Reachability Probing.
567 	 *
568 	 * Router Reachability Probe MUST be rate-limited
569 	 * to no more than one per minute.
570 	 */
571 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
572 		return;
573 	rcu_read_lock_bh();
574 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
575 	if (neigh) {
576 		if (neigh->nud_state & NUD_VALID)
577 			goto out;
578 
579 		work = NULL;
580 		write_lock(&neigh->lock);
581 		if (!(neigh->nud_state & NUD_VALID) &&
582 		    time_after(jiffies,
583 			       neigh->updated +
584 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
585 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
586 			if (work)
587 				__neigh_set_probe_once(neigh);
588 		}
589 		write_unlock(&neigh->lock);
590 	} else {
591 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
592 	}
593 
594 	if (work) {
595 		INIT_WORK(&work->work, rt6_probe_deferred);
596 		work->target = rt->rt6i_gateway;
597 		dev_hold(rt->dst.dev);
598 		work->dev = rt->dst.dev;
599 		schedule_work(&work->work);
600 	}
601 
602 out:
603 	rcu_read_unlock_bh();
604 }
605 #else
606 static inline void rt6_probe(struct rt6_info *rt)
607 {
608 }
609 #endif
610 
611 /*
612  * Default Router Selection (RFC 2461 6.3.6)
613  */
614 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
615 {
616 	struct net_device *dev = rt->dst.dev;
617 	if (!oif || dev->ifindex == oif)
618 		return 2;
619 	if ((dev->flags & IFF_LOOPBACK) &&
620 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
621 		return 1;
622 	return 0;
623 }
624 
625 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
626 {
627 	struct neighbour *neigh;
628 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
629 
630 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
631 	    !(rt->rt6i_flags & RTF_GATEWAY))
632 		return RT6_NUD_SUCCEED;
633 
634 	rcu_read_lock_bh();
635 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
636 	if (neigh) {
637 		read_lock(&neigh->lock);
638 		if (neigh->nud_state & NUD_VALID)
639 			ret = RT6_NUD_SUCCEED;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641 		else if (!(neigh->nud_state & NUD_FAILED))
642 			ret = RT6_NUD_SUCCEED;
643 		else
644 			ret = RT6_NUD_FAIL_PROBE;
645 #endif
646 		read_unlock(&neigh->lock);
647 	} else {
648 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
649 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
650 	}
651 	rcu_read_unlock_bh();
652 
653 	return ret;
654 }
655 
656 static int rt6_score_route(struct rt6_info *rt, int oif,
657 			   int strict)
658 {
659 	int m;
660 
661 	m = rt6_check_dev(rt, oif);
662 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
663 		return RT6_NUD_FAIL_HARD;
664 #ifdef CONFIG_IPV6_ROUTER_PREF
665 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
666 #endif
667 	if (strict & RT6_LOOKUP_F_REACHABLE) {
668 		int n = rt6_check_neigh(rt);
669 		if (n < 0)
670 			return n;
671 	}
672 	return m;
673 }
674 
675 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
676 				   int *mpri, struct rt6_info *match,
677 				   bool *do_rr)
678 {
679 	int m;
680 	bool match_do_rr = false;
681 	struct inet6_dev *idev = rt->rt6i_idev;
682 	struct net_device *dev = rt->dst.dev;
683 
684 	if (dev && !netif_carrier_ok(dev) &&
685 	    idev->cnf.ignore_routes_with_linkdown &&
686 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
687 		goto out;
688 
689 	if (rt6_check_expired(rt))
690 		goto out;
691 
692 	m = rt6_score_route(rt, oif, strict);
693 	if (m == RT6_NUD_FAIL_DO_RR) {
694 		match_do_rr = true;
695 		m = 0; /* lowest valid score */
696 	} else if (m == RT6_NUD_FAIL_HARD) {
697 		goto out;
698 	}
699 
700 	if (strict & RT6_LOOKUP_F_REACHABLE)
701 		rt6_probe(rt);
702 
703 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
704 	if (m > *mpri) {
705 		*do_rr = match_do_rr;
706 		*mpri = m;
707 		match = rt;
708 	}
709 out:
710 	return match;
711 }
712 
713 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
714 				     struct rt6_info *leaf,
715 				     struct rt6_info *rr_head,
716 				     u32 metric, int oif, int strict,
717 				     bool *do_rr)
718 {
719 	struct rt6_info *rt, *match, *cont;
720 	int mpri = -1;
721 
722 	match = NULL;
723 	cont = NULL;
724 	for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) {
725 		if (rt->rt6i_metric != metric) {
726 			cont = rt;
727 			break;
728 		}
729 
730 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
731 	}
732 
733 	for (rt = leaf; rt && rt != rr_head;
734 	     rt = rcu_dereference(rt->dst.rt6_next)) {
735 		if (rt->rt6i_metric != metric) {
736 			cont = rt;
737 			break;
738 		}
739 
740 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
741 	}
742 
743 	if (match || !cont)
744 		return match;
745 
746 	for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next))
747 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
748 
749 	return match;
750 }
751 
752 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
753 				   int oif, int strict)
754 {
755 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
756 	struct rt6_info *match, *rt0;
757 	bool do_rr = false;
758 	int key_plen;
759 
760 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
761 		return net->ipv6.ip6_null_entry;
762 
763 	rt0 = rcu_dereference(fn->rr_ptr);
764 	if (!rt0)
765 		rt0 = leaf;
766 
767 	/* Double check to make sure fn is not an intermediate node
768 	 * and fn->leaf does not points to its child's leaf
769 	 * (This might happen if all routes under fn are deleted from
770 	 * the tree and fib6_repair_tree() is called on the node.)
771 	 */
772 	key_plen = rt0->rt6i_dst.plen;
773 #ifdef CONFIG_IPV6_SUBTREES
774 	if (rt0->rt6i_src.plen)
775 		key_plen = rt0->rt6i_src.plen;
776 #endif
777 	if (fn->fn_bit != key_plen)
778 		return net->ipv6.ip6_null_entry;
779 
780 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
781 			     &do_rr);
782 
783 	if (do_rr) {
784 		struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next);
785 
786 		/* no entries matched; do round-robin */
787 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
788 			next = leaf;
789 
790 		if (next != rt0) {
791 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
792 			/* make sure next is not being deleted from the tree */
793 			if (next->rt6i_node)
794 				rcu_assign_pointer(fn->rr_ptr, next);
795 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
796 		}
797 	}
798 
799 	return match ? match : net->ipv6.ip6_null_entry;
800 }
801 
802 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
803 {
804 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
805 }
806 
807 #ifdef CONFIG_IPV6_ROUTE_INFO
808 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
809 		  const struct in6_addr *gwaddr)
810 {
811 	struct net *net = dev_net(dev);
812 	struct route_info *rinfo = (struct route_info *) opt;
813 	struct in6_addr prefix_buf, *prefix;
814 	unsigned int pref;
815 	unsigned long lifetime;
816 	struct rt6_info *rt;
817 
818 	if (len < sizeof(struct route_info)) {
819 		return -EINVAL;
820 	}
821 
822 	/* Sanity check for prefix_len and length */
823 	if (rinfo->length > 3) {
824 		return -EINVAL;
825 	} else if (rinfo->prefix_len > 128) {
826 		return -EINVAL;
827 	} else if (rinfo->prefix_len > 64) {
828 		if (rinfo->length < 2) {
829 			return -EINVAL;
830 		}
831 	} else if (rinfo->prefix_len > 0) {
832 		if (rinfo->length < 1) {
833 			return -EINVAL;
834 		}
835 	}
836 
837 	pref = rinfo->route_pref;
838 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
839 		return -EINVAL;
840 
841 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
842 
843 	if (rinfo->length == 3)
844 		prefix = (struct in6_addr *)rinfo->prefix;
845 	else {
846 		/* this function is safe */
847 		ipv6_addr_prefix(&prefix_buf,
848 				 (struct in6_addr *)rinfo->prefix,
849 				 rinfo->prefix_len);
850 		prefix = &prefix_buf;
851 	}
852 
853 	if (rinfo->prefix_len == 0)
854 		rt = rt6_get_dflt_router(gwaddr, dev);
855 	else
856 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
857 					gwaddr, dev);
858 
859 	if (rt && !lifetime) {
860 		ip6_del_rt(rt);
861 		rt = NULL;
862 	}
863 
864 	if (!rt && lifetime)
865 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
866 					dev, pref);
867 	else if (rt)
868 		rt->rt6i_flags = RTF_ROUTEINFO |
869 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
870 
871 	if (rt) {
872 		if (!addrconf_finite_timeout(lifetime))
873 			rt6_clean_expires(rt);
874 		else
875 			rt6_set_expires(rt, jiffies + HZ * lifetime);
876 
877 		ip6_rt_put(rt);
878 	}
879 	return 0;
880 }
881 #endif
882 
883 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
884 					struct in6_addr *saddr)
885 {
886 	struct fib6_node *pn, *sn;
887 	while (1) {
888 		if (fn->fn_flags & RTN_TL_ROOT)
889 			return NULL;
890 		pn = rcu_dereference(fn->parent);
891 		sn = FIB6_SUBTREE(pn);
892 		if (sn && sn != fn)
893 			fn = fib6_lookup(sn, NULL, saddr);
894 		else
895 			fn = pn;
896 		if (fn->fn_flags & RTN_RTINFO)
897 			return fn;
898 	}
899 }
900 
901 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
902 			  bool null_fallback)
903 {
904 	struct rt6_info *rt = *prt;
905 
906 	if (dst_hold_safe(&rt->dst))
907 		return true;
908 	if (null_fallback) {
909 		rt = net->ipv6.ip6_null_entry;
910 		dst_hold(&rt->dst);
911 	} else {
912 		rt = NULL;
913 	}
914 	*prt = rt;
915 	return false;
916 }
917 
918 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
919 					     struct fib6_table *table,
920 					     struct flowi6 *fl6, int flags)
921 {
922 	struct rt6_info *rt, *rt_cache;
923 	struct fib6_node *fn;
924 
925 	rcu_read_lock();
926 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
927 restart:
928 	rt = rcu_dereference(fn->leaf);
929 	if (!rt) {
930 		rt = net->ipv6.ip6_null_entry;
931 	} else {
932 		rt = rt6_device_match(net, rt, &fl6->saddr,
933 				      fl6->flowi6_oif, flags);
934 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
935 			rt = rt6_multipath_select(rt, fl6,
936 						  fl6->flowi6_oif, flags);
937 	}
938 	if (rt == net->ipv6.ip6_null_entry) {
939 		fn = fib6_backtrack(fn, &fl6->saddr);
940 		if (fn)
941 			goto restart;
942 	}
943 	/* Search through exception table */
944 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
945 	if (rt_cache)
946 		rt = rt_cache;
947 
948 	if (ip6_hold_safe(net, &rt, true))
949 		dst_use_noref(&rt->dst, jiffies);
950 
951 	rcu_read_unlock();
952 
953 	trace_fib6_table_lookup(net, rt, table, fl6);
954 
955 	return rt;
956 
957 }
958 
959 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
960 				    int flags)
961 {
962 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
963 }
964 EXPORT_SYMBOL_GPL(ip6_route_lookup);
965 
966 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
967 			    const struct in6_addr *saddr, int oif, int strict)
968 {
969 	struct flowi6 fl6 = {
970 		.flowi6_oif = oif,
971 		.daddr = *daddr,
972 	};
973 	struct dst_entry *dst;
974 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
975 
976 	if (saddr) {
977 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
978 		flags |= RT6_LOOKUP_F_HAS_SADDR;
979 	}
980 
981 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
982 	if (dst->error == 0)
983 		return (struct rt6_info *) dst;
984 
985 	dst_release(dst);
986 
987 	return NULL;
988 }
989 EXPORT_SYMBOL(rt6_lookup);
990 
991 /* ip6_ins_rt is called with FREE table->tb6_lock.
992  * It takes new route entry, the addition fails by any reason the
993  * route is released.
994  * Caller must hold dst before calling it.
995  */
996 
997 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
998 			struct mx6_config *mxc,
999 			struct netlink_ext_ack *extack)
1000 {
1001 	int err;
1002 	struct fib6_table *table;
1003 
1004 	table = rt->rt6i_table;
1005 	spin_lock_bh(&table->tb6_lock);
1006 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1007 	spin_unlock_bh(&table->tb6_lock);
1008 
1009 	return err;
1010 }
1011 
1012 int ip6_ins_rt(struct rt6_info *rt)
1013 {
1014 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1015 	struct mx6_config mxc = { .mx = NULL, };
1016 
1017 	/* Hold dst to account for the reference from the fib6 tree */
1018 	dst_hold(&rt->dst);
1019 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1020 }
1021 
1022 /* called with rcu_lock held */
1023 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1024 {
1025 	struct net_device *dev = rt->dst.dev;
1026 
1027 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1028 		/* for copies of local routes, dst->dev needs to be the
1029 		 * device if it is a master device, the master device if
1030 		 * device is enslaved, and the loopback as the default
1031 		 */
1032 		if (netif_is_l3_slave(dev) &&
1033 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1034 			dev = l3mdev_master_dev_rcu(dev);
1035 		else if (!netif_is_l3_master(dev))
1036 			dev = dev_net(dev)->loopback_dev;
1037 		/* last case is netif_is_l3_master(dev) is true in which
1038 		 * case we want dev returned to be dev
1039 		 */
1040 	}
1041 
1042 	return dev;
1043 }
1044 
1045 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1046 					   const struct in6_addr *daddr,
1047 					   const struct in6_addr *saddr)
1048 {
1049 	struct net_device *dev;
1050 	struct rt6_info *rt;
1051 
1052 	/*
1053 	 *	Clone the route.
1054 	 */
1055 
1056 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1057 		ort = (struct rt6_info *)ort->dst.from;
1058 
1059 	rcu_read_lock();
1060 	dev = ip6_rt_get_dev_rcu(ort);
1061 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1062 	rcu_read_unlock();
1063 	if (!rt)
1064 		return NULL;
1065 
1066 	ip6_rt_copy_init(rt, ort);
1067 	rt->rt6i_flags |= RTF_CACHE;
1068 	rt->rt6i_metric = 0;
1069 	rt->dst.flags |= DST_HOST;
1070 	rt->rt6i_dst.addr = *daddr;
1071 	rt->rt6i_dst.plen = 128;
1072 
1073 	if (!rt6_is_gw_or_nonexthop(ort)) {
1074 		if (ort->rt6i_dst.plen != 128 &&
1075 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1076 			rt->rt6i_flags |= RTF_ANYCAST;
1077 #ifdef CONFIG_IPV6_SUBTREES
1078 		if (rt->rt6i_src.plen && saddr) {
1079 			rt->rt6i_src.addr = *saddr;
1080 			rt->rt6i_src.plen = 128;
1081 		}
1082 #endif
1083 	}
1084 
1085 	return rt;
1086 }
1087 
1088 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1089 {
1090 	struct net_device *dev;
1091 	struct rt6_info *pcpu_rt;
1092 
1093 	rcu_read_lock();
1094 	dev = ip6_rt_get_dev_rcu(rt);
1095 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1096 	rcu_read_unlock();
1097 	if (!pcpu_rt)
1098 		return NULL;
1099 	ip6_rt_copy_init(pcpu_rt, rt);
1100 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1101 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1102 	return pcpu_rt;
1103 }
1104 
1105 /* It should be called with rcu_read_lock() acquired */
1106 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1107 {
1108 	struct rt6_info *pcpu_rt, **p;
1109 
1110 	p = this_cpu_ptr(rt->rt6i_pcpu);
1111 	pcpu_rt = *p;
1112 
1113 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1114 		rt6_dst_from_metrics_check(pcpu_rt);
1115 
1116 	return pcpu_rt;
1117 }
1118 
1119 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1120 {
1121 	struct rt6_info *pcpu_rt, *prev, **p;
1122 
1123 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1124 	if (!pcpu_rt) {
1125 		struct net *net = dev_net(rt->dst.dev);
1126 
1127 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1128 		return net->ipv6.ip6_null_entry;
1129 	}
1130 
1131 	dst_hold(&pcpu_rt->dst);
1132 	p = this_cpu_ptr(rt->rt6i_pcpu);
1133 	prev = cmpxchg(p, NULL, pcpu_rt);
1134 	BUG_ON(prev);
1135 
1136 	rt6_dst_from_metrics_check(pcpu_rt);
1137 	return pcpu_rt;
1138 }
1139 
1140 /* exception hash table implementation
1141  */
1142 static DEFINE_SPINLOCK(rt6_exception_lock);
1143 
1144 /* Remove rt6_ex from hash table and free the memory
1145  * Caller must hold rt6_exception_lock
1146  */
1147 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1148 				 struct rt6_exception *rt6_ex)
1149 {
1150 	struct net *net;
1151 
1152 	if (!bucket || !rt6_ex)
1153 		return;
1154 
1155 	net = dev_net(rt6_ex->rt6i->dst.dev);
1156 	rt6_ex->rt6i->rt6i_node = NULL;
1157 	hlist_del_rcu(&rt6_ex->hlist);
1158 	rt6_release(rt6_ex->rt6i);
1159 	kfree_rcu(rt6_ex, rcu);
1160 	WARN_ON_ONCE(!bucket->depth);
1161 	bucket->depth--;
1162 	net->ipv6.rt6_stats->fib_rt_cache--;
1163 }
1164 
1165 /* Remove oldest rt6_ex in bucket and free the memory
1166  * Caller must hold rt6_exception_lock
1167  */
1168 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1169 {
1170 	struct rt6_exception *rt6_ex, *oldest = NULL;
1171 
1172 	if (!bucket)
1173 		return;
1174 
1175 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1176 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1177 			oldest = rt6_ex;
1178 	}
1179 	rt6_remove_exception(bucket, oldest);
1180 }
1181 
1182 static u32 rt6_exception_hash(const struct in6_addr *dst,
1183 			      const struct in6_addr *src)
1184 {
1185 	static u32 seed __read_mostly;
1186 	u32 val;
1187 
1188 	net_get_random_once(&seed, sizeof(seed));
1189 	val = jhash(dst, sizeof(*dst), seed);
1190 
1191 #ifdef CONFIG_IPV6_SUBTREES
1192 	if (src)
1193 		val = jhash(src, sizeof(*src), val);
1194 #endif
1195 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1196 }
1197 
1198 /* Helper function to find the cached rt in the hash table
1199  * and update bucket pointer to point to the bucket for this
1200  * (daddr, saddr) pair
1201  * Caller must hold rt6_exception_lock
1202  */
1203 static struct rt6_exception *
1204 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1205 			      const struct in6_addr *daddr,
1206 			      const struct in6_addr *saddr)
1207 {
1208 	struct rt6_exception *rt6_ex;
1209 	u32 hval;
1210 
1211 	if (!(*bucket) || !daddr)
1212 		return NULL;
1213 
1214 	hval = rt6_exception_hash(daddr, saddr);
1215 	*bucket += hval;
1216 
1217 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1218 		struct rt6_info *rt6 = rt6_ex->rt6i;
1219 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1220 
1221 #ifdef CONFIG_IPV6_SUBTREES
1222 		if (matched && saddr)
1223 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1224 #endif
1225 		if (matched)
1226 			return rt6_ex;
1227 	}
1228 	return NULL;
1229 }
1230 
1231 /* Helper function to find the cached rt in the hash table
1232  * and update bucket pointer to point to the bucket for this
1233  * (daddr, saddr) pair
1234  * Caller must hold rcu_read_lock()
1235  */
1236 static struct rt6_exception *
1237 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1238 			 const struct in6_addr *daddr,
1239 			 const struct in6_addr *saddr)
1240 {
1241 	struct rt6_exception *rt6_ex;
1242 	u32 hval;
1243 
1244 	WARN_ON_ONCE(!rcu_read_lock_held());
1245 
1246 	if (!(*bucket) || !daddr)
1247 		return NULL;
1248 
1249 	hval = rt6_exception_hash(daddr, saddr);
1250 	*bucket += hval;
1251 
1252 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1253 		struct rt6_info *rt6 = rt6_ex->rt6i;
1254 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1255 
1256 #ifdef CONFIG_IPV6_SUBTREES
1257 		if (matched && saddr)
1258 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1259 #endif
1260 		if (matched)
1261 			return rt6_ex;
1262 	}
1263 	return NULL;
1264 }
1265 
1266 static int rt6_insert_exception(struct rt6_info *nrt,
1267 				struct rt6_info *ort)
1268 {
1269 	struct net *net = dev_net(ort->dst.dev);
1270 	struct rt6_exception_bucket *bucket;
1271 	struct in6_addr *src_key = NULL;
1272 	struct rt6_exception *rt6_ex;
1273 	int err = 0;
1274 
1275 	/* ort can't be a cache or pcpu route */
1276 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1277 		ort = (struct rt6_info *)ort->dst.from;
1278 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1279 
1280 	spin_lock_bh(&rt6_exception_lock);
1281 
1282 	if (ort->exception_bucket_flushed) {
1283 		err = -EINVAL;
1284 		goto out;
1285 	}
1286 
1287 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1288 					lockdep_is_held(&rt6_exception_lock));
1289 	if (!bucket) {
1290 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1291 				 GFP_ATOMIC);
1292 		if (!bucket) {
1293 			err = -ENOMEM;
1294 			goto out;
1295 		}
1296 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1297 	}
1298 
1299 #ifdef CONFIG_IPV6_SUBTREES
1300 	/* rt6i_src.plen != 0 indicates ort is in subtree
1301 	 * and exception table is indexed by a hash of
1302 	 * both rt6i_dst and rt6i_src.
1303 	 * Otherwise, the exception table is indexed by
1304 	 * a hash of only rt6i_dst.
1305 	 */
1306 	if (ort->rt6i_src.plen)
1307 		src_key = &nrt->rt6i_src.addr;
1308 #endif
1309 
1310 	/* Update rt6i_prefsrc as it could be changed
1311 	 * in rt6_remove_prefsrc()
1312 	 */
1313 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1314 	/* rt6_mtu_change() might lower mtu on ort.
1315 	 * Only insert this exception route if its mtu
1316 	 * is less than ort's mtu value.
1317 	 */
1318 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1319 		err = -EINVAL;
1320 		goto out;
1321 	}
1322 
1323 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1324 					       src_key);
1325 	if (rt6_ex)
1326 		rt6_remove_exception(bucket, rt6_ex);
1327 
1328 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1329 	if (!rt6_ex) {
1330 		err = -ENOMEM;
1331 		goto out;
1332 	}
1333 	rt6_ex->rt6i = nrt;
1334 	rt6_ex->stamp = jiffies;
1335 	atomic_inc(&nrt->rt6i_ref);
1336 	nrt->rt6i_node = ort->rt6i_node;
1337 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1338 	bucket->depth++;
1339 	net->ipv6.rt6_stats->fib_rt_cache++;
1340 
1341 	if (bucket->depth > FIB6_MAX_DEPTH)
1342 		rt6_exception_remove_oldest(bucket);
1343 
1344 out:
1345 	spin_unlock_bh(&rt6_exception_lock);
1346 
1347 	/* Update fn->fn_sernum to invalidate all cached dst */
1348 	if (!err) {
1349 		fib6_update_sernum(ort);
1350 		fib6_force_start_gc(net);
1351 	}
1352 
1353 	return err;
1354 }
1355 
1356 void rt6_flush_exceptions(struct rt6_info *rt)
1357 {
1358 	struct rt6_exception_bucket *bucket;
1359 	struct rt6_exception *rt6_ex;
1360 	struct hlist_node *tmp;
1361 	int i;
1362 
1363 	spin_lock_bh(&rt6_exception_lock);
1364 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1365 	rt->exception_bucket_flushed = 1;
1366 
1367 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1368 				    lockdep_is_held(&rt6_exception_lock));
1369 	if (!bucket)
1370 		goto out;
1371 
1372 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1373 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1374 			rt6_remove_exception(bucket, rt6_ex);
1375 		WARN_ON_ONCE(bucket->depth);
1376 		bucket++;
1377 	}
1378 
1379 out:
1380 	spin_unlock_bh(&rt6_exception_lock);
1381 }
1382 
1383 /* Find cached rt in the hash table inside passed in rt
1384  * Caller has to hold rcu_read_lock()
1385  */
1386 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1387 					   struct in6_addr *daddr,
1388 					   struct in6_addr *saddr)
1389 {
1390 	struct rt6_exception_bucket *bucket;
1391 	struct in6_addr *src_key = NULL;
1392 	struct rt6_exception *rt6_ex;
1393 	struct rt6_info *res = NULL;
1394 
1395 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1396 
1397 #ifdef CONFIG_IPV6_SUBTREES
1398 	/* rt6i_src.plen != 0 indicates rt is in subtree
1399 	 * and exception table is indexed by a hash of
1400 	 * both rt6i_dst and rt6i_src.
1401 	 * Otherwise, the exception table is indexed by
1402 	 * a hash of only rt6i_dst.
1403 	 */
1404 	if (rt->rt6i_src.plen)
1405 		src_key = saddr;
1406 #endif
1407 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1408 
1409 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1410 		res = rt6_ex->rt6i;
1411 
1412 	return res;
1413 }
1414 
1415 /* Remove the passed in cached rt from the hash table that contains it */
1416 int rt6_remove_exception_rt(struct rt6_info *rt)
1417 {
1418 	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1419 	struct rt6_exception_bucket *bucket;
1420 	struct in6_addr *src_key = NULL;
1421 	struct rt6_exception *rt6_ex;
1422 	int err;
1423 
1424 	if (!from ||
1425 	    !(rt->rt6i_flags & RTF_CACHE))
1426 		return -EINVAL;
1427 
1428 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1429 		return -ENOENT;
1430 
1431 	spin_lock_bh(&rt6_exception_lock);
1432 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1433 				    lockdep_is_held(&rt6_exception_lock));
1434 #ifdef CONFIG_IPV6_SUBTREES
1435 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1436 	 * and exception table is indexed by a hash of
1437 	 * both rt6i_dst and rt6i_src.
1438 	 * Otherwise, the exception table is indexed by
1439 	 * a hash of only rt6i_dst.
1440 	 */
1441 	if (from->rt6i_src.plen)
1442 		src_key = &rt->rt6i_src.addr;
1443 #endif
1444 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1445 					       &rt->rt6i_dst.addr,
1446 					       src_key);
1447 	if (rt6_ex) {
1448 		rt6_remove_exception(bucket, rt6_ex);
1449 		err = 0;
1450 	} else {
1451 		err = -ENOENT;
1452 	}
1453 
1454 	spin_unlock_bh(&rt6_exception_lock);
1455 	return err;
1456 }
1457 
1458 /* Find rt6_ex which contains the passed in rt cache and
1459  * refresh its stamp
1460  */
1461 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1462 {
1463 	struct rt6_info *from = (struct rt6_info *)rt->dst.from;
1464 	struct rt6_exception_bucket *bucket;
1465 	struct in6_addr *src_key = NULL;
1466 	struct rt6_exception *rt6_ex;
1467 
1468 	if (!from ||
1469 	    !(rt->rt6i_flags & RTF_CACHE))
1470 		return;
1471 
1472 	rcu_read_lock();
1473 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1474 
1475 #ifdef CONFIG_IPV6_SUBTREES
1476 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1477 	 * and exception table is indexed by a hash of
1478 	 * both rt6i_dst and rt6i_src.
1479 	 * Otherwise, the exception table is indexed by
1480 	 * a hash of only rt6i_dst.
1481 	 */
1482 	if (from->rt6i_src.plen)
1483 		src_key = &rt->rt6i_src.addr;
1484 #endif
1485 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1486 					  &rt->rt6i_dst.addr,
1487 					  src_key);
1488 	if (rt6_ex)
1489 		rt6_ex->stamp = jiffies;
1490 
1491 	rcu_read_unlock();
1492 }
1493 
1494 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1495 {
1496 	struct rt6_exception_bucket *bucket;
1497 	struct rt6_exception *rt6_ex;
1498 	int i;
1499 
1500 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1501 					lockdep_is_held(&rt6_exception_lock));
1502 
1503 	if (bucket) {
1504 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1505 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1506 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1507 			}
1508 			bucket++;
1509 		}
1510 	}
1511 }
1512 
1513 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1514 {
1515 	struct rt6_exception_bucket *bucket;
1516 	struct rt6_exception *rt6_ex;
1517 	int i;
1518 
1519 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1520 					lockdep_is_held(&rt6_exception_lock));
1521 
1522 	if (bucket) {
1523 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1524 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1525 				struct rt6_info *entry = rt6_ex->rt6i;
1526 				/* For RTF_CACHE with rt6i_pmtu == 0
1527 				 * (i.e. a redirected route),
1528 				 * the metrics of its rt->dst.from has already
1529 				 * been updated.
1530 				 */
1531 				if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1532 					entry->rt6i_pmtu = mtu;
1533 			}
1534 			bucket++;
1535 		}
1536 	}
1537 }
1538 
1539 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1540 
1541 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1542 					struct in6_addr *gateway)
1543 {
1544 	struct rt6_exception_bucket *bucket;
1545 	struct rt6_exception *rt6_ex;
1546 	struct hlist_node *tmp;
1547 	int i;
1548 
1549 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1550 		return;
1551 
1552 	spin_lock_bh(&rt6_exception_lock);
1553 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1554 				     lockdep_is_held(&rt6_exception_lock));
1555 
1556 	if (bucket) {
1557 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1558 			hlist_for_each_entry_safe(rt6_ex, tmp,
1559 						  &bucket->chain, hlist) {
1560 				struct rt6_info *entry = rt6_ex->rt6i;
1561 
1562 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1563 				    RTF_CACHE_GATEWAY &&
1564 				    ipv6_addr_equal(gateway,
1565 						    &entry->rt6i_gateway)) {
1566 					rt6_remove_exception(bucket, rt6_ex);
1567 				}
1568 			}
1569 			bucket++;
1570 		}
1571 	}
1572 
1573 	spin_unlock_bh(&rt6_exception_lock);
1574 }
1575 
1576 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1577 				      struct rt6_exception *rt6_ex,
1578 				      struct fib6_gc_args *gc_args,
1579 				      unsigned long now)
1580 {
1581 	struct rt6_info *rt = rt6_ex->rt6i;
1582 
1583 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1584 	 * even if others have still references to them, so that on next
1585 	 * dst_check() such references can be dropped.
1586 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1587 	 * expired, independently from their aging, as per RFC 8201 section 4
1588 	 */
1589 	if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1590 	    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1591 		RT6_TRACE("aging clone %p\n", rt);
1592 		rt6_remove_exception(bucket, rt6_ex);
1593 		return;
1594 	} else if (rt->rt6i_flags & RTF_GATEWAY) {
1595 		struct neighbour *neigh;
1596 		__u8 neigh_flags = 0;
1597 
1598 		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1599 		if (neigh) {
1600 			neigh_flags = neigh->flags;
1601 			neigh_release(neigh);
1602 		}
1603 		if (!(neigh_flags & NTF_ROUTER)) {
1604 			RT6_TRACE("purging route %p via non-router but gateway\n",
1605 				  rt);
1606 			rt6_remove_exception(bucket, rt6_ex);
1607 			return;
1608 		}
1609 	} else if (__rt6_check_expired(rt)) {
1610 		RT6_TRACE("purging expired route %p\n", rt);
1611 		rt6_remove_exception(bucket, rt6_ex);
1612 		return;
1613 	}
1614 	gc_args->more++;
1615 }
1616 
1617 void rt6_age_exceptions(struct rt6_info *rt,
1618 			struct fib6_gc_args *gc_args,
1619 			unsigned long now)
1620 {
1621 	struct rt6_exception_bucket *bucket;
1622 	struct rt6_exception *rt6_ex;
1623 	struct hlist_node *tmp;
1624 	int i;
1625 
1626 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1627 		return;
1628 
1629 	spin_lock_bh(&rt6_exception_lock);
1630 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1631 				    lockdep_is_held(&rt6_exception_lock));
1632 
1633 	if (bucket) {
1634 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1635 			hlist_for_each_entry_safe(rt6_ex, tmp,
1636 						  &bucket->chain, hlist) {
1637 				rt6_age_examine_exception(bucket, rt6_ex,
1638 							  gc_args, now);
1639 			}
1640 			bucket++;
1641 		}
1642 	}
1643 	spin_unlock_bh(&rt6_exception_lock);
1644 }
1645 
1646 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1647 			       int oif, struct flowi6 *fl6, int flags)
1648 {
1649 	struct fib6_node *fn, *saved_fn;
1650 	struct rt6_info *rt, *rt_cache;
1651 	int strict = 0;
1652 
1653 	strict |= flags & RT6_LOOKUP_F_IFACE;
1654 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1655 	if (net->ipv6.devconf_all->forwarding == 0)
1656 		strict |= RT6_LOOKUP_F_REACHABLE;
1657 
1658 	rcu_read_lock();
1659 
1660 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1661 	saved_fn = fn;
1662 
1663 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1664 		oif = 0;
1665 
1666 redo_rt6_select:
1667 	rt = rt6_select(net, fn, oif, strict);
1668 	if (rt->rt6i_nsiblings)
1669 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1670 	if (rt == net->ipv6.ip6_null_entry) {
1671 		fn = fib6_backtrack(fn, &fl6->saddr);
1672 		if (fn)
1673 			goto redo_rt6_select;
1674 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1675 			/* also consider unreachable route */
1676 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1677 			fn = saved_fn;
1678 			goto redo_rt6_select;
1679 		}
1680 	}
1681 
1682 	/*Search through exception table */
1683 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1684 	if (rt_cache)
1685 		rt = rt_cache;
1686 
1687 	if (rt == net->ipv6.ip6_null_entry) {
1688 		rcu_read_unlock();
1689 		dst_hold(&rt->dst);
1690 		trace_fib6_table_lookup(net, rt, table, fl6);
1691 		return rt;
1692 	} else if (rt->rt6i_flags & RTF_CACHE) {
1693 		if (ip6_hold_safe(net, &rt, true)) {
1694 			dst_use_noref(&rt->dst, jiffies);
1695 			rt6_dst_from_metrics_check(rt);
1696 		}
1697 		rcu_read_unlock();
1698 		trace_fib6_table_lookup(net, rt, table, fl6);
1699 		return rt;
1700 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1701 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1702 		/* Create a RTF_CACHE clone which will not be
1703 		 * owned by the fib6 tree.  It is for the special case where
1704 		 * the daddr in the skb during the neighbor look-up is different
1705 		 * from the fl6->daddr used to look-up route here.
1706 		 */
1707 
1708 		struct rt6_info *uncached_rt;
1709 
1710 		if (ip6_hold_safe(net, &rt, true)) {
1711 			dst_use_noref(&rt->dst, jiffies);
1712 		} else {
1713 			rcu_read_unlock();
1714 			uncached_rt = rt;
1715 			goto uncached_rt_out;
1716 		}
1717 		rcu_read_unlock();
1718 
1719 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1720 		dst_release(&rt->dst);
1721 
1722 		if (uncached_rt) {
1723 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1724 			 * No need for another dst_hold()
1725 			 */
1726 			rt6_uncached_list_add(uncached_rt);
1727 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1728 		} else {
1729 			uncached_rt = net->ipv6.ip6_null_entry;
1730 			dst_hold(&uncached_rt->dst);
1731 		}
1732 
1733 uncached_rt_out:
1734 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1735 		return uncached_rt;
1736 
1737 	} else {
1738 		/* Get a percpu copy */
1739 
1740 		struct rt6_info *pcpu_rt;
1741 
1742 		dst_use_noref(&rt->dst, jiffies);
1743 		local_bh_disable();
1744 		pcpu_rt = rt6_get_pcpu_route(rt);
1745 
1746 		if (!pcpu_rt) {
1747 			/* atomic_inc_not_zero() is needed when using rcu */
1748 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1749 				/* No dst_hold() on rt is needed because grabbing
1750 				 * rt->rt6i_ref makes sure rt can't be released.
1751 				 */
1752 				pcpu_rt = rt6_make_pcpu_route(rt);
1753 				rt6_release(rt);
1754 			} else {
1755 				/* rt is already removed from tree */
1756 				pcpu_rt = net->ipv6.ip6_null_entry;
1757 				dst_hold(&pcpu_rt->dst);
1758 			}
1759 		}
1760 		local_bh_enable();
1761 		rcu_read_unlock();
1762 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1763 		return pcpu_rt;
1764 	}
1765 }
1766 EXPORT_SYMBOL_GPL(ip6_pol_route);
1767 
1768 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1769 					    struct flowi6 *fl6, int flags)
1770 {
1771 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1772 }
1773 
1774 struct dst_entry *ip6_route_input_lookup(struct net *net,
1775 					 struct net_device *dev,
1776 					 struct flowi6 *fl6, int flags)
1777 {
1778 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1779 		flags |= RT6_LOOKUP_F_IFACE;
1780 
1781 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1782 }
1783 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1784 
1785 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1786 				  struct flow_keys *keys)
1787 {
1788 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1789 	const struct ipv6hdr *key_iph = outer_iph;
1790 	const struct ipv6hdr *inner_iph;
1791 	const struct icmp6hdr *icmph;
1792 	struct ipv6hdr _inner_iph;
1793 
1794 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1795 		goto out;
1796 
1797 	icmph = icmp6_hdr(skb);
1798 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1799 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1800 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1801 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1802 		goto out;
1803 
1804 	inner_iph = skb_header_pointer(skb,
1805 				       skb_transport_offset(skb) + sizeof(*icmph),
1806 				       sizeof(_inner_iph), &_inner_iph);
1807 	if (!inner_iph)
1808 		goto out;
1809 
1810 	key_iph = inner_iph;
1811 out:
1812 	memset(keys, 0, sizeof(*keys));
1813 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1814 	keys->addrs.v6addrs.src = key_iph->saddr;
1815 	keys->addrs.v6addrs.dst = key_iph->daddr;
1816 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1817 	keys->basic.ip_proto = key_iph->nexthdr;
1818 }
1819 
1820 /* if skb is set it will be used and fl6 can be NULL */
1821 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1822 {
1823 	struct flow_keys hash_keys;
1824 
1825 	if (skb) {
1826 		ip6_multipath_l3_keys(skb, &hash_keys);
1827 		return flow_hash_from_keys(&hash_keys);
1828 	}
1829 
1830 	return get_hash_from_flowi6(fl6);
1831 }
1832 
1833 void ip6_route_input(struct sk_buff *skb)
1834 {
1835 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1836 	struct net *net = dev_net(skb->dev);
1837 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1838 	struct ip_tunnel_info *tun_info;
1839 	struct flowi6 fl6 = {
1840 		.flowi6_iif = skb->dev->ifindex,
1841 		.daddr = iph->daddr,
1842 		.saddr = iph->saddr,
1843 		.flowlabel = ip6_flowinfo(iph),
1844 		.flowi6_mark = skb->mark,
1845 		.flowi6_proto = iph->nexthdr,
1846 	};
1847 
1848 	tun_info = skb_tunnel_info(skb);
1849 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1850 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1851 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1852 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1853 	skb_dst_drop(skb);
1854 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1855 }
1856 
1857 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1858 					     struct flowi6 *fl6, int flags)
1859 {
1860 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1861 }
1862 
1863 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1864 					 struct flowi6 *fl6, int flags)
1865 {
1866 	bool any_src;
1867 
1868 	if (rt6_need_strict(&fl6->daddr)) {
1869 		struct dst_entry *dst;
1870 
1871 		dst = l3mdev_link_scope_lookup(net, fl6);
1872 		if (dst)
1873 			return dst;
1874 	}
1875 
1876 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1877 
1878 	any_src = ipv6_addr_any(&fl6->saddr);
1879 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1880 	    (fl6->flowi6_oif && any_src))
1881 		flags |= RT6_LOOKUP_F_IFACE;
1882 
1883 	if (!any_src)
1884 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1885 	else if (sk)
1886 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1887 
1888 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1889 }
1890 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1891 
1892 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1893 {
1894 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1895 	struct net_device *loopback_dev = net->loopback_dev;
1896 	struct dst_entry *new = NULL;
1897 
1898 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1899 		       DST_OBSOLETE_DEAD, 0);
1900 	if (rt) {
1901 		rt6_info_init(rt);
1902 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1903 
1904 		new = &rt->dst;
1905 		new->__use = 1;
1906 		new->input = dst_discard;
1907 		new->output = dst_discard_out;
1908 
1909 		dst_copy_metrics(new, &ort->dst);
1910 
1911 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1912 		rt->rt6i_gateway = ort->rt6i_gateway;
1913 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1914 		rt->rt6i_metric = 0;
1915 
1916 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1917 #ifdef CONFIG_IPV6_SUBTREES
1918 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1919 #endif
1920 	}
1921 
1922 	dst_release(dst_orig);
1923 	return new ? new : ERR_PTR(-ENOMEM);
1924 }
1925 
1926 /*
1927  *	Destination cache support functions
1928  */
1929 
1930 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1931 {
1932 	if (rt->dst.from &&
1933 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1934 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1935 }
1936 
1937 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1938 {
1939 	u32 rt_cookie = 0;
1940 
1941 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1942 		return NULL;
1943 
1944 	if (rt6_check_expired(rt))
1945 		return NULL;
1946 
1947 	return &rt->dst;
1948 }
1949 
1950 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1951 {
1952 	if (!__rt6_check_expired(rt) &&
1953 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1954 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1955 		return &rt->dst;
1956 	else
1957 		return NULL;
1958 }
1959 
1960 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1961 {
1962 	struct rt6_info *rt;
1963 
1964 	rt = (struct rt6_info *) dst;
1965 
1966 	/* All IPV6 dsts are created with ->obsolete set to the value
1967 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1968 	 * into this function always.
1969 	 */
1970 
1971 	rt6_dst_from_metrics_check(rt);
1972 
1973 	if (rt->rt6i_flags & RTF_PCPU ||
1974 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1975 		return rt6_dst_from_check(rt, cookie);
1976 	else
1977 		return rt6_check(rt, cookie);
1978 }
1979 
1980 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1981 {
1982 	struct rt6_info *rt = (struct rt6_info *) dst;
1983 
1984 	if (rt) {
1985 		if (rt->rt6i_flags & RTF_CACHE) {
1986 			if (rt6_check_expired(rt)) {
1987 				ip6_del_rt(rt);
1988 				dst = NULL;
1989 			}
1990 		} else {
1991 			dst_release(dst);
1992 			dst = NULL;
1993 		}
1994 	}
1995 	return dst;
1996 }
1997 
1998 static void ip6_link_failure(struct sk_buff *skb)
1999 {
2000 	struct rt6_info *rt;
2001 
2002 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2003 
2004 	rt = (struct rt6_info *) skb_dst(skb);
2005 	if (rt) {
2006 		if (rt->rt6i_flags & RTF_CACHE) {
2007 			if (dst_hold_safe(&rt->dst))
2008 				ip6_del_rt(rt);
2009 		} else {
2010 			struct fib6_node *fn;
2011 
2012 			rcu_read_lock();
2013 			fn = rcu_dereference(rt->rt6i_node);
2014 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2015 				fn->fn_sernum = -1;
2016 			rcu_read_unlock();
2017 		}
2018 	}
2019 }
2020 
2021 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2022 {
2023 	struct net *net = dev_net(rt->dst.dev);
2024 
2025 	rt->rt6i_flags |= RTF_MODIFIED;
2026 	rt->rt6i_pmtu = mtu;
2027 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2028 }
2029 
2030 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2031 {
2032 	return !(rt->rt6i_flags & RTF_CACHE) &&
2033 		(rt->rt6i_flags & RTF_PCPU ||
2034 		 rcu_access_pointer(rt->rt6i_node));
2035 }
2036 
2037 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2038 				 const struct ipv6hdr *iph, u32 mtu)
2039 {
2040 	const struct in6_addr *daddr, *saddr;
2041 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2042 
2043 	if (rt6->rt6i_flags & RTF_LOCAL)
2044 		return;
2045 
2046 	if (dst_metric_locked(dst, RTAX_MTU))
2047 		return;
2048 
2049 	if (iph) {
2050 		daddr = &iph->daddr;
2051 		saddr = &iph->saddr;
2052 	} else if (sk) {
2053 		daddr = &sk->sk_v6_daddr;
2054 		saddr = &inet6_sk(sk)->saddr;
2055 	} else {
2056 		daddr = NULL;
2057 		saddr = NULL;
2058 	}
2059 	dst_confirm_neigh(dst, daddr);
2060 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2061 	if (mtu >= dst_mtu(dst))
2062 		return;
2063 
2064 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2065 		rt6_do_update_pmtu(rt6, mtu);
2066 		/* update rt6_ex->stamp for cache */
2067 		if (rt6->rt6i_flags & RTF_CACHE)
2068 			rt6_update_exception_stamp_rt(rt6);
2069 	} else if (daddr) {
2070 		struct rt6_info *nrt6;
2071 
2072 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2073 		if (nrt6) {
2074 			rt6_do_update_pmtu(nrt6, mtu);
2075 			if (rt6_insert_exception(nrt6, rt6))
2076 				dst_release_immediate(&nrt6->dst);
2077 		}
2078 	}
2079 }
2080 
2081 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2082 			       struct sk_buff *skb, u32 mtu)
2083 {
2084 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2085 }
2086 
2087 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2088 		     int oif, u32 mark, kuid_t uid)
2089 {
2090 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2091 	struct dst_entry *dst;
2092 	struct flowi6 fl6;
2093 
2094 	memset(&fl6, 0, sizeof(fl6));
2095 	fl6.flowi6_oif = oif;
2096 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2097 	fl6.daddr = iph->daddr;
2098 	fl6.saddr = iph->saddr;
2099 	fl6.flowlabel = ip6_flowinfo(iph);
2100 	fl6.flowi6_uid = uid;
2101 
2102 	dst = ip6_route_output(net, NULL, &fl6);
2103 	if (!dst->error)
2104 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2105 	dst_release(dst);
2106 }
2107 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2108 
2109 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2110 {
2111 	struct dst_entry *dst;
2112 
2113 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2114 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2115 
2116 	dst = __sk_dst_get(sk);
2117 	if (!dst || !dst->obsolete ||
2118 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2119 		return;
2120 
2121 	bh_lock_sock(sk);
2122 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2123 		ip6_datagram_dst_update(sk, false);
2124 	bh_unlock_sock(sk);
2125 }
2126 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2127 
2128 /* Handle redirects */
2129 struct ip6rd_flowi {
2130 	struct flowi6 fl6;
2131 	struct in6_addr gateway;
2132 };
2133 
2134 static struct rt6_info *__ip6_route_redirect(struct net *net,
2135 					     struct fib6_table *table,
2136 					     struct flowi6 *fl6,
2137 					     int flags)
2138 {
2139 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2140 	struct rt6_info *rt, *rt_cache;
2141 	struct fib6_node *fn;
2142 
2143 	/* Get the "current" route for this destination and
2144 	 * check if the redirect has come from appropriate router.
2145 	 *
2146 	 * RFC 4861 specifies that redirects should only be
2147 	 * accepted if they come from the nexthop to the target.
2148 	 * Due to the way the routes are chosen, this notion
2149 	 * is a bit fuzzy and one might need to check all possible
2150 	 * routes.
2151 	 */
2152 
2153 	rcu_read_lock();
2154 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2155 restart:
2156 	for_each_fib6_node_rt_rcu(fn) {
2157 		if (rt6_check_expired(rt))
2158 			continue;
2159 		if (rt->dst.error)
2160 			break;
2161 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2162 			continue;
2163 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2164 			continue;
2165 		/* rt_cache's gateway might be different from its 'parent'
2166 		 * in the case of an ip redirect.
2167 		 * So we keep searching in the exception table if the gateway
2168 		 * is different.
2169 		 */
2170 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2171 			rt_cache = rt6_find_cached_rt(rt,
2172 						      &fl6->daddr,
2173 						      &fl6->saddr);
2174 			if (rt_cache &&
2175 			    ipv6_addr_equal(&rdfl->gateway,
2176 					    &rt_cache->rt6i_gateway)) {
2177 				rt = rt_cache;
2178 				break;
2179 			}
2180 			continue;
2181 		}
2182 		break;
2183 	}
2184 
2185 	if (!rt)
2186 		rt = net->ipv6.ip6_null_entry;
2187 	else if (rt->dst.error) {
2188 		rt = net->ipv6.ip6_null_entry;
2189 		goto out;
2190 	}
2191 
2192 	if (rt == net->ipv6.ip6_null_entry) {
2193 		fn = fib6_backtrack(fn, &fl6->saddr);
2194 		if (fn)
2195 			goto restart;
2196 	}
2197 
2198 out:
2199 	ip6_hold_safe(net, &rt, true);
2200 
2201 	rcu_read_unlock();
2202 
2203 	trace_fib6_table_lookup(net, rt, table, fl6);
2204 	return rt;
2205 };
2206 
2207 static struct dst_entry *ip6_route_redirect(struct net *net,
2208 					const struct flowi6 *fl6,
2209 					const struct in6_addr *gateway)
2210 {
2211 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2212 	struct ip6rd_flowi rdfl;
2213 
2214 	rdfl.fl6 = *fl6;
2215 	rdfl.gateway = *gateway;
2216 
2217 	return fib6_rule_lookup(net, &rdfl.fl6,
2218 				flags, __ip6_route_redirect);
2219 }
2220 
2221 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2222 		  kuid_t uid)
2223 {
2224 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2225 	struct dst_entry *dst;
2226 	struct flowi6 fl6;
2227 
2228 	memset(&fl6, 0, sizeof(fl6));
2229 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2230 	fl6.flowi6_oif = oif;
2231 	fl6.flowi6_mark = mark;
2232 	fl6.daddr = iph->daddr;
2233 	fl6.saddr = iph->saddr;
2234 	fl6.flowlabel = ip6_flowinfo(iph);
2235 	fl6.flowi6_uid = uid;
2236 
2237 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2238 	rt6_do_redirect(dst, NULL, skb);
2239 	dst_release(dst);
2240 }
2241 EXPORT_SYMBOL_GPL(ip6_redirect);
2242 
2243 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2244 			    u32 mark)
2245 {
2246 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2247 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2248 	struct dst_entry *dst;
2249 	struct flowi6 fl6;
2250 
2251 	memset(&fl6, 0, sizeof(fl6));
2252 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2253 	fl6.flowi6_oif = oif;
2254 	fl6.flowi6_mark = mark;
2255 	fl6.daddr = msg->dest;
2256 	fl6.saddr = iph->daddr;
2257 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2258 
2259 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2260 	rt6_do_redirect(dst, NULL, skb);
2261 	dst_release(dst);
2262 }
2263 
2264 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2265 {
2266 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2267 		     sk->sk_uid);
2268 }
2269 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2270 
2271 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2272 {
2273 	struct net_device *dev = dst->dev;
2274 	unsigned int mtu = dst_mtu(dst);
2275 	struct net *net = dev_net(dev);
2276 
2277 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2278 
2279 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2280 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2281 
2282 	/*
2283 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2284 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2285 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2286 	 * rely only on pmtu discovery"
2287 	 */
2288 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2289 		mtu = IPV6_MAXPLEN;
2290 	return mtu;
2291 }
2292 
2293 static unsigned int ip6_mtu(const struct dst_entry *dst)
2294 {
2295 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2296 	unsigned int mtu = rt->rt6i_pmtu;
2297 	struct inet6_dev *idev;
2298 
2299 	if (mtu)
2300 		goto out;
2301 
2302 	mtu = dst_metric_raw(dst, RTAX_MTU);
2303 	if (mtu)
2304 		goto out;
2305 
2306 	mtu = IPV6_MIN_MTU;
2307 
2308 	rcu_read_lock();
2309 	idev = __in6_dev_get(dst->dev);
2310 	if (idev)
2311 		mtu = idev->cnf.mtu6;
2312 	rcu_read_unlock();
2313 
2314 out:
2315 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2316 
2317 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2318 }
2319 
2320 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2321 				  struct flowi6 *fl6)
2322 {
2323 	struct dst_entry *dst;
2324 	struct rt6_info *rt;
2325 	struct inet6_dev *idev = in6_dev_get(dev);
2326 	struct net *net = dev_net(dev);
2327 
2328 	if (unlikely(!idev))
2329 		return ERR_PTR(-ENODEV);
2330 
2331 	rt = ip6_dst_alloc(net, dev, 0);
2332 	if (unlikely(!rt)) {
2333 		in6_dev_put(idev);
2334 		dst = ERR_PTR(-ENOMEM);
2335 		goto out;
2336 	}
2337 
2338 	rt->dst.flags |= DST_HOST;
2339 	rt->dst.input = ip6_input;
2340 	rt->dst.output  = ip6_output;
2341 	rt->rt6i_gateway  = fl6->daddr;
2342 	rt->rt6i_dst.addr = fl6->daddr;
2343 	rt->rt6i_dst.plen = 128;
2344 	rt->rt6i_idev     = idev;
2345 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2346 
2347 	/* Add this dst into uncached_list so that rt6_ifdown() can
2348 	 * do proper release of the net_device
2349 	 */
2350 	rt6_uncached_list_add(rt);
2351 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2352 
2353 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2354 
2355 out:
2356 	return dst;
2357 }
2358 
2359 static int ip6_dst_gc(struct dst_ops *ops)
2360 {
2361 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2362 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2363 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2364 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2365 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2366 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2367 	int entries;
2368 
2369 	entries = dst_entries_get_fast(ops);
2370 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2371 	    entries <= rt_max_size)
2372 		goto out;
2373 
2374 	net->ipv6.ip6_rt_gc_expire++;
2375 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2376 	entries = dst_entries_get_slow(ops);
2377 	if (entries < ops->gc_thresh)
2378 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2379 out:
2380 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2381 	return entries > rt_max_size;
2382 }
2383 
2384 static int ip6_convert_metrics(struct mx6_config *mxc,
2385 			       const struct fib6_config *cfg)
2386 {
2387 	struct net *net = cfg->fc_nlinfo.nl_net;
2388 	bool ecn_ca = false;
2389 	struct nlattr *nla;
2390 	int remaining;
2391 	u32 *mp;
2392 
2393 	if (!cfg->fc_mx)
2394 		return 0;
2395 
2396 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2397 	if (unlikely(!mp))
2398 		return -ENOMEM;
2399 
2400 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2401 		int type = nla_type(nla);
2402 		u32 val;
2403 
2404 		if (!type)
2405 			continue;
2406 		if (unlikely(type > RTAX_MAX))
2407 			goto err;
2408 
2409 		if (type == RTAX_CC_ALGO) {
2410 			char tmp[TCP_CA_NAME_MAX];
2411 
2412 			nla_strlcpy(tmp, nla, sizeof(tmp));
2413 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2414 			if (val == TCP_CA_UNSPEC)
2415 				goto err;
2416 		} else {
2417 			val = nla_get_u32(nla);
2418 		}
2419 		if (type == RTAX_HOPLIMIT && val > 255)
2420 			val = 255;
2421 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2422 			goto err;
2423 
2424 		mp[type - 1] = val;
2425 		__set_bit(type - 1, mxc->mx_valid);
2426 	}
2427 
2428 	if (ecn_ca) {
2429 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2430 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2431 	}
2432 
2433 	mxc->mx = mp;
2434 	return 0;
2435  err:
2436 	kfree(mp);
2437 	return -EINVAL;
2438 }
2439 
2440 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2441 					    struct fib6_config *cfg,
2442 					    const struct in6_addr *gw_addr)
2443 {
2444 	struct flowi6 fl6 = {
2445 		.flowi6_oif = cfg->fc_ifindex,
2446 		.daddr = *gw_addr,
2447 		.saddr = cfg->fc_prefsrc,
2448 	};
2449 	struct fib6_table *table;
2450 	struct rt6_info *rt;
2451 	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2452 
2453 	table = fib6_get_table(net, cfg->fc_table);
2454 	if (!table)
2455 		return NULL;
2456 
2457 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2458 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2459 
2460 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2461 
2462 	/* if table lookup failed, fall back to full lookup */
2463 	if (rt == net->ipv6.ip6_null_entry) {
2464 		ip6_rt_put(rt);
2465 		rt = NULL;
2466 	}
2467 
2468 	return rt;
2469 }
2470 
2471 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2472 					      struct netlink_ext_ack *extack)
2473 {
2474 	struct net *net = cfg->fc_nlinfo.nl_net;
2475 	struct rt6_info *rt = NULL;
2476 	struct net_device *dev = NULL;
2477 	struct inet6_dev *idev = NULL;
2478 	struct fib6_table *table;
2479 	int addr_type;
2480 	int err = -EINVAL;
2481 
2482 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2483 	if (cfg->fc_flags & RTF_PCPU) {
2484 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2485 		goto out;
2486 	}
2487 
2488 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2489 	if (cfg->fc_flags & RTF_CACHE) {
2490 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2491 		goto out;
2492 	}
2493 
2494 	if (cfg->fc_dst_len > 128) {
2495 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2496 		goto out;
2497 	}
2498 	if (cfg->fc_src_len > 128) {
2499 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2500 		goto out;
2501 	}
2502 #ifndef CONFIG_IPV6_SUBTREES
2503 	if (cfg->fc_src_len) {
2504 		NL_SET_ERR_MSG(extack,
2505 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2506 		goto out;
2507 	}
2508 #endif
2509 	if (cfg->fc_ifindex) {
2510 		err = -ENODEV;
2511 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2512 		if (!dev)
2513 			goto out;
2514 		idev = in6_dev_get(dev);
2515 		if (!idev)
2516 			goto out;
2517 	}
2518 
2519 	if (cfg->fc_metric == 0)
2520 		cfg->fc_metric = IP6_RT_PRIO_USER;
2521 
2522 	err = -ENOBUFS;
2523 	if (cfg->fc_nlinfo.nlh &&
2524 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2525 		table = fib6_get_table(net, cfg->fc_table);
2526 		if (!table) {
2527 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2528 			table = fib6_new_table(net, cfg->fc_table);
2529 		}
2530 	} else {
2531 		table = fib6_new_table(net, cfg->fc_table);
2532 	}
2533 
2534 	if (!table)
2535 		goto out;
2536 
2537 	rt = ip6_dst_alloc(net, NULL,
2538 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2539 
2540 	if (!rt) {
2541 		err = -ENOMEM;
2542 		goto out;
2543 	}
2544 
2545 	if (cfg->fc_flags & RTF_EXPIRES)
2546 		rt6_set_expires(rt, jiffies +
2547 				clock_t_to_jiffies(cfg->fc_expires));
2548 	else
2549 		rt6_clean_expires(rt);
2550 
2551 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2552 		cfg->fc_protocol = RTPROT_BOOT;
2553 	rt->rt6i_protocol = cfg->fc_protocol;
2554 
2555 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2556 
2557 	if (addr_type & IPV6_ADDR_MULTICAST)
2558 		rt->dst.input = ip6_mc_input;
2559 	else if (cfg->fc_flags & RTF_LOCAL)
2560 		rt->dst.input = ip6_input;
2561 	else
2562 		rt->dst.input = ip6_forward;
2563 
2564 	rt->dst.output = ip6_output;
2565 
2566 	if (cfg->fc_encap) {
2567 		struct lwtunnel_state *lwtstate;
2568 
2569 		err = lwtunnel_build_state(cfg->fc_encap_type,
2570 					   cfg->fc_encap, AF_INET6, cfg,
2571 					   &lwtstate, extack);
2572 		if (err)
2573 			goto out;
2574 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2575 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2576 			rt->dst.lwtstate->orig_output = rt->dst.output;
2577 			rt->dst.output = lwtunnel_output;
2578 		}
2579 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2580 			rt->dst.lwtstate->orig_input = rt->dst.input;
2581 			rt->dst.input = lwtunnel_input;
2582 		}
2583 	}
2584 
2585 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2586 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2587 	if (rt->rt6i_dst.plen == 128)
2588 		rt->dst.flags |= DST_HOST;
2589 
2590 #ifdef CONFIG_IPV6_SUBTREES
2591 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2592 	rt->rt6i_src.plen = cfg->fc_src_len;
2593 #endif
2594 
2595 	rt->rt6i_metric = cfg->fc_metric;
2596 
2597 	/* We cannot add true routes via loopback here,
2598 	   they would result in kernel looping; promote them to reject routes
2599 	 */
2600 	if ((cfg->fc_flags & RTF_REJECT) ||
2601 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2602 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2603 	     !(cfg->fc_flags & RTF_LOCAL))) {
2604 		/* hold loopback dev/idev if we haven't done so. */
2605 		if (dev != net->loopback_dev) {
2606 			if (dev) {
2607 				dev_put(dev);
2608 				in6_dev_put(idev);
2609 			}
2610 			dev = net->loopback_dev;
2611 			dev_hold(dev);
2612 			idev = in6_dev_get(dev);
2613 			if (!idev) {
2614 				err = -ENODEV;
2615 				goto out;
2616 			}
2617 		}
2618 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2619 		switch (cfg->fc_type) {
2620 		case RTN_BLACKHOLE:
2621 			rt->dst.error = -EINVAL;
2622 			rt->dst.output = dst_discard_out;
2623 			rt->dst.input = dst_discard;
2624 			break;
2625 		case RTN_PROHIBIT:
2626 			rt->dst.error = -EACCES;
2627 			rt->dst.output = ip6_pkt_prohibit_out;
2628 			rt->dst.input = ip6_pkt_prohibit;
2629 			break;
2630 		case RTN_THROW:
2631 		case RTN_UNREACHABLE:
2632 		default:
2633 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2634 					: (cfg->fc_type == RTN_UNREACHABLE)
2635 					? -EHOSTUNREACH : -ENETUNREACH;
2636 			rt->dst.output = ip6_pkt_discard_out;
2637 			rt->dst.input = ip6_pkt_discard;
2638 			break;
2639 		}
2640 		goto install_route;
2641 	}
2642 
2643 	if (cfg->fc_flags & RTF_GATEWAY) {
2644 		const struct in6_addr *gw_addr;
2645 		int gwa_type;
2646 
2647 		gw_addr = &cfg->fc_gateway;
2648 		gwa_type = ipv6_addr_type(gw_addr);
2649 
2650 		/* if gw_addr is local we will fail to detect this in case
2651 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2652 		 * will return already-added prefix route via interface that
2653 		 * prefix route was assigned to, which might be non-loopback.
2654 		 */
2655 		err = -EINVAL;
2656 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2657 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2658 					    dev : NULL, 0, 0)) {
2659 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2660 			goto out;
2661 		}
2662 		rt->rt6i_gateway = *gw_addr;
2663 
2664 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2665 			struct rt6_info *grt = NULL;
2666 
2667 			/* IPv6 strictly inhibits using not link-local
2668 			   addresses as nexthop address.
2669 			   Otherwise, router will not able to send redirects.
2670 			   It is very good, but in some (rare!) circumstances
2671 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2672 			   some exceptions. --ANK
2673 			   We allow IPv4-mapped nexthops to support RFC4798-type
2674 			   addressing
2675 			 */
2676 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2677 					  IPV6_ADDR_MAPPED))) {
2678 				NL_SET_ERR_MSG(extack,
2679 					       "Invalid gateway address");
2680 				goto out;
2681 			}
2682 
2683 			if (cfg->fc_table) {
2684 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2685 
2686 				if (grt) {
2687 					if (grt->rt6i_flags & RTF_GATEWAY ||
2688 					    (dev && dev != grt->dst.dev)) {
2689 						ip6_rt_put(grt);
2690 						grt = NULL;
2691 					}
2692 				}
2693 			}
2694 
2695 			if (!grt)
2696 				grt = rt6_lookup(net, gw_addr, NULL,
2697 						 cfg->fc_ifindex, 1);
2698 
2699 			err = -EHOSTUNREACH;
2700 			if (!grt)
2701 				goto out;
2702 			if (dev) {
2703 				if (dev != grt->dst.dev) {
2704 					ip6_rt_put(grt);
2705 					goto out;
2706 				}
2707 			} else {
2708 				dev = grt->dst.dev;
2709 				idev = grt->rt6i_idev;
2710 				dev_hold(dev);
2711 				in6_dev_hold(grt->rt6i_idev);
2712 			}
2713 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2714 				err = 0;
2715 			ip6_rt_put(grt);
2716 
2717 			if (err)
2718 				goto out;
2719 		}
2720 		err = -EINVAL;
2721 		if (!dev) {
2722 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2723 			goto out;
2724 		} else if (dev->flags & IFF_LOOPBACK) {
2725 			NL_SET_ERR_MSG(extack,
2726 				       "Egress device can not be loopback device for this route");
2727 			goto out;
2728 		}
2729 	}
2730 
2731 	err = -ENODEV;
2732 	if (!dev)
2733 		goto out;
2734 
2735 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2736 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2737 			NL_SET_ERR_MSG(extack, "Invalid source address");
2738 			err = -EINVAL;
2739 			goto out;
2740 		}
2741 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2742 		rt->rt6i_prefsrc.plen = 128;
2743 	} else
2744 		rt->rt6i_prefsrc.plen = 0;
2745 
2746 	rt->rt6i_flags = cfg->fc_flags;
2747 
2748 install_route:
2749 	rt->dst.dev = dev;
2750 	rt->rt6i_idev = idev;
2751 	rt->rt6i_table = table;
2752 
2753 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2754 
2755 	return rt;
2756 out:
2757 	if (dev)
2758 		dev_put(dev);
2759 	if (idev)
2760 		in6_dev_put(idev);
2761 	if (rt)
2762 		dst_release_immediate(&rt->dst);
2763 
2764 	return ERR_PTR(err);
2765 }
2766 
2767 int ip6_route_add(struct fib6_config *cfg,
2768 		  struct netlink_ext_ack *extack)
2769 {
2770 	struct mx6_config mxc = { .mx = NULL, };
2771 	struct rt6_info *rt;
2772 	int err;
2773 
2774 	rt = ip6_route_info_create(cfg, extack);
2775 	if (IS_ERR(rt)) {
2776 		err = PTR_ERR(rt);
2777 		rt = NULL;
2778 		goto out;
2779 	}
2780 
2781 	err = ip6_convert_metrics(&mxc, cfg);
2782 	if (err)
2783 		goto out;
2784 
2785 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2786 
2787 	kfree(mxc.mx);
2788 
2789 	return err;
2790 out:
2791 	if (rt)
2792 		dst_release_immediate(&rt->dst);
2793 
2794 	return err;
2795 }
2796 
2797 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2798 {
2799 	int err;
2800 	struct fib6_table *table;
2801 	struct net *net = dev_net(rt->dst.dev);
2802 
2803 	if (rt == net->ipv6.ip6_null_entry) {
2804 		err = -ENOENT;
2805 		goto out;
2806 	}
2807 
2808 	table = rt->rt6i_table;
2809 	spin_lock_bh(&table->tb6_lock);
2810 	err = fib6_del(rt, info);
2811 	spin_unlock_bh(&table->tb6_lock);
2812 
2813 out:
2814 	ip6_rt_put(rt);
2815 	return err;
2816 }
2817 
2818 int ip6_del_rt(struct rt6_info *rt)
2819 {
2820 	struct nl_info info = {
2821 		.nl_net = dev_net(rt->dst.dev),
2822 	};
2823 	return __ip6_del_rt(rt, &info);
2824 }
2825 
2826 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2827 {
2828 	struct nl_info *info = &cfg->fc_nlinfo;
2829 	struct net *net = info->nl_net;
2830 	struct sk_buff *skb = NULL;
2831 	struct fib6_table *table;
2832 	int err = -ENOENT;
2833 
2834 	if (rt == net->ipv6.ip6_null_entry)
2835 		goto out_put;
2836 	table = rt->rt6i_table;
2837 	spin_lock_bh(&table->tb6_lock);
2838 
2839 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2840 		struct rt6_info *sibling, *next_sibling;
2841 
2842 		/* prefer to send a single notification with all hops */
2843 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2844 		if (skb) {
2845 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2846 
2847 			if (rt6_fill_node(net, skb, rt,
2848 					  NULL, NULL, 0, RTM_DELROUTE,
2849 					  info->portid, seq, 0) < 0) {
2850 				kfree_skb(skb);
2851 				skb = NULL;
2852 			} else
2853 				info->skip_notify = 1;
2854 		}
2855 
2856 		list_for_each_entry_safe(sibling, next_sibling,
2857 					 &rt->rt6i_siblings,
2858 					 rt6i_siblings) {
2859 			err = fib6_del(sibling, info);
2860 			if (err)
2861 				goto out_unlock;
2862 		}
2863 	}
2864 
2865 	err = fib6_del(rt, info);
2866 out_unlock:
2867 	spin_unlock_bh(&table->tb6_lock);
2868 out_put:
2869 	ip6_rt_put(rt);
2870 
2871 	if (skb) {
2872 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2873 			    info->nlh, gfp_any());
2874 	}
2875 	return err;
2876 }
2877 
2878 static int ip6_route_del(struct fib6_config *cfg,
2879 			 struct netlink_ext_ack *extack)
2880 {
2881 	struct rt6_info *rt, *rt_cache;
2882 	struct fib6_table *table;
2883 	struct fib6_node *fn;
2884 	int err = -ESRCH;
2885 
2886 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2887 	if (!table) {
2888 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2889 		return err;
2890 	}
2891 
2892 	rcu_read_lock();
2893 
2894 	fn = fib6_locate(&table->tb6_root,
2895 			 &cfg->fc_dst, cfg->fc_dst_len,
2896 			 &cfg->fc_src, cfg->fc_src_len,
2897 			 !(cfg->fc_flags & RTF_CACHE));
2898 
2899 	if (fn) {
2900 		for_each_fib6_node_rt_rcu(fn) {
2901 			if (cfg->fc_flags & RTF_CACHE) {
2902 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2903 							      &cfg->fc_src);
2904 				if (!rt_cache)
2905 					continue;
2906 				rt = rt_cache;
2907 			}
2908 			if (cfg->fc_ifindex &&
2909 			    (!rt->dst.dev ||
2910 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2911 				continue;
2912 			if (cfg->fc_flags & RTF_GATEWAY &&
2913 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2914 				continue;
2915 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2916 				continue;
2917 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2918 				continue;
2919 			if (!dst_hold_safe(&rt->dst))
2920 				break;
2921 			rcu_read_unlock();
2922 
2923 			/* if gateway was specified only delete the one hop */
2924 			if (cfg->fc_flags & RTF_GATEWAY)
2925 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2926 
2927 			return __ip6_del_rt_siblings(rt, cfg);
2928 		}
2929 	}
2930 	rcu_read_unlock();
2931 
2932 	return err;
2933 }
2934 
2935 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2936 {
2937 	struct netevent_redirect netevent;
2938 	struct rt6_info *rt, *nrt = NULL;
2939 	struct ndisc_options ndopts;
2940 	struct inet6_dev *in6_dev;
2941 	struct neighbour *neigh;
2942 	struct rd_msg *msg;
2943 	int optlen, on_link;
2944 	u8 *lladdr;
2945 
2946 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2947 	optlen -= sizeof(*msg);
2948 
2949 	if (optlen < 0) {
2950 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2951 		return;
2952 	}
2953 
2954 	msg = (struct rd_msg *)icmp6_hdr(skb);
2955 
2956 	if (ipv6_addr_is_multicast(&msg->dest)) {
2957 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2958 		return;
2959 	}
2960 
2961 	on_link = 0;
2962 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2963 		on_link = 1;
2964 	} else if (ipv6_addr_type(&msg->target) !=
2965 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2966 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2967 		return;
2968 	}
2969 
2970 	in6_dev = __in6_dev_get(skb->dev);
2971 	if (!in6_dev)
2972 		return;
2973 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2974 		return;
2975 
2976 	/* RFC2461 8.1:
2977 	 *	The IP source address of the Redirect MUST be the same as the current
2978 	 *	first-hop router for the specified ICMP Destination Address.
2979 	 */
2980 
2981 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2982 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2983 		return;
2984 	}
2985 
2986 	lladdr = NULL;
2987 	if (ndopts.nd_opts_tgt_lladdr) {
2988 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2989 					     skb->dev);
2990 		if (!lladdr) {
2991 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2992 			return;
2993 		}
2994 	}
2995 
2996 	rt = (struct rt6_info *) dst;
2997 	if (rt->rt6i_flags & RTF_REJECT) {
2998 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2999 		return;
3000 	}
3001 
3002 	/* Redirect received -> path was valid.
3003 	 * Look, redirects are sent only in response to data packets,
3004 	 * so that this nexthop apparently is reachable. --ANK
3005 	 */
3006 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3007 
3008 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3009 	if (!neigh)
3010 		return;
3011 
3012 	/*
3013 	 *	We have finally decided to accept it.
3014 	 */
3015 
3016 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3017 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3018 		     NEIGH_UPDATE_F_OVERRIDE|
3019 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3020 				     NEIGH_UPDATE_F_ISROUTER)),
3021 		     NDISC_REDIRECT, &ndopts);
3022 
3023 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3024 	if (!nrt)
3025 		goto out;
3026 
3027 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3028 	if (on_link)
3029 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3030 
3031 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3032 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3033 
3034 	/* No need to remove rt from the exception table if rt is
3035 	 * a cached route because rt6_insert_exception() will
3036 	 * takes care of it
3037 	 */
3038 	if (rt6_insert_exception(nrt, rt)) {
3039 		dst_release_immediate(&nrt->dst);
3040 		goto out;
3041 	}
3042 
3043 	netevent.old = &rt->dst;
3044 	netevent.new = &nrt->dst;
3045 	netevent.daddr = &msg->dest;
3046 	netevent.neigh = neigh;
3047 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3048 
3049 out:
3050 	neigh_release(neigh);
3051 }
3052 
3053 /*
3054  *	Misc support functions
3055  */
3056 
3057 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3058 {
3059 	BUG_ON(from->dst.from);
3060 
3061 	rt->rt6i_flags &= ~RTF_EXPIRES;
3062 	dst_hold(&from->dst);
3063 	rt->dst.from = &from->dst;
3064 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3065 }
3066 
3067 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3068 {
3069 	rt->dst.input = ort->dst.input;
3070 	rt->dst.output = ort->dst.output;
3071 	rt->rt6i_dst = ort->rt6i_dst;
3072 	rt->dst.error = ort->dst.error;
3073 	rt->rt6i_idev = ort->rt6i_idev;
3074 	if (rt->rt6i_idev)
3075 		in6_dev_hold(rt->rt6i_idev);
3076 	rt->dst.lastuse = jiffies;
3077 	rt->rt6i_gateway = ort->rt6i_gateway;
3078 	rt->rt6i_flags = ort->rt6i_flags;
3079 	rt6_set_from(rt, ort);
3080 	rt->rt6i_metric = ort->rt6i_metric;
3081 #ifdef CONFIG_IPV6_SUBTREES
3082 	rt->rt6i_src = ort->rt6i_src;
3083 #endif
3084 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3085 	rt->rt6i_table = ort->rt6i_table;
3086 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3087 }
3088 
3089 #ifdef CONFIG_IPV6_ROUTE_INFO
3090 static struct rt6_info *rt6_get_route_info(struct net *net,
3091 					   const struct in6_addr *prefix, int prefixlen,
3092 					   const struct in6_addr *gwaddr,
3093 					   struct net_device *dev)
3094 {
3095 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3096 	int ifindex = dev->ifindex;
3097 	struct fib6_node *fn;
3098 	struct rt6_info *rt = NULL;
3099 	struct fib6_table *table;
3100 
3101 	table = fib6_get_table(net, tb_id);
3102 	if (!table)
3103 		return NULL;
3104 
3105 	rcu_read_lock();
3106 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3107 	if (!fn)
3108 		goto out;
3109 
3110 	for_each_fib6_node_rt_rcu(fn) {
3111 		if (rt->dst.dev->ifindex != ifindex)
3112 			continue;
3113 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3114 			continue;
3115 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3116 			continue;
3117 		ip6_hold_safe(NULL, &rt, false);
3118 		break;
3119 	}
3120 out:
3121 	rcu_read_unlock();
3122 	return rt;
3123 }
3124 
3125 static struct rt6_info *rt6_add_route_info(struct net *net,
3126 					   const struct in6_addr *prefix, int prefixlen,
3127 					   const struct in6_addr *gwaddr,
3128 					   struct net_device *dev,
3129 					   unsigned int pref)
3130 {
3131 	struct fib6_config cfg = {
3132 		.fc_metric	= IP6_RT_PRIO_USER,
3133 		.fc_ifindex	= dev->ifindex,
3134 		.fc_dst_len	= prefixlen,
3135 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3136 				  RTF_UP | RTF_PREF(pref),
3137 		.fc_protocol = RTPROT_RA,
3138 		.fc_nlinfo.portid = 0,
3139 		.fc_nlinfo.nlh = NULL,
3140 		.fc_nlinfo.nl_net = net,
3141 	};
3142 
3143 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3144 	cfg.fc_dst = *prefix;
3145 	cfg.fc_gateway = *gwaddr;
3146 
3147 	/* We should treat it as a default route if prefix length is 0. */
3148 	if (!prefixlen)
3149 		cfg.fc_flags |= RTF_DEFAULT;
3150 
3151 	ip6_route_add(&cfg, NULL);
3152 
3153 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3154 }
3155 #endif
3156 
3157 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3158 {
3159 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3160 	struct rt6_info *rt;
3161 	struct fib6_table *table;
3162 
3163 	table = fib6_get_table(dev_net(dev), tb_id);
3164 	if (!table)
3165 		return NULL;
3166 
3167 	rcu_read_lock();
3168 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3169 		if (dev == rt->dst.dev &&
3170 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3171 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3172 			break;
3173 	}
3174 	if (rt)
3175 		ip6_hold_safe(NULL, &rt, false);
3176 	rcu_read_unlock();
3177 	return rt;
3178 }
3179 
3180 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3181 				     struct net_device *dev,
3182 				     unsigned int pref)
3183 {
3184 	struct fib6_config cfg = {
3185 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3186 		.fc_metric	= IP6_RT_PRIO_USER,
3187 		.fc_ifindex	= dev->ifindex,
3188 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3189 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3190 		.fc_protocol = RTPROT_RA,
3191 		.fc_nlinfo.portid = 0,
3192 		.fc_nlinfo.nlh = NULL,
3193 		.fc_nlinfo.nl_net = dev_net(dev),
3194 	};
3195 
3196 	cfg.fc_gateway = *gwaddr;
3197 
3198 	if (!ip6_route_add(&cfg, NULL)) {
3199 		struct fib6_table *table;
3200 
3201 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3202 		if (table)
3203 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3204 	}
3205 
3206 	return rt6_get_dflt_router(gwaddr, dev);
3207 }
3208 
3209 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3210 {
3211 	struct rt6_info *rt;
3212 
3213 restart:
3214 	rcu_read_lock();
3215 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3216 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3217 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3218 			if (dst_hold_safe(&rt->dst)) {
3219 				rcu_read_unlock();
3220 				ip6_del_rt(rt);
3221 			} else {
3222 				rcu_read_unlock();
3223 			}
3224 			goto restart;
3225 		}
3226 	}
3227 	rcu_read_unlock();
3228 
3229 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3230 }
3231 
3232 void rt6_purge_dflt_routers(struct net *net)
3233 {
3234 	struct fib6_table *table;
3235 	struct hlist_head *head;
3236 	unsigned int h;
3237 
3238 	rcu_read_lock();
3239 
3240 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3241 		head = &net->ipv6.fib_table_hash[h];
3242 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3243 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3244 				__rt6_purge_dflt_routers(table);
3245 		}
3246 	}
3247 
3248 	rcu_read_unlock();
3249 }
3250 
3251 static void rtmsg_to_fib6_config(struct net *net,
3252 				 struct in6_rtmsg *rtmsg,
3253 				 struct fib6_config *cfg)
3254 {
3255 	memset(cfg, 0, sizeof(*cfg));
3256 
3257 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3258 			 : RT6_TABLE_MAIN;
3259 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3260 	cfg->fc_metric = rtmsg->rtmsg_metric;
3261 	cfg->fc_expires = rtmsg->rtmsg_info;
3262 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3263 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3264 	cfg->fc_flags = rtmsg->rtmsg_flags;
3265 
3266 	cfg->fc_nlinfo.nl_net = net;
3267 
3268 	cfg->fc_dst = rtmsg->rtmsg_dst;
3269 	cfg->fc_src = rtmsg->rtmsg_src;
3270 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3271 }
3272 
3273 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3274 {
3275 	struct fib6_config cfg;
3276 	struct in6_rtmsg rtmsg;
3277 	int err;
3278 
3279 	switch (cmd) {
3280 	case SIOCADDRT:		/* Add a route */
3281 	case SIOCDELRT:		/* Delete a route */
3282 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3283 			return -EPERM;
3284 		err = copy_from_user(&rtmsg, arg,
3285 				     sizeof(struct in6_rtmsg));
3286 		if (err)
3287 			return -EFAULT;
3288 
3289 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3290 
3291 		rtnl_lock();
3292 		switch (cmd) {
3293 		case SIOCADDRT:
3294 			err = ip6_route_add(&cfg, NULL);
3295 			break;
3296 		case SIOCDELRT:
3297 			err = ip6_route_del(&cfg, NULL);
3298 			break;
3299 		default:
3300 			err = -EINVAL;
3301 		}
3302 		rtnl_unlock();
3303 
3304 		return err;
3305 	}
3306 
3307 	return -EINVAL;
3308 }
3309 
3310 /*
3311  *	Drop the packet on the floor
3312  */
3313 
3314 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3315 {
3316 	int type;
3317 	struct dst_entry *dst = skb_dst(skb);
3318 	switch (ipstats_mib_noroutes) {
3319 	case IPSTATS_MIB_INNOROUTES:
3320 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3321 		if (type == IPV6_ADDR_ANY) {
3322 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3323 				      IPSTATS_MIB_INADDRERRORS);
3324 			break;
3325 		}
3326 		/* FALLTHROUGH */
3327 	case IPSTATS_MIB_OUTNOROUTES:
3328 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3329 			      ipstats_mib_noroutes);
3330 		break;
3331 	}
3332 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3333 	kfree_skb(skb);
3334 	return 0;
3335 }
3336 
3337 static int ip6_pkt_discard(struct sk_buff *skb)
3338 {
3339 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3340 }
3341 
3342 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3343 {
3344 	skb->dev = skb_dst(skb)->dev;
3345 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3346 }
3347 
3348 static int ip6_pkt_prohibit(struct sk_buff *skb)
3349 {
3350 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3351 }
3352 
3353 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3354 {
3355 	skb->dev = skb_dst(skb)->dev;
3356 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3357 }
3358 
3359 /*
3360  *	Allocate a dst for local (unicast / anycast) address.
3361  */
3362 
3363 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3364 				    const struct in6_addr *addr,
3365 				    bool anycast)
3366 {
3367 	u32 tb_id;
3368 	struct net *net = dev_net(idev->dev);
3369 	struct net_device *dev = idev->dev;
3370 	struct rt6_info *rt;
3371 
3372 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3373 	if (!rt)
3374 		return ERR_PTR(-ENOMEM);
3375 
3376 	in6_dev_hold(idev);
3377 
3378 	rt->dst.flags |= DST_HOST;
3379 	rt->dst.input = ip6_input;
3380 	rt->dst.output = ip6_output;
3381 	rt->rt6i_idev = idev;
3382 
3383 	rt->rt6i_protocol = RTPROT_KERNEL;
3384 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3385 	if (anycast)
3386 		rt->rt6i_flags |= RTF_ANYCAST;
3387 	else
3388 		rt->rt6i_flags |= RTF_LOCAL;
3389 
3390 	rt->rt6i_gateway  = *addr;
3391 	rt->rt6i_dst.addr = *addr;
3392 	rt->rt6i_dst.plen = 128;
3393 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3394 	rt->rt6i_table = fib6_get_table(net, tb_id);
3395 
3396 	return rt;
3397 }
3398 
3399 /* remove deleted ip from prefsrc entries */
3400 struct arg_dev_net_ip {
3401 	struct net_device *dev;
3402 	struct net *net;
3403 	struct in6_addr *addr;
3404 };
3405 
3406 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3407 {
3408 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3409 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3410 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3411 
3412 	if (((void *)rt->dst.dev == dev || !dev) &&
3413 	    rt != net->ipv6.ip6_null_entry &&
3414 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3415 		spin_lock_bh(&rt6_exception_lock);
3416 		/* remove prefsrc entry */
3417 		rt->rt6i_prefsrc.plen = 0;
3418 		/* need to update cache as well */
3419 		rt6_exceptions_remove_prefsrc(rt);
3420 		spin_unlock_bh(&rt6_exception_lock);
3421 	}
3422 	return 0;
3423 }
3424 
3425 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3426 {
3427 	struct net *net = dev_net(ifp->idev->dev);
3428 	struct arg_dev_net_ip adni = {
3429 		.dev = ifp->idev->dev,
3430 		.net = net,
3431 		.addr = &ifp->addr,
3432 	};
3433 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3434 }
3435 
3436 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3437 
3438 /* Remove routers and update dst entries when gateway turn into host. */
3439 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3440 {
3441 	struct in6_addr *gateway = (struct in6_addr *)arg;
3442 
3443 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3444 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3445 		return -1;
3446 	}
3447 
3448 	/* Further clean up cached routes in exception table.
3449 	 * This is needed because cached route may have a different
3450 	 * gateway than its 'parent' in the case of an ip redirect.
3451 	 */
3452 	rt6_exceptions_clean_tohost(rt, gateway);
3453 
3454 	return 0;
3455 }
3456 
3457 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3458 {
3459 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3460 }
3461 
3462 struct arg_dev_net {
3463 	struct net_device *dev;
3464 	struct net *net;
3465 };
3466 
3467 /* called with write lock held for table with rt */
3468 static int fib6_ifdown(struct rt6_info *rt, void *arg)
3469 {
3470 	const struct arg_dev_net *adn = arg;
3471 	const struct net_device *dev = adn->dev;
3472 
3473 	if ((rt->dst.dev == dev || !dev) &&
3474 	    rt != adn->net->ipv6.ip6_null_entry &&
3475 	    (rt->rt6i_nsiblings == 0 ||
3476 	     (dev && netdev_unregistering(dev)) ||
3477 	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3478 		return -1;
3479 
3480 	return 0;
3481 }
3482 
3483 void rt6_ifdown(struct net *net, struct net_device *dev)
3484 {
3485 	struct arg_dev_net adn = {
3486 		.dev = dev,
3487 		.net = net,
3488 	};
3489 
3490 	fib6_clean_all(net, fib6_ifdown, &adn);
3491 	if (dev)
3492 		rt6_uncached_list_flush_dev(net, dev);
3493 }
3494 
3495 struct rt6_mtu_change_arg {
3496 	struct net_device *dev;
3497 	unsigned int mtu;
3498 };
3499 
3500 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3501 {
3502 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3503 	struct inet6_dev *idev;
3504 
3505 	/* In IPv6 pmtu discovery is not optional,
3506 	   so that RTAX_MTU lock cannot disable it.
3507 	   We still use this lock to block changes
3508 	   caused by addrconf/ndisc.
3509 	*/
3510 
3511 	idev = __in6_dev_get(arg->dev);
3512 	if (!idev)
3513 		return 0;
3514 
3515 	/* For administrative MTU increase, there is no way to discover
3516 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3517 	   Since RFC 1981 doesn't include administrative MTU increase
3518 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3519 	 */
3520 	/*
3521 	   If new MTU is less than route PMTU, this new MTU will be the
3522 	   lowest MTU in the path, update the route PMTU to reflect PMTU
3523 	   decreases; if new MTU is greater than route PMTU, and the
3524 	   old MTU is the lowest MTU in the path, update the route PMTU
3525 	   to reflect the increase. In this case if the other nodes' MTU
3526 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
3527 	   PMTU discovery.
3528 	 */
3529 	if (rt->dst.dev == arg->dev &&
3530 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
3531 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3532 		spin_lock_bh(&rt6_exception_lock);
3533 		if (dst_mtu(&rt->dst) >= arg->mtu ||
3534 		    (dst_mtu(&rt->dst) < arg->mtu &&
3535 		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3536 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3537 		}
3538 		rt6_exceptions_update_pmtu(rt, arg->mtu);
3539 		spin_unlock_bh(&rt6_exception_lock);
3540 	}
3541 	return 0;
3542 }
3543 
3544 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3545 {
3546 	struct rt6_mtu_change_arg arg = {
3547 		.dev = dev,
3548 		.mtu = mtu,
3549 	};
3550 
3551 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3552 }
3553 
3554 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3555 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3556 	[RTA_OIF]               = { .type = NLA_U32 },
3557 	[RTA_IIF]		= { .type = NLA_U32 },
3558 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3559 	[RTA_METRICS]           = { .type = NLA_NESTED },
3560 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3561 	[RTA_PREF]              = { .type = NLA_U8 },
3562 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3563 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3564 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3565 	[RTA_UID]		= { .type = NLA_U32 },
3566 	[RTA_MARK]		= { .type = NLA_U32 },
3567 };
3568 
3569 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3570 			      struct fib6_config *cfg,
3571 			      struct netlink_ext_ack *extack)
3572 {
3573 	struct rtmsg *rtm;
3574 	struct nlattr *tb[RTA_MAX+1];
3575 	unsigned int pref;
3576 	int err;
3577 
3578 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3579 			  NULL);
3580 	if (err < 0)
3581 		goto errout;
3582 
3583 	err = -EINVAL;
3584 	rtm = nlmsg_data(nlh);
3585 	memset(cfg, 0, sizeof(*cfg));
3586 
3587 	cfg->fc_table = rtm->rtm_table;
3588 	cfg->fc_dst_len = rtm->rtm_dst_len;
3589 	cfg->fc_src_len = rtm->rtm_src_len;
3590 	cfg->fc_flags = RTF_UP;
3591 	cfg->fc_protocol = rtm->rtm_protocol;
3592 	cfg->fc_type = rtm->rtm_type;
3593 
3594 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3595 	    rtm->rtm_type == RTN_BLACKHOLE ||
3596 	    rtm->rtm_type == RTN_PROHIBIT ||
3597 	    rtm->rtm_type == RTN_THROW)
3598 		cfg->fc_flags |= RTF_REJECT;
3599 
3600 	if (rtm->rtm_type == RTN_LOCAL)
3601 		cfg->fc_flags |= RTF_LOCAL;
3602 
3603 	if (rtm->rtm_flags & RTM_F_CLONED)
3604 		cfg->fc_flags |= RTF_CACHE;
3605 
3606 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3607 	cfg->fc_nlinfo.nlh = nlh;
3608 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3609 
3610 	if (tb[RTA_GATEWAY]) {
3611 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3612 		cfg->fc_flags |= RTF_GATEWAY;
3613 	}
3614 
3615 	if (tb[RTA_DST]) {
3616 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3617 
3618 		if (nla_len(tb[RTA_DST]) < plen)
3619 			goto errout;
3620 
3621 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3622 	}
3623 
3624 	if (tb[RTA_SRC]) {
3625 		int plen = (rtm->rtm_src_len + 7) >> 3;
3626 
3627 		if (nla_len(tb[RTA_SRC]) < plen)
3628 			goto errout;
3629 
3630 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3631 	}
3632 
3633 	if (tb[RTA_PREFSRC])
3634 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3635 
3636 	if (tb[RTA_OIF])
3637 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3638 
3639 	if (tb[RTA_PRIORITY])
3640 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3641 
3642 	if (tb[RTA_METRICS]) {
3643 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3644 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3645 	}
3646 
3647 	if (tb[RTA_TABLE])
3648 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3649 
3650 	if (tb[RTA_MULTIPATH]) {
3651 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3652 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3653 
3654 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3655 						     cfg->fc_mp_len, extack);
3656 		if (err < 0)
3657 			goto errout;
3658 	}
3659 
3660 	if (tb[RTA_PREF]) {
3661 		pref = nla_get_u8(tb[RTA_PREF]);
3662 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3663 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3664 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3665 		cfg->fc_flags |= RTF_PREF(pref);
3666 	}
3667 
3668 	if (tb[RTA_ENCAP])
3669 		cfg->fc_encap = tb[RTA_ENCAP];
3670 
3671 	if (tb[RTA_ENCAP_TYPE]) {
3672 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3673 
3674 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3675 		if (err < 0)
3676 			goto errout;
3677 	}
3678 
3679 	if (tb[RTA_EXPIRES]) {
3680 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3681 
3682 		if (addrconf_finite_timeout(timeout)) {
3683 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3684 			cfg->fc_flags |= RTF_EXPIRES;
3685 		}
3686 	}
3687 
3688 	err = 0;
3689 errout:
3690 	return err;
3691 }
3692 
3693 struct rt6_nh {
3694 	struct rt6_info *rt6_info;
3695 	struct fib6_config r_cfg;
3696 	struct mx6_config mxc;
3697 	struct list_head next;
3698 };
3699 
3700 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3701 {
3702 	struct rt6_nh *nh;
3703 
3704 	list_for_each_entry(nh, rt6_nh_list, next) {
3705 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3706 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3707 		        nh->r_cfg.fc_ifindex);
3708 	}
3709 }
3710 
3711 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3712 				 struct rt6_info *rt, struct fib6_config *r_cfg)
3713 {
3714 	struct rt6_nh *nh;
3715 	int err = -EEXIST;
3716 
3717 	list_for_each_entry(nh, rt6_nh_list, next) {
3718 		/* check if rt6_info already exists */
3719 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3720 			return err;
3721 	}
3722 
3723 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3724 	if (!nh)
3725 		return -ENOMEM;
3726 	nh->rt6_info = rt;
3727 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
3728 	if (err) {
3729 		kfree(nh);
3730 		return err;
3731 	}
3732 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3733 	list_add_tail(&nh->next, rt6_nh_list);
3734 
3735 	return 0;
3736 }
3737 
3738 static void ip6_route_mpath_notify(struct rt6_info *rt,
3739 				   struct rt6_info *rt_last,
3740 				   struct nl_info *info,
3741 				   __u16 nlflags)
3742 {
3743 	/* if this is an APPEND route, then rt points to the first route
3744 	 * inserted and rt_last points to last route inserted. Userspace
3745 	 * wants a consistent dump of the route which starts at the first
3746 	 * nexthop. Since sibling routes are always added at the end of
3747 	 * the list, find the first sibling of the last route appended
3748 	 */
3749 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3750 		rt = list_first_entry(&rt_last->rt6i_siblings,
3751 				      struct rt6_info,
3752 				      rt6i_siblings);
3753 	}
3754 
3755 	if (rt)
3756 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3757 }
3758 
3759 static int ip6_route_multipath_add(struct fib6_config *cfg,
3760 				   struct netlink_ext_ack *extack)
3761 {
3762 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3763 	struct nl_info *info = &cfg->fc_nlinfo;
3764 	struct fib6_config r_cfg;
3765 	struct rtnexthop *rtnh;
3766 	struct rt6_info *rt;
3767 	struct rt6_nh *err_nh;
3768 	struct rt6_nh *nh, *nh_safe;
3769 	__u16 nlflags;
3770 	int remaining;
3771 	int attrlen;
3772 	int err = 1;
3773 	int nhn = 0;
3774 	int replace = (cfg->fc_nlinfo.nlh &&
3775 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3776 	LIST_HEAD(rt6_nh_list);
3777 
3778 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3779 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3780 		nlflags |= NLM_F_APPEND;
3781 
3782 	remaining = cfg->fc_mp_len;
3783 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3784 
3785 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
3786 	 * rt6_info structs per nexthop
3787 	 */
3788 	while (rtnh_ok(rtnh, remaining)) {
3789 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3790 		if (rtnh->rtnh_ifindex)
3791 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3792 
3793 		attrlen = rtnh_attrlen(rtnh);
3794 		if (attrlen > 0) {
3795 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3796 
3797 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3798 			if (nla) {
3799 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
3800 				r_cfg.fc_flags |= RTF_GATEWAY;
3801 			}
3802 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3803 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3804 			if (nla)
3805 				r_cfg.fc_encap_type = nla_get_u16(nla);
3806 		}
3807 
3808 		rt = ip6_route_info_create(&r_cfg, extack);
3809 		if (IS_ERR(rt)) {
3810 			err = PTR_ERR(rt);
3811 			rt = NULL;
3812 			goto cleanup;
3813 		}
3814 
3815 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3816 		if (err) {
3817 			dst_release_immediate(&rt->dst);
3818 			goto cleanup;
3819 		}
3820 
3821 		rtnh = rtnh_next(rtnh, &remaining);
3822 	}
3823 
3824 	/* for add and replace send one notification with all nexthops.
3825 	 * Skip the notification in fib6_add_rt2node and send one with
3826 	 * the full route when done
3827 	 */
3828 	info->skip_notify = 1;
3829 
3830 	err_nh = NULL;
3831 	list_for_each_entry(nh, &rt6_nh_list, next) {
3832 		rt_last = nh->rt6_info;
3833 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3834 		/* save reference to first route for notification */
3835 		if (!rt_notif && !err)
3836 			rt_notif = nh->rt6_info;
3837 
3838 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
3839 		nh->rt6_info = NULL;
3840 		if (err) {
3841 			if (replace && nhn)
3842 				ip6_print_replace_route_err(&rt6_nh_list);
3843 			err_nh = nh;
3844 			goto add_errout;
3845 		}
3846 
3847 		/* Because each route is added like a single route we remove
3848 		 * these flags after the first nexthop: if there is a collision,
3849 		 * we have already failed to add the first nexthop:
3850 		 * fib6_add_rt2node() has rejected it; when replacing, old
3851 		 * nexthops have been replaced by first new, the rest should
3852 		 * be added to it.
3853 		 */
3854 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3855 						     NLM_F_REPLACE);
3856 		nhn++;
3857 	}
3858 
3859 	/* success ... tell user about new route */
3860 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3861 	goto cleanup;
3862 
3863 add_errout:
3864 	/* send notification for routes that were added so that
3865 	 * the delete notifications sent by ip6_route_del are
3866 	 * coherent
3867 	 */
3868 	if (rt_notif)
3869 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3870 
3871 	/* Delete routes that were already added */
3872 	list_for_each_entry(nh, &rt6_nh_list, next) {
3873 		if (err_nh == nh)
3874 			break;
3875 		ip6_route_del(&nh->r_cfg, extack);
3876 	}
3877 
3878 cleanup:
3879 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3880 		if (nh->rt6_info)
3881 			dst_release_immediate(&nh->rt6_info->dst);
3882 		kfree(nh->mxc.mx);
3883 		list_del(&nh->next);
3884 		kfree(nh);
3885 	}
3886 
3887 	return err;
3888 }
3889 
3890 static int ip6_route_multipath_del(struct fib6_config *cfg,
3891 				   struct netlink_ext_ack *extack)
3892 {
3893 	struct fib6_config r_cfg;
3894 	struct rtnexthop *rtnh;
3895 	int remaining;
3896 	int attrlen;
3897 	int err = 1, last_err = 0;
3898 
3899 	remaining = cfg->fc_mp_len;
3900 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3901 
3902 	/* Parse a Multipath Entry */
3903 	while (rtnh_ok(rtnh, remaining)) {
3904 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3905 		if (rtnh->rtnh_ifindex)
3906 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3907 
3908 		attrlen = rtnh_attrlen(rtnh);
3909 		if (attrlen > 0) {
3910 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3911 
3912 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3913 			if (nla) {
3914 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3915 				r_cfg.fc_flags |= RTF_GATEWAY;
3916 			}
3917 		}
3918 		err = ip6_route_del(&r_cfg, extack);
3919 		if (err)
3920 			last_err = err;
3921 
3922 		rtnh = rtnh_next(rtnh, &remaining);
3923 	}
3924 
3925 	return last_err;
3926 }
3927 
3928 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3929 			      struct netlink_ext_ack *extack)
3930 {
3931 	struct fib6_config cfg;
3932 	int err;
3933 
3934 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3935 	if (err < 0)
3936 		return err;
3937 
3938 	if (cfg.fc_mp)
3939 		return ip6_route_multipath_del(&cfg, extack);
3940 	else {
3941 		cfg.fc_delete_all_nh = 1;
3942 		return ip6_route_del(&cfg, extack);
3943 	}
3944 }
3945 
3946 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3947 			      struct netlink_ext_ack *extack)
3948 {
3949 	struct fib6_config cfg;
3950 	int err;
3951 
3952 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3953 	if (err < 0)
3954 		return err;
3955 
3956 	if (cfg.fc_mp)
3957 		return ip6_route_multipath_add(&cfg, extack);
3958 	else
3959 		return ip6_route_add(&cfg, extack);
3960 }
3961 
3962 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3963 {
3964 	int nexthop_len = 0;
3965 
3966 	if (rt->rt6i_nsiblings) {
3967 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
3968 			    + NLA_ALIGN(sizeof(struct rtnexthop))
3969 			    + nla_total_size(16) /* RTA_GATEWAY */
3970 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
3971 
3972 		nexthop_len *= rt->rt6i_nsiblings;
3973 	}
3974 
3975 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3976 	       + nla_total_size(16) /* RTA_SRC */
3977 	       + nla_total_size(16) /* RTA_DST */
3978 	       + nla_total_size(16) /* RTA_GATEWAY */
3979 	       + nla_total_size(16) /* RTA_PREFSRC */
3980 	       + nla_total_size(4) /* RTA_TABLE */
3981 	       + nla_total_size(4) /* RTA_IIF */
3982 	       + nla_total_size(4) /* RTA_OIF */
3983 	       + nla_total_size(4) /* RTA_PRIORITY */
3984 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3985 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3986 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3987 	       + nla_total_size(1) /* RTA_PREF */
3988 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
3989 	       + nexthop_len;
3990 }
3991 
3992 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3993 			    unsigned int *flags, bool skip_oif)
3994 {
3995 	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3996 		*flags |= RTNH_F_LINKDOWN;
3997 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3998 			*flags |= RTNH_F_DEAD;
3999 	}
4000 
4001 	if (rt->rt6i_flags & RTF_GATEWAY) {
4002 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4003 			goto nla_put_failure;
4004 	}
4005 
4006 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4007 		*flags |= RTNH_F_OFFLOAD;
4008 
4009 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4010 	if (!skip_oif && rt->dst.dev &&
4011 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4012 		goto nla_put_failure;
4013 
4014 	if (rt->dst.lwtstate &&
4015 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4016 		goto nla_put_failure;
4017 
4018 	return 0;
4019 
4020 nla_put_failure:
4021 	return -EMSGSIZE;
4022 }
4023 
4024 /* add multipath next hop */
4025 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4026 {
4027 	struct rtnexthop *rtnh;
4028 	unsigned int flags = 0;
4029 
4030 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4031 	if (!rtnh)
4032 		goto nla_put_failure;
4033 
4034 	rtnh->rtnh_hops = 0;
4035 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4036 
4037 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4038 		goto nla_put_failure;
4039 
4040 	rtnh->rtnh_flags = flags;
4041 
4042 	/* length of rtnetlink header + attributes */
4043 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4044 
4045 	return 0;
4046 
4047 nla_put_failure:
4048 	return -EMSGSIZE;
4049 }
4050 
4051 static int rt6_fill_node(struct net *net,
4052 			 struct sk_buff *skb, struct rt6_info *rt,
4053 			 struct in6_addr *dst, struct in6_addr *src,
4054 			 int iif, int type, u32 portid, u32 seq,
4055 			 unsigned int flags)
4056 {
4057 	u32 metrics[RTAX_MAX];
4058 	struct rtmsg *rtm;
4059 	struct nlmsghdr *nlh;
4060 	long expires;
4061 	u32 table;
4062 
4063 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4064 	if (!nlh)
4065 		return -EMSGSIZE;
4066 
4067 	rtm = nlmsg_data(nlh);
4068 	rtm->rtm_family = AF_INET6;
4069 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4070 	rtm->rtm_src_len = rt->rt6i_src.plen;
4071 	rtm->rtm_tos = 0;
4072 	if (rt->rt6i_table)
4073 		table = rt->rt6i_table->tb6_id;
4074 	else
4075 		table = RT6_TABLE_UNSPEC;
4076 	rtm->rtm_table = table;
4077 	if (nla_put_u32(skb, RTA_TABLE, table))
4078 		goto nla_put_failure;
4079 	if (rt->rt6i_flags & RTF_REJECT) {
4080 		switch (rt->dst.error) {
4081 		case -EINVAL:
4082 			rtm->rtm_type = RTN_BLACKHOLE;
4083 			break;
4084 		case -EACCES:
4085 			rtm->rtm_type = RTN_PROHIBIT;
4086 			break;
4087 		case -EAGAIN:
4088 			rtm->rtm_type = RTN_THROW;
4089 			break;
4090 		default:
4091 			rtm->rtm_type = RTN_UNREACHABLE;
4092 			break;
4093 		}
4094 	}
4095 	else if (rt->rt6i_flags & RTF_LOCAL)
4096 		rtm->rtm_type = RTN_LOCAL;
4097 	else if (rt->rt6i_flags & RTF_ANYCAST)
4098 		rtm->rtm_type = RTN_ANYCAST;
4099 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4100 		rtm->rtm_type = RTN_LOCAL;
4101 	else
4102 		rtm->rtm_type = RTN_UNICAST;
4103 	rtm->rtm_flags = 0;
4104 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4105 	rtm->rtm_protocol = rt->rt6i_protocol;
4106 
4107 	if (rt->rt6i_flags & RTF_CACHE)
4108 		rtm->rtm_flags |= RTM_F_CLONED;
4109 
4110 	if (dst) {
4111 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4112 			goto nla_put_failure;
4113 		rtm->rtm_dst_len = 128;
4114 	} else if (rtm->rtm_dst_len)
4115 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4116 			goto nla_put_failure;
4117 #ifdef CONFIG_IPV6_SUBTREES
4118 	if (src) {
4119 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4120 			goto nla_put_failure;
4121 		rtm->rtm_src_len = 128;
4122 	} else if (rtm->rtm_src_len &&
4123 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4124 		goto nla_put_failure;
4125 #endif
4126 	if (iif) {
4127 #ifdef CONFIG_IPV6_MROUTE
4128 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4129 			int err = ip6mr_get_route(net, skb, rtm, portid);
4130 
4131 			if (err == 0)
4132 				return 0;
4133 			if (err < 0)
4134 				goto nla_put_failure;
4135 		} else
4136 #endif
4137 			if (nla_put_u32(skb, RTA_IIF, iif))
4138 				goto nla_put_failure;
4139 	} else if (dst) {
4140 		struct in6_addr saddr_buf;
4141 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4142 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4143 			goto nla_put_failure;
4144 	}
4145 
4146 	if (rt->rt6i_prefsrc.plen) {
4147 		struct in6_addr saddr_buf;
4148 		saddr_buf = rt->rt6i_prefsrc.addr;
4149 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4150 			goto nla_put_failure;
4151 	}
4152 
4153 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4154 	if (rt->rt6i_pmtu)
4155 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4156 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4157 		goto nla_put_failure;
4158 
4159 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4160 		goto nla_put_failure;
4161 
4162 	/* For multipath routes, walk the siblings list and add
4163 	 * each as a nexthop within RTA_MULTIPATH.
4164 	 */
4165 	if (rt->rt6i_nsiblings) {
4166 		struct rt6_info *sibling, *next_sibling;
4167 		struct nlattr *mp;
4168 
4169 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4170 		if (!mp)
4171 			goto nla_put_failure;
4172 
4173 		if (rt6_add_nexthop(skb, rt) < 0)
4174 			goto nla_put_failure;
4175 
4176 		list_for_each_entry_safe(sibling, next_sibling,
4177 					 &rt->rt6i_siblings, rt6i_siblings) {
4178 			if (rt6_add_nexthop(skb, sibling) < 0)
4179 				goto nla_put_failure;
4180 		}
4181 
4182 		nla_nest_end(skb, mp);
4183 	} else {
4184 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4185 			goto nla_put_failure;
4186 	}
4187 
4188 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4189 
4190 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4191 		goto nla_put_failure;
4192 
4193 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4194 		goto nla_put_failure;
4195 
4196 
4197 	nlmsg_end(skb, nlh);
4198 	return 0;
4199 
4200 nla_put_failure:
4201 	nlmsg_cancel(skb, nlh);
4202 	return -EMSGSIZE;
4203 }
4204 
4205 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4206 {
4207 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4208 	struct net *net = arg->net;
4209 
4210 	if (rt == net->ipv6.ip6_null_entry)
4211 		return 0;
4212 
4213 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4214 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4215 
4216 		/* user wants prefix routes only */
4217 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4218 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4219 			/* success since this is not a prefix route */
4220 			return 1;
4221 		}
4222 	}
4223 
4224 	return rt6_fill_node(net,
4225 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4226 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4227 		     NLM_F_MULTI);
4228 }
4229 
4230 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4231 			      struct netlink_ext_ack *extack)
4232 {
4233 	struct net *net = sock_net(in_skb->sk);
4234 	struct nlattr *tb[RTA_MAX+1];
4235 	int err, iif = 0, oif = 0;
4236 	struct dst_entry *dst;
4237 	struct rt6_info *rt;
4238 	struct sk_buff *skb;
4239 	struct rtmsg *rtm;
4240 	struct flowi6 fl6;
4241 	bool fibmatch;
4242 
4243 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4244 			  extack);
4245 	if (err < 0)
4246 		goto errout;
4247 
4248 	err = -EINVAL;
4249 	memset(&fl6, 0, sizeof(fl6));
4250 	rtm = nlmsg_data(nlh);
4251 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4252 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4253 
4254 	if (tb[RTA_SRC]) {
4255 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4256 			goto errout;
4257 
4258 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4259 	}
4260 
4261 	if (tb[RTA_DST]) {
4262 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4263 			goto errout;
4264 
4265 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4266 	}
4267 
4268 	if (tb[RTA_IIF])
4269 		iif = nla_get_u32(tb[RTA_IIF]);
4270 
4271 	if (tb[RTA_OIF])
4272 		oif = nla_get_u32(tb[RTA_OIF]);
4273 
4274 	if (tb[RTA_MARK])
4275 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4276 
4277 	if (tb[RTA_UID])
4278 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4279 					   nla_get_u32(tb[RTA_UID]));
4280 	else
4281 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4282 
4283 	if (iif) {
4284 		struct net_device *dev;
4285 		int flags = 0;
4286 
4287 		rcu_read_lock();
4288 
4289 		dev = dev_get_by_index_rcu(net, iif);
4290 		if (!dev) {
4291 			rcu_read_unlock();
4292 			err = -ENODEV;
4293 			goto errout;
4294 		}
4295 
4296 		fl6.flowi6_iif = iif;
4297 
4298 		if (!ipv6_addr_any(&fl6.saddr))
4299 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4300 
4301 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4302 
4303 		rcu_read_unlock();
4304 	} else {
4305 		fl6.flowi6_oif = oif;
4306 
4307 		dst = ip6_route_output(net, NULL, &fl6);
4308 	}
4309 
4310 
4311 	rt = container_of(dst, struct rt6_info, dst);
4312 	if (rt->dst.error) {
4313 		err = rt->dst.error;
4314 		ip6_rt_put(rt);
4315 		goto errout;
4316 	}
4317 
4318 	if (rt == net->ipv6.ip6_null_entry) {
4319 		err = rt->dst.error;
4320 		ip6_rt_put(rt);
4321 		goto errout;
4322 	}
4323 
4324 	if (fibmatch && rt->dst.from) {
4325 		struct rt6_info *ort = container_of(rt->dst.from,
4326 						    struct rt6_info, dst);
4327 
4328 		dst_hold(&ort->dst);
4329 		ip6_rt_put(rt);
4330 		rt = ort;
4331 	}
4332 
4333 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4334 	if (!skb) {
4335 		ip6_rt_put(rt);
4336 		err = -ENOBUFS;
4337 		goto errout;
4338 	}
4339 
4340 	skb_dst_set(skb, &rt->dst);
4341 	if (fibmatch)
4342 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4343 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4344 				    nlh->nlmsg_seq, 0);
4345 	else
4346 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4347 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4348 				    nlh->nlmsg_seq, 0);
4349 	if (err < 0) {
4350 		kfree_skb(skb);
4351 		goto errout;
4352 	}
4353 
4354 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4355 errout:
4356 	return err;
4357 }
4358 
4359 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4360 		     unsigned int nlm_flags)
4361 {
4362 	struct sk_buff *skb;
4363 	struct net *net = info->nl_net;
4364 	u32 seq;
4365 	int err;
4366 
4367 	err = -ENOBUFS;
4368 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4369 
4370 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4371 	if (!skb)
4372 		goto errout;
4373 
4374 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4375 				event, info->portid, seq, nlm_flags);
4376 	if (err < 0) {
4377 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4378 		WARN_ON(err == -EMSGSIZE);
4379 		kfree_skb(skb);
4380 		goto errout;
4381 	}
4382 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4383 		    info->nlh, gfp_any());
4384 	return;
4385 errout:
4386 	if (err < 0)
4387 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4388 }
4389 
4390 static int ip6_route_dev_notify(struct notifier_block *this,
4391 				unsigned long event, void *ptr)
4392 {
4393 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4394 	struct net *net = dev_net(dev);
4395 
4396 	if (!(dev->flags & IFF_LOOPBACK))
4397 		return NOTIFY_OK;
4398 
4399 	if (event == NETDEV_REGISTER) {
4400 		net->ipv6.ip6_null_entry->dst.dev = dev;
4401 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4402 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4403 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4404 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4405 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4406 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4407 #endif
4408 	 } else if (event == NETDEV_UNREGISTER &&
4409 		    dev->reg_state != NETREG_UNREGISTERED) {
4410 		/* NETDEV_UNREGISTER could be fired for multiple times by
4411 		 * netdev_wait_allrefs(). Make sure we only call this once.
4412 		 */
4413 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4414 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4415 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4416 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4417 #endif
4418 	}
4419 
4420 	return NOTIFY_OK;
4421 }
4422 
4423 /*
4424  *	/proc
4425  */
4426 
4427 #ifdef CONFIG_PROC_FS
4428 
4429 static const struct file_operations ipv6_route_proc_fops = {
4430 	.owner		= THIS_MODULE,
4431 	.open		= ipv6_route_open,
4432 	.read		= seq_read,
4433 	.llseek		= seq_lseek,
4434 	.release	= seq_release_net,
4435 };
4436 
4437 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4438 {
4439 	struct net *net = (struct net *)seq->private;
4440 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4441 		   net->ipv6.rt6_stats->fib_nodes,
4442 		   net->ipv6.rt6_stats->fib_route_nodes,
4443 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4444 		   net->ipv6.rt6_stats->fib_rt_entries,
4445 		   net->ipv6.rt6_stats->fib_rt_cache,
4446 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4447 		   net->ipv6.rt6_stats->fib_discarded_routes);
4448 
4449 	return 0;
4450 }
4451 
4452 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4453 {
4454 	return single_open_net(inode, file, rt6_stats_seq_show);
4455 }
4456 
4457 static const struct file_operations rt6_stats_seq_fops = {
4458 	.owner	 = THIS_MODULE,
4459 	.open	 = rt6_stats_seq_open,
4460 	.read	 = seq_read,
4461 	.llseek	 = seq_lseek,
4462 	.release = single_release_net,
4463 };
4464 #endif	/* CONFIG_PROC_FS */
4465 
4466 #ifdef CONFIG_SYSCTL
4467 
4468 static
4469 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4470 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4471 {
4472 	struct net *net;
4473 	int delay;
4474 	if (!write)
4475 		return -EINVAL;
4476 
4477 	net = (struct net *)ctl->extra1;
4478 	delay = net->ipv6.sysctl.flush_delay;
4479 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4480 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4481 	return 0;
4482 }
4483 
4484 struct ctl_table ipv6_route_table_template[] = {
4485 	{
4486 		.procname	=	"flush",
4487 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4488 		.maxlen		=	sizeof(int),
4489 		.mode		=	0200,
4490 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4491 	},
4492 	{
4493 		.procname	=	"gc_thresh",
4494 		.data		=	&ip6_dst_ops_template.gc_thresh,
4495 		.maxlen		=	sizeof(int),
4496 		.mode		=	0644,
4497 		.proc_handler	=	proc_dointvec,
4498 	},
4499 	{
4500 		.procname	=	"max_size",
4501 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4502 		.maxlen		=	sizeof(int),
4503 		.mode		=	0644,
4504 		.proc_handler	=	proc_dointvec,
4505 	},
4506 	{
4507 		.procname	=	"gc_min_interval",
4508 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4509 		.maxlen		=	sizeof(int),
4510 		.mode		=	0644,
4511 		.proc_handler	=	proc_dointvec_jiffies,
4512 	},
4513 	{
4514 		.procname	=	"gc_timeout",
4515 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4516 		.maxlen		=	sizeof(int),
4517 		.mode		=	0644,
4518 		.proc_handler	=	proc_dointvec_jiffies,
4519 	},
4520 	{
4521 		.procname	=	"gc_interval",
4522 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4523 		.maxlen		=	sizeof(int),
4524 		.mode		=	0644,
4525 		.proc_handler	=	proc_dointvec_jiffies,
4526 	},
4527 	{
4528 		.procname	=	"gc_elasticity",
4529 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4530 		.maxlen		=	sizeof(int),
4531 		.mode		=	0644,
4532 		.proc_handler	=	proc_dointvec,
4533 	},
4534 	{
4535 		.procname	=	"mtu_expires",
4536 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4537 		.maxlen		=	sizeof(int),
4538 		.mode		=	0644,
4539 		.proc_handler	=	proc_dointvec_jiffies,
4540 	},
4541 	{
4542 		.procname	=	"min_adv_mss",
4543 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4544 		.maxlen		=	sizeof(int),
4545 		.mode		=	0644,
4546 		.proc_handler	=	proc_dointvec,
4547 	},
4548 	{
4549 		.procname	=	"gc_min_interval_ms",
4550 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4551 		.maxlen		=	sizeof(int),
4552 		.mode		=	0644,
4553 		.proc_handler	=	proc_dointvec_ms_jiffies,
4554 	},
4555 	{ }
4556 };
4557 
4558 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4559 {
4560 	struct ctl_table *table;
4561 
4562 	table = kmemdup(ipv6_route_table_template,
4563 			sizeof(ipv6_route_table_template),
4564 			GFP_KERNEL);
4565 
4566 	if (table) {
4567 		table[0].data = &net->ipv6.sysctl.flush_delay;
4568 		table[0].extra1 = net;
4569 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4570 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4571 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4572 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4573 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4574 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4575 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4576 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4577 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4578 
4579 		/* Don't export sysctls to unprivileged users */
4580 		if (net->user_ns != &init_user_ns)
4581 			table[0].procname = NULL;
4582 	}
4583 
4584 	return table;
4585 }
4586 #endif
4587 
4588 static int __net_init ip6_route_net_init(struct net *net)
4589 {
4590 	int ret = -ENOMEM;
4591 
4592 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4593 	       sizeof(net->ipv6.ip6_dst_ops));
4594 
4595 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4596 		goto out_ip6_dst_ops;
4597 
4598 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4599 					   sizeof(*net->ipv6.ip6_null_entry),
4600 					   GFP_KERNEL);
4601 	if (!net->ipv6.ip6_null_entry)
4602 		goto out_ip6_dst_entries;
4603 	net->ipv6.ip6_null_entry->dst.path =
4604 		(struct dst_entry *)net->ipv6.ip6_null_entry;
4605 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4606 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4607 			 ip6_template_metrics, true);
4608 
4609 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4610 	net->ipv6.fib6_has_custom_rules = false;
4611 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4612 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4613 					       GFP_KERNEL);
4614 	if (!net->ipv6.ip6_prohibit_entry)
4615 		goto out_ip6_null_entry;
4616 	net->ipv6.ip6_prohibit_entry->dst.path =
4617 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4618 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4619 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4620 			 ip6_template_metrics, true);
4621 
4622 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4623 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4624 					       GFP_KERNEL);
4625 	if (!net->ipv6.ip6_blk_hole_entry)
4626 		goto out_ip6_prohibit_entry;
4627 	net->ipv6.ip6_blk_hole_entry->dst.path =
4628 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4629 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4630 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4631 			 ip6_template_metrics, true);
4632 #endif
4633 
4634 	net->ipv6.sysctl.flush_delay = 0;
4635 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4636 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4637 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4638 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4639 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4640 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4641 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4642 
4643 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4644 
4645 	ret = 0;
4646 out:
4647 	return ret;
4648 
4649 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4650 out_ip6_prohibit_entry:
4651 	kfree(net->ipv6.ip6_prohibit_entry);
4652 out_ip6_null_entry:
4653 	kfree(net->ipv6.ip6_null_entry);
4654 #endif
4655 out_ip6_dst_entries:
4656 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4657 out_ip6_dst_ops:
4658 	goto out;
4659 }
4660 
4661 static void __net_exit ip6_route_net_exit(struct net *net)
4662 {
4663 	kfree(net->ipv6.ip6_null_entry);
4664 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4665 	kfree(net->ipv6.ip6_prohibit_entry);
4666 	kfree(net->ipv6.ip6_blk_hole_entry);
4667 #endif
4668 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4669 }
4670 
4671 static int __net_init ip6_route_net_init_late(struct net *net)
4672 {
4673 #ifdef CONFIG_PROC_FS
4674 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4675 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4676 #endif
4677 	return 0;
4678 }
4679 
4680 static void __net_exit ip6_route_net_exit_late(struct net *net)
4681 {
4682 #ifdef CONFIG_PROC_FS
4683 	remove_proc_entry("ipv6_route", net->proc_net);
4684 	remove_proc_entry("rt6_stats", net->proc_net);
4685 #endif
4686 }
4687 
4688 static struct pernet_operations ip6_route_net_ops = {
4689 	.init = ip6_route_net_init,
4690 	.exit = ip6_route_net_exit,
4691 };
4692 
4693 static int __net_init ipv6_inetpeer_init(struct net *net)
4694 {
4695 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4696 
4697 	if (!bp)
4698 		return -ENOMEM;
4699 	inet_peer_base_init(bp);
4700 	net->ipv6.peers = bp;
4701 	return 0;
4702 }
4703 
4704 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4705 {
4706 	struct inet_peer_base *bp = net->ipv6.peers;
4707 
4708 	net->ipv6.peers = NULL;
4709 	inetpeer_invalidate_tree(bp);
4710 	kfree(bp);
4711 }
4712 
4713 static struct pernet_operations ipv6_inetpeer_ops = {
4714 	.init	=	ipv6_inetpeer_init,
4715 	.exit	=	ipv6_inetpeer_exit,
4716 };
4717 
4718 static struct pernet_operations ip6_route_net_late_ops = {
4719 	.init = ip6_route_net_init_late,
4720 	.exit = ip6_route_net_exit_late,
4721 };
4722 
4723 static struct notifier_block ip6_route_dev_notifier = {
4724 	.notifier_call = ip6_route_dev_notify,
4725 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4726 };
4727 
4728 void __init ip6_route_init_special_entries(void)
4729 {
4730 	/* Registering of the loopback is done before this portion of code,
4731 	 * the loopback reference in rt6_info will not be taken, do it
4732 	 * manually for init_net */
4733 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4734 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4735   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4736 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4737 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4738 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4739 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4740   #endif
4741 }
4742 
4743 int __init ip6_route_init(void)
4744 {
4745 	int ret;
4746 	int cpu;
4747 
4748 	ret = -ENOMEM;
4749 	ip6_dst_ops_template.kmem_cachep =
4750 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4751 				  SLAB_HWCACHE_ALIGN, NULL);
4752 	if (!ip6_dst_ops_template.kmem_cachep)
4753 		goto out;
4754 
4755 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
4756 	if (ret)
4757 		goto out_kmem_cache;
4758 
4759 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4760 	if (ret)
4761 		goto out_dst_entries;
4762 
4763 	ret = register_pernet_subsys(&ip6_route_net_ops);
4764 	if (ret)
4765 		goto out_register_inetpeer;
4766 
4767 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4768 
4769 	ret = fib6_init();
4770 	if (ret)
4771 		goto out_register_subsys;
4772 
4773 	ret = xfrm6_init();
4774 	if (ret)
4775 		goto out_fib6_init;
4776 
4777 	ret = fib6_rules_init();
4778 	if (ret)
4779 		goto xfrm6_init;
4780 
4781 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
4782 	if (ret)
4783 		goto fib6_rules_init;
4784 
4785 	ret = -ENOBUFS;
4786 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4787 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4788 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4789 			    RTNL_FLAG_DOIT_UNLOCKED))
4790 		goto out_register_late_subsys;
4791 
4792 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4793 	if (ret)
4794 		goto out_register_late_subsys;
4795 
4796 	for_each_possible_cpu(cpu) {
4797 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4798 
4799 		INIT_LIST_HEAD(&ul->head);
4800 		spin_lock_init(&ul->lock);
4801 	}
4802 
4803 out:
4804 	return ret;
4805 
4806 out_register_late_subsys:
4807 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4808 fib6_rules_init:
4809 	fib6_rules_cleanup();
4810 xfrm6_init:
4811 	xfrm6_fini();
4812 out_fib6_init:
4813 	fib6_gc_cleanup();
4814 out_register_subsys:
4815 	unregister_pernet_subsys(&ip6_route_net_ops);
4816 out_register_inetpeer:
4817 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4818 out_dst_entries:
4819 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4820 out_kmem_cache:
4821 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4822 	goto out;
4823 }
4824 
4825 void ip6_route_cleanup(void)
4826 {
4827 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
4828 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4829 	fib6_rules_cleanup();
4830 	xfrm6_fini();
4831 	fib6_gc_cleanup();
4832 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4833 	unregister_pernet_subsys(&ip6_route_net_ops);
4834 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4835 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4836 }
4837