xref: /openbmc/linux/net/ipv6/route.c (revision 9dae47aba0a055f761176d9297371d5bb24289ec)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 	int route_choosen;
459 
460 	/* We might have already computed the hash for ICMPv6 errors. In such
461 	 * case it will always be non-zero. Otherwise now is the time to do it.
462 	 */
463 	if (!fl6->mp_hash)
464 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
465 
466 	route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
467 	/* Don't change the route, if route_choosen == 0
468 	 * (siblings does not include ourself)
469 	 */
470 	if (route_choosen)
471 		list_for_each_entry_safe(sibling, next_sibling,
472 				&match->rt6i_siblings, rt6i_siblings) {
473 			route_choosen--;
474 			if (route_choosen == 0) {
475 				struct inet6_dev *idev = sibling->rt6i_idev;
476 
477 				if (sibling->rt6i_nh_flags & RTNH_F_DEAD)
478 					break;
479 				if (sibling->rt6i_nh_flags & RTNH_F_LINKDOWN &&
480 				    idev->cnf.ignore_routes_with_linkdown)
481 					break;
482 				if (rt6_score_route(sibling, oif, strict) < 0)
483 					break;
484 				match = sibling;
485 				break;
486 			}
487 		}
488 	return match;
489 }
490 
491 /*
492  *	Route lookup. rcu_read_lock() should be held.
493  */
494 
495 static inline struct rt6_info *rt6_device_match(struct net *net,
496 						    struct rt6_info *rt,
497 						    const struct in6_addr *saddr,
498 						    int oif,
499 						    int flags)
500 {
501 	struct rt6_info *local = NULL;
502 	struct rt6_info *sprt;
503 
504 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
505 		return rt;
506 
507 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
508 		struct net_device *dev = sprt->dst.dev;
509 
510 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
511 			continue;
512 
513 		if (oif) {
514 			if (dev->ifindex == oif)
515 				return sprt;
516 			if (dev->flags & IFF_LOOPBACK) {
517 				if (!sprt->rt6i_idev ||
518 				    sprt->rt6i_idev->dev->ifindex != oif) {
519 					if (flags & RT6_LOOKUP_F_IFACE)
520 						continue;
521 					if (local &&
522 					    local->rt6i_idev->dev->ifindex == oif)
523 						continue;
524 				}
525 				local = sprt;
526 			}
527 		} else {
528 			if (ipv6_chk_addr(net, saddr, dev,
529 					  flags & RT6_LOOKUP_F_IFACE))
530 				return sprt;
531 		}
532 	}
533 
534 	if (oif) {
535 		if (local)
536 			return local;
537 
538 		if (flags & RT6_LOOKUP_F_IFACE)
539 			return net->ipv6.ip6_null_entry;
540 	}
541 
542 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
543 }
544 
545 #ifdef CONFIG_IPV6_ROUTER_PREF
546 struct __rt6_probe_work {
547 	struct work_struct work;
548 	struct in6_addr target;
549 	struct net_device *dev;
550 };
551 
552 static void rt6_probe_deferred(struct work_struct *w)
553 {
554 	struct in6_addr mcaddr;
555 	struct __rt6_probe_work *work =
556 		container_of(w, struct __rt6_probe_work, work);
557 
558 	addrconf_addr_solict_mult(&work->target, &mcaddr);
559 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
560 	dev_put(work->dev);
561 	kfree(work);
562 }
563 
564 static void rt6_probe(struct rt6_info *rt)
565 {
566 	struct __rt6_probe_work *work;
567 	struct neighbour *neigh;
568 	/*
569 	 * Okay, this does not seem to be appropriate
570 	 * for now, however, we need to check if it
571 	 * is really so; aka Router Reachability Probing.
572 	 *
573 	 * Router Reachability Probe MUST be rate-limited
574 	 * to no more than one per minute.
575 	 */
576 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
577 		return;
578 	rcu_read_lock_bh();
579 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
580 	if (neigh) {
581 		if (neigh->nud_state & NUD_VALID)
582 			goto out;
583 
584 		work = NULL;
585 		write_lock(&neigh->lock);
586 		if (!(neigh->nud_state & NUD_VALID) &&
587 		    time_after(jiffies,
588 			       neigh->updated +
589 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
590 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
591 			if (work)
592 				__neigh_set_probe_once(neigh);
593 		}
594 		write_unlock(&neigh->lock);
595 	} else {
596 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
597 	}
598 
599 	if (work) {
600 		INIT_WORK(&work->work, rt6_probe_deferred);
601 		work->target = rt->rt6i_gateway;
602 		dev_hold(rt->dst.dev);
603 		work->dev = rt->dst.dev;
604 		schedule_work(&work->work);
605 	}
606 
607 out:
608 	rcu_read_unlock_bh();
609 }
610 #else
611 static inline void rt6_probe(struct rt6_info *rt)
612 {
613 }
614 #endif
615 
616 /*
617  * Default Router Selection (RFC 2461 6.3.6)
618  */
619 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
620 {
621 	struct net_device *dev = rt->dst.dev;
622 	if (!oif || dev->ifindex == oif)
623 		return 2;
624 	if ((dev->flags & IFF_LOOPBACK) &&
625 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
626 		return 1;
627 	return 0;
628 }
629 
630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
631 {
632 	struct neighbour *neigh;
633 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
634 
635 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
636 	    !(rt->rt6i_flags & RTF_GATEWAY))
637 		return RT6_NUD_SUCCEED;
638 
639 	rcu_read_lock_bh();
640 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
641 	if (neigh) {
642 		read_lock(&neigh->lock);
643 		if (neigh->nud_state & NUD_VALID)
644 			ret = RT6_NUD_SUCCEED;
645 #ifdef CONFIG_IPV6_ROUTER_PREF
646 		else if (!(neigh->nud_state & NUD_FAILED))
647 			ret = RT6_NUD_SUCCEED;
648 		else
649 			ret = RT6_NUD_FAIL_PROBE;
650 #endif
651 		read_unlock(&neigh->lock);
652 	} else {
653 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
654 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
655 	}
656 	rcu_read_unlock_bh();
657 
658 	return ret;
659 }
660 
661 static int rt6_score_route(struct rt6_info *rt, int oif,
662 			   int strict)
663 {
664 	int m;
665 
666 	m = rt6_check_dev(rt, oif);
667 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
668 		return RT6_NUD_FAIL_HARD;
669 #ifdef CONFIG_IPV6_ROUTER_PREF
670 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
671 #endif
672 	if (strict & RT6_LOOKUP_F_REACHABLE) {
673 		int n = rt6_check_neigh(rt);
674 		if (n < 0)
675 			return n;
676 	}
677 	return m;
678 }
679 
680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
681 				   int *mpri, struct rt6_info *match,
682 				   bool *do_rr)
683 {
684 	int m;
685 	bool match_do_rr = false;
686 	struct inet6_dev *idev = rt->rt6i_idev;
687 
688 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
689 		goto out;
690 
691 	if (idev->cnf.ignore_routes_with_linkdown &&
692 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
693 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
694 		goto out;
695 
696 	if (rt6_check_expired(rt))
697 		goto out;
698 
699 	m = rt6_score_route(rt, oif, strict);
700 	if (m == RT6_NUD_FAIL_DO_RR) {
701 		match_do_rr = true;
702 		m = 0; /* lowest valid score */
703 	} else if (m == RT6_NUD_FAIL_HARD) {
704 		goto out;
705 	}
706 
707 	if (strict & RT6_LOOKUP_F_REACHABLE)
708 		rt6_probe(rt);
709 
710 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
711 	if (m > *mpri) {
712 		*do_rr = match_do_rr;
713 		*mpri = m;
714 		match = rt;
715 	}
716 out:
717 	return match;
718 }
719 
720 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
721 				     struct rt6_info *leaf,
722 				     struct rt6_info *rr_head,
723 				     u32 metric, int oif, int strict,
724 				     bool *do_rr)
725 {
726 	struct rt6_info *rt, *match, *cont;
727 	int mpri = -1;
728 
729 	match = NULL;
730 	cont = NULL;
731 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
732 		if (rt->rt6i_metric != metric) {
733 			cont = rt;
734 			break;
735 		}
736 
737 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 	}
739 
740 	for (rt = leaf; rt && rt != rr_head;
741 	     rt = rcu_dereference(rt->rt6_next)) {
742 		if (rt->rt6i_metric != metric) {
743 			cont = rt;
744 			break;
745 		}
746 
747 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
748 	}
749 
750 	if (match || !cont)
751 		return match;
752 
753 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
754 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
755 
756 	return match;
757 }
758 
759 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
760 				   int oif, int strict)
761 {
762 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
763 	struct rt6_info *match, *rt0;
764 	bool do_rr = false;
765 	int key_plen;
766 
767 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
768 		return net->ipv6.ip6_null_entry;
769 
770 	rt0 = rcu_dereference(fn->rr_ptr);
771 	if (!rt0)
772 		rt0 = leaf;
773 
774 	/* Double check to make sure fn is not an intermediate node
775 	 * and fn->leaf does not points to its child's leaf
776 	 * (This might happen if all routes under fn are deleted from
777 	 * the tree and fib6_repair_tree() is called on the node.)
778 	 */
779 	key_plen = rt0->rt6i_dst.plen;
780 #ifdef CONFIG_IPV6_SUBTREES
781 	if (rt0->rt6i_src.plen)
782 		key_plen = rt0->rt6i_src.plen;
783 #endif
784 	if (fn->fn_bit != key_plen)
785 		return net->ipv6.ip6_null_entry;
786 
787 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
788 			     &do_rr);
789 
790 	if (do_rr) {
791 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
792 
793 		/* no entries matched; do round-robin */
794 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
795 			next = leaf;
796 
797 		if (next != rt0) {
798 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
799 			/* make sure next is not being deleted from the tree */
800 			if (next->rt6i_node)
801 				rcu_assign_pointer(fn->rr_ptr, next);
802 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
803 		}
804 	}
805 
806 	return match ? match : net->ipv6.ip6_null_entry;
807 }
808 
809 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
810 {
811 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
812 }
813 
814 #ifdef CONFIG_IPV6_ROUTE_INFO
815 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
816 		  const struct in6_addr *gwaddr)
817 {
818 	struct net *net = dev_net(dev);
819 	struct route_info *rinfo = (struct route_info *) opt;
820 	struct in6_addr prefix_buf, *prefix;
821 	unsigned int pref;
822 	unsigned long lifetime;
823 	struct rt6_info *rt;
824 
825 	if (len < sizeof(struct route_info)) {
826 		return -EINVAL;
827 	}
828 
829 	/* Sanity check for prefix_len and length */
830 	if (rinfo->length > 3) {
831 		return -EINVAL;
832 	} else if (rinfo->prefix_len > 128) {
833 		return -EINVAL;
834 	} else if (rinfo->prefix_len > 64) {
835 		if (rinfo->length < 2) {
836 			return -EINVAL;
837 		}
838 	} else if (rinfo->prefix_len > 0) {
839 		if (rinfo->length < 1) {
840 			return -EINVAL;
841 		}
842 	}
843 
844 	pref = rinfo->route_pref;
845 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
846 		return -EINVAL;
847 
848 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
849 
850 	if (rinfo->length == 3)
851 		prefix = (struct in6_addr *)rinfo->prefix;
852 	else {
853 		/* this function is safe */
854 		ipv6_addr_prefix(&prefix_buf,
855 				 (struct in6_addr *)rinfo->prefix,
856 				 rinfo->prefix_len);
857 		prefix = &prefix_buf;
858 	}
859 
860 	if (rinfo->prefix_len == 0)
861 		rt = rt6_get_dflt_router(gwaddr, dev);
862 	else
863 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
864 					gwaddr, dev);
865 
866 	if (rt && !lifetime) {
867 		ip6_del_rt(rt);
868 		rt = NULL;
869 	}
870 
871 	if (!rt && lifetime)
872 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
873 					dev, pref);
874 	else if (rt)
875 		rt->rt6i_flags = RTF_ROUTEINFO |
876 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
877 
878 	if (rt) {
879 		if (!addrconf_finite_timeout(lifetime))
880 			rt6_clean_expires(rt);
881 		else
882 			rt6_set_expires(rt, jiffies + HZ * lifetime);
883 
884 		ip6_rt_put(rt);
885 	}
886 	return 0;
887 }
888 #endif
889 
890 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
891 					struct in6_addr *saddr)
892 {
893 	struct fib6_node *pn, *sn;
894 	while (1) {
895 		if (fn->fn_flags & RTN_TL_ROOT)
896 			return NULL;
897 		pn = rcu_dereference(fn->parent);
898 		sn = FIB6_SUBTREE(pn);
899 		if (sn && sn != fn)
900 			fn = fib6_lookup(sn, NULL, saddr);
901 		else
902 			fn = pn;
903 		if (fn->fn_flags & RTN_RTINFO)
904 			return fn;
905 	}
906 }
907 
908 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
909 			  bool null_fallback)
910 {
911 	struct rt6_info *rt = *prt;
912 
913 	if (dst_hold_safe(&rt->dst))
914 		return true;
915 	if (null_fallback) {
916 		rt = net->ipv6.ip6_null_entry;
917 		dst_hold(&rt->dst);
918 	} else {
919 		rt = NULL;
920 	}
921 	*prt = rt;
922 	return false;
923 }
924 
925 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
926 					     struct fib6_table *table,
927 					     struct flowi6 *fl6, int flags)
928 {
929 	struct rt6_info *rt, *rt_cache;
930 	struct fib6_node *fn;
931 
932 	rcu_read_lock();
933 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
934 restart:
935 	rt = rcu_dereference(fn->leaf);
936 	if (!rt) {
937 		rt = net->ipv6.ip6_null_entry;
938 	} else {
939 		rt = rt6_device_match(net, rt, &fl6->saddr,
940 				      fl6->flowi6_oif, flags);
941 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
942 			rt = rt6_multipath_select(rt, fl6,
943 						  fl6->flowi6_oif, flags);
944 	}
945 	if (rt == net->ipv6.ip6_null_entry) {
946 		fn = fib6_backtrack(fn, &fl6->saddr);
947 		if (fn)
948 			goto restart;
949 	}
950 	/* Search through exception table */
951 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
952 	if (rt_cache)
953 		rt = rt_cache;
954 
955 	if (ip6_hold_safe(net, &rt, true))
956 		dst_use_noref(&rt->dst, jiffies);
957 
958 	rcu_read_unlock();
959 
960 	trace_fib6_table_lookup(net, rt, table, fl6);
961 
962 	return rt;
963 
964 }
965 
966 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
967 				    int flags)
968 {
969 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
970 }
971 EXPORT_SYMBOL_GPL(ip6_route_lookup);
972 
973 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
974 			    const struct in6_addr *saddr, int oif, int strict)
975 {
976 	struct flowi6 fl6 = {
977 		.flowi6_oif = oif,
978 		.daddr = *daddr,
979 	};
980 	struct dst_entry *dst;
981 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
982 
983 	if (saddr) {
984 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
985 		flags |= RT6_LOOKUP_F_HAS_SADDR;
986 	}
987 
988 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
989 	if (dst->error == 0)
990 		return (struct rt6_info *) dst;
991 
992 	dst_release(dst);
993 
994 	return NULL;
995 }
996 EXPORT_SYMBOL(rt6_lookup);
997 
998 /* ip6_ins_rt is called with FREE table->tb6_lock.
999  * It takes new route entry, the addition fails by any reason the
1000  * route is released.
1001  * Caller must hold dst before calling it.
1002  */
1003 
1004 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1005 			struct mx6_config *mxc,
1006 			struct netlink_ext_ack *extack)
1007 {
1008 	int err;
1009 	struct fib6_table *table;
1010 
1011 	table = rt->rt6i_table;
1012 	spin_lock_bh(&table->tb6_lock);
1013 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1014 	spin_unlock_bh(&table->tb6_lock);
1015 
1016 	return err;
1017 }
1018 
1019 int ip6_ins_rt(struct rt6_info *rt)
1020 {
1021 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1022 	struct mx6_config mxc = { .mx = NULL, };
1023 
1024 	/* Hold dst to account for the reference from the fib6 tree */
1025 	dst_hold(&rt->dst);
1026 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1027 }
1028 
1029 /* called with rcu_lock held */
1030 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1031 {
1032 	struct net_device *dev = rt->dst.dev;
1033 
1034 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1035 		/* for copies of local routes, dst->dev needs to be the
1036 		 * device if it is a master device, the master device if
1037 		 * device is enslaved, and the loopback as the default
1038 		 */
1039 		if (netif_is_l3_slave(dev) &&
1040 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1041 			dev = l3mdev_master_dev_rcu(dev);
1042 		else if (!netif_is_l3_master(dev))
1043 			dev = dev_net(dev)->loopback_dev;
1044 		/* last case is netif_is_l3_master(dev) is true in which
1045 		 * case we want dev returned to be dev
1046 		 */
1047 	}
1048 
1049 	return dev;
1050 }
1051 
1052 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1053 					   const struct in6_addr *daddr,
1054 					   const struct in6_addr *saddr)
1055 {
1056 	struct net_device *dev;
1057 	struct rt6_info *rt;
1058 
1059 	/*
1060 	 *	Clone the route.
1061 	 */
1062 
1063 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1064 		ort = ort->from;
1065 
1066 	rcu_read_lock();
1067 	dev = ip6_rt_get_dev_rcu(ort);
1068 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1069 	rcu_read_unlock();
1070 	if (!rt)
1071 		return NULL;
1072 
1073 	ip6_rt_copy_init(rt, ort);
1074 	rt->rt6i_flags |= RTF_CACHE;
1075 	rt->rt6i_metric = 0;
1076 	rt->dst.flags |= DST_HOST;
1077 	rt->rt6i_dst.addr = *daddr;
1078 	rt->rt6i_dst.plen = 128;
1079 
1080 	if (!rt6_is_gw_or_nonexthop(ort)) {
1081 		if (ort->rt6i_dst.plen != 128 &&
1082 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1083 			rt->rt6i_flags |= RTF_ANYCAST;
1084 #ifdef CONFIG_IPV6_SUBTREES
1085 		if (rt->rt6i_src.plen && saddr) {
1086 			rt->rt6i_src.addr = *saddr;
1087 			rt->rt6i_src.plen = 128;
1088 		}
1089 #endif
1090 	}
1091 
1092 	return rt;
1093 }
1094 
1095 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1096 {
1097 	struct net_device *dev;
1098 	struct rt6_info *pcpu_rt;
1099 
1100 	rcu_read_lock();
1101 	dev = ip6_rt_get_dev_rcu(rt);
1102 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1103 	rcu_read_unlock();
1104 	if (!pcpu_rt)
1105 		return NULL;
1106 	ip6_rt_copy_init(pcpu_rt, rt);
1107 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1108 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1109 	return pcpu_rt;
1110 }
1111 
1112 /* It should be called with rcu_read_lock() acquired */
1113 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1114 {
1115 	struct rt6_info *pcpu_rt, **p;
1116 
1117 	p = this_cpu_ptr(rt->rt6i_pcpu);
1118 	pcpu_rt = *p;
1119 
1120 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1121 		rt6_dst_from_metrics_check(pcpu_rt);
1122 
1123 	return pcpu_rt;
1124 }
1125 
1126 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1127 {
1128 	struct rt6_info *pcpu_rt, *prev, **p;
1129 
1130 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1131 	if (!pcpu_rt) {
1132 		struct net *net = dev_net(rt->dst.dev);
1133 
1134 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1135 		return net->ipv6.ip6_null_entry;
1136 	}
1137 
1138 	dst_hold(&pcpu_rt->dst);
1139 	p = this_cpu_ptr(rt->rt6i_pcpu);
1140 	prev = cmpxchg(p, NULL, pcpu_rt);
1141 	BUG_ON(prev);
1142 
1143 	rt6_dst_from_metrics_check(pcpu_rt);
1144 	return pcpu_rt;
1145 }
1146 
1147 /* exception hash table implementation
1148  */
1149 static DEFINE_SPINLOCK(rt6_exception_lock);
1150 
1151 /* Remove rt6_ex from hash table and free the memory
1152  * Caller must hold rt6_exception_lock
1153  */
1154 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1155 				 struct rt6_exception *rt6_ex)
1156 {
1157 	struct net *net;
1158 
1159 	if (!bucket || !rt6_ex)
1160 		return;
1161 
1162 	net = dev_net(rt6_ex->rt6i->dst.dev);
1163 	rt6_ex->rt6i->rt6i_node = NULL;
1164 	hlist_del_rcu(&rt6_ex->hlist);
1165 	rt6_release(rt6_ex->rt6i);
1166 	kfree_rcu(rt6_ex, rcu);
1167 	WARN_ON_ONCE(!bucket->depth);
1168 	bucket->depth--;
1169 	net->ipv6.rt6_stats->fib_rt_cache--;
1170 }
1171 
1172 /* Remove oldest rt6_ex in bucket and free the memory
1173  * Caller must hold rt6_exception_lock
1174  */
1175 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1176 {
1177 	struct rt6_exception *rt6_ex, *oldest = NULL;
1178 
1179 	if (!bucket)
1180 		return;
1181 
1182 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1183 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1184 			oldest = rt6_ex;
1185 	}
1186 	rt6_remove_exception(bucket, oldest);
1187 }
1188 
1189 static u32 rt6_exception_hash(const struct in6_addr *dst,
1190 			      const struct in6_addr *src)
1191 {
1192 	static u32 seed __read_mostly;
1193 	u32 val;
1194 
1195 	net_get_random_once(&seed, sizeof(seed));
1196 	val = jhash(dst, sizeof(*dst), seed);
1197 
1198 #ifdef CONFIG_IPV6_SUBTREES
1199 	if (src)
1200 		val = jhash(src, sizeof(*src), val);
1201 #endif
1202 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1203 }
1204 
1205 /* Helper function to find the cached rt in the hash table
1206  * and update bucket pointer to point to the bucket for this
1207  * (daddr, saddr) pair
1208  * Caller must hold rt6_exception_lock
1209  */
1210 static struct rt6_exception *
1211 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1212 			      const struct in6_addr *daddr,
1213 			      const struct in6_addr *saddr)
1214 {
1215 	struct rt6_exception *rt6_ex;
1216 	u32 hval;
1217 
1218 	if (!(*bucket) || !daddr)
1219 		return NULL;
1220 
1221 	hval = rt6_exception_hash(daddr, saddr);
1222 	*bucket += hval;
1223 
1224 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1225 		struct rt6_info *rt6 = rt6_ex->rt6i;
1226 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1227 
1228 #ifdef CONFIG_IPV6_SUBTREES
1229 		if (matched && saddr)
1230 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1231 #endif
1232 		if (matched)
1233 			return rt6_ex;
1234 	}
1235 	return NULL;
1236 }
1237 
1238 /* Helper function to find the cached rt in the hash table
1239  * and update bucket pointer to point to the bucket for this
1240  * (daddr, saddr) pair
1241  * Caller must hold rcu_read_lock()
1242  */
1243 static struct rt6_exception *
1244 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1245 			 const struct in6_addr *daddr,
1246 			 const struct in6_addr *saddr)
1247 {
1248 	struct rt6_exception *rt6_ex;
1249 	u32 hval;
1250 
1251 	WARN_ON_ONCE(!rcu_read_lock_held());
1252 
1253 	if (!(*bucket) || !daddr)
1254 		return NULL;
1255 
1256 	hval = rt6_exception_hash(daddr, saddr);
1257 	*bucket += hval;
1258 
1259 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1260 		struct rt6_info *rt6 = rt6_ex->rt6i;
1261 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1262 
1263 #ifdef CONFIG_IPV6_SUBTREES
1264 		if (matched && saddr)
1265 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1266 #endif
1267 		if (matched)
1268 			return rt6_ex;
1269 	}
1270 	return NULL;
1271 }
1272 
1273 static int rt6_insert_exception(struct rt6_info *nrt,
1274 				struct rt6_info *ort)
1275 {
1276 	struct net *net = dev_net(ort->dst.dev);
1277 	struct rt6_exception_bucket *bucket;
1278 	struct in6_addr *src_key = NULL;
1279 	struct rt6_exception *rt6_ex;
1280 	int err = 0;
1281 
1282 	/* ort can't be a cache or pcpu route */
1283 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1284 		ort = ort->from;
1285 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1286 
1287 	spin_lock_bh(&rt6_exception_lock);
1288 
1289 	if (ort->exception_bucket_flushed) {
1290 		err = -EINVAL;
1291 		goto out;
1292 	}
1293 
1294 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1295 					lockdep_is_held(&rt6_exception_lock));
1296 	if (!bucket) {
1297 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1298 				 GFP_ATOMIC);
1299 		if (!bucket) {
1300 			err = -ENOMEM;
1301 			goto out;
1302 		}
1303 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1304 	}
1305 
1306 #ifdef CONFIG_IPV6_SUBTREES
1307 	/* rt6i_src.plen != 0 indicates ort is in subtree
1308 	 * and exception table is indexed by a hash of
1309 	 * both rt6i_dst and rt6i_src.
1310 	 * Otherwise, the exception table is indexed by
1311 	 * a hash of only rt6i_dst.
1312 	 */
1313 	if (ort->rt6i_src.plen)
1314 		src_key = &nrt->rt6i_src.addr;
1315 #endif
1316 
1317 	/* Update rt6i_prefsrc as it could be changed
1318 	 * in rt6_remove_prefsrc()
1319 	 */
1320 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1321 	/* rt6_mtu_change() might lower mtu on ort.
1322 	 * Only insert this exception route if its mtu
1323 	 * is less than ort's mtu value.
1324 	 */
1325 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1326 		err = -EINVAL;
1327 		goto out;
1328 	}
1329 
1330 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1331 					       src_key);
1332 	if (rt6_ex)
1333 		rt6_remove_exception(bucket, rt6_ex);
1334 
1335 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1336 	if (!rt6_ex) {
1337 		err = -ENOMEM;
1338 		goto out;
1339 	}
1340 	rt6_ex->rt6i = nrt;
1341 	rt6_ex->stamp = jiffies;
1342 	atomic_inc(&nrt->rt6i_ref);
1343 	nrt->rt6i_node = ort->rt6i_node;
1344 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1345 	bucket->depth++;
1346 	net->ipv6.rt6_stats->fib_rt_cache++;
1347 
1348 	if (bucket->depth > FIB6_MAX_DEPTH)
1349 		rt6_exception_remove_oldest(bucket);
1350 
1351 out:
1352 	spin_unlock_bh(&rt6_exception_lock);
1353 
1354 	/* Update fn->fn_sernum to invalidate all cached dst */
1355 	if (!err) {
1356 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1357 		fib6_update_sernum(ort);
1358 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1359 		fib6_force_start_gc(net);
1360 	}
1361 
1362 	return err;
1363 }
1364 
1365 void rt6_flush_exceptions(struct rt6_info *rt)
1366 {
1367 	struct rt6_exception_bucket *bucket;
1368 	struct rt6_exception *rt6_ex;
1369 	struct hlist_node *tmp;
1370 	int i;
1371 
1372 	spin_lock_bh(&rt6_exception_lock);
1373 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1374 	rt->exception_bucket_flushed = 1;
1375 
1376 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1377 				    lockdep_is_held(&rt6_exception_lock));
1378 	if (!bucket)
1379 		goto out;
1380 
1381 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1382 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1383 			rt6_remove_exception(bucket, rt6_ex);
1384 		WARN_ON_ONCE(bucket->depth);
1385 		bucket++;
1386 	}
1387 
1388 out:
1389 	spin_unlock_bh(&rt6_exception_lock);
1390 }
1391 
1392 /* Find cached rt in the hash table inside passed in rt
1393  * Caller has to hold rcu_read_lock()
1394  */
1395 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1396 					   struct in6_addr *daddr,
1397 					   struct in6_addr *saddr)
1398 {
1399 	struct rt6_exception_bucket *bucket;
1400 	struct in6_addr *src_key = NULL;
1401 	struct rt6_exception *rt6_ex;
1402 	struct rt6_info *res = NULL;
1403 
1404 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1405 
1406 #ifdef CONFIG_IPV6_SUBTREES
1407 	/* rt6i_src.plen != 0 indicates rt is in subtree
1408 	 * and exception table is indexed by a hash of
1409 	 * both rt6i_dst and rt6i_src.
1410 	 * Otherwise, the exception table is indexed by
1411 	 * a hash of only rt6i_dst.
1412 	 */
1413 	if (rt->rt6i_src.plen)
1414 		src_key = saddr;
1415 #endif
1416 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1417 
1418 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1419 		res = rt6_ex->rt6i;
1420 
1421 	return res;
1422 }
1423 
1424 /* Remove the passed in cached rt from the hash table that contains it */
1425 int rt6_remove_exception_rt(struct rt6_info *rt)
1426 {
1427 	struct rt6_exception_bucket *bucket;
1428 	struct rt6_info *from = rt->from;
1429 	struct in6_addr *src_key = NULL;
1430 	struct rt6_exception *rt6_ex;
1431 	int err;
1432 
1433 	if (!from ||
1434 	    !(rt->rt6i_flags & RTF_CACHE))
1435 		return -EINVAL;
1436 
1437 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1438 		return -ENOENT;
1439 
1440 	spin_lock_bh(&rt6_exception_lock);
1441 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1442 				    lockdep_is_held(&rt6_exception_lock));
1443 #ifdef CONFIG_IPV6_SUBTREES
1444 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1445 	 * and exception table is indexed by a hash of
1446 	 * both rt6i_dst and rt6i_src.
1447 	 * Otherwise, the exception table is indexed by
1448 	 * a hash of only rt6i_dst.
1449 	 */
1450 	if (from->rt6i_src.plen)
1451 		src_key = &rt->rt6i_src.addr;
1452 #endif
1453 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1454 					       &rt->rt6i_dst.addr,
1455 					       src_key);
1456 	if (rt6_ex) {
1457 		rt6_remove_exception(bucket, rt6_ex);
1458 		err = 0;
1459 	} else {
1460 		err = -ENOENT;
1461 	}
1462 
1463 	spin_unlock_bh(&rt6_exception_lock);
1464 	return err;
1465 }
1466 
1467 /* Find rt6_ex which contains the passed in rt cache and
1468  * refresh its stamp
1469  */
1470 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1471 {
1472 	struct rt6_exception_bucket *bucket;
1473 	struct rt6_info *from = rt->from;
1474 	struct in6_addr *src_key = NULL;
1475 	struct rt6_exception *rt6_ex;
1476 
1477 	if (!from ||
1478 	    !(rt->rt6i_flags & RTF_CACHE))
1479 		return;
1480 
1481 	rcu_read_lock();
1482 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1483 
1484 #ifdef CONFIG_IPV6_SUBTREES
1485 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1486 	 * and exception table is indexed by a hash of
1487 	 * both rt6i_dst and rt6i_src.
1488 	 * Otherwise, the exception table is indexed by
1489 	 * a hash of only rt6i_dst.
1490 	 */
1491 	if (from->rt6i_src.plen)
1492 		src_key = &rt->rt6i_src.addr;
1493 #endif
1494 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1495 					  &rt->rt6i_dst.addr,
1496 					  src_key);
1497 	if (rt6_ex)
1498 		rt6_ex->stamp = jiffies;
1499 
1500 	rcu_read_unlock();
1501 }
1502 
1503 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1504 {
1505 	struct rt6_exception_bucket *bucket;
1506 	struct rt6_exception *rt6_ex;
1507 	int i;
1508 
1509 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1510 					lockdep_is_held(&rt6_exception_lock));
1511 
1512 	if (bucket) {
1513 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1514 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1515 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1516 			}
1517 			bucket++;
1518 		}
1519 	}
1520 }
1521 
1522 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1523 {
1524 	struct rt6_exception_bucket *bucket;
1525 	struct rt6_exception *rt6_ex;
1526 	int i;
1527 
1528 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1529 					lockdep_is_held(&rt6_exception_lock));
1530 
1531 	if (bucket) {
1532 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1533 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1534 				struct rt6_info *entry = rt6_ex->rt6i;
1535 				/* For RTF_CACHE with rt6i_pmtu == 0
1536 				 * (i.e. a redirected route),
1537 				 * the metrics of its rt->dst.from has already
1538 				 * been updated.
1539 				 */
1540 				if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1541 					entry->rt6i_pmtu = mtu;
1542 			}
1543 			bucket++;
1544 		}
1545 	}
1546 }
1547 
1548 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1549 
1550 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1551 					struct in6_addr *gateway)
1552 {
1553 	struct rt6_exception_bucket *bucket;
1554 	struct rt6_exception *rt6_ex;
1555 	struct hlist_node *tmp;
1556 	int i;
1557 
1558 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1559 		return;
1560 
1561 	spin_lock_bh(&rt6_exception_lock);
1562 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1563 				     lockdep_is_held(&rt6_exception_lock));
1564 
1565 	if (bucket) {
1566 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1567 			hlist_for_each_entry_safe(rt6_ex, tmp,
1568 						  &bucket->chain, hlist) {
1569 				struct rt6_info *entry = rt6_ex->rt6i;
1570 
1571 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1572 				    RTF_CACHE_GATEWAY &&
1573 				    ipv6_addr_equal(gateway,
1574 						    &entry->rt6i_gateway)) {
1575 					rt6_remove_exception(bucket, rt6_ex);
1576 				}
1577 			}
1578 			bucket++;
1579 		}
1580 	}
1581 
1582 	spin_unlock_bh(&rt6_exception_lock);
1583 }
1584 
1585 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1586 				      struct rt6_exception *rt6_ex,
1587 				      struct fib6_gc_args *gc_args,
1588 				      unsigned long now)
1589 {
1590 	struct rt6_info *rt = rt6_ex->rt6i;
1591 
1592 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1593 	 * even if others have still references to them, so that on next
1594 	 * dst_check() such references can be dropped.
1595 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1596 	 * expired, independently from their aging, as per RFC 8201 section 4
1597 	 */
1598 	if (!(rt->rt6i_flags & RTF_EXPIRES) &&
1599 	    time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1600 		RT6_TRACE("aging clone %p\n", rt);
1601 		rt6_remove_exception(bucket, rt6_ex);
1602 		return;
1603 	} else if (rt->rt6i_flags & RTF_GATEWAY) {
1604 		struct neighbour *neigh;
1605 		__u8 neigh_flags = 0;
1606 
1607 		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1608 		if (neigh) {
1609 			neigh_flags = neigh->flags;
1610 			neigh_release(neigh);
1611 		}
1612 		if (!(neigh_flags & NTF_ROUTER)) {
1613 			RT6_TRACE("purging route %p via non-router but gateway\n",
1614 				  rt);
1615 			rt6_remove_exception(bucket, rt6_ex);
1616 			return;
1617 		}
1618 	} else if (__rt6_check_expired(rt)) {
1619 		RT6_TRACE("purging expired route %p\n", rt);
1620 		rt6_remove_exception(bucket, rt6_ex);
1621 		return;
1622 	}
1623 	gc_args->more++;
1624 }
1625 
1626 void rt6_age_exceptions(struct rt6_info *rt,
1627 			struct fib6_gc_args *gc_args,
1628 			unsigned long now)
1629 {
1630 	struct rt6_exception_bucket *bucket;
1631 	struct rt6_exception *rt6_ex;
1632 	struct hlist_node *tmp;
1633 	int i;
1634 
1635 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1636 		return;
1637 
1638 	spin_lock_bh(&rt6_exception_lock);
1639 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1640 				    lockdep_is_held(&rt6_exception_lock));
1641 
1642 	if (bucket) {
1643 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1644 			hlist_for_each_entry_safe(rt6_ex, tmp,
1645 						  &bucket->chain, hlist) {
1646 				rt6_age_examine_exception(bucket, rt6_ex,
1647 							  gc_args, now);
1648 			}
1649 			bucket++;
1650 		}
1651 	}
1652 	spin_unlock_bh(&rt6_exception_lock);
1653 }
1654 
1655 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1656 			       int oif, struct flowi6 *fl6, int flags)
1657 {
1658 	struct fib6_node *fn, *saved_fn;
1659 	struct rt6_info *rt, *rt_cache;
1660 	int strict = 0;
1661 
1662 	strict |= flags & RT6_LOOKUP_F_IFACE;
1663 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1664 	if (net->ipv6.devconf_all->forwarding == 0)
1665 		strict |= RT6_LOOKUP_F_REACHABLE;
1666 
1667 	rcu_read_lock();
1668 
1669 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1670 	saved_fn = fn;
1671 
1672 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1673 		oif = 0;
1674 
1675 redo_rt6_select:
1676 	rt = rt6_select(net, fn, oif, strict);
1677 	if (rt->rt6i_nsiblings)
1678 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1679 	if (rt == net->ipv6.ip6_null_entry) {
1680 		fn = fib6_backtrack(fn, &fl6->saddr);
1681 		if (fn)
1682 			goto redo_rt6_select;
1683 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1684 			/* also consider unreachable route */
1685 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1686 			fn = saved_fn;
1687 			goto redo_rt6_select;
1688 		}
1689 	}
1690 
1691 	/*Search through exception table */
1692 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1693 	if (rt_cache)
1694 		rt = rt_cache;
1695 
1696 	if (rt == net->ipv6.ip6_null_entry) {
1697 		rcu_read_unlock();
1698 		dst_hold(&rt->dst);
1699 		trace_fib6_table_lookup(net, rt, table, fl6);
1700 		return rt;
1701 	} else if (rt->rt6i_flags & RTF_CACHE) {
1702 		if (ip6_hold_safe(net, &rt, true)) {
1703 			dst_use_noref(&rt->dst, jiffies);
1704 			rt6_dst_from_metrics_check(rt);
1705 		}
1706 		rcu_read_unlock();
1707 		trace_fib6_table_lookup(net, rt, table, fl6);
1708 		return rt;
1709 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1710 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1711 		/* Create a RTF_CACHE clone which will not be
1712 		 * owned by the fib6 tree.  It is for the special case where
1713 		 * the daddr in the skb during the neighbor look-up is different
1714 		 * from the fl6->daddr used to look-up route here.
1715 		 */
1716 
1717 		struct rt6_info *uncached_rt;
1718 
1719 		if (ip6_hold_safe(net, &rt, true)) {
1720 			dst_use_noref(&rt->dst, jiffies);
1721 		} else {
1722 			rcu_read_unlock();
1723 			uncached_rt = rt;
1724 			goto uncached_rt_out;
1725 		}
1726 		rcu_read_unlock();
1727 
1728 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1729 		dst_release(&rt->dst);
1730 
1731 		if (uncached_rt) {
1732 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1733 			 * No need for another dst_hold()
1734 			 */
1735 			rt6_uncached_list_add(uncached_rt);
1736 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1737 		} else {
1738 			uncached_rt = net->ipv6.ip6_null_entry;
1739 			dst_hold(&uncached_rt->dst);
1740 		}
1741 
1742 uncached_rt_out:
1743 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1744 		return uncached_rt;
1745 
1746 	} else {
1747 		/* Get a percpu copy */
1748 
1749 		struct rt6_info *pcpu_rt;
1750 
1751 		dst_use_noref(&rt->dst, jiffies);
1752 		local_bh_disable();
1753 		pcpu_rt = rt6_get_pcpu_route(rt);
1754 
1755 		if (!pcpu_rt) {
1756 			/* atomic_inc_not_zero() is needed when using rcu */
1757 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1758 				/* No dst_hold() on rt is needed because grabbing
1759 				 * rt->rt6i_ref makes sure rt can't be released.
1760 				 */
1761 				pcpu_rt = rt6_make_pcpu_route(rt);
1762 				rt6_release(rt);
1763 			} else {
1764 				/* rt is already removed from tree */
1765 				pcpu_rt = net->ipv6.ip6_null_entry;
1766 				dst_hold(&pcpu_rt->dst);
1767 			}
1768 		}
1769 		local_bh_enable();
1770 		rcu_read_unlock();
1771 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1772 		return pcpu_rt;
1773 	}
1774 }
1775 EXPORT_SYMBOL_GPL(ip6_pol_route);
1776 
1777 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1778 					    struct flowi6 *fl6, int flags)
1779 {
1780 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1781 }
1782 
1783 struct dst_entry *ip6_route_input_lookup(struct net *net,
1784 					 struct net_device *dev,
1785 					 struct flowi6 *fl6, int flags)
1786 {
1787 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1788 		flags |= RT6_LOOKUP_F_IFACE;
1789 
1790 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1791 }
1792 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1793 
1794 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1795 				  struct flow_keys *keys)
1796 {
1797 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1798 	const struct ipv6hdr *key_iph = outer_iph;
1799 	const struct ipv6hdr *inner_iph;
1800 	const struct icmp6hdr *icmph;
1801 	struct ipv6hdr _inner_iph;
1802 
1803 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1804 		goto out;
1805 
1806 	icmph = icmp6_hdr(skb);
1807 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1808 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1809 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1810 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1811 		goto out;
1812 
1813 	inner_iph = skb_header_pointer(skb,
1814 				       skb_transport_offset(skb) + sizeof(*icmph),
1815 				       sizeof(_inner_iph), &_inner_iph);
1816 	if (!inner_iph)
1817 		goto out;
1818 
1819 	key_iph = inner_iph;
1820 out:
1821 	memset(keys, 0, sizeof(*keys));
1822 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1823 	keys->addrs.v6addrs.src = key_iph->saddr;
1824 	keys->addrs.v6addrs.dst = key_iph->daddr;
1825 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1826 	keys->basic.ip_proto = key_iph->nexthdr;
1827 }
1828 
1829 /* if skb is set it will be used and fl6 can be NULL */
1830 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1831 {
1832 	struct flow_keys hash_keys;
1833 
1834 	if (skb) {
1835 		ip6_multipath_l3_keys(skb, &hash_keys);
1836 		return flow_hash_from_keys(&hash_keys);
1837 	}
1838 
1839 	return get_hash_from_flowi6(fl6);
1840 }
1841 
1842 void ip6_route_input(struct sk_buff *skb)
1843 {
1844 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1845 	struct net *net = dev_net(skb->dev);
1846 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1847 	struct ip_tunnel_info *tun_info;
1848 	struct flowi6 fl6 = {
1849 		.flowi6_iif = skb->dev->ifindex,
1850 		.daddr = iph->daddr,
1851 		.saddr = iph->saddr,
1852 		.flowlabel = ip6_flowinfo(iph),
1853 		.flowi6_mark = skb->mark,
1854 		.flowi6_proto = iph->nexthdr,
1855 	};
1856 
1857 	tun_info = skb_tunnel_info(skb);
1858 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1859 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1860 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1861 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1862 	skb_dst_drop(skb);
1863 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1864 }
1865 
1866 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1867 					     struct flowi6 *fl6, int flags)
1868 {
1869 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1870 }
1871 
1872 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1873 					 struct flowi6 *fl6, int flags)
1874 {
1875 	bool any_src;
1876 
1877 	if (rt6_need_strict(&fl6->daddr)) {
1878 		struct dst_entry *dst;
1879 
1880 		dst = l3mdev_link_scope_lookup(net, fl6);
1881 		if (dst)
1882 			return dst;
1883 	}
1884 
1885 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1886 
1887 	any_src = ipv6_addr_any(&fl6->saddr);
1888 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1889 	    (fl6->flowi6_oif && any_src))
1890 		flags |= RT6_LOOKUP_F_IFACE;
1891 
1892 	if (!any_src)
1893 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1894 	else if (sk)
1895 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1896 
1897 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1898 }
1899 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1900 
1901 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1902 {
1903 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1904 	struct net_device *loopback_dev = net->loopback_dev;
1905 	struct dst_entry *new = NULL;
1906 
1907 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1908 		       DST_OBSOLETE_DEAD, 0);
1909 	if (rt) {
1910 		rt6_info_init(rt);
1911 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1912 
1913 		new = &rt->dst;
1914 		new->__use = 1;
1915 		new->input = dst_discard;
1916 		new->output = dst_discard_out;
1917 
1918 		dst_copy_metrics(new, &ort->dst);
1919 
1920 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1921 		rt->rt6i_gateway = ort->rt6i_gateway;
1922 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1923 		rt->rt6i_metric = 0;
1924 
1925 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1926 #ifdef CONFIG_IPV6_SUBTREES
1927 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1928 #endif
1929 	}
1930 
1931 	dst_release(dst_orig);
1932 	return new ? new : ERR_PTR(-ENOMEM);
1933 }
1934 
1935 /*
1936  *	Destination cache support functions
1937  */
1938 
1939 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1940 {
1941 	if (rt->from &&
1942 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1943 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1944 }
1945 
1946 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1947 {
1948 	u32 rt_cookie = 0;
1949 
1950 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1951 		return NULL;
1952 
1953 	if (rt6_check_expired(rt))
1954 		return NULL;
1955 
1956 	return &rt->dst;
1957 }
1958 
1959 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1960 {
1961 	if (!__rt6_check_expired(rt) &&
1962 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1963 	    rt6_check(rt->from, cookie))
1964 		return &rt->dst;
1965 	else
1966 		return NULL;
1967 }
1968 
1969 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1970 {
1971 	struct rt6_info *rt;
1972 
1973 	rt = (struct rt6_info *) dst;
1974 
1975 	/* All IPV6 dsts are created with ->obsolete set to the value
1976 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1977 	 * into this function always.
1978 	 */
1979 
1980 	rt6_dst_from_metrics_check(rt);
1981 
1982 	if (rt->rt6i_flags & RTF_PCPU ||
1983 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1984 		return rt6_dst_from_check(rt, cookie);
1985 	else
1986 		return rt6_check(rt, cookie);
1987 }
1988 
1989 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1990 {
1991 	struct rt6_info *rt = (struct rt6_info *) dst;
1992 
1993 	if (rt) {
1994 		if (rt->rt6i_flags & RTF_CACHE) {
1995 			if (rt6_check_expired(rt)) {
1996 				ip6_del_rt(rt);
1997 				dst = NULL;
1998 			}
1999 		} else {
2000 			dst_release(dst);
2001 			dst = NULL;
2002 		}
2003 	}
2004 	return dst;
2005 }
2006 
2007 static void ip6_link_failure(struct sk_buff *skb)
2008 {
2009 	struct rt6_info *rt;
2010 
2011 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2012 
2013 	rt = (struct rt6_info *) skb_dst(skb);
2014 	if (rt) {
2015 		if (rt->rt6i_flags & RTF_CACHE) {
2016 			if (dst_hold_safe(&rt->dst))
2017 				ip6_del_rt(rt);
2018 		} else {
2019 			struct fib6_node *fn;
2020 
2021 			rcu_read_lock();
2022 			fn = rcu_dereference(rt->rt6i_node);
2023 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2024 				fn->fn_sernum = -1;
2025 			rcu_read_unlock();
2026 		}
2027 	}
2028 }
2029 
2030 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2031 {
2032 	struct net *net = dev_net(rt->dst.dev);
2033 
2034 	rt->rt6i_flags |= RTF_MODIFIED;
2035 	rt->rt6i_pmtu = mtu;
2036 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2037 }
2038 
2039 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2040 {
2041 	return !(rt->rt6i_flags & RTF_CACHE) &&
2042 		(rt->rt6i_flags & RTF_PCPU ||
2043 		 rcu_access_pointer(rt->rt6i_node));
2044 }
2045 
2046 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2047 				 const struct ipv6hdr *iph, u32 mtu)
2048 {
2049 	const struct in6_addr *daddr, *saddr;
2050 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2051 
2052 	if (rt6->rt6i_flags & RTF_LOCAL)
2053 		return;
2054 
2055 	if (dst_metric_locked(dst, RTAX_MTU))
2056 		return;
2057 
2058 	if (iph) {
2059 		daddr = &iph->daddr;
2060 		saddr = &iph->saddr;
2061 	} else if (sk) {
2062 		daddr = &sk->sk_v6_daddr;
2063 		saddr = &inet6_sk(sk)->saddr;
2064 	} else {
2065 		daddr = NULL;
2066 		saddr = NULL;
2067 	}
2068 	dst_confirm_neigh(dst, daddr);
2069 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2070 	if (mtu >= dst_mtu(dst))
2071 		return;
2072 
2073 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2074 		rt6_do_update_pmtu(rt6, mtu);
2075 		/* update rt6_ex->stamp for cache */
2076 		if (rt6->rt6i_flags & RTF_CACHE)
2077 			rt6_update_exception_stamp_rt(rt6);
2078 	} else if (daddr) {
2079 		struct rt6_info *nrt6;
2080 
2081 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2082 		if (nrt6) {
2083 			rt6_do_update_pmtu(nrt6, mtu);
2084 			if (rt6_insert_exception(nrt6, rt6))
2085 				dst_release_immediate(&nrt6->dst);
2086 		}
2087 	}
2088 }
2089 
2090 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2091 			       struct sk_buff *skb, u32 mtu)
2092 {
2093 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2094 }
2095 
2096 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2097 		     int oif, u32 mark, kuid_t uid)
2098 {
2099 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2100 	struct dst_entry *dst;
2101 	struct flowi6 fl6;
2102 
2103 	memset(&fl6, 0, sizeof(fl6));
2104 	fl6.flowi6_oif = oif;
2105 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2106 	fl6.daddr = iph->daddr;
2107 	fl6.saddr = iph->saddr;
2108 	fl6.flowlabel = ip6_flowinfo(iph);
2109 	fl6.flowi6_uid = uid;
2110 
2111 	dst = ip6_route_output(net, NULL, &fl6);
2112 	if (!dst->error)
2113 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2114 	dst_release(dst);
2115 }
2116 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2117 
2118 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2119 {
2120 	struct dst_entry *dst;
2121 
2122 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2123 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2124 
2125 	dst = __sk_dst_get(sk);
2126 	if (!dst || !dst->obsolete ||
2127 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2128 		return;
2129 
2130 	bh_lock_sock(sk);
2131 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2132 		ip6_datagram_dst_update(sk, false);
2133 	bh_unlock_sock(sk);
2134 }
2135 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2136 
2137 /* Handle redirects */
2138 struct ip6rd_flowi {
2139 	struct flowi6 fl6;
2140 	struct in6_addr gateway;
2141 };
2142 
2143 static struct rt6_info *__ip6_route_redirect(struct net *net,
2144 					     struct fib6_table *table,
2145 					     struct flowi6 *fl6,
2146 					     int flags)
2147 {
2148 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2149 	struct rt6_info *rt, *rt_cache;
2150 	struct fib6_node *fn;
2151 
2152 	/* Get the "current" route for this destination and
2153 	 * check if the redirect has come from appropriate router.
2154 	 *
2155 	 * RFC 4861 specifies that redirects should only be
2156 	 * accepted if they come from the nexthop to the target.
2157 	 * Due to the way the routes are chosen, this notion
2158 	 * is a bit fuzzy and one might need to check all possible
2159 	 * routes.
2160 	 */
2161 
2162 	rcu_read_lock();
2163 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2164 restart:
2165 	for_each_fib6_node_rt_rcu(fn) {
2166 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2167 			continue;
2168 		if (rt6_check_expired(rt))
2169 			continue;
2170 		if (rt->dst.error)
2171 			break;
2172 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2173 			continue;
2174 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2175 			continue;
2176 		/* rt_cache's gateway might be different from its 'parent'
2177 		 * in the case of an ip redirect.
2178 		 * So we keep searching in the exception table if the gateway
2179 		 * is different.
2180 		 */
2181 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2182 			rt_cache = rt6_find_cached_rt(rt,
2183 						      &fl6->daddr,
2184 						      &fl6->saddr);
2185 			if (rt_cache &&
2186 			    ipv6_addr_equal(&rdfl->gateway,
2187 					    &rt_cache->rt6i_gateway)) {
2188 				rt = rt_cache;
2189 				break;
2190 			}
2191 			continue;
2192 		}
2193 		break;
2194 	}
2195 
2196 	if (!rt)
2197 		rt = net->ipv6.ip6_null_entry;
2198 	else if (rt->dst.error) {
2199 		rt = net->ipv6.ip6_null_entry;
2200 		goto out;
2201 	}
2202 
2203 	if (rt == net->ipv6.ip6_null_entry) {
2204 		fn = fib6_backtrack(fn, &fl6->saddr);
2205 		if (fn)
2206 			goto restart;
2207 	}
2208 
2209 out:
2210 	ip6_hold_safe(net, &rt, true);
2211 
2212 	rcu_read_unlock();
2213 
2214 	trace_fib6_table_lookup(net, rt, table, fl6);
2215 	return rt;
2216 };
2217 
2218 static struct dst_entry *ip6_route_redirect(struct net *net,
2219 					const struct flowi6 *fl6,
2220 					const struct in6_addr *gateway)
2221 {
2222 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2223 	struct ip6rd_flowi rdfl;
2224 
2225 	rdfl.fl6 = *fl6;
2226 	rdfl.gateway = *gateway;
2227 
2228 	return fib6_rule_lookup(net, &rdfl.fl6,
2229 				flags, __ip6_route_redirect);
2230 }
2231 
2232 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2233 		  kuid_t uid)
2234 {
2235 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2236 	struct dst_entry *dst;
2237 	struct flowi6 fl6;
2238 
2239 	memset(&fl6, 0, sizeof(fl6));
2240 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2241 	fl6.flowi6_oif = oif;
2242 	fl6.flowi6_mark = mark;
2243 	fl6.daddr = iph->daddr;
2244 	fl6.saddr = iph->saddr;
2245 	fl6.flowlabel = ip6_flowinfo(iph);
2246 	fl6.flowi6_uid = uid;
2247 
2248 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2249 	rt6_do_redirect(dst, NULL, skb);
2250 	dst_release(dst);
2251 }
2252 EXPORT_SYMBOL_GPL(ip6_redirect);
2253 
2254 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2255 			    u32 mark)
2256 {
2257 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2258 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2259 	struct dst_entry *dst;
2260 	struct flowi6 fl6;
2261 
2262 	memset(&fl6, 0, sizeof(fl6));
2263 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2264 	fl6.flowi6_oif = oif;
2265 	fl6.flowi6_mark = mark;
2266 	fl6.daddr = msg->dest;
2267 	fl6.saddr = iph->daddr;
2268 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2269 
2270 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2271 	rt6_do_redirect(dst, NULL, skb);
2272 	dst_release(dst);
2273 }
2274 
2275 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2276 {
2277 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2278 		     sk->sk_uid);
2279 }
2280 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2281 
2282 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2283 {
2284 	struct net_device *dev = dst->dev;
2285 	unsigned int mtu = dst_mtu(dst);
2286 	struct net *net = dev_net(dev);
2287 
2288 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2289 
2290 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2291 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2292 
2293 	/*
2294 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2295 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2296 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2297 	 * rely only on pmtu discovery"
2298 	 */
2299 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2300 		mtu = IPV6_MAXPLEN;
2301 	return mtu;
2302 }
2303 
2304 static unsigned int ip6_mtu(const struct dst_entry *dst)
2305 {
2306 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2307 	unsigned int mtu = rt->rt6i_pmtu;
2308 	struct inet6_dev *idev;
2309 
2310 	if (mtu)
2311 		goto out;
2312 
2313 	mtu = dst_metric_raw(dst, RTAX_MTU);
2314 	if (mtu)
2315 		goto out;
2316 
2317 	mtu = IPV6_MIN_MTU;
2318 
2319 	rcu_read_lock();
2320 	idev = __in6_dev_get(dst->dev);
2321 	if (idev)
2322 		mtu = idev->cnf.mtu6;
2323 	rcu_read_unlock();
2324 
2325 out:
2326 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2327 
2328 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2329 }
2330 
2331 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2332 				  struct flowi6 *fl6)
2333 {
2334 	struct dst_entry *dst;
2335 	struct rt6_info *rt;
2336 	struct inet6_dev *idev = in6_dev_get(dev);
2337 	struct net *net = dev_net(dev);
2338 
2339 	if (unlikely(!idev))
2340 		return ERR_PTR(-ENODEV);
2341 
2342 	rt = ip6_dst_alloc(net, dev, 0);
2343 	if (unlikely(!rt)) {
2344 		in6_dev_put(idev);
2345 		dst = ERR_PTR(-ENOMEM);
2346 		goto out;
2347 	}
2348 
2349 	rt->dst.flags |= DST_HOST;
2350 	rt->dst.input = ip6_input;
2351 	rt->dst.output  = ip6_output;
2352 	rt->rt6i_gateway  = fl6->daddr;
2353 	rt->rt6i_dst.addr = fl6->daddr;
2354 	rt->rt6i_dst.plen = 128;
2355 	rt->rt6i_idev     = idev;
2356 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2357 
2358 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2359 	 * do proper release of the net_device
2360 	 */
2361 	rt6_uncached_list_add(rt);
2362 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2363 
2364 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2365 
2366 out:
2367 	return dst;
2368 }
2369 
2370 static int ip6_dst_gc(struct dst_ops *ops)
2371 {
2372 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2373 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2374 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2375 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2376 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2377 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2378 	int entries;
2379 
2380 	entries = dst_entries_get_fast(ops);
2381 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2382 	    entries <= rt_max_size)
2383 		goto out;
2384 
2385 	net->ipv6.ip6_rt_gc_expire++;
2386 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2387 	entries = dst_entries_get_slow(ops);
2388 	if (entries < ops->gc_thresh)
2389 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2390 out:
2391 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2392 	return entries > rt_max_size;
2393 }
2394 
2395 static int ip6_convert_metrics(struct mx6_config *mxc,
2396 			       const struct fib6_config *cfg)
2397 {
2398 	struct net *net = cfg->fc_nlinfo.nl_net;
2399 	bool ecn_ca = false;
2400 	struct nlattr *nla;
2401 	int remaining;
2402 	u32 *mp;
2403 
2404 	if (!cfg->fc_mx)
2405 		return 0;
2406 
2407 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2408 	if (unlikely(!mp))
2409 		return -ENOMEM;
2410 
2411 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2412 		int type = nla_type(nla);
2413 		u32 val;
2414 
2415 		if (!type)
2416 			continue;
2417 		if (unlikely(type > RTAX_MAX))
2418 			goto err;
2419 
2420 		if (type == RTAX_CC_ALGO) {
2421 			char tmp[TCP_CA_NAME_MAX];
2422 
2423 			nla_strlcpy(tmp, nla, sizeof(tmp));
2424 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2425 			if (val == TCP_CA_UNSPEC)
2426 				goto err;
2427 		} else {
2428 			val = nla_get_u32(nla);
2429 		}
2430 		if (type == RTAX_HOPLIMIT && val > 255)
2431 			val = 255;
2432 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2433 			goto err;
2434 
2435 		mp[type - 1] = val;
2436 		__set_bit(type - 1, mxc->mx_valid);
2437 	}
2438 
2439 	if (ecn_ca) {
2440 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2441 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2442 	}
2443 
2444 	mxc->mx = mp;
2445 	return 0;
2446  err:
2447 	kfree(mp);
2448 	return -EINVAL;
2449 }
2450 
2451 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2452 					    struct fib6_config *cfg,
2453 					    const struct in6_addr *gw_addr)
2454 {
2455 	struct flowi6 fl6 = {
2456 		.flowi6_oif = cfg->fc_ifindex,
2457 		.daddr = *gw_addr,
2458 		.saddr = cfg->fc_prefsrc,
2459 	};
2460 	struct fib6_table *table;
2461 	struct rt6_info *rt;
2462 	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
2463 
2464 	table = fib6_get_table(net, cfg->fc_table);
2465 	if (!table)
2466 		return NULL;
2467 
2468 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2469 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2470 
2471 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2472 
2473 	/* if table lookup failed, fall back to full lookup */
2474 	if (rt == net->ipv6.ip6_null_entry) {
2475 		ip6_rt_put(rt);
2476 		rt = NULL;
2477 	}
2478 
2479 	return rt;
2480 }
2481 
2482 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2483 					      struct netlink_ext_ack *extack)
2484 {
2485 	struct net *net = cfg->fc_nlinfo.nl_net;
2486 	struct rt6_info *rt = NULL;
2487 	struct net_device *dev = NULL;
2488 	struct inet6_dev *idev = NULL;
2489 	struct fib6_table *table;
2490 	int addr_type;
2491 	int err = -EINVAL;
2492 
2493 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2494 	if (cfg->fc_flags & RTF_PCPU) {
2495 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2496 		goto out;
2497 	}
2498 
2499 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2500 	if (cfg->fc_flags & RTF_CACHE) {
2501 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2502 		goto out;
2503 	}
2504 
2505 	if (cfg->fc_dst_len > 128) {
2506 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2507 		goto out;
2508 	}
2509 	if (cfg->fc_src_len > 128) {
2510 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2511 		goto out;
2512 	}
2513 #ifndef CONFIG_IPV6_SUBTREES
2514 	if (cfg->fc_src_len) {
2515 		NL_SET_ERR_MSG(extack,
2516 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2517 		goto out;
2518 	}
2519 #endif
2520 	if (cfg->fc_ifindex) {
2521 		err = -ENODEV;
2522 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2523 		if (!dev)
2524 			goto out;
2525 		idev = in6_dev_get(dev);
2526 		if (!idev)
2527 			goto out;
2528 	}
2529 
2530 	if (cfg->fc_metric == 0)
2531 		cfg->fc_metric = IP6_RT_PRIO_USER;
2532 
2533 	err = -ENOBUFS;
2534 	if (cfg->fc_nlinfo.nlh &&
2535 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2536 		table = fib6_get_table(net, cfg->fc_table);
2537 		if (!table) {
2538 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2539 			table = fib6_new_table(net, cfg->fc_table);
2540 		}
2541 	} else {
2542 		table = fib6_new_table(net, cfg->fc_table);
2543 	}
2544 
2545 	if (!table)
2546 		goto out;
2547 
2548 	rt = ip6_dst_alloc(net, NULL,
2549 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2550 
2551 	if (!rt) {
2552 		err = -ENOMEM;
2553 		goto out;
2554 	}
2555 
2556 	if (cfg->fc_flags & RTF_EXPIRES)
2557 		rt6_set_expires(rt, jiffies +
2558 				clock_t_to_jiffies(cfg->fc_expires));
2559 	else
2560 		rt6_clean_expires(rt);
2561 
2562 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2563 		cfg->fc_protocol = RTPROT_BOOT;
2564 	rt->rt6i_protocol = cfg->fc_protocol;
2565 
2566 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2567 
2568 	if (addr_type & IPV6_ADDR_MULTICAST)
2569 		rt->dst.input = ip6_mc_input;
2570 	else if (cfg->fc_flags & RTF_LOCAL)
2571 		rt->dst.input = ip6_input;
2572 	else
2573 		rt->dst.input = ip6_forward;
2574 
2575 	rt->dst.output = ip6_output;
2576 
2577 	if (cfg->fc_encap) {
2578 		struct lwtunnel_state *lwtstate;
2579 
2580 		err = lwtunnel_build_state(cfg->fc_encap_type,
2581 					   cfg->fc_encap, AF_INET6, cfg,
2582 					   &lwtstate, extack);
2583 		if (err)
2584 			goto out;
2585 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2586 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2587 			rt->dst.lwtstate->orig_output = rt->dst.output;
2588 			rt->dst.output = lwtunnel_output;
2589 		}
2590 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2591 			rt->dst.lwtstate->orig_input = rt->dst.input;
2592 			rt->dst.input = lwtunnel_input;
2593 		}
2594 	}
2595 
2596 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2597 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2598 	if (rt->rt6i_dst.plen == 128)
2599 		rt->dst.flags |= DST_HOST;
2600 
2601 #ifdef CONFIG_IPV6_SUBTREES
2602 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2603 	rt->rt6i_src.plen = cfg->fc_src_len;
2604 #endif
2605 
2606 	rt->rt6i_metric = cfg->fc_metric;
2607 
2608 	/* We cannot add true routes via loopback here,
2609 	   they would result in kernel looping; promote them to reject routes
2610 	 */
2611 	if ((cfg->fc_flags & RTF_REJECT) ||
2612 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2613 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2614 	     !(cfg->fc_flags & RTF_LOCAL))) {
2615 		/* hold loopback dev/idev if we haven't done so. */
2616 		if (dev != net->loopback_dev) {
2617 			if (dev) {
2618 				dev_put(dev);
2619 				in6_dev_put(idev);
2620 			}
2621 			dev = net->loopback_dev;
2622 			dev_hold(dev);
2623 			idev = in6_dev_get(dev);
2624 			if (!idev) {
2625 				err = -ENODEV;
2626 				goto out;
2627 			}
2628 		}
2629 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2630 		switch (cfg->fc_type) {
2631 		case RTN_BLACKHOLE:
2632 			rt->dst.error = -EINVAL;
2633 			rt->dst.output = dst_discard_out;
2634 			rt->dst.input = dst_discard;
2635 			break;
2636 		case RTN_PROHIBIT:
2637 			rt->dst.error = -EACCES;
2638 			rt->dst.output = ip6_pkt_prohibit_out;
2639 			rt->dst.input = ip6_pkt_prohibit;
2640 			break;
2641 		case RTN_THROW:
2642 		case RTN_UNREACHABLE:
2643 		default:
2644 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2645 					: (cfg->fc_type == RTN_UNREACHABLE)
2646 					? -EHOSTUNREACH : -ENETUNREACH;
2647 			rt->dst.output = ip6_pkt_discard_out;
2648 			rt->dst.input = ip6_pkt_discard;
2649 			break;
2650 		}
2651 		goto install_route;
2652 	}
2653 
2654 	if (cfg->fc_flags & RTF_GATEWAY) {
2655 		const struct in6_addr *gw_addr;
2656 		int gwa_type;
2657 
2658 		gw_addr = &cfg->fc_gateway;
2659 		gwa_type = ipv6_addr_type(gw_addr);
2660 
2661 		/* if gw_addr is local we will fail to detect this in case
2662 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2663 		 * will return already-added prefix route via interface that
2664 		 * prefix route was assigned to, which might be non-loopback.
2665 		 */
2666 		err = -EINVAL;
2667 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2668 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2669 					    dev : NULL, 0, 0)) {
2670 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2671 			goto out;
2672 		}
2673 		rt->rt6i_gateway = *gw_addr;
2674 
2675 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2676 			struct rt6_info *grt = NULL;
2677 
2678 			/* IPv6 strictly inhibits using not link-local
2679 			   addresses as nexthop address.
2680 			   Otherwise, router will not able to send redirects.
2681 			   It is very good, but in some (rare!) circumstances
2682 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2683 			   some exceptions. --ANK
2684 			   We allow IPv4-mapped nexthops to support RFC4798-type
2685 			   addressing
2686 			 */
2687 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2688 					  IPV6_ADDR_MAPPED))) {
2689 				NL_SET_ERR_MSG(extack,
2690 					       "Invalid gateway address");
2691 				goto out;
2692 			}
2693 
2694 			if (cfg->fc_table) {
2695 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2696 
2697 				if (grt) {
2698 					if (grt->rt6i_flags & RTF_GATEWAY ||
2699 					    (dev && dev != grt->dst.dev)) {
2700 						ip6_rt_put(grt);
2701 						grt = NULL;
2702 					}
2703 				}
2704 			}
2705 
2706 			if (!grt)
2707 				grt = rt6_lookup(net, gw_addr, NULL,
2708 						 cfg->fc_ifindex, 1);
2709 
2710 			err = -EHOSTUNREACH;
2711 			if (!grt)
2712 				goto out;
2713 			if (dev) {
2714 				if (dev != grt->dst.dev) {
2715 					ip6_rt_put(grt);
2716 					goto out;
2717 				}
2718 			} else {
2719 				dev = grt->dst.dev;
2720 				idev = grt->rt6i_idev;
2721 				dev_hold(dev);
2722 				in6_dev_hold(grt->rt6i_idev);
2723 			}
2724 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2725 				err = 0;
2726 			ip6_rt_put(grt);
2727 
2728 			if (err)
2729 				goto out;
2730 		}
2731 		err = -EINVAL;
2732 		if (!dev) {
2733 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2734 			goto out;
2735 		} else if (dev->flags & IFF_LOOPBACK) {
2736 			NL_SET_ERR_MSG(extack,
2737 				       "Egress device can not be loopback device for this route");
2738 			goto out;
2739 		}
2740 	}
2741 
2742 	err = -ENODEV;
2743 	if (!dev)
2744 		goto out;
2745 
2746 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2747 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2748 			NL_SET_ERR_MSG(extack, "Invalid source address");
2749 			err = -EINVAL;
2750 			goto out;
2751 		}
2752 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2753 		rt->rt6i_prefsrc.plen = 128;
2754 	} else
2755 		rt->rt6i_prefsrc.plen = 0;
2756 
2757 	rt->rt6i_flags = cfg->fc_flags;
2758 
2759 install_route:
2760 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2761 	    !netif_carrier_ok(dev))
2762 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2763 	rt->dst.dev = dev;
2764 	rt->rt6i_idev = idev;
2765 	rt->rt6i_table = table;
2766 
2767 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2768 
2769 	return rt;
2770 out:
2771 	if (dev)
2772 		dev_put(dev);
2773 	if (idev)
2774 		in6_dev_put(idev);
2775 	if (rt)
2776 		dst_release_immediate(&rt->dst);
2777 
2778 	return ERR_PTR(err);
2779 }
2780 
2781 int ip6_route_add(struct fib6_config *cfg,
2782 		  struct netlink_ext_ack *extack)
2783 {
2784 	struct mx6_config mxc = { .mx = NULL, };
2785 	struct rt6_info *rt;
2786 	int err;
2787 
2788 	rt = ip6_route_info_create(cfg, extack);
2789 	if (IS_ERR(rt)) {
2790 		err = PTR_ERR(rt);
2791 		rt = NULL;
2792 		goto out;
2793 	}
2794 
2795 	err = ip6_convert_metrics(&mxc, cfg);
2796 	if (err)
2797 		goto out;
2798 
2799 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2800 
2801 	kfree(mxc.mx);
2802 
2803 	return err;
2804 out:
2805 	if (rt)
2806 		dst_release_immediate(&rt->dst);
2807 
2808 	return err;
2809 }
2810 
2811 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2812 {
2813 	int err;
2814 	struct fib6_table *table;
2815 	struct net *net = dev_net(rt->dst.dev);
2816 
2817 	if (rt == net->ipv6.ip6_null_entry) {
2818 		err = -ENOENT;
2819 		goto out;
2820 	}
2821 
2822 	table = rt->rt6i_table;
2823 	spin_lock_bh(&table->tb6_lock);
2824 	err = fib6_del(rt, info);
2825 	spin_unlock_bh(&table->tb6_lock);
2826 
2827 out:
2828 	ip6_rt_put(rt);
2829 	return err;
2830 }
2831 
2832 int ip6_del_rt(struct rt6_info *rt)
2833 {
2834 	struct nl_info info = {
2835 		.nl_net = dev_net(rt->dst.dev),
2836 	};
2837 	return __ip6_del_rt(rt, &info);
2838 }
2839 
2840 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2841 {
2842 	struct nl_info *info = &cfg->fc_nlinfo;
2843 	struct net *net = info->nl_net;
2844 	struct sk_buff *skb = NULL;
2845 	struct fib6_table *table;
2846 	int err = -ENOENT;
2847 
2848 	if (rt == net->ipv6.ip6_null_entry)
2849 		goto out_put;
2850 	table = rt->rt6i_table;
2851 	spin_lock_bh(&table->tb6_lock);
2852 
2853 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2854 		struct rt6_info *sibling, *next_sibling;
2855 
2856 		/* prefer to send a single notification with all hops */
2857 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2858 		if (skb) {
2859 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2860 
2861 			if (rt6_fill_node(net, skb, rt,
2862 					  NULL, NULL, 0, RTM_DELROUTE,
2863 					  info->portid, seq, 0) < 0) {
2864 				kfree_skb(skb);
2865 				skb = NULL;
2866 			} else
2867 				info->skip_notify = 1;
2868 		}
2869 
2870 		list_for_each_entry_safe(sibling, next_sibling,
2871 					 &rt->rt6i_siblings,
2872 					 rt6i_siblings) {
2873 			err = fib6_del(sibling, info);
2874 			if (err)
2875 				goto out_unlock;
2876 		}
2877 	}
2878 
2879 	err = fib6_del(rt, info);
2880 out_unlock:
2881 	spin_unlock_bh(&table->tb6_lock);
2882 out_put:
2883 	ip6_rt_put(rt);
2884 
2885 	if (skb) {
2886 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2887 			    info->nlh, gfp_any());
2888 	}
2889 	return err;
2890 }
2891 
2892 static int ip6_route_del(struct fib6_config *cfg,
2893 			 struct netlink_ext_ack *extack)
2894 {
2895 	struct rt6_info *rt, *rt_cache;
2896 	struct fib6_table *table;
2897 	struct fib6_node *fn;
2898 	int err = -ESRCH;
2899 
2900 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2901 	if (!table) {
2902 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2903 		return err;
2904 	}
2905 
2906 	rcu_read_lock();
2907 
2908 	fn = fib6_locate(&table->tb6_root,
2909 			 &cfg->fc_dst, cfg->fc_dst_len,
2910 			 &cfg->fc_src, cfg->fc_src_len,
2911 			 !(cfg->fc_flags & RTF_CACHE));
2912 
2913 	if (fn) {
2914 		for_each_fib6_node_rt_rcu(fn) {
2915 			if (cfg->fc_flags & RTF_CACHE) {
2916 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2917 							      &cfg->fc_src);
2918 				if (!rt_cache)
2919 					continue;
2920 				rt = rt_cache;
2921 			}
2922 			if (cfg->fc_ifindex &&
2923 			    (!rt->dst.dev ||
2924 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2925 				continue;
2926 			if (cfg->fc_flags & RTF_GATEWAY &&
2927 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2928 				continue;
2929 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2930 				continue;
2931 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2932 				continue;
2933 			if (!dst_hold_safe(&rt->dst))
2934 				break;
2935 			rcu_read_unlock();
2936 
2937 			/* if gateway was specified only delete the one hop */
2938 			if (cfg->fc_flags & RTF_GATEWAY)
2939 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2940 
2941 			return __ip6_del_rt_siblings(rt, cfg);
2942 		}
2943 	}
2944 	rcu_read_unlock();
2945 
2946 	return err;
2947 }
2948 
2949 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2950 {
2951 	struct netevent_redirect netevent;
2952 	struct rt6_info *rt, *nrt = NULL;
2953 	struct ndisc_options ndopts;
2954 	struct inet6_dev *in6_dev;
2955 	struct neighbour *neigh;
2956 	struct rd_msg *msg;
2957 	int optlen, on_link;
2958 	u8 *lladdr;
2959 
2960 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2961 	optlen -= sizeof(*msg);
2962 
2963 	if (optlen < 0) {
2964 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2965 		return;
2966 	}
2967 
2968 	msg = (struct rd_msg *)icmp6_hdr(skb);
2969 
2970 	if (ipv6_addr_is_multicast(&msg->dest)) {
2971 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2972 		return;
2973 	}
2974 
2975 	on_link = 0;
2976 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2977 		on_link = 1;
2978 	} else if (ipv6_addr_type(&msg->target) !=
2979 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2980 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2981 		return;
2982 	}
2983 
2984 	in6_dev = __in6_dev_get(skb->dev);
2985 	if (!in6_dev)
2986 		return;
2987 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2988 		return;
2989 
2990 	/* RFC2461 8.1:
2991 	 *	The IP source address of the Redirect MUST be the same as the current
2992 	 *	first-hop router for the specified ICMP Destination Address.
2993 	 */
2994 
2995 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2996 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2997 		return;
2998 	}
2999 
3000 	lladdr = NULL;
3001 	if (ndopts.nd_opts_tgt_lladdr) {
3002 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3003 					     skb->dev);
3004 		if (!lladdr) {
3005 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3006 			return;
3007 		}
3008 	}
3009 
3010 	rt = (struct rt6_info *) dst;
3011 	if (rt->rt6i_flags & RTF_REJECT) {
3012 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3013 		return;
3014 	}
3015 
3016 	/* Redirect received -> path was valid.
3017 	 * Look, redirects are sent only in response to data packets,
3018 	 * so that this nexthop apparently is reachable. --ANK
3019 	 */
3020 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3021 
3022 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3023 	if (!neigh)
3024 		return;
3025 
3026 	/*
3027 	 *	We have finally decided to accept it.
3028 	 */
3029 
3030 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3031 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3032 		     NEIGH_UPDATE_F_OVERRIDE|
3033 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3034 				     NEIGH_UPDATE_F_ISROUTER)),
3035 		     NDISC_REDIRECT, &ndopts);
3036 
3037 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3038 	if (!nrt)
3039 		goto out;
3040 
3041 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3042 	if (on_link)
3043 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3044 
3045 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3046 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3047 
3048 	/* No need to remove rt from the exception table if rt is
3049 	 * a cached route because rt6_insert_exception() will
3050 	 * takes care of it
3051 	 */
3052 	if (rt6_insert_exception(nrt, rt)) {
3053 		dst_release_immediate(&nrt->dst);
3054 		goto out;
3055 	}
3056 
3057 	netevent.old = &rt->dst;
3058 	netevent.new = &nrt->dst;
3059 	netevent.daddr = &msg->dest;
3060 	netevent.neigh = neigh;
3061 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3062 
3063 out:
3064 	neigh_release(neigh);
3065 }
3066 
3067 /*
3068  *	Misc support functions
3069  */
3070 
3071 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3072 {
3073 	BUG_ON(from->from);
3074 
3075 	rt->rt6i_flags &= ~RTF_EXPIRES;
3076 	dst_hold(&from->dst);
3077 	rt->from = from;
3078 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3079 }
3080 
3081 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3082 {
3083 	rt->dst.input = ort->dst.input;
3084 	rt->dst.output = ort->dst.output;
3085 	rt->rt6i_dst = ort->rt6i_dst;
3086 	rt->dst.error = ort->dst.error;
3087 	rt->rt6i_idev = ort->rt6i_idev;
3088 	if (rt->rt6i_idev)
3089 		in6_dev_hold(rt->rt6i_idev);
3090 	rt->dst.lastuse = jiffies;
3091 	rt->rt6i_gateway = ort->rt6i_gateway;
3092 	rt->rt6i_flags = ort->rt6i_flags;
3093 	rt6_set_from(rt, ort);
3094 	rt->rt6i_metric = ort->rt6i_metric;
3095 #ifdef CONFIG_IPV6_SUBTREES
3096 	rt->rt6i_src = ort->rt6i_src;
3097 #endif
3098 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3099 	rt->rt6i_table = ort->rt6i_table;
3100 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3101 }
3102 
3103 #ifdef CONFIG_IPV6_ROUTE_INFO
3104 static struct rt6_info *rt6_get_route_info(struct net *net,
3105 					   const struct in6_addr *prefix, int prefixlen,
3106 					   const struct in6_addr *gwaddr,
3107 					   struct net_device *dev)
3108 {
3109 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3110 	int ifindex = dev->ifindex;
3111 	struct fib6_node *fn;
3112 	struct rt6_info *rt = NULL;
3113 	struct fib6_table *table;
3114 
3115 	table = fib6_get_table(net, tb_id);
3116 	if (!table)
3117 		return NULL;
3118 
3119 	rcu_read_lock();
3120 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3121 	if (!fn)
3122 		goto out;
3123 
3124 	for_each_fib6_node_rt_rcu(fn) {
3125 		if (rt->dst.dev->ifindex != ifindex)
3126 			continue;
3127 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3128 			continue;
3129 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3130 			continue;
3131 		ip6_hold_safe(NULL, &rt, false);
3132 		break;
3133 	}
3134 out:
3135 	rcu_read_unlock();
3136 	return rt;
3137 }
3138 
3139 static struct rt6_info *rt6_add_route_info(struct net *net,
3140 					   const struct in6_addr *prefix, int prefixlen,
3141 					   const struct in6_addr *gwaddr,
3142 					   struct net_device *dev,
3143 					   unsigned int pref)
3144 {
3145 	struct fib6_config cfg = {
3146 		.fc_metric	= IP6_RT_PRIO_USER,
3147 		.fc_ifindex	= dev->ifindex,
3148 		.fc_dst_len	= prefixlen,
3149 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3150 				  RTF_UP | RTF_PREF(pref),
3151 		.fc_protocol = RTPROT_RA,
3152 		.fc_nlinfo.portid = 0,
3153 		.fc_nlinfo.nlh = NULL,
3154 		.fc_nlinfo.nl_net = net,
3155 	};
3156 
3157 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3158 	cfg.fc_dst = *prefix;
3159 	cfg.fc_gateway = *gwaddr;
3160 
3161 	/* We should treat it as a default route if prefix length is 0. */
3162 	if (!prefixlen)
3163 		cfg.fc_flags |= RTF_DEFAULT;
3164 
3165 	ip6_route_add(&cfg, NULL);
3166 
3167 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3168 }
3169 #endif
3170 
3171 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3172 {
3173 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3174 	struct rt6_info *rt;
3175 	struct fib6_table *table;
3176 
3177 	table = fib6_get_table(dev_net(dev), tb_id);
3178 	if (!table)
3179 		return NULL;
3180 
3181 	rcu_read_lock();
3182 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3183 		if (dev == rt->dst.dev &&
3184 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3185 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3186 			break;
3187 	}
3188 	if (rt)
3189 		ip6_hold_safe(NULL, &rt, false);
3190 	rcu_read_unlock();
3191 	return rt;
3192 }
3193 
3194 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3195 				     struct net_device *dev,
3196 				     unsigned int pref)
3197 {
3198 	struct fib6_config cfg = {
3199 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3200 		.fc_metric	= IP6_RT_PRIO_USER,
3201 		.fc_ifindex	= dev->ifindex,
3202 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3203 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3204 		.fc_protocol = RTPROT_RA,
3205 		.fc_nlinfo.portid = 0,
3206 		.fc_nlinfo.nlh = NULL,
3207 		.fc_nlinfo.nl_net = dev_net(dev),
3208 	};
3209 
3210 	cfg.fc_gateway = *gwaddr;
3211 
3212 	if (!ip6_route_add(&cfg, NULL)) {
3213 		struct fib6_table *table;
3214 
3215 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3216 		if (table)
3217 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3218 	}
3219 
3220 	return rt6_get_dflt_router(gwaddr, dev);
3221 }
3222 
3223 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3224 {
3225 	struct rt6_info *rt;
3226 
3227 restart:
3228 	rcu_read_lock();
3229 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3230 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3231 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3232 			if (dst_hold_safe(&rt->dst)) {
3233 				rcu_read_unlock();
3234 				ip6_del_rt(rt);
3235 			} else {
3236 				rcu_read_unlock();
3237 			}
3238 			goto restart;
3239 		}
3240 	}
3241 	rcu_read_unlock();
3242 
3243 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3244 }
3245 
3246 void rt6_purge_dflt_routers(struct net *net)
3247 {
3248 	struct fib6_table *table;
3249 	struct hlist_head *head;
3250 	unsigned int h;
3251 
3252 	rcu_read_lock();
3253 
3254 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3255 		head = &net->ipv6.fib_table_hash[h];
3256 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3257 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3258 				__rt6_purge_dflt_routers(table);
3259 		}
3260 	}
3261 
3262 	rcu_read_unlock();
3263 }
3264 
3265 static void rtmsg_to_fib6_config(struct net *net,
3266 				 struct in6_rtmsg *rtmsg,
3267 				 struct fib6_config *cfg)
3268 {
3269 	memset(cfg, 0, sizeof(*cfg));
3270 
3271 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3272 			 : RT6_TABLE_MAIN;
3273 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3274 	cfg->fc_metric = rtmsg->rtmsg_metric;
3275 	cfg->fc_expires = rtmsg->rtmsg_info;
3276 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3277 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3278 	cfg->fc_flags = rtmsg->rtmsg_flags;
3279 
3280 	cfg->fc_nlinfo.nl_net = net;
3281 
3282 	cfg->fc_dst = rtmsg->rtmsg_dst;
3283 	cfg->fc_src = rtmsg->rtmsg_src;
3284 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3285 }
3286 
3287 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3288 {
3289 	struct fib6_config cfg;
3290 	struct in6_rtmsg rtmsg;
3291 	int err;
3292 
3293 	switch (cmd) {
3294 	case SIOCADDRT:		/* Add a route */
3295 	case SIOCDELRT:		/* Delete a route */
3296 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3297 			return -EPERM;
3298 		err = copy_from_user(&rtmsg, arg,
3299 				     sizeof(struct in6_rtmsg));
3300 		if (err)
3301 			return -EFAULT;
3302 
3303 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3304 
3305 		rtnl_lock();
3306 		switch (cmd) {
3307 		case SIOCADDRT:
3308 			err = ip6_route_add(&cfg, NULL);
3309 			break;
3310 		case SIOCDELRT:
3311 			err = ip6_route_del(&cfg, NULL);
3312 			break;
3313 		default:
3314 			err = -EINVAL;
3315 		}
3316 		rtnl_unlock();
3317 
3318 		return err;
3319 	}
3320 
3321 	return -EINVAL;
3322 }
3323 
3324 /*
3325  *	Drop the packet on the floor
3326  */
3327 
3328 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3329 {
3330 	int type;
3331 	struct dst_entry *dst = skb_dst(skb);
3332 	switch (ipstats_mib_noroutes) {
3333 	case IPSTATS_MIB_INNOROUTES:
3334 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3335 		if (type == IPV6_ADDR_ANY) {
3336 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3337 				      IPSTATS_MIB_INADDRERRORS);
3338 			break;
3339 		}
3340 		/* FALLTHROUGH */
3341 	case IPSTATS_MIB_OUTNOROUTES:
3342 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3343 			      ipstats_mib_noroutes);
3344 		break;
3345 	}
3346 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3347 	kfree_skb(skb);
3348 	return 0;
3349 }
3350 
3351 static int ip6_pkt_discard(struct sk_buff *skb)
3352 {
3353 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3354 }
3355 
3356 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3357 {
3358 	skb->dev = skb_dst(skb)->dev;
3359 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3360 }
3361 
3362 static int ip6_pkt_prohibit(struct sk_buff *skb)
3363 {
3364 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3365 }
3366 
3367 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3368 {
3369 	skb->dev = skb_dst(skb)->dev;
3370 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3371 }
3372 
3373 /*
3374  *	Allocate a dst for local (unicast / anycast) address.
3375  */
3376 
3377 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3378 				    const struct in6_addr *addr,
3379 				    bool anycast)
3380 {
3381 	u32 tb_id;
3382 	struct net *net = dev_net(idev->dev);
3383 	struct net_device *dev = idev->dev;
3384 	struct rt6_info *rt;
3385 
3386 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3387 	if (!rt)
3388 		return ERR_PTR(-ENOMEM);
3389 
3390 	in6_dev_hold(idev);
3391 
3392 	rt->dst.flags |= DST_HOST;
3393 	rt->dst.input = ip6_input;
3394 	rt->dst.output = ip6_output;
3395 	rt->rt6i_idev = idev;
3396 
3397 	rt->rt6i_protocol = RTPROT_KERNEL;
3398 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3399 	if (anycast)
3400 		rt->rt6i_flags |= RTF_ANYCAST;
3401 	else
3402 		rt->rt6i_flags |= RTF_LOCAL;
3403 
3404 	rt->rt6i_gateway  = *addr;
3405 	rt->rt6i_dst.addr = *addr;
3406 	rt->rt6i_dst.plen = 128;
3407 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3408 	rt->rt6i_table = fib6_get_table(net, tb_id);
3409 
3410 	return rt;
3411 }
3412 
3413 /* remove deleted ip from prefsrc entries */
3414 struct arg_dev_net_ip {
3415 	struct net_device *dev;
3416 	struct net *net;
3417 	struct in6_addr *addr;
3418 };
3419 
3420 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3421 {
3422 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3423 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3424 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3425 
3426 	if (((void *)rt->dst.dev == dev || !dev) &&
3427 	    rt != net->ipv6.ip6_null_entry &&
3428 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3429 		spin_lock_bh(&rt6_exception_lock);
3430 		/* remove prefsrc entry */
3431 		rt->rt6i_prefsrc.plen = 0;
3432 		/* need to update cache as well */
3433 		rt6_exceptions_remove_prefsrc(rt);
3434 		spin_unlock_bh(&rt6_exception_lock);
3435 	}
3436 	return 0;
3437 }
3438 
3439 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3440 {
3441 	struct net *net = dev_net(ifp->idev->dev);
3442 	struct arg_dev_net_ip adni = {
3443 		.dev = ifp->idev->dev,
3444 		.net = net,
3445 		.addr = &ifp->addr,
3446 	};
3447 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3448 }
3449 
3450 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3451 
3452 /* Remove routers and update dst entries when gateway turn into host. */
3453 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3454 {
3455 	struct in6_addr *gateway = (struct in6_addr *)arg;
3456 
3457 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3458 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3459 		return -1;
3460 	}
3461 
3462 	/* Further clean up cached routes in exception table.
3463 	 * This is needed because cached route may have a different
3464 	 * gateway than its 'parent' in the case of an ip redirect.
3465 	 */
3466 	rt6_exceptions_clean_tohost(rt, gateway);
3467 
3468 	return 0;
3469 }
3470 
3471 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3472 {
3473 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3474 }
3475 
3476 struct arg_netdev_event {
3477 	const struct net_device *dev;
3478 	union {
3479 		unsigned int nh_flags;
3480 		unsigned long event;
3481 	};
3482 };
3483 
3484 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3485 {
3486 	const struct arg_netdev_event *arg = p_arg;
3487 	const struct net *net = dev_net(arg->dev);
3488 
3489 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3490 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3491 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3492 	}
3493 
3494 	return 0;
3495 }
3496 
3497 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3498 {
3499 	struct arg_netdev_event arg = {
3500 		.dev = dev,
3501 		.nh_flags = nh_flags,
3502 	};
3503 
3504 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3505 		arg.nh_flags |= RTNH_F_LINKDOWN;
3506 
3507 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3508 }
3509 
3510 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3511 				   const struct net_device *dev)
3512 {
3513 	struct rt6_info *iter;
3514 
3515 	if (rt->dst.dev == dev)
3516 		return true;
3517 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3518 		if (iter->dst.dev == dev)
3519 			return true;
3520 
3521 	return false;
3522 }
3523 
3524 static void rt6_multipath_flush(struct rt6_info *rt)
3525 {
3526 	struct rt6_info *iter;
3527 
3528 	rt->should_flush = 1;
3529 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3530 		iter->should_flush = 1;
3531 }
3532 
3533 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3534 					     const struct net_device *down_dev)
3535 {
3536 	struct rt6_info *iter;
3537 	unsigned int dead = 0;
3538 
3539 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3540 		dead++;
3541 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3542 		if (iter->dst.dev == down_dev ||
3543 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3544 			dead++;
3545 
3546 	return dead;
3547 }
3548 
3549 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3550 				       const struct net_device *dev,
3551 				       unsigned int nh_flags)
3552 {
3553 	struct rt6_info *iter;
3554 
3555 	if (rt->dst.dev == dev)
3556 		rt->rt6i_nh_flags |= nh_flags;
3557 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3558 		if (iter->dst.dev == dev)
3559 			iter->rt6i_nh_flags |= nh_flags;
3560 }
3561 
3562 /* called with write lock held for table with rt */
3563 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3564 {
3565 	const struct arg_netdev_event *arg = p_arg;
3566 	const struct net_device *dev = arg->dev;
3567 	const struct net *net = dev_net(dev);
3568 
3569 	if (rt == net->ipv6.ip6_null_entry)
3570 		return 0;
3571 
3572 	switch (arg->event) {
3573 	case NETDEV_UNREGISTER:
3574 		return rt->dst.dev == dev ? -1 : 0;
3575 	case NETDEV_DOWN:
3576 		if (rt->should_flush)
3577 			return -1;
3578 		if (!rt->rt6i_nsiblings)
3579 			return rt->dst.dev == dev ? -1 : 0;
3580 		if (rt6_multipath_uses_dev(rt, dev)) {
3581 			unsigned int count;
3582 
3583 			count = rt6_multipath_dead_count(rt, dev);
3584 			if (rt->rt6i_nsiblings + 1 == count) {
3585 				rt6_multipath_flush(rt);
3586 				return -1;
3587 			}
3588 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3589 						   RTNH_F_LINKDOWN);
3590 			fib6_update_sernum(rt);
3591 		}
3592 		return -2;
3593 	case NETDEV_CHANGE:
3594 		if (rt->dst.dev != dev ||
3595 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3596 			break;
3597 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3598 		break;
3599 	}
3600 
3601 	return 0;
3602 }
3603 
3604 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3605 {
3606 	struct arg_netdev_event arg = {
3607 		.dev = dev,
3608 		.event = event,
3609 	};
3610 
3611 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3612 }
3613 
3614 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3615 {
3616 	rt6_sync_down_dev(dev, event);
3617 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3618 	neigh_ifdown(&nd_tbl, dev);
3619 }
3620 
3621 struct rt6_mtu_change_arg {
3622 	struct net_device *dev;
3623 	unsigned int mtu;
3624 };
3625 
3626 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3627 {
3628 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3629 	struct inet6_dev *idev;
3630 
3631 	/* In IPv6 pmtu discovery is not optional,
3632 	   so that RTAX_MTU lock cannot disable it.
3633 	   We still use this lock to block changes
3634 	   caused by addrconf/ndisc.
3635 	*/
3636 
3637 	idev = __in6_dev_get(arg->dev);
3638 	if (!idev)
3639 		return 0;
3640 
3641 	/* For administrative MTU increase, there is no way to discover
3642 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3643 	   Since RFC 1981 doesn't include administrative MTU increase
3644 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3645 	 */
3646 	/*
3647 	   If new MTU is less than route PMTU, this new MTU will be the
3648 	   lowest MTU in the path, update the route PMTU to reflect PMTU
3649 	   decreases; if new MTU is greater than route PMTU, and the
3650 	   old MTU is the lowest MTU in the path, update the route PMTU
3651 	   to reflect the increase. In this case if the other nodes' MTU
3652 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
3653 	   PMTU discovery.
3654 	 */
3655 	if (rt->dst.dev == arg->dev &&
3656 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
3657 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3658 		spin_lock_bh(&rt6_exception_lock);
3659 		if (dst_mtu(&rt->dst) >= arg->mtu ||
3660 		    (dst_mtu(&rt->dst) < arg->mtu &&
3661 		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3662 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3663 		}
3664 		rt6_exceptions_update_pmtu(rt, arg->mtu);
3665 		spin_unlock_bh(&rt6_exception_lock);
3666 	}
3667 	return 0;
3668 }
3669 
3670 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3671 {
3672 	struct rt6_mtu_change_arg arg = {
3673 		.dev = dev,
3674 		.mtu = mtu,
3675 	};
3676 
3677 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3678 }
3679 
3680 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3681 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3682 	[RTA_OIF]               = { .type = NLA_U32 },
3683 	[RTA_IIF]		= { .type = NLA_U32 },
3684 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3685 	[RTA_METRICS]           = { .type = NLA_NESTED },
3686 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3687 	[RTA_PREF]              = { .type = NLA_U8 },
3688 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3689 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3690 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3691 	[RTA_UID]		= { .type = NLA_U32 },
3692 	[RTA_MARK]		= { .type = NLA_U32 },
3693 };
3694 
3695 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3696 			      struct fib6_config *cfg,
3697 			      struct netlink_ext_ack *extack)
3698 {
3699 	struct rtmsg *rtm;
3700 	struct nlattr *tb[RTA_MAX+1];
3701 	unsigned int pref;
3702 	int err;
3703 
3704 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3705 			  NULL);
3706 	if (err < 0)
3707 		goto errout;
3708 
3709 	err = -EINVAL;
3710 	rtm = nlmsg_data(nlh);
3711 	memset(cfg, 0, sizeof(*cfg));
3712 
3713 	cfg->fc_table = rtm->rtm_table;
3714 	cfg->fc_dst_len = rtm->rtm_dst_len;
3715 	cfg->fc_src_len = rtm->rtm_src_len;
3716 	cfg->fc_flags = RTF_UP;
3717 	cfg->fc_protocol = rtm->rtm_protocol;
3718 	cfg->fc_type = rtm->rtm_type;
3719 
3720 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3721 	    rtm->rtm_type == RTN_BLACKHOLE ||
3722 	    rtm->rtm_type == RTN_PROHIBIT ||
3723 	    rtm->rtm_type == RTN_THROW)
3724 		cfg->fc_flags |= RTF_REJECT;
3725 
3726 	if (rtm->rtm_type == RTN_LOCAL)
3727 		cfg->fc_flags |= RTF_LOCAL;
3728 
3729 	if (rtm->rtm_flags & RTM_F_CLONED)
3730 		cfg->fc_flags |= RTF_CACHE;
3731 
3732 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3733 	cfg->fc_nlinfo.nlh = nlh;
3734 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3735 
3736 	if (tb[RTA_GATEWAY]) {
3737 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3738 		cfg->fc_flags |= RTF_GATEWAY;
3739 	}
3740 
3741 	if (tb[RTA_DST]) {
3742 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3743 
3744 		if (nla_len(tb[RTA_DST]) < plen)
3745 			goto errout;
3746 
3747 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3748 	}
3749 
3750 	if (tb[RTA_SRC]) {
3751 		int plen = (rtm->rtm_src_len + 7) >> 3;
3752 
3753 		if (nla_len(tb[RTA_SRC]) < plen)
3754 			goto errout;
3755 
3756 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3757 	}
3758 
3759 	if (tb[RTA_PREFSRC])
3760 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3761 
3762 	if (tb[RTA_OIF])
3763 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3764 
3765 	if (tb[RTA_PRIORITY])
3766 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3767 
3768 	if (tb[RTA_METRICS]) {
3769 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3770 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3771 	}
3772 
3773 	if (tb[RTA_TABLE])
3774 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3775 
3776 	if (tb[RTA_MULTIPATH]) {
3777 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3778 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3779 
3780 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3781 						     cfg->fc_mp_len, extack);
3782 		if (err < 0)
3783 			goto errout;
3784 	}
3785 
3786 	if (tb[RTA_PREF]) {
3787 		pref = nla_get_u8(tb[RTA_PREF]);
3788 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3789 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3790 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3791 		cfg->fc_flags |= RTF_PREF(pref);
3792 	}
3793 
3794 	if (tb[RTA_ENCAP])
3795 		cfg->fc_encap = tb[RTA_ENCAP];
3796 
3797 	if (tb[RTA_ENCAP_TYPE]) {
3798 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3799 
3800 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3801 		if (err < 0)
3802 			goto errout;
3803 	}
3804 
3805 	if (tb[RTA_EXPIRES]) {
3806 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3807 
3808 		if (addrconf_finite_timeout(timeout)) {
3809 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3810 			cfg->fc_flags |= RTF_EXPIRES;
3811 		}
3812 	}
3813 
3814 	err = 0;
3815 errout:
3816 	return err;
3817 }
3818 
3819 struct rt6_nh {
3820 	struct rt6_info *rt6_info;
3821 	struct fib6_config r_cfg;
3822 	struct mx6_config mxc;
3823 	struct list_head next;
3824 };
3825 
3826 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3827 {
3828 	struct rt6_nh *nh;
3829 
3830 	list_for_each_entry(nh, rt6_nh_list, next) {
3831 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3832 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3833 		        nh->r_cfg.fc_ifindex);
3834 	}
3835 }
3836 
3837 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3838 				 struct rt6_info *rt, struct fib6_config *r_cfg)
3839 {
3840 	struct rt6_nh *nh;
3841 	int err = -EEXIST;
3842 
3843 	list_for_each_entry(nh, rt6_nh_list, next) {
3844 		/* check if rt6_info already exists */
3845 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3846 			return err;
3847 	}
3848 
3849 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3850 	if (!nh)
3851 		return -ENOMEM;
3852 	nh->rt6_info = rt;
3853 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
3854 	if (err) {
3855 		kfree(nh);
3856 		return err;
3857 	}
3858 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3859 	list_add_tail(&nh->next, rt6_nh_list);
3860 
3861 	return 0;
3862 }
3863 
3864 static void ip6_route_mpath_notify(struct rt6_info *rt,
3865 				   struct rt6_info *rt_last,
3866 				   struct nl_info *info,
3867 				   __u16 nlflags)
3868 {
3869 	/* if this is an APPEND route, then rt points to the first route
3870 	 * inserted and rt_last points to last route inserted. Userspace
3871 	 * wants a consistent dump of the route which starts at the first
3872 	 * nexthop. Since sibling routes are always added at the end of
3873 	 * the list, find the first sibling of the last route appended
3874 	 */
3875 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3876 		rt = list_first_entry(&rt_last->rt6i_siblings,
3877 				      struct rt6_info,
3878 				      rt6i_siblings);
3879 	}
3880 
3881 	if (rt)
3882 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3883 }
3884 
3885 static int ip6_route_multipath_add(struct fib6_config *cfg,
3886 				   struct netlink_ext_ack *extack)
3887 {
3888 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3889 	struct nl_info *info = &cfg->fc_nlinfo;
3890 	struct fib6_config r_cfg;
3891 	struct rtnexthop *rtnh;
3892 	struct rt6_info *rt;
3893 	struct rt6_nh *err_nh;
3894 	struct rt6_nh *nh, *nh_safe;
3895 	__u16 nlflags;
3896 	int remaining;
3897 	int attrlen;
3898 	int err = 1;
3899 	int nhn = 0;
3900 	int replace = (cfg->fc_nlinfo.nlh &&
3901 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3902 	LIST_HEAD(rt6_nh_list);
3903 
3904 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3905 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3906 		nlflags |= NLM_F_APPEND;
3907 
3908 	remaining = cfg->fc_mp_len;
3909 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3910 
3911 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
3912 	 * rt6_info structs per nexthop
3913 	 */
3914 	while (rtnh_ok(rtnh, remaining)) {
3915 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3916 		if (rtnh->rtnh_ifindex)
3917 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3918 
3919 		attrlen = rtnh_attrlen(rtnh);
3920 		if (attrlen > 0) {
3921 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3922 
3923 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3924 			if (nla) {
3925 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
3926 				r_cfg.fc_flags |= RTF_GATEWAY;
3927 			}
3928 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3929 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3930 			if (nla)
3931 				r_cfg.fc_encap_type = nla_get_u16(nla);
3932 		}
3933 
3934 		rt = ip6_route_info_create(&r_cfg, extack);
3935 		if (IS_ERR(rt)) {
3936 			err = PTR_ERR(rt);
3937 			rt = NULL;
3938 			goto cleanup;
3939 		}
3940 
3941 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3942 		if (err) {
3943 			dst_release_immediate(&rt->dst);
3944 			goto cleanup;
3945 		}
3946 
3947 		rtnh = rtnh_next(rtnh, &remaining);
3948 	}
3949 
3950 	/* for add and replace send one notification with all nexthops.
3951 	 * Skip the notification in fib6_add_rt2node and send one with
3952 	 * the full route when done
3953 	 */
3954 	info->skip_notify = 1;
3955 
3956 	err_nh = NULL;
3957 	list_for_each_entry(nh, &rt6_nh_list, next) {
3958 		rt_last = nh->rt6_info;
3959 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3960 		/* save reference to first route for notification */
3961 		if (!rt_notif && !err)
3962 			rt_notif = nh->rt6_info;
3963 
3964 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
3965 		nh->rt6_info = NULL;
3966 		if (err) {
3967 			if (replace && nhn)
3968 				ip6_print_replace_route_err(&rt6_nh_list);
3969 			err_nh = nh;
3970 			goto add_errout;
3971 		}
3972 
3973 		/* Because each route is added like a single route we remove
3974 		 * these flags after the first nexthop: if there is a collision,
3975 		 * we have already failed to add the first nexthop:
3976 		 * fib6_add_rt2node() has rejected it; when replacing, old
3977 		 * nexthops have been replaced by first new, the rest should
3978 		 * be added to it.
3979 		 */
3980 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3981 						     NLM_F_REPLACE);
3982 		nhn++;
3983 	}
3984 
3985 	/* success ... tell user about new route */
3986 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3987 	goto cleanup;
3988 
3989 add_errout:
3990 	/* send notification for routes that were added so that
3991 	 * the delete notifications sent by ip6_route_del are
3992 	 * coherent
3993 	 */
3994 	if (rt_notif)
3995 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3996 
3997 	/* Delete routes that were already added */
3998 	list_for_each_entry(nh, &rt6_nh_list, next) {
3999 		if (err_nh == nh)
4000 			break;
4001 		ip6_route_del(&nh->r_cfg, extack);
4002 	}
4003 
4004 cleanup:
4005 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4006 		if (nh->rt6_info)
4007 			dst_release_immediate(&nh->rt6_info->dst);
4008 		kfree(nh->mxc.mx);
4009 		list_del(&nh->next);
4010 		kfree(nh);
4011 	}
4012 
4013 	return err;
4014 }
4015 
4016 static int ip6_route_multipath_del(struct fib6_config *cfg,
4017 				   struct netlink_ext_ack *extack)
4018 {
4019 	struct fib6_config r_cfg;
4020 	struct rtnexthop *rtnh;
4021 	int remaining;
4022 	int attrlen;
4023 	int err = 1, last_err = 0;
4024 
4025 	remaining = cfg->fc_mp_len;
4026 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4027 
4028 	/* Parse a Multipath Entry */
4029 	while (rtnh_ok(rtnh, remaining)) {
4030 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4031 		if (rtnh->rtnh_ifindex)
4032 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4033 
4034 		attrlen = rtnh_attrlen(rtnh);
4035 		if (attrlen > 0) {
4036 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4037 
4038 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4039 			if (nla) {
4040 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4041 				r_cfg.fc_flags |= RTF_GATEWAY;
4042 			}
4043 		}
4044 		err = ip6_route_del(&r_cfg, extack);
4045 		if (err)
4046 			last_err = err;
4047 
4048 		rtnh = rtnh_next(rtnh, &remaining);
4049 	}
4050 
4051 	return last_err;
4052 }
4053 
4054 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4055 			      struct netlink_ext_ack *extack)
4056 {
4057 	struct fib6_config cfg;
4058 	int err;
4059 
4060 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4061 	if (err < 0)
4062 		return err;
4063 
4064 	if (cfg.fc_mp)
4065 		return ip6_route_multipath_del(&cfg, extack);
4066 	else {
4067 		cfg.fc_delete_all_nh = 1;
4068 		return ip6_route_del(&cfg, extack);
4069 	}
4070 }
4071 
4072 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4073 			      struct netlink_ext_ack *extack)
4074 {
4075 	struct fib6_config cfg;
4076 	int err;
4077 
4078 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4079 	if (err < 0)
4080 		return err;
4081 
4082 	if (cfg.fc_mp)
4083 		return ip6_route_multipath_add(&cfg, extack);
4084 	else
4085 		return ip6_route_add(&cfg, extack);
4086 }
4087 
4088 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4089 {
4090 	int nexthop_len = 0;
4091 
4092 	if (rt->rt6i_nsiblings) {
4093 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4094 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4095 			    + nla_total_size(16) /* RTA_GATEWAY */
4096 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4097 
4098 		nexthop_len *= rt->rt6i_nsiblings;
4099 	}
4100 
4101 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4102 	       + nla_total_size(16) /* RTA_SRC */
4103 	       + nla_total_size(16) /* RTA_DST */
4104 	       + nla_total_size(16) /* RTA_GATEWAY */
4105 	       + nla_total_size(16) /* RTA_PREFSRC */
4106 	       + nla_total_size(4) /* RTA_TABLE */
4107 	       + nla_total_size(4) /* RTA_IIF */
4108 	       + nla_total_size(4) /* RTA_OIF */
4109 	       + nla_total_size(4) /* RTA_PRIORITY */
4110 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4111 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4112 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4113 	       + nla_total_size(1) /* RTA_PREF */
4114 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4115 	       + nexthop_len;
4116 }
4117 
4118 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4119 			    unsigned int *flags, bool skip_oif)
4120 {
4121 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4122 		*flags |= RTNH_F_DEAD;
4123 
4124 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4125 		*flags |= RTNH_F_LINKDOWN;
4126 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4127 			*flags |= RTNH_F_DEAD;
4128 	}
4129 
4130 	if (rt->rt6i_flags & RTF_GATEWAY) {
4131 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4132 			goto nla_put_failure;
4133 	}
4134 
4135 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4136 		*flags |= RTNH_F_OFFLOAD;
4137 
4138 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4139 	if (!skip_oif && rt->dst.dev &&
4140 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4141 		goto nla_put_failure;
4142 
4143 	if (rt->dst.lwtstate &&
4144 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4145 		goto nla_put_failure;
4146 
4147 	return 0;
4148 
4149 nla_put_failure:
4150 	return -EMSGSIZE;
4151 }
4152 
4153 /* add multipath next hop */
4154 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4155 {
4156 	struct rtnexthop *rtnh;
4157 	unsigned int flags = 0;
4158 
4159 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4160 	if (!rtnh)
4161 		goto nla_put_failure;
4162 
4163 	rtnh->rtnh_hops = 0;
4164 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4165 
4166 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4167 		goto nla_put_failure;
4168 
4169 	rtnh->rtnh_flags = flags;
4170 
4171 	/* length of rtnetlink header + attributes */
4172 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4173 
4174 	return 0;
4175 
4176 nla_put_failure:
4177 	return -EMSGSIZE;
4178 }
4179 
4180 static int rt6_fill_node(struct net *net,
4181 			 struct sk_buff *skb, struct rt6_info *rt,
4182 			 struct in6_addr *dst, struct in6_addr *src,
4183 			 int iif, int type, u32 portid, u32 seq,
4184 			 unsigned int flags)
4185 {
4186 	u32 metrics[RTAX_MAX];
4187 	struct rtmsg *rtm;
4188 	struct nlmsghdr *nlh;
4189 	long expires;
4190 	u32 table;
4191 
4192 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4193 	if (!nlh)
4194 		return -EMSGSIZE;
4195 
4196 	rtm = nlmsg_data(nlh);
4197 	rtm->rtm_family = AF_INET6;
4198 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4199 	rtm->rtm_src_len = rt->rt6i_src.plen;
4200 	rtm->rtm_tos = 0;
4201 	if (rt->rt6i_table)
4202 		table = rt->rt6i_table->tb6_id;
4203 	else
4204 		table = RT6_TABLE_UNSPEC;
4205 	rtm->rtm_table = table;
4206 	if (nla_put_u32(skb, RTA_TABLE, table))
4207 		goto nla_put_failure;
4208 	if (rt->rt6i_flags & RTF_REJECT) {
4209 		switch (rt->dst.error) {
4210 		case -EINVAL:
4211 			rtm->rtm_type = RTN_BLACKHOLE;
4212 			break;
4213 		case -EACCES:
4214 			rtm->rtm_type = RTN_PROHIBIT;
4215 			break;
4216 		case -EAGAIN:
4217 			rtm->rtm_type = RTN_THROW;
4218 			break;
4219 		default:
4220 			rtm->rtm_type = RTN_UNREACHABLE;
4221 			break;
4222 		}
4223 	}
4224 	else if (rt->rt6i_flags & RTF_LOCAL)
4225 		rtm->rtm_type = RTN_LOCAL;
4226 	else if (rt->rt6i_flags & RTF_ANYCAST)
4227 		rtm->rtm_type = RTN_ANYCAST;
4228 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4229 		rtm->rtm_type = RTN_LOCAL;
4230 	else
4231 		rtm->rtm_type = RTN_UNICAST;
4232 	rtm->rtm_flags = 0;
4233 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4234 	rtm->rtm_protocol = rt->rt6i_protocol;
4235 
4236 	if (rt->rt6i_flags & RTF_CACHE)
4237 		rtm->rtm_flags |= RTM_F_CLONED;
4238 
4239 	if (dst) {
4240 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4241 			goto nla_put_failure;
4242 		rtm->rtm_dst_len = 128;
4243 	} else if (rtm->rtm_dst_len)
4244 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4245 			goto nla_put_failure;
4246 #ifdef CONFIG_IPV6_SUBTREES
4247 	if (src) {
4248 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4249 			goto nla_put_failure;
4250 		rtm->rtm_src_len = 128;
4251 	} else if (rtm->rtm_src_len &&
4252 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4253 		goto nla_put_failure;
4254 #endif
4255 	if (iif) {
4256 #ifdef CONFIG_IPV6_MROUTE
4257 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4258 			int err = ip6mr_get_route(net, skb, rtm, portid);
4259 
4260 			if (err == 0)
4261 				return 0;
4262 			if (err < 0)
4263 				goto nla_put_failure;
4264 		} else
4265 #endif
4266 			if (nla_put_u32(skb, RTA_IIF, iif))
4267 				goto nla_put_failure;
4268 	} else if (dst) {
4269 		struct in6_addr saddr_buf;
4270 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4271 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4272 			goto nla_put_failure;
4273 	}
4274 
4275 	if (rt->rt6i_prefsrc.plen) {
4276 		struct in6_addr saddr_buf;
4277 		saddr_buf = rt->rt6i_prefsrc.addr;
4278 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4279 			goto nla_put_failure;
4280 	}
4281 
4282 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4283 	if (rt->rt6i_pmtu)
4284 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4285 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4286 		goto nla_put_failure;
4287 
4288 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4289 		goto nla_put_failure;
4290 
4291 	/* For multipath routes, walk the siblings list and add
4292 	 * each as a nexthop within RTA_MULTIPATH.
4293 	 */
4294 	if (rt->rt6i_nsiblings) {
4295 		struct rt6_info *sibling, *next_sibling;
4296 		struct nlattr *mp;
4297 
4298 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4299 		if (!mp)
4300 			goto nla_put_failure;
4301 
4302 		if (rt6_add_nexthop(skb, rt) < 0)
4303 			goto nla_put_failure;
4304 
4305 		list_for_each_entry_safe(sibling, next_sibling,
4306 					 &rt->rt6i_siblings, rt6i_siblings) {
4307 			if (rt6_add_nexthop(skb, sibling) < 0)
4308 				goto nla_put_failure;
4309 		}
4310 
4311 		nla_nest_end(skb, mp);
4312 	} else {
4313 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4314 			goto nla_put_failure;
4315 	}
4316 
4317 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4318 
4319 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4320 		goto nla_put_failure;
4321 
4322 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4323 		goto nla_put_failure;
4324 
4325 
4326 	nlmsg_end(skb, nlh);
4327 	return 0;
4328 
4329 nla_put_failure:
4330 	nlmsg_cancel(skb, nlh);
4331 	return -EMSGSIZE;
4332 }
4333 
4334 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4335 {
4336 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4337 	struct net *net = arg->net;
4338 
4339 	if (rt == net->ipv6.ip6_null_entry)
4340 		return 0;
4341 
4342 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4343 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4344 
4345 		/* user wants prefix routes only */
4346 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4347 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4348 			/* success since this is not a prefix route */
4349 			return 1;
4350 		}
4351 	}
4352 
4353 	return rt6_fill_node(net,
4354 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4355 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4356 		     NLM_F_MULTI);
4357 }
4358 
4359 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4360 			      struct netlink_ext_ack *extack)
4361 {
4362 	struct net *net = sock_net(in_skb->sk);
4363 	struct nlattr *tb[RTA_MAX+1];
4364 	int err, iif = 0, oif = 0;
4365 	struct dst_entry *dst;
4366 	struct rt6_info *rt;
4367 	struct sk_buff *skb;
4368 	struct rtmsg *rtm;
4369 	struct flowi6 fl6;
4370 	bool fibmatch;
4371 
4372 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4373 			  extack);
4374 	if (err < 0)
4375 		goto errout;
4376 
4377 	err = -EINVAL;
4378 	memset(&fl6, 0, sizeof(fl6));
4379 	rtm = nlmsg_data(nlh);
4380 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4381 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4382 
4383 	if (tb[RTA_SRC]) {
4384 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4385 			goto errout;
4386 
4387 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4388 	}
4389 
4390 	if (tb[RTA_DST]) {
4391 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4392 			goto errout;
4393 
4394 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4395 	}
4396 
4397 	if (tb[RTA_IIF])
4398 		iif = nla_get_u32(tb[RTA_IIF]);
4399 
4400 	if (tb[RTA_OIF])
4401 		oif = nla_get_u32(tb[RTA_OIF]);
4402 
4403 	if (tb[RTA_MARK])
4404 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4405 
4406 	if (tb[RTA_UID])
4407 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4408 					   nla_get_u32(tb[RTA_UID]));
4409 	else
4410 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4411 
4412 	if (iif) {
4413 		struct net_device *dev;
4414 		int flags = 0;
4415 
4416 		rcu_read_lock();
4417 
4418 		dev = dev_get_by_index_rcu(net, iif);
4419 		if (!dev) {
4420 			rcu_read_unlock();
4421 			err = -ENODEV;
4422 			goto errout;
4423 		}
4424 
4425 		fl6.flowi6_iif = iif;
4426 
4427 		if (!ipv6_addr_any(&fl6.saddr))
4428 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4429 
4430 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4431 
4432 		rcu_read_unlock();
4433 	} else {
4434 		fl6.flowi6_oif = oif;
4435 
4436 		dst = ip6_route_output(net, NULL, &fl6);
4437 	}
4438 
4439 
4440 	rt = container_of(dst, struct rt6_info, dst);
4441 	if (rt->dst.error) {
4442 		err = rt->dst.error;
4443 		ip6_rt_put(rt);
4444 		goto errout;
4445 	}
4446 
4447 	if (rt == net->ipv6.ip6_null_entry) {
4448 		err = rt->dst.error;
4449 		ip6_rt_put(rt);
4450 		goto errout;
4451 	}
4452 
4453 	if (fibmatch && rt->from) {
4454 		struct rt6_info *ort = rt->from;
4455 
4456 		dst_hold(&ort->dst);
4457 		ip6_rt_put(rt);
4458 		rt = ort;
4459 	}
4460 
4461 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4462 	if (!skb) {
4463 		ip6_rt_put(rt);
4464 		err = -ENOBUFS;
4465 		goto errout;
4466 	}
4467 
4468 	skb_dst_set(skb, &rt->dst);
4469 	if (fibmatch)
4470 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4471 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4472 				    nlh->nlmsg_seq, 0);
4473 	else
4474 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4475 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4476 				    nlh->nlmsg_seq, 0);
4477 	if (err < 0) {
4478 		kfree_skb(skb);
4479 		goto errout;
4480 	}
4481 
4482 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4483 errout:
4484 	return err;
4485 }
4486 
4487 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4488 		     unsigned int nlm_flags)
4489 {
4490 	struct sk_buff *skb;
4491 	struct net *net = info->nl_net;
4492 	u32 seq;
4493 	int err;
4494 
4495 	err = -ENOBUFS;
4496 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4497 
4498 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4499 	if (!skb)
4500 		goto errout;
4501 
4502 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4503 				event, info->portid, seq, nlm_flags);
4504 	if (err < 0) {
4505 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4506 		WARN_ON(err == -EMSGSIZE);
4507 		kfree_skb(skb);
4508 		goto errout;
4509 	}
4510 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4511 		    info->nlh, gfp_any());
4512 	return;
4513 errout:
4514 	if (err < 0)
4515 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4516 }
4517 
4518 static int ip6_route_dev_notify(struct notifier_block *this,
4519 				unsigned long event, void *ptr)
4520 {
4521 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4522 	struct net *net = dev_net(dev);
4523 
4524 	if (!(dev->flags & IFF_LOOPBACK))
4525 		return NOTIFY_OK;
4526 
4527 	if (event == NETDEV_REGISTER) {
4528 		net->ipv6.ip6_null_entry->dst.dev = dev;
4529 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4530 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4531 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4532 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4533 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4534 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4535 #endif
4536 	 } else if (event == NETDEV_UNREGISTER &&
4537 		    dev->reg_state != NETREG_UNREGISTERED) {
4538 		/* NETDEV_UNREGISTER could be fired for multiple times by
4539 		 * netdev_wait_allrefs(). Make sure we only call this once.
4540 		 */
4541 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4542 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4543 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4544 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4545 #endif
4546 	}
4547 
4548 	return NOTIFY_OK;
4549 }
4550 
4551 /*
4552  *	/proc
4553  */
4554 
4555 #ifdef CONFIG_PROC_FS
4556 
4557 static const struct file_operations ipv6_route_proc_fops = {
4558 	.owner		= THIS_MODULE,
4559 	.open		= ipv6_route_open,
4560 	.read		= seq_read,
4561 	.llseek		= seq_lseek,
4562 	.release	= seq_release_net,
4563 };
4564 
4565 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4566 {
4567 	struct net *net = (struct net *)seq->private;
4568 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4569 		   net->ipv6.rt6_stats->fib_nodes,
4570 		   net->ipv6.rt6_stats->fib_route_nodes,
4571 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4572 		   net->ipv6.rt6_stats->fib_rt_entries,
4573 		   net->ipv6.rt6_stats->fib_rt_cache,
4574 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4575 		   net->ipv6.rt6_stats->fib_discarded_routes);
4576 
4577 	return 0;
4578 }
4579 
4580 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4581 {
4582 	return single_open_net(inode, file, rt6_stats_seq_show);
4583 }
4584 
4585 static const struct file_operations rt6_stats_seq_fops = {
4586 	.owner	 = THIS_MODULE,
4587 	.open	 = rt6_stats_seq_open,
4588 	.read	 = seq_read,
4589 	.llseek	 = seq_lseek,
4590 	.release = single_release_net,
4591 };
4592 #endif	/* CONFIG_PROC_FS */
4593 
4594 #ifdef CONFIG_SYSCTL
4595 
4596 static
4597 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4598 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4599 {
4600 	struct net *net;
4601 	int delay;
4602 	if (!write)
4603 		return -EINVAL;
4604 
4605 	net = (struct net *)ctl->extra1;
4606 	delay = net->ipv6.sysctl.flush_delay;
4607 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4608 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4609 	return 0;
4610 }
4611 
4612 struct ctl_table ipv6_route_table_template[] = {
4613 	{
4614 		.procname	=	"flush",
4615 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4616 		.maxlen		=	sizeof(int),
4617 		.mode		=	0200,
4618 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4619 	},
4620 	{
4621 		.procname	=	"gc_thresh",
4622 		.data		=	&ip6_dst_ops_template.gc_thresh,
4623 		.maxlen		=	sizeof(int),
4624 		.mode		=	0644,
4625 		.proc_handler	=	proc_dointvec,
4626 	},
4627 	{
4628 		.procname	=	"max_size",
4629 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4630 		.maxlen		=	sizeof(int),
4631 		.mode		=	0644,
4632 		.proc_handler	=	proc_dointvec,
4633 	},
4634 	{
4635 		.procname	=	"gc_min_interval",
4636 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4637 		.maxlen		=	sizeof(int),
4638 		.mode		=	0644,
4639 		.proc_handler	=	proc_dointvec_jiffies,
4640 	},
4641 	{
4642 		.procname	=	"gc_timeout",
4643 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4644 		.maxlen		=	sizeof(int),
4645 		.mode		=	0644,
4646 		.proc_handler	=	proc_dointvec_jiffies,
4647 	},
4648 	{
4649 		.procname	=	"gc_interval",
4650 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4651 		.maxlen		=	sizeof(int),
4652 		.mode		=	0644,
4653 		.proc_handler	=	proc_dointvec_jiffies,
4654 	},
4655 	{
4656 		.procname	=	"gc_elasticity",
4657 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4658 		.maxlen		=	sizeof(int),
4659 		.mode		=	0644,
4660 		.proc_handler	=	proc_dointvec,
4661 	},
4662 	{
4663 		.procname	=	"mtu_expires",
4664 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4665 		.maxlen		=	sizeof(int),
4666 		.mode		=	0644,
4667 		.proc_handler	=	proc_dointvec_jiffies,
4668 	},
4669 	{
4670 		.procname	=	"min_adv_mss",
4671 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4672 		.maxlen		=	sizeof(int),
4673 		.mode		=	0644,
4674 		.proc_handler	=	proc_dointvec,
4675 	},
4676 	{
4677 		.procname	=	"gc_min_interval_ms",
4678 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4679 		.maxlen		=	sizeof(int),
4680 		.mode		=	0644,
4681 		.proc_handler	=	proc_dointvec_ms_jiffies,
4682 	},
4683 	{ }
4684 };
4685 
4686 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4687 {
4688 	struct ctl_table *table;
4689 
4690 	table = kmemdup(ipv6_route_table_template,
4691 			sizeof(ipv6_route_table_template),
4692 			GFP_KERNEL);
4693 
4694 	if (table) {
4695 		table[0].data = &net->ipv6.sysctl.flush_delay;
4696 		table[0].extra1 = net;
4697 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4698 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4699 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4700 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4701 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4702 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4703 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4704 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4705 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4706 
4707 		/* Don't export sysctls to unprivileged users */
4708 		if (net->user_ns != &init_user_ns)
4709 			table[0].procname = NULL;
4710 	}
4711 
4712 	return table;
4713 }
4714 #endif
4715 
4716 static int __net_init ip6_route_net_init(struct net *net)
4717 {
4718 	int ret = -ENOMEM;
4719 
4720 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4721 	       sizeof(net->ipv6.ip6_dst_ops));
4722 
4723 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4724 		goto out_ip6_dst_ops;
4725 
4726 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4727 					   sizeof(*net->ipv6.ip6_null_entry),
4728 					   GFP_KERNEL);
4729 	if (!net->ipv6.ip6_null_entry)
4730 		goto out_ip6_dst_entries;
4731 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4732 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4733 			 ip6_template_metrics, true);
4734 
4735 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4736 	net->ipv6.fib6_has_custom_rules = false;
4737 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4738 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4739 					       GFP_KERNEL);
4740 	if (!net->ipv6.ip6_prohibit_entry)
4741 		goto out_ip6_null_entry;
4742 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4743 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4744 			 ip6_template_metrics, true);
4745 
4746 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4747 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4748 					       GFP_KERNEL);
4749 	if (!net->ipv6.ip6_blk_hole_entry)
4750 		goto out_ip6_prohibit_entry;
4751 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4752 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4753 			 ip6_template_metrics, true);
4754 #endif
4755 
4756 	net->ipv6.sysctl.flush_delay = 0;
4757 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4758 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4759 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4760 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4761 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4762 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4763 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4764 
4765 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4766 
4767 	ret = 0;
4768 out:
4769 	return ret;
4770 
4771 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4772 out_ip6_prohibit_entry:
4773 	kfree(net->ipv6.ip6_prohibit_entry);
4774 out_ip6_null_entry:
4775 	kfree(net->ipv6.ip6_null_entry);
4776 #endif
4777 out_ip6_dst_entries:
4778 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4779 out_ip6_dst_ops:
4780 	goto out;
4781 }
4782 
4783 static void __net_exit ip6_route_net_exit(struct net *net)
4784 {
4785 	kfree(net->ipv6.ip6_null_entry);
4786 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4787 	kfree(net->ipv6.ip6_prohibit_entry);
4788 	kfree(net->ipv6.ip6_blk_hole_entry);
4789 #endif
4790 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4791 }
4792 
4793 static int __net_init ip6_route_net_init_late(struct net *net)
4794 {
4795 #ifdef CONFIG_PROC_FS
4796 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4797 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4798 #endif
4799 	return 0;
4800 }
4801 
4802 static void __net_exit ip6_route_net_exit_late(struct net *net)
4803 {
4804 #ifdef CONFIG_PROC_FS
4805 	remove_proc_entry("ipv6_route", net->proc_net);
4806 	remove_proc_entry("rt6_stats", net->proc_net);
4807 #endif
4808 }
4809 
4810 static struct pernet_operations ip6_route_net_ops = {
4811 	.init = ip6_route_net_init,
4812 	.exit = ip6_route_net_exit,
4813 };
4814 
4815 static int __net_init ipv6_inetpeer_init(struct net *net)
4816 {
4817 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4818 
4819 	if (!bp)
4820 		return -ENOMEM;
4821 	inet_peer_base_init(bp);
4822 	net->ipv6.peers = bp;
4823 	return 0;
4824 }
4825 
4826 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4827 {
4828 	struct inet_peer_base *bp = net->ipv6.peers;
4829 
4830 	net->ipv6.peers = NULL;
4831 	inetpeer_invalidate_tree(bp);
4832 	kfree(bp);
4833 }
4834 
4835 static struct pernet_operations ipv6_inetpeer_ops = {
4836 	.init	=	ipv6_inetpeer_init,
4837 	.exit	=	ipv6_inetpeer_exit,
4838 };
4839 
4840 static struct pernet_operations ip6_route_net_late_ops = {
4841 	.init = ip6_route_net_init_late,
4842 	.exit = ip6_route_net_exit_late,
4843 };
4844 
4845 static struct notifier_block ip6_route_dev_notifier = {
4846 	.notifier_call = ip6_route_dev_notify,
4847 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4848 };
4849 
4850 void __init ip6_route_init_special_entries(void)
4851 {
4852 	/* Registering of the loopback is done before this portion of code,
4853 	 * the loopback reference in rt6_info will not be taken, do it
4854 	 * manually for init_net */
4855 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4856 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4857   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4858 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4859 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4860 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4861 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4862   #endif
4863 }
4864 
4865 int __init ip6_route_init(void)
4866 {
4867 	int ret;
4868 	int cpu;
4869 
4870 	ret = -ENOMEM;
4871 	ip6_dst_ops_template.kmem_cachep =
4872 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4873 				  SLAB_HWCACHE_ALIGN, NULL);
4874 	if (!ip6_dst_ops_template.kmem_cachep)
4875 		goto out;
4876 
4877 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
4878 	if (ret)
4879 		goto out_kmem_cache;
4880 
4881 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4882 	if (ret)
4883 		goto out_dst_entries;
4884 
4885 	ret = register_pernet_subsys(&ip6_route_net_ops);
4886 	if (ret)
4887 		goto out_register_inetpeer;
4888 
4889 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4890 
4891 	ret = fib6_init();
4892 	if (ret)
4893 		goto out_register_subsys;
4894 
4895 	ret = xfrm6_init();
4896 	if (ret)
4897 		goto out_fib6_init;
4898 
4899 	ret = fib6_rules_init();
4900 	if (ret)
4901 		goto xfrm6_init;
4902 
4903 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
4904 	if (ret)
4905 		goto fib6_rules_init;
4906 
4907 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
4908 				   inet6_rtm_newroute, NULL, 0);
4909 	if (ret < 0)
4910 		goto out_register_late_subsys;
4911 
4912 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
4913 				   inet6_rtm_delroute, NULL, 0);
4914 	if (ret < 0)
4915 		goto out_register_late_subsys;
4916 
4917 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
4918 				   inet6_rtm_getroute, NULL,
4919 				   RTNL_FLAG_DOIT_UNLOCKED);
4920 	if (ret < 0)
4921 		goto out_register_late_subsys;
4922 
4923 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4924 	if (ret)
4925 		goto out_register_late_subsys;
4926 
4927 	for_each_possible_cpu(cpu) {
4928 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4929 
4930 		INIT_LIST_HEAD(&ul->head);
4931 		spin_lock_init(&ul->lock);
4932 	}
4933 
4934 out:
4935 	return ret;
4936 
4937 out_register_late_subsys:
4938 	rtnl_unregister_all(PF_INET6);
4939 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4940 fib6_rules_init:
4941 	fib6_rules_cleanup();
4942 xfrm6_init:
4943 	xfrm6_fini();
4944 out_fib6_init:
4945 	fib6_gc_cleanup();
4946 out_register_subsys:
4947 	unregister_pernet_subsys(&ip6_route_net_ops);
4948 out_register_inetpeer:
4949 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4950 out_dst_entries:
4951 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4952 out_kmem_cache:
4953 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4954 	goto out;
4955 }
4956 
4957 void ip6_route_cleanup(void)
4958 {
4959 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
4960 	unregister_pernet_subsys(&ip6_route_net_late_ops);
4961 	fib6_rules_cleanup();
4962 	xfrm6_fini();
4963 	fib6_gc_cleanup();
4964 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4965 	unregister_pernet_subsys(&ip6_route_net_ops);
4966 	dst_entries_destroy(&ip6_dst_blackhole_ops);
4967 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4968 }
4969