xref: /openbmc/linux/net/ipv6/route.c (revision 93707cbabcc8baf2b2b5f4a99c1f08ee83eb7abd)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 
459 	/* We might have already computed the hash for ICMPv6 errors. In such
460 	 * case it will always be non-zero. Otherwise now is the time to do it.
461 	 */
462 	if (!fl6->mp_hash)
463 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464 
465 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466 		return match;
467 
468 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469 				 rt6i_siblings) {
470 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471 			continue;
472 		if (rt6_score_route(sibling, oif, strict) < 0)
473 			break;
474 		match = sibling;
475 		break;
476 	}
477 
478 	return match;
479 }
480 
481 /*
482  *	Route lookup. rcu_read_lock() should be held.
483  */
484 
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486 						    struct rt6_info *rt,
487 						    const struct in6_addr *saddr,
488 						    int oif,
489 						    int flags)
490 {
491 	struct rt6_info *local = NULL;
492 	struct rt6_info *sprt;
493 
494 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495 		return rt;
496 
497 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498 		struct net_device *dev = sprt->dst.dev;
499 
500 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501 			continue;
502 
503 		if (oif) {
504 			if (dev->ifindex == oif)
505 				return sprt;
506 			if (dev->flags & IFF_LOOPBACK) {
507 				if (!sprt->rt6i_idev ||
508 				    sprt->rt6i_idev->dev->ifindex != oif) {
509 					if (flags & RT6_LOOKUP_F_IFACE)
510 						continue;
511 					if (local &&
512 					    local->rt6i_idev->dev->ifindex == oif)
513 						continue;
514 				}
515 				local = sprt;
516 			}
517 		} else {
518 			if (ipv6_chk_addr(net, saddr, dev,
519 					  flags & RT6_LOOKUP_F_IFACE))
520 				return sprt;
521 		}
522 	}
523 
524 	if (oif) {
525 		if (local)
526 			return local;
527 
528 		if (flags & RT6_LOOKUP_F_IFACE)
529 			return net->ipv6.ip6_null_entry;
530 	}
531 
532 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534 
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 	struct work_struct work;
538 	struct in6_addr target;
539 	struct net_device *dev;
540 };
541 
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544 	struct in6_addr mcaddr;
545 	struct __rt6_probe_work *work =
546 		container_of(w, struct __rt6_probe_work, work);
547 
548 	addrconf_addr_solict_mult(&work->target, &mcaddr);
549 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 	dev_put(work->dev);
551 	kfree(work);
552 }
553 
554 static void rt6_probe(struct rt6_info *rt)
555 {
556 	struct __rt6_probe_work *work;
557 	struct neighbour *neigh;
558 	/*
559 	 * Okay, this does not seem to be appropriate
560 	 * for now, however, we need to check if it
561 	 * is really so; aka Router Reachability Probing.
562 	 *
563 	 * Router Reachability Probe MUST be rate-limited
564 	 * to no more than one per minute.
565 	 */
566 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 		return;
568 	rcu_read_lock_bh();
569 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 	if (neigh) {
571 		if (neigh->nud_state & NUD_VALID)
572 			goto out;
573 
574 		work = NULL;
575 		write_lock(&neigh->lock);
576 		if (!(neigh->nud_state & NUD_VALID) &&
577 		    time_after(jiffies,
578 			       neigh->updated +
579 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else {
586 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
587 	}
588 
589 	if (work) {
590 		INIT_WORK(&work->work, rt6_probe_deferred);
591 		work->target = rt->rt6i_gateway;
592 		dev_hold(rt->dst.dev);
593 		work->dev = rt->dst.dev;
594 		schedule_work(&work->work);
595 	}
596 
597 out:
598 	rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605 
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611 	struct net_device *dev = rt->dst.dev;
612 	if (!oif || dev->ifindex == oif)
613 		return 2;
614 	if ((dev->flags & IFF_LOOPBACK) &&
615 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 		return 1;
617 	return 0;
618 }
619 
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622 	struct neighbour *neigh;
623 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624 
625 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 	    !(rt->rt6i_flags & RTF_GATEWAY))
627 		return RT6_NUD_SUCCEED;
628 
629 	rcu_read_lock_bh();
630 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 	if (neigh) {
632 		read_lock(&neigh->lock);
633 		if (neigh->nud_state & NUD_VALID)
634 			ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 		else if (!(neigh->nud_state & NUD_FAILED))
637 			ret = RT6_NUD_SUCCEED;
638 		else
639 			ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 		read_unlock(&neigh->lock);
642 	} else {
643 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645 	}
646 	rcu_read_unlock_bh();
647 
648 	return ret;
649 }
650 
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 			   int strict)
653 {
654 	int m;
655 
656 	m = rt6_check_dev(rt, oif);
657 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 		return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 	if (strict & RT6_LOOKUP_F_REACHABLE) {
663 		int n = rt6_check_neigh(rt);
664 		if (n < 0)
665 			return n;
666 	}
667 	return m;
668 }
669 
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 				   int *mpri, struct rt6_info *match,
672 				   bool *do_rr)
673 {
674 	int m;
675 	bool match_do_rr = false;
676 	struct inet6_dev *idev = rt->rt6i_idev;
677 
678 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679 		goto out;
680 
681 	if (idev->cnf.ignore_routes_with_linkdown &&
682 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 		goto out;
685 
686 	if (rt6_check_expired(rt))
687 		goto out;
688 
689 	m = rt6_score_route(rt, oif, strict);
690 	if (m == RT6_NUD_FAIL_DO_RR) {
691 		match_do_rr = true;
692 		m = 0; /* lowest valid score */
693 	} else if (m == RT6_NUD_FAIL_HARD) {
694 		goto out;
695 	}
696 
697 	if (strict & RT6_LOOKUP_F_REACHABLE)
698 		rt6_probe(rt);
699 
700 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
701 	if (m > *mpri) {
702 		*do_rr = match_do_rr;
703 		*mpri = m;
704 		match = rt;
705 	}
706 out:
707 	return match;
708 }
709 
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711 				     struct rt6_info *leaf,
712 				     struct rt6_info *rr_head,
713 				     u32 metric, int oif, int strict,
714 				     bool *do_rr)
715 {
716 	struct rt6_info *rt, *match, *cont;
717 	int mpri = -1;
718 
719 	match = NULL;
720 	cont = NULL;
721 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722 		if (rt->rt6i_metric != metric) {
723 			cont = rt;
724 			break;
725 		}
726 
727 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 	}
729 
730 	for (rt = leaf; rt && rt != rr_head;
731 	     rt = rcu_dereference(rt->rt6_next)) {
732 		if (rt->rt6i_metric != metric) {
733 			cont = rt;
734 			break;
735 		}
736 
737 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 	}
739 
740 	if (match || !cont)
741 		return match;
742 
743 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 
746 	return match;
747 }
748 
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 				   int oif, int strict)
751 {
752 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
753 	struct rt6_info *match, *rt0;
754 	bool do_rr = false;
755 	int key_plen;
756 
757 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
758 		return net->ipv6.ip6_null_entry;
759 
760 	rt0 = rcu_dereference(fn->rr_ptr);
761 	if (!rt0)
762 		rt0 = leaf;
763 
764 	/* Double check to make sure fn is not an intermediate node
765 	 * and fn->leaf does not points to its child's leaf
766 	 * (This might happen if all routes under fn are deleted from
767 	 * the tree and fib6_repair_tree() is called on the node.)
768 	 */
769 	key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771 	if (rt0->rt6i_src.plen)
772 		key_plen = rt0->rt6i_src.plen;
773 #endif
774 	if (fn->fn_bit != key_plen)
775 		return net->ipv6.ip6_null_entry;
776 
777 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778 			     &do_rr);
779 
780 	if (do_rr) {
781 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782 
783 		/* no entries matched; do round-robin */
784 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
785 			next = leaf;
786 
787 		if (next != rt0) {
788 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789 			/* make sure next is not being deleted from the tree */
790 			if (next->rt6i_node)
791 				rcu_assign_pointer(fn->rr_ptr, next);
792 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793 		}
794 	}
795 
796 	return match ? match : net->ipv6.ip6_null_entry;
797 }
798 
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803 
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806 		  const struct in6_addr *gwaddr)
807 {
808 	struct net *net = dev_net(dev);
809 	struct route_info *rinfo = (struct route_info *) opt;
810 	struct in6_addr prefix_buf, *prefix;
811 	unsigned int pref;
812 	unsigned long lifetime;
813 	struct rt6_info *rt;
814 
815 	if (len < sizeof(struct route_info)) {
816 		return -EINVAL;
817 	}
818 
819 	/* Sanity check for prefix_len and length */
820 	if (rinfo->length > 3) {
821 		return -EINVAL;
822 	} else if (rinfo->prefix_len > 128) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 64) {
825 		if (rinfo->length < 2) {
826 			return -EINVAL;
827 		}
828 	} else if (rinfo->prefix_len > 0) {
829 		if (rinfo->length < 1) {
830 			return -EINVAL;
831 		}
832 	}
833 
834 	pref = rinfo->route_pref;
835 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 		return -EINVAL;
837 
838 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839 
840 	if (rinfo->length == 3)
841 		prefix = (struct in6_addr *)rinfo->prefix;
842 	else {
843 		/* this function is safe */
844 		ipv6_addr_prefix(&prefix_buf,
845 				 (struct in6_addr *)rinfo->prefix,
846 				 rinfo->prefix_len);
847 		prefix = &prefix_buf;
848 	}
849 
850 	if (rinfo->prefix_len == 0)
851 		rt = rt6_get_dflt_router(gwaddr, dev);
852 	else
853 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 					gwaddr, dev);
855 
856 	if (rt && !lifetime) {
857 		ip6_del_rt(rt);
858 		rt = NULL;
859 	}
860 
861 	if (!rt && lifetime)
862 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 					dev, pref);
864 	else if (rt)
865 		rt->rt6i_flags = RTF_ROUTEINFO |
866 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867 
868 	if (rt) {
869 		if (!addrconf_finite_timeout(lifetime))
870 			rt6_clean_expires(rt);
871 		else
872 			rt6_set_expires(rt, jiffies + HZ * lifetime);
873 
874 		ip6_rt_put(rt);
875 	}
876 	return 0;
877 }
878 #endif
879 
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881 					struct in6_addr *saddr)
882 {
883 	struct fib6_node *pn, *sn;
884 	while (1) {
885 		if (fn->fn_flags & RTN_TL_ROOT)
886 			return NULL;
887 		pn = rcu_dereference(fn->parent);
888 		sn = FIB6_SUBTREE(pn);
889 		if (sn && sn != fn)
890 			fn = fib6_lookup(sn, NULL, saddr);
891 		else
892 			fn = pn;
893 		if (fn->fn_flags & RTN_RTINFO)
894 			return fn;
895 	}
896 }
897 
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 			  bool null_fallback)
900 {
901 	struct rt6_info *rt = *prt;
902 
903 	if (dst_hold_safe(&rt->dst))
904 		return true;
905 	if (null_fallback) {
906 		rt = net->ipv6.ip6_null_entry;
907 		dst_hold(&rt->dst);
908 	} else {
909 		rt = NULL;
910 	}
911 	*prt = rt;
912 	return false;
913 }
914 
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916 					     struct fib6_table *table,
917 					     struct flowi6 *fl6, int flags)
918 {
919 	struct rt6_info *rt, *rt_cache;
920 	struct fib6_node *fn;
921 
922 	rcu_read_lock();
923 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924 restart:
925 	rt = rcu_dereference(fn->leaf);
926 	if (!rt) {
927 		rt = net->ipv6.ip6_null_entry;
928 	} else {
929 		rt = rt6_device_match(net, rt, &fl6->saddr,
930 				      fl6->flowi6_oif, flags);
931 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932 			rt = rt6_multipath_select(rt, fl6,
933 						  fl6->flowi6_oif, flags);
934 	}
935 	if (rt == net->ipv6.ip6_null_entry) {
936 		fn = fib6_backtrack(fn, &fl6->saddr);
937 		if (fn)
938 			goto restart;
939 	}
940 	/* Search through exception table */
941 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942 	if (rt_cache)
943 		rt = rt_cache;
944 
945 	if (ip6_hold_safe(net, &rt, true))
946 		dst_use_noref(&rt->dst, jiffies);
947 
948 	rcu_read_unlock();
949 
950 	trace_fib6_table_lookup(net, rt, table, fl6);
951 
952 	return rt;
953 
954 }
955 
956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957 				    int flags)
958 {
959 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960 }
961 EXPORT_SYMBOL_GPL(ip6_route_lookup);
962 
963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964 			    const struct in6_addr *saddr, int oif, int strict)
965 {
966 	struct flowi6 fl6 = {
967 		.flowi6_oif = oif,
968 		.daddr = *daddr,
969 	};
970 	struct dst_entry *dst;
971 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972 
973 	if (saddr) {
974 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975 		flags |= RT6_LOOKUP_F_HAS_SADDR;
976 	}
977 
978 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979 	if (dst->error == 0)
980 		return (struct rt6_info *) dst;
981 
982 	dst_release(dst);
983 
984 	return NULL;
985 }
986 EXPORT_SYMBOL(rt6_lookup);
987 
988 /* ip6_ins_rt is called with FREE table->tb6_lock.
989  * It takes new route entry, the addition fails by any reason the
990  * route is released.
991  * Caller must hold dst before calling it.
992  */
993 
994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995 			struct mx6_config *mxc,
996 			struct netlink_ext_ack *extack)
997 {
998 	int err;
999 	struct fib6_table *table;
1000 
1001 	table = rt->rt6i_table;
1002 	spin_lock_bh(&table->tb6_lock);
1003 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004 	spin_unlock_bh(&table->tb6_lock);
1005 
1006 	return err;
1007 }
1008 
1009 int ip6_ins_rt(struct rt6_info *rt)
1010 {
1011 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1012 	struct mx6_config mxc = { .mx = NULL, };
1013 
1014 	/* Hold dst to account for the reference from the fib6 tree */
1015 	dst_hold(&rt->dst);
1016 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 }
1018 
1019 /* called with rcu_lock held */
1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021 {
1022 	struct net_device *dev = rt->dst.dev;
1023 
1024 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025 		/* for copies of local routes, dst->dev needs to be the
1026 		 * device if it is a master device, the master device if
1027 		 * device is enslaved, and the loopback as the default
1028 		 */
1029 		if (netif_is_l3_slave(dev) &&
1030 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1031 			dev = l3mdev_master_dev_rcu(dev);
1032 		else if (!netif_is_l3_master(dev))
1033 			dev = dev_net(dev)->loopback_dev;
1034 		/* last case is netif_is_l3_master(dev) is true in which
1035 		 * case we want dev returned to be dev
1036 		 */
1037 	}
1038 
1039 	return dev;
1040 }
1041 
1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043 					   const struct in6_addr *daddr,
1044 					   const struct in6_addr *saddr)
1045 {
1046 	struct net_device *dev;
1047 	struct rt6_info *rt;
1048 
1049 	/*
1050 	 *	Clone the route.
1051 	 */
1052 
1053 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054 		ort = ort->from;
1055 
1056 	rcu_read_lock();
1057 	dev = ip6_rt_get_dev_rcu(ort);
1058 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059 	rcu_read_unlock();
1060 	if (!rt)
1061 		return NULL;
1062 
1063 	ip6_rt_copy_init(rt, ort);
1064 	rt->rt6i_flags |= RTF_CACHE;
1065 	rt->rt6i_metric = 0;
1066 	rt->dst.flags |= DST_HOST;
1067 	rt->rt6i_dst.addr = *daddr;
1068 	rt->rt6i_dst.plen = 128;
1069 
1070 	if (!rt6_is_gw_or_nonexthop(ort)) {
1071 		if (ort->rt6i_dst.plen != 128 &&
1072 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073 			rt->rt6i_flags |= RTF_ANYCAST;
1074 #ifdef CONFIG_IPV6_SUBTREES
1075 		if (rt->rt6i_src.plen && saddr) {
1076 			rt->rt6i_src.addr = *saddr;
1077 			rt->rt6i_src.plen = 128;
1078 		}
1079 #endif
1080 	}
1081 
1082 	return rt;
1083 }
1084 
1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086 {
1087 	struct net_device *dev;
1088 	struct rt6_info *pcpu_rt;
1089 
1090 	rcu_read_lock();
1091 	dev = ip6_rt_get_dev_rcu(rt);
1092 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093 	rcu_read_unlock();
1094 	if (!pcpu_rt)
1095 		return NULL;
1096 	ip6_rt_copy_init(pcpu_rt, rt);
1097 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1099 	return pcpu_rt;
1100 }
1101 
1102 /* It should be called with rcu_read_lock() acquired */
1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104 {
1105 	struct rt6_info *pcpu_rt, **p;
1106 
1107 	p = this_cpu_ptr(rt->rt6i_pcpu);
1108 	pcpu_rt = *p;
1109 
1110 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111 		rt6_dst_from_metrics_check(pcpu_rt);
1112 
1113 	return pcpu_rt;
1114 }
1115 
1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117 {
1118 	struct rt6_info *pcpu_rt, *prev, **p;
1119 
1120 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121 	if (!pcpu_rt) {
1122 		struct net *net = dev_net(rt->dst.dev);
1123 
1124 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1125 		return net->ipv6.ip6_null_entry;
1126 	}
1127 
1128 	dst_hold(&pcpu_rt->dst);
1129 	p = this_cpu_ptr(rt->rt6i_pcpu);
1130 	prev = cmpxchg(p, NULL, pcpu_rt);
1131 	BUG_ON(prev);
1132 
1133 	rt6_dst_from_metrics_check(pcpu_rt);
1134 	return pcpu_rt;
1135 }
1136 
1137 /* exception hash table implementation
1138  */
1139 static DEFINE_SPINLOCK(rt6_exception_lock);
1140 
1141 /* Remove rt6_ex from hash table and free the memory
1142  * Caller must hold rt6_exception_lock
1143  */
1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145 				 struct rt6_exception *rt6_ex)
1146 {
1147 	struct net *net;
1148 
1149 	if (!bucket || !rt6_ex)
1150 		return;
1151 
1152 	net = dev_net(rt6_ex->rt6i->dst.dev);
1153 	rt6_ex->rt6i->rt6i_node = NULL;
1154 	hlist_del_rcu(&rt6_ex->hlist);
1155 	rt6_release(rt6_ex->rt6i);
1156 	kfree_rcu(rt6_ex, rcu);
1157 	WARN_ON_ONCE(!bucket->depth);
1158 	bucket->depth--;
1159 	net->ipv6.rt6_stats->fib_rt_cache--;
1160 }
1161 
1162 /* Remove oldest rt6_ex in bucket and free the memory
1163  * Caller must hold rt6_exception_lock
1164  */
1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166 {
1167 	struct rt6_exception *rt6_ex, *oldest = NULL;
1168 
1169 	if (!bucket)
1170 		return;
1171 
1172 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174 			oldest = rt6_ex;
1175 	}
1176 	rt6_remove_exception(bucket, oldest);
1177 }
1178 
1179 static u32 rt6_exception_hash(const struct in6_addr *dst,
1180 			      const struct in6_addr *src)
1181 {
1182 	static u32 seed __read_mostly;
1183 	u32 val;
1184 
1185 	net_get_random_once(&seed, sizeof(seed));
1186 	val = jhash(dst, sizeof(*dst), seed);
1187 
1188 #ifdef CONFIG_IPV6_SUBTREES
1189 	if (src)
1190 		val = jhash(src, sizeof(*src), val);
1191 #endif
1192 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193 }
1194 
1195 /* Helper function to find the cached rt in the hash table
1196  * and update bucket pointer to point to the bucket for this
1197  * (daddr, saddr) pair
1198  * Caller must hold rt6_exception_lock
1199  */
1200 static struct rt6_exception *
1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202 			      const struct in6_addr *daddr,
1203 			      const struct in6_addr *saddr)
1204 {
1205 	struct rt6_exception *rt6_ex;
1206 	u32 hval;
1207 
1208 	if (!(*bucket) || !daddr)
1209 		return NULL;
1210 
1211 	hval = rt6_exception_hash(daddr, saddr);
1212 	*bucket += hval;
1213 
1214 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215 		struct rt6_info *rt6 = rt6_ex->rt6i;
1216 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217 
1218 #ifdef CONFIG_IPV6_SUBTREES
1219 		if (matched && saddr)
1220 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 #endif
1222 		if (matched)
1223 			return rt6_ex;
1224 	}
1225 	return NULL;
1226 }
1227 
1228 /* Helper function to find the cached rt in the hash table
1229  * and update bucket pointer to point to the bucket for this
1230  * (daddr, saddr) pair
1231  * Caller must hold rcu_read_lock()
1232  */
1233 static struct rt6_exception *
1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235 			 const struct in6_addr *daddr,
1236 			 const struct in6_addr *saddr)
1237 {
1238 	struct rt6_exception *rt6_ex;
1239 	u32 hval;
1240 
1241 	WARN_ON_ONCE(!rcu_read_lock_held());
1242 
1243 	if (!(*bucket) || !daddr)
1244 		return NULL;
1245 
1246 	hval = rt6_exception_hash(daddr, saddr);
1247 	*bucket += hval;
1248 
1249 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250 		struct rt6_info *rt6 = rt6_ex->rt6i;
1251 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252 
1253 #ifdef CONFIG_IPV6_SUBTREES
1254 		if (matched && saddr)
1255 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256 #endif
1257 		if (matched)
1258 			return rt6_ex;
1259 	}
1260 	return NULL;
1261 }
1262 
1263 static int rt6_insert_exception(struct rt6_info *nrt,
1264 				struct rt6_info *ort)
1265 {
1266 	struct net *net = dev_net(ort->dst.dev);
1267 	struct rt6_exception_bucket *bucket;
1268 	struct in6_addr *src_key = NULL;
1269 	struct rt6_exception *rt6_ex;
1270 	int err = 0;
1271 
1272 	/* ort can't be a cache or pcpu route */
1273 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274 		ort = ort->from;
1275 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276 
1277 	spin_lock_bh(&rt6_exception_lock);
1278 
1279 	if (ort->exception_bucket_flushed) {
1280 		err = -EINVAL;
1281 		goto out;
1282 	}
1283 
1284 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285 					lockdep_is_held(&rt6_exception_lock));
1286 	if (!bucket) {
1287 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288 				 GFP_ATOMIC);
1289 		if (!bucket) {
1290 			err = -ENOMEM;
1291 			goto out;
1292 		}
1293 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294 	}
1295 
1296 #ifdef CONFIG_IPV6_SUBTREES
1297 	/* rt6i_src.plen != 0 indicates ort is in subtree
1298 	 * and exception table is indexed by a hash of
1299 	 * both rt6i_dst and rt6i_src.
1300 	 * Otherwise, the exception table is indexed by
1301 	 * a hash of only rt6i_dst.
1302 	 */
1303 	if (ort->rt6i_src.plen)
1304 		src_key = &nrt->rt6i_src.addr;
1305 #endif
1306 
1307 	/* Update rt6i_prefsrc as it could be changed
1308 	 * in rt6_remove_prefsrc()
1309 	 */
1310 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311 	/* rt6_mtu_change() might lower mtu on ort.
1312 	 * Only insert this exception route if its mtu
1313 	 * is less than ort's mtu value.
1314 	 */
1315 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316 		err = -EINVAL;
1317 		goto out;
1318 	}
1319 
1320 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321 					       src_key);
1322 	if (rt6_ex)
1323 		rt6_remove_exception(bucket, rt6_ex);
1324 
1325 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326 	if (!rt6_ex) {
1327 		err = -ENOMEM;
1328 		goto out;
1329 	}
1330 	rt6_ex->rt6i = nrt;
1331 	rt6_ex->stamp = jiffies;
1332 	atomic_inc(&nrt->rt6i_ref);
1333 	nrt->rt6i_node = ort->rt6i_node;
1334 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335 	bucket->depth++;
1336 	net->ipv6.rt6_stats->fib_rt_cache++;
1337 
1338 	if (bucket->depth > FIB6_MAX_DEPTH)
1339 		rt6_exception_remove_oldest(bucket);
1340 
1341 out:
1342 	spin_unlock_bh(&rt6_exception_lock);
1343 
1344 	/* Update fn->fn_sernum to invalidate all cached dst */
1345 	if (!err) {
1346 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347 		fib6_update_sernum(ort);
1348 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349 		fib6_force_start_gc(net);
1350 	}
1351 
1352 	return err;
1353 }
1354 
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357 	struct rt6_exception_bucket *bucket;
1358 	struct rt6_exception *rt6_ex;
1359 	struct hlist_node *tmp;
1360 	int i;
1361 
1362 	spin_lock_bh(&rt6_exception_lock);
1363 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1364 	rt->exception_bucket_flushed = 1;
1365 
1366 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367 				    lockdep_is_held(&rt6_exception_lock));
1368 	if (!bucket)
1369 		goto out;
1370 
1371 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373 			rt6_remove_exception(bucket, rt6_ex);
1374 		WARN_ON_ONCE(bucket->depth);
1375 		bucket++;
1376 	}
1377 
1378 out:
1379 	spin_unlock_bh(&rt6_exception_lock);
1380 }
1381 
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386 					   struct in6_addr *daddr,
1387 					   struct in6_addr *saddr)
1388 {
1389 	struct rt6_exception_bucket *bucket;
1390 	struct in6_addr *src_key = NULL;
1391 	struct rt6_exception *rt6_ex;
1392 	struct rt6_info *res = NULL;
1393 
1394 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395 
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 	/* rt6i_src.plen != 0 indicates rt is in subtree
1398 	 * and exception table is indexed by a hash of
1399 	 * both rt6i_dst and rt6i_src.
1400 	 * Otherwise, the exception table is indexed by
1401 	 * a hash of only rt6i_dst.
1402 	 */
1403 	if (rt->rt6i_src.plen)
1404 		src_key = saddr;
1405 #endif
1406 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407 
1408 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409 		res = rt6_ex->rt6i;
1410 
1411 	return res;
1412 }
1413 
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417 	struct rt6_exception_bucket *bucket;
1418 	struct rt6_info *from = rt->from;
1419 	struct in6_addr *src_key = NULL;
1420 	struct rt6_exception *rt6_ex;
1421 	int err;
1422 
1423 	if (!from ||
1424 	    !(rt->rt6i_flags & RTF_CACHE))
1425 		return -EINVAL;
1426 
1427 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428 		return -ENOENT;
1429 
1430 	spin_lock_bh(&rt6_exception_lock);
1431 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432 				    lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1435 	 * and exception table is indexed by a hash of
1436 	 * both rt6i_dst and rt6i_src.
1437 	 * Otherwise, the exception table is indexed by
1438 	 * a hash of only rt6i_dst.
1439 	 */
1440 	if (from->rt6i_src.plen)
1441 		src_key = &rt->rt6i_src.addr;
1442 #endif
1443 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444 					       &rt->rt6i_dst.addr,
1445 					       src_key);
1446 	if (rt6_ex) {
1447 		rt6_remove_exception(bucket, rt6_ex);
1448 		err = 0;
1449 	} else {
1450 		err = -ENOENT;
1451 	}
1452 
1453 	spin_unlock_bh(&rt6_exception_lock);
1454 	return err;
1455 }
1456 
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462 	struct rt6_exception_bucket *bucket;
1463 	struct rt6_info *from = rt->from;
1464 	struct in6_addr *src_key = NULL;
1465 	struct rt6_exception *rt6_ex;
1466 
1467 	if (!from ||
1468 	    !(rt->rt6i_flags & RTF_CACHE))
1469 		return;
1470 
1471 	rcu_read_lock();
1472 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1473 
1474 #ifdef CONFIG_IPV6_SUBTREES
1475 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1476 	 * and exception table is indexed by a hash of
1477 	 * both rt6i_dst and rt6i_src.
1478 	 * Otherwise, the exception table is indexed by
1479 	 * a hash of only rt6i_dst.
1480 	 */
1481 	if (from->rt6i_src.plen)
1482 		src_key = &rt->rt6i_src.addr;
1483 #endif
1484 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1485 					  &rt->rt6i_dst.addr,
1486 					  src_key);
1487 	if (rt6_ex)
1488 		rt6_ex->stamp = jiffies;
1489 
1490 	rcu_read_unlock();
1491 }
1492 
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495 	struct rt6_exception_bucket *bucket;
1496 	struct rt6_exception *rt6_ex;
1497 	int i;
1498 
1499 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 					lockdep_is_held(&rt6_exception_lock));
1501 
1502 	if (bucket) {
1503 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506 			}
1507 			bucket++;
1508 		}
1509 	}
1510 }
1511 
1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1513 {
1514 	struct rt6_exception_bucket *bucket;
1515 	struct rt6_exception *rt6_ex;
1516 	int i;
1517 
1518 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519 					lockdep_is_held(&rt6_exception_lock));
1520 
1521 	if (bucket) {
1522 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1524 				struct rt6_info *entry = rt6_ex->rt6i;
1525 				/* For RTF_CACHE with rt6i_pmtu == 0
1526 				 * (i.e. a redirected route),
1527 				 * the metrics of its rt->dst.from has already
1528 				 * been updated.
1529 				 */
1530 				if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1531 					entry->rt6i_pmtu = mtu;
1532 			}
1533 			bucket++;
1534 		}
1535 	}
1536 }
1537 
1538 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1539 
1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1541 					struct in6_addr *gateway)
1542 {
1543 	struct rt6_exception_bucket *bucket;
1544 	struct rt6_exception *rt6_ex;
1545 	struct hlist_node *tmp;
1546 	int i;
1547 
1548 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1549 		return;
1550 
1551 	spin_lock_bh(&rt6_exception_lock);
1552 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553 				     lockdep_is_held(&rt6_exception_lock));
1554 
1555 	if (bucket) {
1556 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1557 			hlist_for_each_entry_safe(rt6_ex, tmp,
1558 						  &bucket->chain, hlist) {
1559 				struct rt6_info *entry = rt6_ex->rt6i;
1560 
1561 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1562 				    RTF_CACHE_GATEWAY &&
1563 				    ipv6_addr_equal(gateway,
1564 						    &entry->rt6i_gateway)) {
1565 					rt6_remove_exception(bucket, rt6_ex);
1566 				}
1567 			}
1568 			bucket++;
1569 		}
1570 	}
1571 
1572 	spin_unlock_bh(&rt6_exception_lock);
1573 }
1574 
1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1576 				      struct rt6_exception *rt6_ex,
1577 				      struct fib6_gc_args *gc_args,
1578 				      unsigned long now)
1579 {
1580 	struct rt6_info *rt = rt6_ex->rt6i;
1581 
1582 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1583 	 * even if others have still references to them, so that on next
1584 	 * dst_check() such references can be dropped.
1585 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1586 	 * expired, independently from their aging, as per RFC 8201 section 4
1587 	 */
1588 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1589 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1590 			RT6_TRACE("aging clone %p\n", rt);
1591 			rt6_remove_exception(bucket, rt6_ex);
1592 			return;
1593 		}
1594 	} else if (time_after(jiffies, rt->dst.expires)) {
1595 		RT6_TRACE("purging expired route %p\n", rt);
1596 		rt6_remove_exception(bucket, rt6_ex);
1597 		return;
1598 	}
1599 
1600 	if (rt->rt6i_flags & RTF_GATEWAY) {
1601 		struct neighbour *neigh;
1602 		__u8 neigh_flags = 0;
1603 
1604 		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1605 		if (neigh) {
1606 			neigh_flags = neigh->flags;
1607 			neigh_release(neigh);
1608 		}
1609 		if (!(neigh_flags & NTF_ROUTER)) {
1610 			RT6_TRACE("purging route %p via non-router but gateway\n",
1611 				  rt);
1612 			rt6_remove_exception(bucket, rt6_ex);
1613 			return;
1614 		}
1615 	}
1616 
1617 	gc_args->more++;
1618 }
1619 
1620 void rt6_age_exceptions(struct rt6_info *rt,
1621 			struct fib6_gc_args *gc_args,
1622 			unsigned long now)
1623 {
1624 	struct rt6_exception_bucket *bucket;
1625 	struct rt6_exception *rt6_ex;
1626 	struct hlist_node *tmp;
1627 	int i;
1628 
1629 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1630 		return;
1631 
1632 	spin_lock_bh(&rt6_exception_lock);
1633 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634 				    lockdep_is_held(&rt6_exception_lock));
1635 
1636 	if (bucket) {
1637 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638 			hlist_for_each_entry_safe(rt6_ex, tmp,
1639 						  &bucket->chain, hlist) {
1640 				rt6_age_examine_exception(bucket, rt6_ex,
1641 							  gc_args, now);
1642 			}
1643 			bucket++;
1644 		}
1645 	}
1646 	spin_unlock_bh(&rt6_exception_lock);
1647 }
1648 
1649 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1650 			       int oif, struct flowi6 *fl6, int flags)
1651 {
1652 	struct fib6_node *fn, *saved_fn;
1653 	struct rt6_info *rt, *rt_cache;
1654 	int strict = 0;
1655 
1656 	strict |= flags & RT6_LOOKUP_F_IFACE;
1657 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1658 	if (net->ipv6.devconf_all->forwarding == 0)
1659 		strict |= RT6_LOOKUP_F_REACHABLE;
1660 
1661 	rcu_read_lock();
1662 
1663 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1664 	saved_fn = fn;
1665 
1666 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1667 		oif = 0;
1668 
1669 redo_rt6_select:
1670 	rt = rt6_select(net, fn, oif, strict);
1671 	if (rt->rt6i_nsiblings)
1672 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1673 	if (rt == net->ipv6.ip6_null_entry) {
1674 		fn = fib6_backtrack(fn, &fl6->saddr);
1675 		if (fn)
1676 			goto redo_rt6_select;
1677 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1678 			/* also consider unreachable route */
1679 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1680 			fn = saved_fn;
1681 			goto redo_rt6_select;
1682 		}
1683 	}
1684 
1685 	/*Search through exception table */
1686 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1687 	if (rt_cache)
1688 		rt = rt_cache;
1689 
1690 	if (rt == net->ipv6.ip6_null_entry) {
1691 		rcu_read_unlock();
1692 		dst_hold(&rt->dst);
1693 		trace_fib6_table_lookup(net, rt, table, fl6);
1694 		return rt;
1695 	} else if (rt->rt6i_flags & RTF_CACHE) {
1696 		if (ip6_hold_safe(net, &rt, true)) {
1697 			dst_use_noref(&rt->dst, jiffies);
1698 			rt6_dst_from_metrics_check(rt);
1699 		}
1700 		rcu_read_unlock();
1701 		trace_fib6_table_lookup(net, rt, table, fl6);
1702 		return rt;
1703 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1704 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1705 		/* Create a RTF_CACHE clone which will not be
1706 		 * owned by the fib6 tree.  It is for the special case where
1707 		 * the daddr in the skb during the neighbor look-up is different
1708 		 * from the fl6->daddr used to look-up route here.
1709 		 */
1710 
1711 		struct rt6_info *uncached_rt;
1712 
1713 		if (ip6_hold_safe(net, &rt, true)) {
1714 			dst_use_noref(&rt->dst, jiffies);
1715 		} else {
1716 			rcu_read_unlock();
1717 			uncached_rt = rt;
1718 			goto uncached_rt_out;
1719 		}
1720 		rcu_read_unlock();
1721 
1722 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1723 		dst_release(&rt->dst);
1724 
1725 		if (uncached_rt) {
1726 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1727 			 * No need for another dst_hold()
1728 			 */
1729 			rt6_uncached_list_add(uncached_rt);
1730 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1731 		} else {
1732 			uncached_rt = net->ipv6.ip6_null_entry;
1733 			dst_hold(&uncached_rt->dst);
1734 		}
1735 
1736 uncached_rt_out:
1737 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1738 		return uncached_rt;
1739 
1740 	} else {
1741 		/* Get a percpu copy */
1742 
1743 		struct rt6_info *pcpu_rt;
1744 
1745 		dst_use_noref(&rt->dst, jiffies);
1746 		local_bh_disable();
1747 		pcpu_rt = rt6_get_pcpu_route(rt);
1748 
1749 		if (!pcpu_rt) {
1750 			/* atomic_inc_not_zero() is needed when using rcu */
1751 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1752 				/* No dst_hold() on rt is needed because grabbing
1753 				 * rt->rt6i_ref makes sure rt can't be released.
1754 				 */
1755 				pcpu_rt = rt6_make_pcpu_route(rt);
1756 				rt6_release(rt);
1757 			} else {
1758 				/* rt is already removed from tree */
1759 				pcpu_rt = net->ipv6.ip6_null_entry;
1760 				dst_hold(&pcpu_rt->dst);
1761 			}
1762 		}
1763 		local_bh_enable();
1764 		rcu_read_unlock();
1765 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1766 		return pcpu_rt;
1767 	}
1768 }
1769 EXPORT_SYMBOL_GPL(ip6_pol_route);
1770 
1771 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1772 					    struct flowi6 *fl6, int flags)
1773 {
1774 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1775 }
1776 
1777 struct dst_entry *ip6_route_input_lookup(struct net *net,
1778 					 struct net_device *dev,
1779 					 struct flowi6 *fl6, int flags)
1780 {
1781 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1782 		flags |= RT6_LOOKUP_F_IFACE;
1783 
1784 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1785 }
1786 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1787 
1788 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1789 				  struct flow_keys *keys)
1790 {
1791 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1792 	const struct ipv6hdr *key_iph = outer_iph;
1793 	const struct ipv6hdr *inner_iph;
1794 	const struct icmp6hdr *icmph;
1795 	struct ipv6hdr _inner_iph;
1796 
1797 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1798 		goto out;
1799 
1800 	icmph = icmp6_hdr(skb);
1801 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1802 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1803 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1804 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1805 		goto out;
1806 
1807 	inner_iph = skb_header_pointer(skb,
1808 				       skb_transport_offset(skb) + sizeof(*icmph),
1809 				       sizeof(_inner_iph), &_inner_iph);
1810 	if (!inner_iph)
1811 		goto out;
1812 
1813 	key_iph = inner_iph;
1814 out:
1815 	memset(keys, 0, sizeof(*keys));
1816 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1817 	keys->addrs.v6addrs.src = key_iph->saddr;
1818 	keys->addrs.v6addrs.dst = key_iph->daddr;
1819 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1820 	keys->basic.ip_proto = key_iph->nexthdr;
1821 }
1822 
1823 /* if skb is set it will be used and fl6 can be NULL */
1824 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1825 {
1826 	struct flow_keys hash_keys;
1827 
1828 	if (skb) {
1829 		ip6_multipath_l3_keys(skb, &hash_keys);
1830 		return flow_hash_from_keys(&hash_keys) >> 1;
1831 	}
1832 
1833 	return get_hash_from_flowi6(fl6) >> 1;
1834 }
1835 
1836 void ip6_route_input(struct sk_buff *skb)
1837 {
1838 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1839 	struct net *net = dev_net(skb->dev);
1840 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1841 	struct ip_tunnel_info *tun_info;
1842 	struct flowi6 fl6 = {
1843 		.flowi6_iif = skb->dev->ifindex,
1844 		.daddr = iph->daddr,
1845 		.saddr = iph->saddr,
1846 		.flowlabel = ip6_flowinfo(iph),
1847 		.flowi6_mark = skb->mark,
1848 		.flowi6_proto = iph->nexthdr,
1849 	};
1850 
1851 	tun_info = skb_tunnel_info(skb);
1852 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1853 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1854 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1855 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1856 	skb_dst_drop(skb);
1857 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1858 }
1859 
1860 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1861 					     struct flowi6 *fl6, int flags)
1862 {
1863 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1864 }
1865 
1866 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1867 					 struct flowi6 *fl6, int flags)
1868 {
1869 	bool any_src;
1870 
1871 	if (rt6_need_strict(&fl6->daddr)) {
1872 		struct dst_entry *dst;
1873 
1874 		dst = l3mdev_link_scope_lookup(net, fl6);
1875 		if (dst)
1876 			return dst;
1877 	}
1878 
1879 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1880 
1881 	any_src = ipv6_addr_any(&fl6->saddr);
1882 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1883 	    (fl6->flowi6_oif && any_src))
1884 		flags |= RT6_LOOKUP_F_IFACE;
1885 
1886 	if (!any_src)
1887 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1888 	else if (sk)
1889 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1890 
1891 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1892 }
1893 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1894 
1895 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1896 {
1897 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1898 	struct net_device *loopback_dev = net->loopback_dev;
1899 	struct dst_entry *new = NULL;
1900 
1901 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1902 		       DST_OBSOLETE_DEAD, 0);
1903 	if (rt) {
1904 		rt6_info_init(rt);
1905 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1906 
1907 		new = &rt->dst;
1908 		new->__use = 1;
1909 		new->input = dst_discard;
1910 		new->output = dst_discard_out;
1911 
1912 		dst_copy_metrics(new, &ort->dst);
1913 
1914 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1915 		rt->rt6i_gateway = ort->rt6i_gateway;
1916 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1917 		rt->rt6i_metric = 0;
1918 
1919 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1920 #ifdef CONFIG_IPV6_SUBTREES
1921 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1922 #endif
1923 	}
1924 
1925 	dst_release(dst_orig);
1926 	return new ? new : ERR_PTR(-ENOMEM);
1927 }
1928 
1929 /*
1930  *	Destination cache support functions
1931  */
1932 
1933 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1934 {
1935 	if (rt->from &&
1936 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1937 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1938 }
1939 
1940 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1941 {
1942 	u32 rt_cookie = 0;
1943 
1944 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1945 		return NULL;
1946 
1947 	if (rt6_check_expired(rt))
1948 		return NULL;
1949 
1950 	return &rt->dst;
1951 }
1952 
1953 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1954 {
1955 	if (!__rt6_check_expired(rt) &&
1956 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1957 	    rt6_check(rt->from, cookie))
1958 		return &rt->dst;
1959 	else
1960 		return NULL;
1961 }
1962 
1963 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1964 {
1965 	struct rt6_info *rt;
1966 
1967 	rt = (struct rt6_info *) dst;
1968 
1969 	/* All IPV6 dsts are created with ->obsolete set to the value
1970 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1971 	 * into this function always.
1972 	 */
1973 
1974 	rt6_dst_from_metrics_check(rt);
1975 
1976 	if (rt->rt6i_flags & RTF_PCPU ||
1977 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1978 		return rt6_dst_from_check(rt, cookie);
1979 	else
1980 		return rt6_check(rt, cookie);
1981 }
1982 
1983 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1984 {
1985 	struct rt6_info *rt = (struct rt6_info *) dst;
1986 
1987 	if (rt) {
1988 		if (rt->rt6i_flags & RTF_CACHE) {
1989 			if (rt6_check_expired(rt)) {
1990 				ip6_del_rt(rt);
1991 				dst = NULL;
1992 			}
1993 		} else {
1994 			dst_release(dst);
1995 			dst = NULL;
1996 		}
1997 	}
1998 	return dst;
1999 }
2000 
2001 static void ip6_link_failure(struct sk_buff *skb)
2002 {
2003 	struct rt6_info *rt;
2004 
2005 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2006 
2007 	rt = (struct rt6_info *) skb_dst(skb);
2008 	if (rt) {
2009 		if (rt->rt6i_flags & RTF_CACHE) {
2010 			if (dst_hold_safe(&rt->dst))
2011 				ip6_del_rt(rt);
2012 		} else {
2013 			struct fib6_node *fn;
2014 
2015 			rcu_read_lock();
2016 			fn = rcu_dereference(rt->rt6i_node);
2017 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2018 				fn->fn_sernum = -1;
2019 			rcu_read_unlock();
2020 		}
2021 	}
2022 }
2023 
2024 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2025 {
2026 	struct net *net = dev_net(rt->dst.dev);
2027 
2028 	rt->rt6i_flags |= RTF_MODIFIED;
2029 	rt->rt6i_pmtu = mtu;
2030 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2031 }
2032 
2033 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2034 {
2035 	return !(rt->rt6i_flags & RTF_CACHE) &&
2036 		(rt->rt6i_flags & RTF_PCPU ||
2037 		 rcu_access_pointer(rt->rt6i_node));
2038 }
2039 
2040 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2041 				 const struct ipv6hdr *iph, u32 mtu)
2042 {
2043 	const struct in6_addr *daddr, *saddr;
2044 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2045 
2046 	if (rt6->rt6i_flags & RTF_LOCAL)
2047 		return;
2048 
2049 	if (dst_metric_locked(dst, RTAX_MTU))
2050 		return;
2051 
2052 	if (iph) {
2053 		daddr = &iph->daddr;
2054 		saddr = &iph->saddr;
2055 	} else if (sk) {
2056 		daddr = &sk->sk_v6_daddr;
2057 		saddr = &inet6_sk(sk)->saddr;
2058 	} else {
2059 		daddr = NULL;
2060 		saddr = NULL;
2061 	}
2062 	dst_confirm_neigh(dst, daddr);
2063 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2064 	if (mtu >= dst_mtu(dst))
2065 		return;
2066 
2067 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2068 		rt6_do_update_pmtu(rt6, mtu);
2069 		/* update rt6_ex->stamp for cache */
2070 		if (rt6->rt6i_flags & RTF_CACHE)
2071 			rt6_update_exception_stamp_rt(rt6);
2072 	} else if (daddr) {
2073 		struct rt6_info *nrt6;
2074 
2075 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2076 		if (nrt6) {
2077 			rt6_do_update_pmtu(nrt6, mtu);
2078 			if (rt6_insert_exception(nrt6, rt6))
2079 				dst_release_immediate(&nrt6->dst);
2080 		}
2081 	}
2082 }
2083 
2084 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2085 			       struct sk_buff *skb, u32 mtu)
2086 {
2087 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2088 }
2089 
2090 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2091 		     int oif, u32 mark, kuid_t uid)
2092 {
2093 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2094 	struct dst_entry *dst;
2095 	struct flowi6 fl6;
2096 
2097 	memset(&fl6, 0, sizeof(fl6));
2098 	fl6.flowi6_oif = oif;
2099 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2100 	fl6.daddr = iph->daddr;
2101 	fl6.saddr = iph->saddr;
2102 	fl6.flowlabel = ip6_flowinfo(iph);
2103 	fl6.flowi6_uid = uid;
2104 
2105 	dst = ip6_route_output(net, NULL, &fl6);
2106 	if (!dst->error)
2107 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2108 	dst_release(dst);
2109 }
2110 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2111 
2112 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2113 {
2114 	struct dst_entry *dst;
2115 
2116 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2117 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2118 
2119 	dst = __sk_dst_get(sk);
2120 	if (!dst || !dst->obsolete ||
2121 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2122 		return;
2123 
2124 	bh_lock_sock(sk);
2125 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2126 		ip6_datagram_dst_update(sk, false);
2127 	bh_unlock_sock(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2130 
2131 /* Handle redirects */
2132 struct ip6rd_flowi {
2133 	struct flowi6 fl6;
2134 	struct in6_addr gateway;
2135 };
2136 
2137 static struct rt6_info *__ip6_route_redirect(struct net *net,
2138 					     struct fib6_table *table,
2139 					     struct flowi6 *fl6,
2140 					     int flags)
2141 {
2142 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2143 	struct rt6_info *rt, *rt_cache;
2144 	struct fib6_node *fn;
2145 
2146 	/* Get the "current" route for this destination and
2147 	 * check if the redirect has come from appropriate router.
2148 	 *
2149 	 * RFC 4861 specifies that redirects should only be
2150 	 * accepted if they come from the nexthop to the target.
2151 	 * Due to the way the routes are chosen, this notion
2152 	 * is a bit fuzzy and one might need to check all possible
2153 	 * routes.
2154 	 */
2155 
2156 	rcu_read_lock();
2157 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2158 restart:
2159 	for_each_fib6_node_rt_rcu(fn) {
2160 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2161 			continue;
2162 		if (rt6_check_expired(rt))
2163 			continue;
2164 		if (rt->dst.error)
2165 			break;
2166 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2167 			continue;
2168 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2169 			continue;
2170 		/* rt_cache's gateway might be different from its 'parent'
2171 		 * in the case of an ip redirect.
2172 		 * So we keep searching in the exception table if the gateway
2173 		 * is different.
2174 		 */
2175 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176 			rt_cache = rt6_find_cached_rt(rt,
2177 						      &fl6->daddr,
2178 						      &fl6->saddr);
2179 			if (rt_cache &&
2180 			    ipv6_addr_equal(&rdfl->gateway,
2181 					    &rt_cache->rt6i_gateway)) {
2182 				rt = rt_cache;
2183 				break;
2184 			}
2185 			continue;
2186 		}
2187 		break;
2188 	}
2189 
2190 	if (!rt)
2191 		rt = net->ipv6.ip6_null_entry;
2192 	else if (rt->dst.error) {
2193 		rt = net->ipv6.ip6_null_entry;
2194 		goto out;
2195 	}
2196 
2197 	if (rt == net->ipv6.ip6_null_entry) {
2198 		fn = fib6_backtrack(fn, &fl6->saddr);
2199 		if (fn)
2200 			goto restart;
2201 	}
2202 
2203 out:
2204 	ip6_hold_safe(net, &rt, true);
2205 
2206 	rcu_read_unlock();
2207 
2208 	trace_fib6_table_lookup(net, rt, table, fl6);
2209 	return rt;
2210 };
2211 
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213 					const struct flowi6 *fl6,
2214 					const struct in6_addr *gateway)
2215 {
2216 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2217 	struct ip6rd_flowi rdfl;
2218 
2219 	rdfl.fl6 = *fl6;
2220 	rdfl.gateway = *gateway;
2221 
2222 	return fib6_rule_lookup(net, &rdfl.fl6,
2223 				flags, __ip6_route_redirect);
2224 }
2225 
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2227 		  kuid_t uid)
2228 {
2229 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230 	struct dst_entry *dst;
2231 	struct flowi6 fl6;
2232 
2233 	memset(&fl6, 0, sizeof(fl6));
2234 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235 	fl6.flowi6_oif = oif;
2236 	fl6.flowi6_mark = mark;
2237 	fl6.daddr = iph->daddr;
2238 	fl6.saddr = iph->saddr;
2239 	fl6.flowlabel = ip6_flowinfo(iph);
2240 	fl6.flowi6_uid = uid;
2241 
2242 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243 	rt6_do_redirect(dst, NULL, skb);
2244 	dst_release(dst);
2245 }
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2247 
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2249 			    u32 mark)
2250 {
2251 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2252 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253 	struct dst_entry *dst;
2254 	struct flowi6 fl6;
2255 
2256 	memset(&fl6, 0, sizeof(fl6));
2257 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258 	fl6.flowi6_oif = oif;
2259 	fl6.flowi6_mark = mark;
2260 	fl6.daddr = msg->dest;
2261 	fl6.saddr = iph->daddr;
2262 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2263 
2264 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265 	rt6_do_redirect(dst, NULL, skb);
2266 	dst_release(dst);
2267 }
2268 
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2270 {
2271 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2272 		     sk->sk_uid);
2273 }
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2275 
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2277 {
2278 	struct net_device *dev = dst->dev;
2279 	unsigned int mtu = dst_mtu(dst);
2280 	struct net *net = dev_net(dev);
2281 
2282 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2283 
2284 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2286 
2287 	/*
2288 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2291 	 * rely only on pmtu discovery"
2292 	 */
2293 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2294 		mtu = IPV6_MAXPLEN;
2295 	return mtu;
2296 }
2297 
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2299 {
2300 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2301 	unsigned int mtu = rt->rt6i_pmtu;
2302 	struct inet6_dev *idev;
2303 
2304 	if (mtu)
2305 		goto out;
2306 
2307 	mtu = dst_metric_raw(dst, RTAX_MTU);
2308 	if (mtu)
2309 		goto out;
2310 
2311 	mtu = IPV6_MIN_MTU;
2312 
2313 	rcu_read_lock();
2314 	idev = __in6_dev_get(dst->dev);
2315 	if (idev)
2316 		mtu = idev->cnf.mtu6;
2317 	rcu_read_unlock();
2318 
2319 out:
2320 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2321 
2322 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2323 }
2324 
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2326 				  struct flowi6 *fl6)
2327 {
2328 	struct dst_entry *dst;
2329 	struct rt6_info *rt;
2330 	struct inet6_dev *idev = in6_dev_get(dev);
2331 	struct net *net = dev_net(dev);
2332 
2333 	if (unlikely(!idev))
2334 		return ERR_PTR(-ENODEV);
2335 
2336 	rt = ip6_dst_alloc(net, dev, 0);
2337 	if (unlikely(!rt)) {
2338 		in6_dev_put(idev);
2339 		dst = ERR_PTR(-ENOMEM);
2340 		goto out;
2341 	}
2342 
2343 	rt->dst.flags |= DST_HOST;
2344 	rt->dst.input = ip6_input;
2345 	rt->dst.output  = ip6_output;
2346 	rt->rt6i_gateway  = fl6->daddr;
2347 	rt->rt6i_dst.addr = fl6->daddr;
2348 	rt->rt6i_dst.plen = 128;
2349 	rt->rt6i_idev     = idev;
2350 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2351 
2352 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2353 	 * do proper release of the net_device
2354 	 */
2355 	rt6_uncached_list_add(rt);
2356 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2357 
2358 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2359 
2360 out:
2361 	return dst;
2362 }
2363 
2364 static int ip6_dst_gc(struct dst_ops *ops)
2365 {
2366 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2367 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2368 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2369 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2370 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2371 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2372 	int entries;
2373 
2374 	entries = dst_entries_get_fast(ops);
2375 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2376 	    entries <= rt_max_size)
2377 		goto out;
2378 
2379 	net->ipv6.ip6_rt_gc_expire++;
2380 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2381 	entries = dst_entries_get_slow(ops);
2382 	if (entries < ops->gc_thresh)
2383 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2384 out:
2385 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2386 	return entries > rt_max_size;
2387 }
2388 
2389 static int ip6_convert_metrics(struct mx6_config *mxc,
2390 			       const struct fib6_config *cfg)
2391 {
2392 	struct net *net = cfg->fc_nlinfo.nl_net;
2393 	bool ecn_ca = false;
2394 	struct nlattr *nla;
2395 	int remaining;
2396 	u32 *mp;
2397 
2398 	if (!cfg->fc_mx)
2399 		return 0;
2400 
2401 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2402 	if (unlikely(!mp))
2403 		return -ENOMEM;
2404 
2405 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2406 		int type = nla_type(nla);
2407 		u32 val;
2408 
2409 		if (!type)
2410 			continue;
2411 		if (unlikely(type > RTAX_MAX))
2412 			goto err;
2413 
2414 		if (type == RTAX_CC_ALGO) {
2415 			char tmp[TCP_CA_NAME_MAX];
2416 
2417 			nla_strlcpy(tmp, nla, sizeof(tmp));
2418 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2419 			if (val == TCP_CA_UNSPEC)
2420 				goto err;
2421 		} else {
2422 			val = nla_get_u32(nla);
2423 		}
2424 		if (type == RTAX_HOPLIMIT && val > 255)
2425 			val = 255;
2426 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2427 			goto err;
2428 
2429 		mp[type - 1] = val;
2430 		__set_bit(type - 1, mxc->mx_valid);
2431 	}
2432 
2433 	if (ecn_ca) {
2434 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2435 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2436 	}
2437 
2438 	mxc->mx = mp;
2439 	return 0;
2440  err:
2441 	kfree(mp);
2442 	return -EINVAL;
2443 }
2444 
2445 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2446 					    struct fib6_config *cfg,
2447 					    const struct in6_addr *gw_addr,
2448 					    u32 tbid, int flags)
2449 {
2450 	struct flowi6 fl6 = {
2451 		.flowi6_oif = cfg->fc_ifindex,
2452 		.daddr = *gw_addr,
2453 		.saddr = cfg->fc_prefsrc,
2454 	};
2455 	struct fib6_table *table;
2456 	struct rt6_info *rt;
2457 
2458 	table = fib6_get_table(net, tbid);
2459 	if (!table)
2460 		return NULL;
2461 
2462 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2463 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2464 
2465 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2466 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2467 
2468 	/* if table lookup failed, fall back to full lookup */
2469 	if (rt == net->ipv6.ip6_null_entry) {
2470 		ip6_rt_put(rt);
2471 		rt = NULL;
2472 	}
2473 
2474 	return rt;
2475 }
2476 
2477 static int ip6_route_check_nh_onlink(struct net *net,
2478 				     struct fib6_config *cfg,
2479 				     struct net_device *dev,
2480 				     struct netlink_ext_ack *extack)
2481 {
2482 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2483 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2484 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2485 	struct rt6_info *grt;
2486 	int err;
2487 
2488 	err = 0;
2489 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2490 	if (grt) {
2491 		if (!grt->dst.error &&
2492 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2493 			NL_SET_ERR_MSG(extack,
2494 				       "Nexthop has invalid gateway or device mismatch");
2495 			err = -EINVAL;
2496 		}
2497 
2498 		ip6_rt_put(grt);
2499 	}
2500 
2501 	return err;
2502 }
2503 
2504 static int ip6_route_check_nh(struct net *net,
2505 			      struct fib6_config *cfg,
2506 			      struct net_device **_dev,
2507 			      struct inet6_dev **idev)
2508 {
2509 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2510 	struct net_device *dev = _dev ? *_dev : NULL;
2511 	struct rt6_info *grt = NULL;
2512 	int err = -EHOSTUNREACH;
2513 
2514 	if (cfg->fc_table) {
2515 		int flags = RT6_LOOKUP_F_IFACE;
2516 
2517 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2518 					  cfg->fc_table, flags);
2519 		if (grt) {
2520 			if (grt->rt6i_flags & RTF_GATEWAY ||
2521 			    (dev && dev != grt->dst.dev)) {
2522 				ip6_rt_put(grt);
2523 				grt = NULL;
2524 			}
2525 		}
2526 	}
2527 
2528 	if (!grt)
2529 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2530 
2531 	if (!grt)
2532 		goto out;
2533 
2534 	if (dev) {
2535 		if (dev != grt->dst.dev) {
2536 			ip6_rt_put(grt);
2537 			goto out;
2538 		}
2539 	} else {
2540 		*_dev = dev = grt->dst.dev;
2541 		*idev = grt->rt6i_idev;
2542 		dev_hold(dev);
2543 		in6_dev_hold(grt->rt6i_idev);
2544 	}
2545 
2546 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2547 		err = 0;
2548 
2549 	ip6_rt_put(grt);
2550 
2551 out:
2552 	return err;
2553 }
2554 
2555 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2556 					      struct netlink_ext_ack *extack)
2557 {
2558 	struct net *net = cfg->fc_nlinfo.nl_net;
2559 	struct rt6_info *rt = NULL;
2560 	struct net_device *dev = NULL;
2561 	struct inet6_dev *idev = NULL;
2562 	struct fib6_table *table;
2563 	int addr_type;
2564 	int err = -EINVAL;
2565 
2566 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2567 	if (cfg->fc_flags & RTF_PCPU) {
2568 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2569 		goto out;
2570 	}
2571 
2572 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2573 	if (cfg->fc_flags & RTF_CACHE) {
2574 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2575 		goto out;
2576 	}
2577 
2578 	if (cfg->fc_dst_len > 128) {
2579 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2580 		goto out;
2581 	}
2582 	if (cfg->fc_src_len > 128) {
2583 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2584 		goto out;
2585 	}
2586 #ifndef CONFIG_IPV6_SUBTREES
2587 	if (cfg->fc_src_len) {
2588 		NL_SET_ERR_MSG(extack,
2589 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2590 		goto out;
2591 	}
2592 #endif
2593 	if (cfg->fc_ifindex) {
2594 		err = -ENODEV;
2595 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2596 		if (!dev)
2597 			goto out;
2598 		idev = in6_dev_get(dev);
2599 		if (!idev)
2600 			goto out;
2601 	}
2602 
2603 	if (cfg->fc_metric == 0)
2604 		cfg->fc_metric = IP6_RT_PRIO_USER;
2605 
2606 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2607 		if (!dev) {
2608 			NL_SET_ERR_MSG(extack,
2609 				       "Nexthop device required for onlink");
2610 			err = -ENODEV;
2611 			goto out;
2612 		}
2613 
2614 		if (!(dev->flags & IFF_UP)) {
2615 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2616 			err = -ENETDOWN;
2617 			goto out;
2618 		}
2619 	}
2620 
2621 	err = -ENOBUFS;
2622 	if (cfg->fc_nlinfo.nlh &&
2623 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2624 		table = fib6_get_table(net, cfg->fc_table);
2625 		if (!table) {
2626 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2627 			table = fib6_new_table(net, cfg->fc_table);
2628 		}
2629 	} else {
2630 		table = fib6_new_table(net, cfg->fc_table);
2631 	}
2632 
2633 	if (!table)
2634 		goto out;
2635 
2636 	rt = ip6_dst_alloc(net, NULL,
2637 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2638 
2639 	if (!rt) {
2640 		err = -ENOMEM;
2641 		goto out;
2642 	}
2643 
2644 	if (cfg->fc_flags & RTF_EXPIRES)
2645 		rt6_set_expires(rt, jiffies +
2646 				clock_t_to_jiffies(cfg->fc_expires));
2647 	else
2648 		rt6_clean_expires(rt);
2649 
2650 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2651 		cfg->fc_protocol = RTPROT_BOOT;
2652 	rt->rt6i_protocol = cfg->fc_protocol;
2653 
2654 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2655 
2656 	if (addr_type & IPV6_ADDR_MULTICAST)
2657 		rt->dst.input = ip6_mc_input;
2658 	else if (cfg->fc_flags & RTF_LOCAL)
2659 		rt->dst.input = ip6_input;
2660 	else
2661 		rt->dst.input = ip6_forward;
2662 
2663 	rt->dst.output = ip6_output;
2664 
2665 	if (cfg->fc_encap) {
2666 		struct lwtunnel_state *lwtstate;
2667 
2668 		err = lwtunnel_build_state(cfg->fc_encap_type,
2669 					   cfg->fc_encap, AF_INET6, cfg,
2670 					   &lwtstate, extack);
2671 		if (err)
2672 			goto out;
2673 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2674 		lwtunnel_set_redirect(&rt->dst);
2675 	}
2676 
2677 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2678 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2679 	if (rt->rt6i_dst.plen == 128)
2680 		rt->dst.flags |= DST_HOST;
2681 
2682 #ifdef CONFIG_IPV6_SUBTREES
2683 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2684 	rt->rt6i_src.plen = cfg->fc_src_len;
2685 #endif
2686 
2687 	rt->rt6i_metric = cfg->fc_metric;
2688 	rt->rt6i_nh_weight = 1;
2689 
2690 	/* We cannot add true routes via loopback here,
2691 	   they would result in kernel looping; promote them to reject routes
2692 	 */
2693 	if ((cfg->fc_flags & RTF_REJECT) ||
2694 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2695 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2696 	     !(cfg->fc_flags & RTF_LOCAL))) {
2697 		/* hold loopback dev/idev if we haven't done so. */
2698 		if (dev != net->loopback_dev) {
2699 			if (dev) {
2700 				dev_put(dev);
2701 				in6_dev_put(idev);
2702 			}
2703 			dev = net->loopback_dev;
2704 			dev_hold(dev);
2705 			idev = in6_dev_get(dev);
2706 			if (!idev) {
2707 				err = -ENODEV;
2708 				goto out;
2709 			}
2710 		}
2711 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2712 		switch (cfg->fc_type) {
2713 		case RTN_BLACKHOLE:
2714 			rt->dst.error = -EINVAL;
2715 			rt->dst.output = dst_discard_out;
2716 			rt->dst.input = dst_discard;
2717 			break;
2718 		case RTN_PROHIBIT:
2719 			rt->dst.error = -EACCES;
2720 			rt->dst.output = ip6_pkt_prohibit_out;
2721 			rt->dst.input = ip6_pkt_prohibit;
2722 			break;
2723 		case RTN_THROW:
2724 		case RTN_UNREACHABLE:
2725 		default:
2726 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2727 					: (cfg->fc_type == RTN_UNREACHABLE)
2728 					? -EHOSTUNREACH : -ENETUNREACH;
2729 			rt->dst.output = ip6_pkt_discard_out;
2730 			rt->dst.input = ip6_pkt_discard;
2731 			break;
2732 		}
2733 		goto install_route;
2734 	}
2735 
2736 	if (cfg->fc_flags & RTF_GATEWAY) {
2737 		const struct in6_addr *gw_addr;
2738 		int gwa_type;
2739 
2740 		gw_addr = &cfg->fc_gateway;
2741 		gwa_type = ipv6_addr_type(gw_addr);
2742 
2743 		/* if gw_addr is local we will fail to detect this in case
2744 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2745 		 * will return already-added prefix route via interface that
2746 		 * prefix route was assigned to, which might be non-loopback.
2747 		 */
2748 		err = -EINVAL;
2749 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2750 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2751 					    dev : NULL, 0, 0)) {
2752 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2753 			goto out;
2754 		}
2755 		rt->rt6i_gateway = *gw_addr;
2756 
2757 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2758 			/* IPv6 strictly inhibits using not link-local
2759 			   addresses as nexthop address.
2760 			   Otherwise, router will not able to send redirects.
2761 			   It is very good, but in some (rare!) circumstances
2762 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2763 			   some exceptions. --ANK
2764 			   We allow IPv4-mapped nexthops to support RFC4798-type
2765 			   addressing
2766 			 */
2767 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2768 					  IPV6_ADDR_MAPPED))) {
2769 				NL_SET_ERR_MSG(extack,
2770 					       "Invalid gateway address");
2771 				goto out;
2772 			}
2773 
2774 			if (cfg->fc_flags & RTNH_F_ONLINK) {
2775 				err = ip6_route_check_nh_onlink(net, cfg, dev,
2776 								extack);
2777 			} else {
2778 				err = ip6_route_check_nh(net, cfg, &dev, &idev);
2779 			}
2780 			if (err)
2781 				goto out;
2782 		}
2783 		err = -EINVAL;
2784 		if (!dev) {
2785 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2786 			goto out;
2787 		} else if (dev->flags & IFF_LOOPBACK) {
2788 			NL_SET_ERR_MSG(extack,
2789 				       "Egress device can not be loopback device for this route");
2790 			goto out;
2791 		}
2792 	}
2793 
2794 	err = -ENODEV;
2795 	if (!dev)
2796 		goto out;
2797 
2798 	if (!(dev->flags & IFF_UP)) {
2799 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2800 		err = -ENETDOWN;
2801 		goto out;
2802 	}
2803 
2804 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2805 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2806 			NL_SET_ERR_MSG(extack, "Invalid source address");
2807 			err = -EINVAL;
2808 			goto out;
2809 		}
2810 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2811 		rt->rt6i_prefsrc.plen = 128;
2812 	} else
2813 		rt->rt6i_prefsrc.plen = 0;
2814 
2815 	rt->rt6i_flags = cfg->fc_flags;
2816 
2817 install_route:
2818 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2819 	    !netif_carrier_ok(dev))
2820 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2821 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2822 	rt->dst.dev = dev;
2823 	rt->rt6i_idev = idev;
2824 	rt->rt6i_table = table;
2825 
2826 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2827 
2828 	return rt;
2829 out:
2830 	if (dev)
2831 		dev_put(dev);
2832 	if (idev)
2833 		in6_dev_put(idev);
2834 	if (rt)
2835 		dst_release_immediate(&rt->dst);
2836 
2837 	return ERR_PTR(err);
2838 }
2839 
2840 int ip6_route_add(struct fib6_config *cfg,
2841 		  struct netlink_ext_ack *extack)
2842 {
2843 	struct mx6_config mxc = { .mx = NULL, };
2844 	struct rt6_info *rt;
2845 	int err;
2846 
2847 	rt = ip6_route_info_create(cfg, extack);
2848 	if (IS_ERR(rt)) {
2849 		err = PTR_ERR(rt);
2850 		rt = NULL;
2851 		goto out;
2852 	}
2853 
2854 	err = ip6_convert_metrics(&mxc, cfg);
2855 	if (err)
2856 		goto out;
2857 
2858 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2859 
2860 	kfree(mxc.mx);
2861 
2862 	return err;
2863 out:
2864 	if (rt)
2865 		dst_release_immediate(&rt->dst);
2866 
2867 	return err;
2868 }
2869 
2870 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2871 {
2872 	int err;
2873 	struct fib6_table *table;
2874 	struct net *net = dev_net(rt->dst.dev);
2875 
2876 	if (rt == net->ipv6.ip6_null_entry) {
2877 		err = -ENOENT;
2878 		goto out;
2879 	}
2880 
2881 	table = rt->rt6i_table;
2882 	spin_lock_bh(&table->tb6_lock);
2883 	err = fib6_del(rt, info);
2884 	spin_unlock_bh(&table->tb6_lock);
2885 
2886 out:
2887 	ip6_rt_put(rt);
2888 	return err;
2889 }
2890 
2891 int ip6_del_rt(struct rt6_info *rt)
2892 {
2893 	struct nl_info info = {
2894 		.nl_net = dev_net(rt->dst.dev),
2895 	};
2896 	return __ip6_del_rt(rt, &info);
2897 }
2898 
2899 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2900 {
2901 	struct nl_info *info = &cfg->fc_nlinfo;
2902 	struct net *net = info->nl_net;
2903 	struct sk_buff *skb = NULL;
2904 	struct fib6_table *table;
2905 	int err = -ENOENT;
2906 
2907 	if (rt == net->ipv6.ip6_null_entry)
2908 		goto out_put;
2909 	table = rt->rt6i_table;
2910 	spin_lock_bh(&table->tb6_lock);
2911 
2912 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2913 		struct rt6_info *sibling, *next_sibling;
2914 
2915 		/* prefer to send a single notification with all hops */
2916 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2917 		if (skb) {
2918 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2919 
2920 			if (rt6_fill_node(net, skb, rt,
2921 					  NULL, NULL, 0, RTM_DELROUTE,
2922 					  info->portid, seq, 0) < 0) {
2923 				kfree_skb(skb);
2924 				skb = NULL;
2925 			} else
2926 				info->skip_notify = 1;
2927 		}
2928 
2929 		list_for_each_entry_safe(sibling, next_sibling,
2930 					 &rt->rt6i_siblings,
2931 					 rt6i_siblings) {
2932 			err = fib6_del(sibling, info);
2933 			if (err)
2934 				goto out_unlock;
2935 		}
2936 	}
2937 
2938 	err = fib6_del(rt, info);
2939 out_unlock:
2940 	spin_unlock_bh(&table->tb6_lock);
2941 out_put:
2942 	ip6_rt_put(rt);
2943 
2944 	if (skb) {
2945 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2946 			    info->nlh, gfp_any());
2947 	}
2948 	return err;
2949 }
2950 
2951 static int ip6_route_del(struct fib6_config *cfg,
2952 			 struct netlink_ext_ack *extack)
2953 {
2954 	struct rt6_info *rt, *rt_cache;
2955 	struct fib6_table *table;
2956 	struct fib6_node *fn;
2957 	int err = -ESRCH;
2958 
2959 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2960 	if (!table) {
2961 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2962 		return err;
2963 	}
2964 
2965 	rcu_read_lock();
2966 
2967 	fn = fib6_locate(&table->tb6_root,
2968 			 &cfg->fc_dst, cfg->fc_dst_len,
2969 			 &cfg->fc_src, cfg->fc_src_len,
2970 			 !(cfg->fc_flags & RTF_CACHE));
2971 
2972 	if (fn) {
2973 		for_each_fib6_node_rt_rcu(fn) {
2974 			if (cfg->fc_flags & RTF_CACHE) {
2975 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2976 							      &cfg->fc_src);
2977 				if (!rt_cache)
2978 					continue;
2979 				rt = rt_cache;
2980 			}
2981 			if (cfg->fc_ifindex &&
2982 			    (!rt->dst.dev ||
2983 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2984 				continue;
2985 			if (cfg->fc_flags & RTF_GATEWAY &&
2986 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2987 				continue;
2988 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2989 				continue;
2990 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2991 				continue;
2992 			if (!dst_hold_safe(&rt->dst))
2993 				break;
2994 			rcu_read_unlock();
2995 
2996 			/* if gateway was specified only delete the one hop */
2997 			if (cfg->fc_flags & RTF_GATEWAY)
2998 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2999 
3000 			return __ip6_del_rt_siblings(rt, cfg);
3001 		}
3002 	}
3003 	rcu_read_unlock();
3004 
3005 	return err;
3006 }
3007 
3008 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3009 {
3010 	struct netevent_redirect netevent;
3011 	struct rt6_info *rt, *nrt = NULL;
3012 	struct ndisc_options ndopts;
3013 	struct inet6_dev *in6_dev;
3014 	struct neighbour *neigh;
3015 	struct rd_msg *msg;
3016 	int optlen, on_link;
3017 	u8 *lladdr;
3018 
3019 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3020 	optlen -= sizeof(*msg);
3021 
3022 	if (optlen < 0) {
3023 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3024 		return;
3025 	}
3026 
3027 	msg = (struct rd_msg *)icmp6_hdr(skb);
3028 
3029 	if (ipv6_addr_is_multicast(&msg->dest)) {
3030 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3031 		return;
3032 	}
3033 
3034 	on_link = 0;
3035 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3036 		on_link = 1;
3037 	} else if (ipv6_addr_type(&msg->target) !=
3038 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3039 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3040 		return;
3041 	}
3042 
3043 	in6_dev = __in6_dev_get(skb->dev);
3044 	if (!in6_dev)
3045 		return;
3046 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3047 		return;
3048 
3049 	/* RFC2461 8.1:
3050 	 *	The IP source address of the Redirect MUST be the same as the current
3051 	 *	first-hop router for the specified ICMP Destination Address.
3052 	 */
3053 
3054 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3055 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3056 		return;
3057 	}
3058 
3059 	lladdr = NULL;
3060 	if (ndopts.nd_opts_tgt_lladdr) {
3061 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3062 					     skb->dev);
3063 		if (!lladdr) {
3064 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3065 			return;
3066 		}
3067 	}
3068 
3069 	rt = (struct rt6_info *) dst;
3070 	if (rt->rt6i_flags & RTF_REJECT) {
3071 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3072 		return;
3073 	}
3074 
3075 	/* Redirect received -> path was valid.
3076 	 * Look, redirects are sent only in response to data packets,
3077 	 * so that this nexthop apparently is reachable. --ANK
3078 	 */
3079 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3080 
3081 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3082 	if (!neigh)
3083 		return;
3084 
3085 	/*
3086 	 *	We have finally decided to accept it.
3087 	 */
3088 
3089 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3090 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3091 		     NEIGH_UPDATE_F_OVERRIDE|
3092 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3093 				     NEIGH_UPDATE_F_ISROUTER)),
3094 		     NDISC_REDIRECT, &ndopts);
3095 
3096 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3097 	if (!nrt)
3098 		goto out;
3099 
3100 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3101 	if (on_link)
3102 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3103 
3104 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3105 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3106 
3107 	/* No need to remove rt from the exception table if rt is
3108 	 * a cached route because rt6_insert_exception() will
3109 	 * takes care of it
3110 	 */
3111 	if (rt6_insert_exception(nrt, rt)) {
3112 		dst_release_immediate(&nrt->dst);
3113 		goto out;
3114 	}
3115 
3116 	netevent.old = &rt->dst;
3117 	netevent.new = &nrt->dst;
3118 	netevent.daddr = &msg->dest;
3119 	netevent.neigh = neigh;
3120 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3121 
3122 out:
3123 	neigh_release(neigh);
3124 }
3125 
3126 /*
3127  *	Misc support functions
3128  */
3129 
3130 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3131 {
3132 	BUG_ON(from->from);
3133 
3134 	rt->rt6i_flags &= ~RTF_EXPIRES;
3135 	dst_hold(&from->dst);
3136 	rt->from = from;
3137 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3138 }
3139 
3140 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3141 {
3142 	rt->dst.input = ort->dst.input;
3143 	rt->dst.output = ort->dst.output;
3144 	rt->rt6i_dst = ort->rt6i_dst;
3145 	rt->dst.error = ort->dst.error;
3146 	rt->rt6i_idev = ort->rt6i_idev;
3147 	if (rt->rt6i_idev)
3148 		in6_dev_hold(rt->rt6i_idev);
3149 	rt->dst.lastuse = jiffies;
3150 	rt->rt6i_gateway = ort->rt6i_gateway;
3151 	rt->rt6i_flags = ort->rt6i_flags;
3152 	rt6_set_from(rt, ort);
3153 	rt->rt6i_metric = ort->rt6i_metric;
3154 #ifdef CONFIG_IPV6_SUBTREES
3155 	rt->rt6i_src = ort->rt6i_src;
3156 #endif
3157 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3158 	rt->rt6i_table = ort->rt6i_table;
3159 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3160 }
3161 
3162 #ifdef CONFIG_IPV6_ROUTE_INFO
3163 static struct rt6_info *rt6_get_route_info(struct net *net,
3164 					   const struct in6_addr *prefix, int prefixlen,
3165 					   const struct in6_addr *gwaddr,
3166 					   struct net_device *dev)
3167 {
3168 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3169 	int ifindex = dev->ifindex;
3170 	struct fib6_node *fn;
3171 	struct rt6_info *rt = NULL;
3172 	struct fib6_table *table;
3173 
3174 	table = fib6_get_table(net, tb_id);
3175 	if (!table)
3176 		return NULL;
3177 
3178 	rcu_read_lock();
3179 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3180 	if (!fn)
3181 		goto out;
3182 
3183 	for_each_fib6_node_rt_rcu(fn) {
3184 		if (rt->dst.dev->ifindex != ifindex)
3185 			continue;
3186 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3187 			continue;
3188 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3189 			continue;
3190 		ip6_hold_safe(NULL, &rt, false);
3191 		break;
3192 	}
3193 out:
3194 	rcu_read_unlock();
3195 	return rt;
3196 }
3197 
3198 static struct rt6_info *rt6_add_route_info(struct net *net,
3199 					   const struct in6_addr *prefix, int prefixlen,
3200 					   const struct in6_addr *gwaddr,
3201 					   struct net_device *dev,
3202 					   unsigned int pref)
3203 {
3204 	struct fib6_config cfg = {
3205 		.fc_metric	= IP6_RT_PRIO_USER,
3206 		.fc_ifindex	= dev->ifindex,
3207 		.fc_dst_len	= prefixlen,
3208 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3209 				  RTF_UP | RTF_PREF(pref),
3210 		.fc_protocol = RTPROT_RA,
3211 		.fc_nlinfo.portid = 0,
3212 		.fc_nlinfo.nlh = NULL,
3213 		.fc_nlinfo.nl_net = net,
3214 	};
3215 
3216 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3217 	cfg.fc_dst = *prefix;
3218 	cfg.fc_gateway = *gwaddr;
3219 
3220 	/* We should treat it as a default route if prefix length is 0. */
3221 	if (!prefixlen)
3222 		cfg.fc_flags |= RTF_DEFAULT;
3223 
3224 	ip6_route_add(&cfg, NULL);
3225 
3226 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3227 }
3228 #endif
3229 
3230 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3231 {
3232 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3233 	struct rt6_info *rt;
3234 	struct fib6_table *table;
3235 
3236 	table = fib6_get_table(dev_net(dev), tb_id);
3237 	if (!table)
3238 		return NULL;
3239 
3240 	rcu_read_lock();
3241 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3242 		if (dev == rt->dst.dev &&
3243 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3244 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3245 			break;
3246 	}
3247 	if (rt)
3248 		ip6_hold_safe(NULL, &rt, false);
3249 	rcu_read_unlock();
3250 	return rt;
3251 }
3252 
3253 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3254 				     struct net_device *dev,
3255 				     unsigned int pref)
3256 {
3257 	struct fib6_config cfg = {
3258 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3259 		.fc_metric	= IP6_RT_PRIO_USER,
3260 		.fc_ifindex	= dev->ifindex,
3261 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3262 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3263 		.fc_protocol = RTPROT_RA,
3264 		.fc_nlinfo.portid = 0,
3265 		.fc_nlinfo.nlh = NULL,
3266 		.fc_nlinfo.nl_net = dev_net(dev),
3267 	};
3268 
3269 	cfg.fc_gateway = *gwaddr;
3270 
3271 	if (!ip6_route_add(&cfg, NULL)) {
3272 		struct fib6_table *table;
3273 
3274 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3275 		if (table)
3276 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3277 	}
3278 
3279 	return rt6_get_dflt_router(gwaddr, dev);
3280 }
3281 
3282 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3283 {
3284 	struct rt6_info *rt;
3285 
3286 restart:
3287 	rcu_read_lock();
3288 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3289 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3290 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3291 			if (dst_hold_safe(&rt->dst)) {
3292 				rcu_read_unlock();
3293 				ip6_del_rt(rt);
3294 			} else {
3295 				rcu_read_unlock();
3296 			}
3297 			goto restart;
3298 		}
3299 	}
3300 	rcu_read_unlock();
3301 
3302 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3303 }
3304 
3305 void rt6_purge_dflt_routers(struct net *net)
3306 {
3307 	struct fib6_table *table;
3308 	struct hlist_head *head;
3309 	unsigned int h;
3310 
3311 	rcu_read_lock();
3312 
3313 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3314 		head = &net->ipv6.fib_table_hash[h];
3315 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3316 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3317 				__rt6_purge_dflt_routers(table);
3318 		}
3319 	}
3320 
3321 	rcu_read_unlock();
3322 }
3323 
3324 static void rtmsg_to_fib6_config(struct net *net,
3325 				 struct in6_rtmsg *rtmsg,
3326 				 struct fib6_config *cfg)
3327 {
3328 	memset(cfg, 0, sizeof(*cfg));
3329 
3330 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3331 			 : RT6_TABLE_MAIN;
3332 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3333 	cfg->fc_metric = rtmsg->rtmsg_metric;
3334 	cfg->fc_expires = rtmsg->rtmsg_info;
3335 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3336 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3337 	cfg->fc_flags = rtmsg->rtmsg_flags;
3338 
3339 	cfg->fc_nlinfo.nl_net = net;
3340 
3341 	cfg->fc_dst = rtmsg->rtmsg_dst;
3342 	cfg->fc_src = rtmsg->rtmsg_src;
3343 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3344 }
3345 
3346 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3347 {
3348 	struct fib6_config cfg;
3349 	struct in6_rtmsg rtmsg;
3350 	int err;
3351 
3352 	switch (cmd) {
3353 	case SIOCADDRT:		/* Add a route */
3354 	case SIOCDELRT:		/* Delete a route */
3355 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3356 			return -EPERM;
3357 		err = copy_from_user(&rtmsg, arg,
3358 				     sizeof(struct in6_rtmsg));
3359 		if (err)
3360 			return -EFAULT;
3361 
3362 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3363 
3364 		rtnl_lock();
3365 		switch (cmd) {
3366 		case SIOCADDRT:
3367 			err = ip6_route_add(&cfg, NULL);
3368 			break;
3369 		case SIOCDELRT:
3370 			err = ip6_route_del(&cfg, NULL);
3371 			break;
3372 		default:
3373 			err = -EINVAL;
3374 		}
3375 		rtnl_unlock();
3376 
3377 		return err;
3378 	}
3379 
3380 	return -EINVAL;
3381 }
3382 
3383 /*
3384  *	Drop the packet on the floor
3385  */
3386 
3387 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3388 {
3389 	int type;
3390 	struct dst_entry *dst = skb_dst(skb);
3391 	switch (ipstats_mib_noroutes) {
3392 	case IPSTATS_MIB_INNOROUTES:
3393 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3394 		if (type == IPV6_ADDR_ANY) {
3395 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3396 				      IPSTATS_MIB_INADDRERRORS);
3397 			break;
3398 		}
3399 		/* FALLTHROUGH */
3400 	case IPSTATS_MIB_OUTNOROUTES:
3401 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3402 			      ipstats_mib_noroutes);
3403 		break;
3404 	}
3405 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3406 	kfree_skb(skb);
3407 	return 0;
3408 }
3409 
3410 static int ip6_pkt_discard(struct sk_buff *skb)
3411 {
3412 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3413 }
3414 
3415 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3416 {
3417 	skb->dev = skb_dst(skb)->dev;
3418 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3419 }
3420 
3421 static int ip6_pkt_prohibit(struct sk_buff *skb)
3422 {
3423 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3424 }
3425 
3426 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3427 {
3428 	skb->dev = skb_dst(skb)->dev;
3429 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3430 }
3431 
3432 /*
3433  *	Allocate a dst for local (unicast / anycast) address.
3434  */
3435 
3436 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3437 				    const struct in6_addr *addr,
3438 				    bool anycast)
3439 {
3440 	u32 tb_id;
3441 	struct net *net = dev_net(idev->dev);
3442 	struct net_device *dev = idev->dev;
3443 	struct rt6_info *rt;
3444 
3445 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3446 	if (!rt)
3447 		return ERR_PTR(-ENOMEM);
3448 
3449 	in6_dev_hold(idev);
3450 
3451 	rt->dst.flags |= DST_HOST;
3452 	rt->dst.input = ip6_input;
3453 	rt->dst.output = ip6_output;
3454 	rt->rt6i_idev = idev;
3455 
3456 	rt->rt6i_protocol = RTPROT_KERNEL;
3457 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3458 	if (anycast)
3459 		rt->rt6i_flags |= RTF_ANYCAST;
3460 	else
3461 		rt->rt6i_flags |= RTF_LOCAL;
3462 
3463 	rt->rt6i_gateway  = *addr;
3464 	rt->rt6i_dst.addr = *addr;
3465 	rt->rt6i_dst.plen = 128;
3466 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3467 	rt->rt6i_table = fib6_get_table(net, tb_id);
3468 
3469 	return rt;
3470 }
3471 
3472 /* remove deleted ip from prefsrc entries */
3473 struct arg_dev_net_ip {
3474 	struct net_device *dev;
3475 	struct net *net;
3476 	struct in6_addr *addr;
3477 };
3478 
3479 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3480 {
3481 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3482 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3483 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3484 
3485 	if (((void *)rt->dst.dev == dev || !dev) &&
3486 	    rt != net->ipv6.ip6_null_entry &&
3487 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3488 		spin_lock_bh(&rt6_exception_lock);
3489 		/* remove prefsrc entry */
3490 		rt->rt6i_prefsrc.plen = 0;
3491 		/* need to update cache as well */
3492 		rt6_exceptions_remove_prefsrc(rt);
3493 		spin_unlock_bh(&rt6_exception_lock);
3494 	}
3495 	return 0;
3496 }
3497 
3498 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3499 {
3500 	struct net *net = dev_net(ifp->idev->dev);
3501 	struct arg_dev_net_ip adni = {
3502 		.dev = ifp->idev->dev,
3503 		.net = net,
3504 		.addr = &ifp->addr,
3505 	};
3506 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3507 }
3508 
3509 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3510 
3511 /* Remove routers and update dst entries when gateway turn into host. */
3512 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3513 {
3514 	struct in6_addr *gateway = (struct in6_addr *)arg;
3515 
3516 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3517 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3518 		return -1;
3519 	}
3520 
3521 	/* Further clean up cached routes in exception table.
3522 	 * This is needed because cached route may have a different
3523 	 * gateway than its 'parent' in the case of an ip redirect.
3524 	 */
3525 	rt6_exceptions_clean_tohost(rt, gateway);
3526 
3527 	return 0;
3528 }
3529 
3530 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3531 {
3532 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3533 }
3534 
3535 struct arg_netdev_event {
3536 	const struct net_device *dev;
3537 	union {
3538 		unsigned int nh_flags;
3539 		unsigned long event;
3540 	};
3541 };
3542 
3543 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3544 {
3545 	struct rt6_info *iter;
3546 	struct fib6_node *fn;
3547 
3548 	fn = rcu_dereference_protected(rt->rt6i_node,
3549 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3550 	iter = rcu_dereference_protected(fn->leaf,
3551 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3552 	while (iter) {
3553 		if (iter->rt6i_metric == rt->rt6i_metric &&
3554 		    rt6_qualify_for_ecmp(iter))
3555 			return iter;
3556 		iter = rcu_dereference_protected(iter->rt6_next,
3557 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3558 	}
3559 
3560 	return NULL;
3561 }
3562 
3563 static bool rt6_is_dead(const struct rt6_info *rt)
3564 {
3565 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3566 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3567 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3568 		return true;
3569 
3570 	return false;
3571 }
3572 
3573 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3574 {
3575 	struct rt6_info *iter;
3576 	int total = 0;
3577 
3578 	if (!rt6_is_dead(rt))
3579 		total += rt->rt6i_nh_weight;
3580 
3581 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3582 		if (!rt6_is_dead(iter))
3583 			total += iter->rt6i_nh_weight;
3584 	}
3585 
3586 	return total;
3587 }
3588 
3589 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3590 {
3591 	int upper_bound = -1;
3592 
3593 	if (!rt6_is_dead(rt)) {
3594 		*weight += rt->rt6i_nh_weight;
3595 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3596 						    total) - 1;
3597 	}
3598 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3599 }
3600 
3601 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3602 {
3603 	struct rt6_info *iter;
3604 	int weight = 0;
3605 
3606 	rt6_upper_bound_set(rt, &weight, total);
3607 
3608 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3609 		rt6_upper_bound_set(iter, &weight, total);
3610 }
3611 
3612 void rt6_multipath_rebalance(struct rt6_info *rt)
3613 {
3614 	struct rt6_info *first;
3615 	int total;
3616 
3617 	/* In case the entire multipath route was marked for flushing,
3618 	 * then there is no need to rebalance upon the removal of every
3619 	 * sibling route.
3620 	 */
3621 	if (!rt->rt6i_nsiblings || rt->should_flush)
3622 		return;
3623 
3624 	/* During lookup routes are evaluated in order, so we need to
3625 	 * make sure upper bounds are assigned from the first sibling
3626 	 * onwards.
3627 	 */
3628 	first = rt6_multipath_first_sibling(rt);
3629 	if (WARN_ON_ONCE(!first))
3630 		return;
3631 
3632 	total = rt6_multipath_total_weight(first);
3633 	rt6_multipath_upper_bound_set(first, total);
3634 }
3635 
3636 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3637 {
3638 	const struct arg_netdev_event *arg = p_arg;
3639 	const struct net *net = dev_net(arg->dev);
3640 
3641 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3642 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3643 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3644 		rt6_multipath_rebalance(rt);
3645 	}
3646 
3647 	return 0;
3648 }
3649 
3650 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3651 {
3652 	struct arg_netdev_event arg = {
3653 		.dev = dev,
3654 		{
3655 			.nh_flags = nh_flags,
3656 		},
3657 	};
3658 
3659 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3660 		arg.nh_flags |= RTNH_F_LINKDOWN;
3661 
3662 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3663 }
3664 
3665 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3666 				   const struct net_device *dev)
3667 {
3668 	struct rt6_info *iter;
3669 
3670 	if (rt->dst.dev == dev)
3671 		return true;
3672 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3673 		if (iter->dst.dev == dev)
3674 			return true;
3675 
3676 	return false;
3677 }
3678 
3679 static void rt6_multipath_flush(struct rt6_info *rt)
3680 {
3681 	struct rt6_info *iter;
3682 
3683 	rt->should_flush = 1;
3684 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3685 		iter->should_flush = 1;
3686 }
3687 
3688 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3689 					     const struct net_device *down_dev)
3690 {
3691 	struct rt6_info *iter;
3692 	unsigned int dead = 0;
3693 
3694 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3695 		dead++;
3696 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3697 		if (iter->dst.dev == down_dev ||
3698 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3699 			dead++;
3700 
3701 	return dead;
3702 }
3703 
3704 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3705 				       const struct net_device *dev,
3706 				       unsigned int nh_flags)
3707 {
3708 	struct rt6_info *iter;
3709 
3710 	if (rt->dst.dev == dev)
3711 		rt->rt6i_nh_flags |= nh_flags;
3712 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3713 		if (iter->dst.dev == dev)
3714 			iter->rt6i_nh_flags |= nh_flags;
3715 }
3716 
3717 /* called with write lock held for table with rt */
3718 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3719 {
3720 	const struct arg_netdev_event *arg = p_arg;
3721 	const struct net_device *dev = arg->dev;
3722 	const struct net *net = dev_net(dev);
3723 
3724 	if (rt == net->ipv6.ip6_null_entry)
3725 		return 0;
3726 
3727 	switch (arg->event) {
3728 	case NETDEV_UNREGISTER:
3729 		return rt->dst.dev == dev ? -1 : 0;
3730 	case NETDEV_DOWN:
3731 		if (rt->should_flush)
3732 			return -1;
3733 		if (!rt->rt6i_nsiblings)
3734 			return rt->dst.dev == dev ? -1 : 0;
3735 		if (rt6_multipath_uses_dev(rt, dev)) {
3736 			unsigned int count;
3737 
3738 			count = rt6_multipath_dead_count(rt, dev);
3739 			if (rt->rt6i_nsiblings + 1 == count) {
3740 				rt6_multipath_flush(rt);
3741 				return -1;
3742 			}
3743 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3744 						   RTNH_F_LINKDOWN);
3745 			fib6_update_sernum(rt);
3746 			rt6_multipath_rebalance(rt);
3747 		}
3748 		return -2;
3749 	case NETDEV_CHANGE:
3750 		if (rt->dst.dev != dev ||
3751 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3752 			break;
3753 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3754 		rt6_multipath_rebalance(rt);
3755 		break;
3756 	}
3757 
3758 	return 0;
3759 }
3760 
3761 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3762 {
3763 	struct arg_netdev_event arg = {
3764 		.dev = dev,
3765 		{
3766 			.event = event,
3767 		},
3768 	};
3769 
3770 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3771 }
3772 
3773 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3774 {
3775 	rt6_sync_down_dev(dev, event);
3776 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3777 	neigh_ifdown(&nd_tbl, dev);
3778 }
3779 
3780 struct rt6_mtu_change_arg {
3781 	struct net_device *dev;
3782 	unsigned int mtu;
3783 };
3784 
3785 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3786 {
3787 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3788 	struct inet6_dev *idev;
3789 
3790 	/* In IPv6 pmtu discovery is not optional,
3791 	   so that RTAX_MTU lock cannot disable it.
3792 	   We still use this lock to block changes
3793 	   caused by addrconf/ndisc.
3794 	*/
3795 
3796 	idev = __in6_dev_get(arg->dev);
3797 	if (!idev)
3798 		return 0;
3799 
3800 	/* For administrative MTU increase, there is no way to discover
3801 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3802 	   Since RFC 1981 doesn't include administrative MTU increase
3803 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3804 	 */
3805 	/*
3806 	   If new MTU is less than route PMTU, this new MTU will be the
3807 	   lowest MTU in the path, update the route PMTU to reflect PMTU
3808 	   decreases; if new MTU is greater than route PMTU, and the
3809 	   old MTU is the lowest MTU in the path, update the route PMTU
3810 	   to reflect the increase. In this case if the other nodes' MTU
3811 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
3812 	   PMTU discovery.
3813 	 */
3814 	if (rt->dst.dev == arg->dev &&
3815 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
3816 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3817 		spin_lock_bh(&rt6_exception_lock);
3818 		if (dst_mtu(&rt->dst) >= arg->mtu ||
3819 		    (dst_mtu(&rt->dst) < arg->mtu &&
3820 		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3821 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3822 		}
3823 		rt6_exceptions_update_pmtu(rt, arg->mtu);
3824 		spin_unlock_bh(&rt6_exception_lock);
3825 	}
3826 	return 0;
3827 }
3828 
3829 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3830 {
3831 	struct rt6_mtu_change_arg arg = {
3832 		.dev = dev,
3833 		.mtu = mtu,
3834 	};
3835 
3836 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3837 }
3838 
3839 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3840 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3841 	[RTA_OIF]               = { .type = NLA_U32 },
3842 	[RTA_IIF]		= { .type = NLA_U32 },
3843 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3844 	[RTA_METRICS]           = { .type = NLA_NESTED },
3845 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3846 	[RTA_PREF]              = { .type = NLA_U8 },
3847 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3848 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3849 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3850 	[RTA_UID]		= { .type = NLA_U32 },
3851 	[RTA_MARK]		= { .type = NLA_U32 },
3852 };
3853 
3854 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3855 			      struct fib6_config *cfg,
3856 			      struct netlink_ext_ack *extack)
3857 {
3858 	struct rtmsg *rtm;
3859 	struct nlattr *tb[RTA_MAX+1];
3860 	unsigned int pref;
3861 	int err;
3862 
3863 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3864 			  NULL);
3865 	if (err < 0)
3866 		goto errout;
3867 
3868 	err = -EINVAL;
3869 	rtm = nlmsg_data(nlh);
3870 	memset(cfg, 0, sizeof(*cfg));
3871 
3872 	cfg->fc_table = rtm->rtm_table;
3873 	cfg->fc_dst_len = rtm->rtm_dst_len;
3874 	cfg->fc_src_len = rtm->rtm_src_len;
3875 	cfg->fc_flags = RTF_UP;
3876 	cfg->fc_protocol = rtm->rtm_protocol;
3877 	cfg->fc_type = rtm->rtm_type;
3878 
3879 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3880 	    rtm->rtm_type == RTN_BLACKHOLE ||
3881 	    rtm->rtm_type == RTN_PROHIBIT ||
3882 	    rtm->rtm_type == RTN_THROW)
3883 		cfg->fc_flags |= RTF_REJECT;
3884 
3885 	if (rtm->rtm_type == RTN_LOCAL)
3886 		cfg->fc_flags |= RTF_LOCAL;
3887 
3888 	if (rtm->rtm_flags & RTM_F_CLONED)
3889 		cfg->fc_flags |= RTF_CACHE;
3890 
3891 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3892 
3893 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3894 	cfg->fc_nlinfo.nlh = nlh;
3895 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3896 
3897 	if (tb[RTA_GATEWAY]) {
3898 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3899 		cfg->fc_flags |= RTF_GATEWAY;
3900 	}
3901 
3902 	if (tb[RTA_DST]) {
3903 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3904 
3905 		if (nla_len(tb[RTA_DST]) < plen)
3906 			goto errout;
3907 
3908 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3909 	}
3910 
3911 	if (tb[RTA_SRC]) {
3912 		int plen = (rtm->rtm_src_len + 7) >> 3;
3913 
3914 		if (nla_len(tb[RTA_SRC]) < plen)
3915 			goto errout;
3916 
3917 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3918 	}
3919 
3920 	if (tb[RTA_PREFSRC])
3921 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3922 
3923 	if (tb[RTA_OIF])
3924 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3925 
3926 	if (tb[RTA_PRIORITY])
3927 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3928 
3929 	if (tb[RTA_METRICS]) {
3930 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3931 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3932 	}
3933 
3934 	if (tb[RTA_TABLE])
3935 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3936 
3937 	if (tb[RTA_MULTIPATH]) {
3938 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3939 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3940 
3941 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3942 						     cfg->fc_mp_len, extack);
3943 		if (err < 0)
3944 			goto errout;
3945 	}
3946 
3947 	if (tb[RTA_PREF]) {
3948 		pref = nla_get_u8(tb[RTA_PREF]);
3949 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3950 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3951 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3952 		cfg->fc_flags |= RTF_PREF(pref);
3953 	}
3954 
3955 	if (tb[RTA_ENCAP])
3956 		cfg->fc_encap = tb[RTA_ENCAP];
3957 
3958 	if (tb[RTA_ENCAP_TYPE]) {
3959 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3960 
3961 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3962 		if (err < 0)
3963 			goto errout;
3964 	}
3965 
3966 	if (tb[RTA_EXPIRES]) {
3967 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3968 
3969 		if (addrconf_finite_timeout(timeout)) {
3970 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3971 			cfg->fc_flags |= RTF_EXPIRES;
3972 		}
3973 	}
3974 
3975 	err = 0;
3976 errout:
3977 	return err;
3978 }
3979 
3980 struct rt6_nh {
3981 	struct rt6_info *rt6_info;
3982 	struct fib6_config r_cfg;
3983 	struct mx6_config mxc;
3984 	struct list_head next;
3985 };
3986 
3987 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3988 {
3989 	struct rt6_nh *nh;
3990 
3991 	list_for_each_entry(nh, rt6_nh_list, next) {
3992 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3993 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3994 		        nh->r_cfg.fc_ifindex);
3995 	}
3996 }
3997 
3998 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3999 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4000 {
4001 	struct rt6_nh *nh;
4002 	int err = -EEXIST;
4003 
4004 	list_for_each_entry(nh, rt6_nh_list, next) {
4005 		/* check if rt6_info already exists */
4006 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4007 			return err;
4008 	}
4009 
4010 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4011 	if (!nh)
4012 		return -ENOMEM;
4013 	nh->rt6_info = rt;
4014 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4015 	if (err) {
4016 		kfree(nh);
4017 		return err;
4018 	}
4019 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4020 	list_add_tail(&nh->next, rt6_nh_list);
4021 
4022 	return 0;
4023 }
4024 
4025 static void ip6_route_mpath_notify(struct rt6_info *rt,
4026 				   struct rt6_info *rt_last,
4027 				   struct nl_info *info,
4028 				   __u16 nlflags)
4029 {
4030 	/* if this is an APPEND route, then rt points to the first route
4031 	 * inserted and rt_last points to last route inserted. Userspace
4032 	 * wants a consistent dump of the route which starts at the first
4033 	 * nexthop. Since sibling routes are always added at the end of
4034 	 * the list, find the first sibling of the last route appended
4035 	 */
4036 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4037 		rt = list_first_entry(&rt_last->rt6i_siblings,
4038 				      struct rt6_info,
4039 				      rt6i_siblings);
4040 	}
4041 
4042 	if (rt)
4043 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4044 }
4045 
4046 static int ip6_route_multipath_add(struct fib6_config *cfg,
4047 				   struct netlink_ext_ack *extack)
4048 {
4049 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4050 	struct nl_info *info = &cfg->fc_nlinfo;
4051 	struct fib6_config r_cfg;
4052 	struct rtnexthop *rtnh;
4053 	struct rt6_info *rt;
4054 	struct rt6_nh *err_nh;
4055 	struct rt6_nh *nh, *nh_safe;
4056 	__u16 nlflags;
4057 	int remaining;
4058 	int attrlen;
4059 	int err = 1;
4060 	int nhn = 0;
4061 	int replace = (cfg->fc_nlinfo.nlh &&
4062 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4063 	LIST_HEAD(rt6_nh_list);
4064 
4065 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4066 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4067 		nlflags |= NLM_F_APPEND;
4068 
4069 	remaining = cfg->fc_mp_len;
4070 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4071 
4072 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4073 	 * rt6_info structs per nexthop
4074 	 */
4075 	while (rtnh_ok(rtnh, remaining)) {
4076 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4077 		if (rtnh->rtnh_ifindex)
4078 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4079 
4080 		attrlen = rtnh_attrlen(rtnh);
4081 		if (attrlen > 0) {
4082 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4083 
4084 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4085 			if (nla) {
4086 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4087 				r_cfg.fc_flags |= RTF_GATEWAY;
4088 			}
4089 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4090 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4091 			if (nla)
4092 				r_cfg.fc_encap_type = nla_get_u16(nla);
4093 		}
4094 
4095 		rt = ip6_route_info_create(&r_cfg, extack);
4096 		if (IS_ERR(rt)) {
4097 			err = PTR_ERR(rt);
4098 			rt = NULL;
4099 			goto cleanup;
4100 		}
4101 
4102 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4103 
4104 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4105 		if (err) {
4106 			dst_release_immediate(&rt->dst);
4107 			goto cleanup;
4108 		}
4109 
4110 		rtnh = rtnh_next(rtnh, &remaining);
4111 	}
4112 
4113 	/* for add and replace send one notification with all nexthops.
4114 	 * Skip the notification in fib6_add_rt2node and send one with
4115 	 * the full route when done
4116 	 */
4117 	info->skip_notify = 1;
4118 
4119 	err_nh = NULL;
4120 	list_for_each_entry(nh, &rt6_nh_list, next) {
4121 		rt_last = nh->rt6_info;
4122 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4123 		/* save reference to first route for notification */
4124 		if (!rt_notif && !err)
4125 			rt_notif = nh->rt6_info;
4126 
4127 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4128 		nh->rt6_info = NULL;
4129 		if (err) {
4130 			if (replace && nhn)
4131 				ip6_print_replace_route_err(&rt6_nh_list);
4132 			err_nh = nh;
4133 			goto add_errout;
4134 		}
4135 
4136 		/* Because each route is added like a single route we remove
4137 		 * these flags after the first nexthop: if there is a collision,
4138 		 * we have already failed to add the first nexthop:
4139 		 * fib6_add_rt2node() has rejected it; when replacing, old
4140 		 * nexthops have been replaced by first new, the rest should
4141 		 * be added to it.
4142 		 */
4143 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4144 						     NLM_F_REPLACE);
4145 		nhn++;
4146 	}
4147 
4148 	/* success ... tell user about new route */
4149 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4150 	goto cleanup;
4151 
4152 add_errout:
4153 	/* send notification for routes that were added so that
4154 	 * the delete notifications sent by ip6_route_del are
4155 	 * coherent
4156 	 */
4157 	if (rt_notif)
4158 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4159 
4160 	/* Delete routes that were already added */
4161 	list_for_each_entry(nh, &rt6_nh_list, next) {
4162 		if (err_nh == nh)
4163 			break;
4164 		ip6_route_del(&nh->r_cfg, extack);
4165 	}
4166 
4167 cleanup:
4168 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4169 		if (nh->rt6_info)
4170 			dst_release_immediate(&nh->rt6_info->dst);
4171 		kfree(nh->mxc.mx);
4172 		list_del(&nh->next);
4173 		kfree(nh);
4174 	}
4175 
4176 	return err;
4177 }
4178 
4179 static int ip6_route_multipath_del(struct fib6_config *cfg,
4180 				   struct netlink_ext_ack *extack)
4181 {
4182 	struct fib6_config r_cfg;
4183 	struct rtnexthop *rtnh;
4184 	int remaining;
4185 	int attrlen;
4186 	int err = 1, last_err = 0;
4187 
4188 	remaining = cfg->fc_mp_len;
4189 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4190 
4191 	/* Parse a Multipath Entry */
4192 	while (rtnh_ok(rtnh, remaining)) {
4193 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4194 		if (rtnh->rtnh_ifindex)
4195 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4196 
4197 		attrlen = rtnh_attrlen(rtnh);
4198 		if (attrlen > 0) {
4199 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4200 
4201 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4202 			if (nla) {
4203 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4204 				r_cfg.fc_flags |= RTF_GATEWAY;
4205 			}
4206 		}
4207 		err = ip6_route_del(&r_cfg, extack);
4208 		if (err)
4209 			last_err = err;
4210 
4211 		rtnh = rtnh_next(rtnh, &remaining);
4212 	}
4213 
4214 	return last_err;
4215 }
4216 
4217 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4218 			      struct netlink_ext_ack *extack)
4219 {
4220 	struct fib6_config cfg;
4221 	int err;
4222 
4223 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4224 	if (err < 0)
4225 		return err;
4226 
4227 	if (cfg.fc_mp)
4228 		return ip6_route_multipath_del(&cfg, extack);
4229 	else {
4230 		cfg.fc_delete_all_nh = 1;
4231 		return ip6_route_del(&cfg, extack);
4232 	}
4233 }
4234 
4235 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4236 			      struct netlink_ext_ack *extack)
4237 {
4238 	struct fib6_config cfg;
4239 	int err;
4240 
4241 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4242 	if (err < 0)
4243 		return err;
4244 
4245 	if (cfg.fc_mp)
4246 		return ip6_route_multipath_add(&cfg, extack);
4247 	else
4248 		return ip6_route_add(&cfg, extack);
4249 }
4250 
4251 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4252 {
4253 	int nexthop_len = 0;
4254 
4255 	if (rt->rt6i_nsiblings) {
4256 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4257 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4258 			    + nla_total_size(16) /* RTA_GATEWAY */
4259 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4260 
4261 		nexthop_len *= rt->rt6i_nsiblings;
4262 	}
4263 
4264 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4265 	       + nla_total_size(16) /* RTA_SRC */
4266 	       + nla_total_size(16) /* RTA_DST */
4267 	       + nla_total_size(16) /* RTA_GATEWAY */
4268 	       + nla_total_size(16) /* RTA_PREFSRC */
4269 	       + nla_total_size(4) /* RTA_TABLE */
4270 	       + nla_total_size(4) /* RTA_IIF */
4271 	       + nla_total_size(4) /* RTA_OIF */
4272 	       + nla_total_size(4) /* RTA_PRIORITY */
4273 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4274 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4275 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4276 	       + nla_total_size(1) /* RTA_PREF */
4277 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4278 	       + nexthop_len;
4279 }
4280 
4281 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4282 			    unsigned int *flags, bool skip_oif)
4283 {
4284 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4285 		*flags |= RTNH_F_DEAD;
4286 
4287 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4288 		*flags |= RTNH_F_LINKDOWN;
4289 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4290 			*flags |= RTNH_F_DEAD;
4291 	}
4292 
4293 	if (rt->rt6i_flags & RTF_GATEWAY) {
4294 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4295 			goto nla_put_failure;
4296 	}
4297 
4298 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4299 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4300 		*flags |= RTNH_F_OFFLOAD;
4301 
4302 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4303 	if (!skip_oif && rt->dst.dev &&
4304 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4305 		goto nla_put_failure;
4306 
4307 	if (rt->dst.lwtstate &&
4308 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4309 		goto nla_put_failure;
4310 
4311 	return 0;
4312 
4313 nla_put_failure:
4314 	return -EMSGSIZE;
4315 }
4316 
4317 /* add multipath next hop */
4318 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4319 {
4320 	struct rtnexthop *rtnh;
4321 	unsigned int flags = 0;
4322 
4323 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4324 	if (!rtnh)
4325 		goto nla_put_failure;
4326 
4327 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4328 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4329 
4330 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4331 		goto nla_put_failure;
4332 
4333 	rtnh->rtnh_flags = flags;
4334 
4335 	/* length of rtnetlink header + attributes */
4336 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4337 
4338 	return 0;
4339 
4340 nla_put_failure:
4341 	return -EMSGSIZE;
4342 }
4343 
4344 static int rt6_fill_node(struct net *net,
4345 			 struct sk_buff *skb, struct rt6_info *rt,
4346 			 struct in6_addr *dst, struct in6_addr *src,
4347 			 int iif, int type, u32 portid, u32 seq,
4348 			 unsigned int flags)
4349 {
4350 	u32 metrics[RTAX_MAX];
4351 	struct rtmsg *rtm;
4352 	struct nlmsghdr *nlh;
4353 	long expires;
4354 	u32 table;
4355 
4356 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4357 	if (!nlh)
4358 		return -EMSGSIZE;
4359 
4360 	rtm = nlmsg_data(nlh);
4361 	rtm->rtm_family = AF_INET6;
4362 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4363 	rtm->rtm_src_len = rt->rt6i_src.plen;
4364 	rtm->rtm_tos = 0;
4365 	if (rt->rt6i_table)
4366 		table = rt->rt6i_table->tb6_id;
4367 	else
4368 		table = RT6_TABLE_UNSPEC;
4369 	rtm->rtm_table = table;
4370 	if (nla_put_u32(skb, RTA_TABLE, table))
4371 		goto nla_put_failure;
4372 	if (rt->rt6i_flags & RTF_REJECT) {
4373 		switch (rt->dst.error) {
4374 		case -EINVAL:
4375 			rtm->rtm_type = RTN_BLACKHOLE;
4376 			break;
4377 		case -EACCES:
4378 			rtm->rtm_type = RTN_PROHIBIT;
4379 			break;
4380 		case -EAGAIN:
4381 			rtm->rtm_type = RTN_THROW;
4382 			break;
4383 		default:
4384 			rtm->rtm_type = RTN_UNREACHABLE;
4385 			break;
4386 		}
4387 	}
4388 	else if (rt->rt6i_flags & RTF_LOCAL)
4389 		rtm->rtm_type = RTN_LOCAL;
4390 	else if (rt->rt6i_flags & RTF_ANYCAST)
4391 		rtm->rtm_type = RTN_ANYCAST;
4392 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4393 		rtm->rtm_type = RTN_LOCAL;
4394 	else
4395 		rtm->rtm_type = RTN_UNICAST;
4396 	rtm->rtm_flags = 0;
4397 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4398 	rtm->rtm_protocol = rt->rt6i_protocol;
4399 
4400 	if (rt->rt6i_flags & RTF_CACHE)
4401 		rtm->rtm_flags |= RTM_F_CLONED;
4402 
4403 	if (dst) {
4404 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4405 			goto nla_put_failure;
4406 		rtm->rtm_dst_len = 128;
4407 	} else if (rtm->rtm_dst_len)
4408 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4409 			goto nla_put_failure;
4410 #ifdef CONFIG_IPV6_SUBTREES
4411 	if (src) {
4412 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4413 			goto nla_put_failure;
4414 		rtm->rtm_src_len = 128;
4415 	} else if (rtm->rtm_src_len &&
4416 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4417 		goto nla_put_failure;
4418 #endif
4419 	if (iif) {
4420 #ifdef CONFIG_IPV6_MROUTE
4421 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4422 			int err = ip6mr_get_route(net, skb, rtm, portid);
4423 
4424 			if (err == 0)
4425 				return 0;
4426 			if (err < 0)
4427 				goto nla_put_failure;
4428 		} else
4429 #endif
4430 			if (nla_put_u32(skb, RTA_IIF, iif))
4431 				goto nla_put_failure;
4432 	} else if (dst) {
4433 		struct in6_addr saddr_buf;
4434 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4435 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4436 			goto nla_put_failure;
4437 	}
4438 
4439 	if (rt->rt6i_prefsrc.plen) {
4440 		struct in6_addr saddr_buf;
4441 		saddr_buf = rt->rt6i_prefsrc.addr;
4442 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4443 			goto nla_put_failure;
4444 	}
4445 
4446 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4447 	if (rt->rt6i_pmtu)
4448 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4449 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4450 		goto nla_put_failure;
4451 
4452 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4453 		goto nla_put_failure;
4454 
4455 	/* For multipath routes, walk the siblings list and add
4456 	 * each as a nexthop within RTA_MULTIPATH.
4457 	 */
4458 	if (rt->rt6i_nsiblings) {
4459 		struct rt6_info *sibling, *next_sibling;
4460 		struct nlattr *mp;
4461 
4462 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4463 		if (!mp)
4464 			goto nla_put_failure;
4465 
4466 		if (rt6_add_nexthop(skb, rt) < 0)
4467 			goto nla_put_failure;
4468 
4469 		list_for_each_entry_safe(sibling, next_sibling,
4470 					 &rt->rt6i_siblings, rt6i_siblings) {
4471 			if (rt6_add_nexthop(skb, sibling) < 0)
4472 				goto nla_put_failure;
4473 		}
4474 
4475 		nla_nest_end(skb, mp);
4476 	} else {
4477 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4478 			goto nla_put_failure;
4479 	}
4480 
4481 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4482 
4483 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4484 		goto nla_put_failure;
4485 
4486 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4487 		goto nla_put_failure;
4488 
4489 
4490 	nlmsg_end(skb, nlh);
4491 	return 0;
4492 
4493 nla_put_failure:
4494 	nlmsg_cancel(skb, nlh);
4495 	return -EMSGSIZE;
4496 }
4497 
4498 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4499 {
4500 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4501 	struct net *net = arg->net;
4502 
4503 	if (rt == net->ipv6.ip6_null_entry)
4504 		return 0;
4505 
4506 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4507 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4508 
4509 		/* user wants prefix routes only */
4510 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4511 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4512 			/* success since this is not a prefix route */
4513 			return 1;
4514 		}
4515 	}
4516 
4517 	return rt6_fill_node(net,
4518 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4519 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4520 		     NLM_F_MULTI);
4521 }
4522 
4523 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4524 			      struct netlink_ext_ack *extack)
4525 {
4526 	struct net *net = sock_net(in_skb->sk);
4527 	struct nlattr *tb[RTA_MAX+1];
4528 	int err, iif = 0, oif = 0;
4529 	struct dst_entry *dst;
4530 	struct rt6_info *rt;
4531 	struct sk_buff *skb;
4532 	struct rtmsg *rtm;
4533 	struct flowi6 fl6;
4534 	bool fibmatch;
4535 
4536 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4537 			  extack);
4538 	if (err < 0)
4539 		goto errout;
4540 
4541 	err = -EINVAL;
4542 	memset(&fl6, 0, sizeof(fl6));
4543 	rtm = nlmsg_data(nlh);
4544 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4545 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4546 
4547 	if (tb[RTA_SRC]) {
4548 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4549 			goto errout;
4550 
4551 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4552 	}
4553 
4554 	if (tb[RTA_DST]) {
4555 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4556 			goto errout;
4557 
4558 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4559 	}
4560 
4561 	if (tb[RTA_IIF])
4562 		iif = nla_get_u32(tb[RTA_IIF]);
4563 
4564 	if (tb[RTA_OIF])
4565 		oif = nla_get_u32(tb[RTA_OIF]);
4566 
4567 	if (tb[RTA_MARK])
4568 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4569 
4570 	if (tb[RTA_UID])
4571 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4572 					   nla_get_u32(tb[RTA_UID]));
4573 	else
4574 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4575 
4576 	if (iif) {
4577 		struct net_device *dev;
4578 		int flags = 0;
4579 
4580 		rcu_read_lock();
4581 
4582 		dev = dev_get_by_index_rcu(net, iif);
4583 		if (!dev) {
4584 			rcu_read_unlock();
4585 			err = -ENODEV;
4586 			goto errout;
4587 		}
4588 
4589 		fl6.flowi6_iif = iif;
4590 
4591 		if (!ipv6_addr_any(&fl6.saddr))
4592 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4593 
4594 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4595 
4596 		rcu_read_unlock();
4597 	} else {
4598 		fl6.flowi6_oif = oif;
4599 
4600 		dst = ip6_route_output(net, NULL, &fl6);
4601 	}
4602 
4603 
4604 	rt = container_of(dst, struct rt6_info, dst);
4605 	if (rt->dst.error) {
4606 		err = rt->dst.error;
4607 		ip6_rt_put(rt);
4608 		goto errout;
4609 	}
4610 
4611 	if (rt == net->ipv6.ip6_null_entry) {
4612 		err = rt->dst.error;
4613 		ip6_rt_put(rt);
4614 		goto errout;
4615 	}
4616 
4617 	if (fibmatch && rt->from) {
4618 		struct rt6_info *ort = rt->from;
4619 
4620 		dst_hold(&ort->dst);
4621 		ip6_rt_put(rt);
4622 		rt = ort;
4623 	}
4624 
4625 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4626 	if (!skb) {
4627 		ip6_rt_put(rt);
4628 		err = -ENOBUFS;
4629 		goto errout;
4630 	}
4631 
4632 	skb_dst_set(skb, &rt->dst);
4633 	if (fibmatch)
4634 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4635 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4636 				    nlh->nlmsg_seq, 0);
4637 	else
4638 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4639 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4640 				    nlh->nlmsg_seq, 0);
4641 	if (err < 0) {
4642 		kfree_skb(skb);
4643 		goto errout;
4644 	}
4645 
4646 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4647 errout:
4648 	return err;
4649 }
4650 
4651 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4652 		     unsigned int nlm_flags)
4653 {
4654 	struct sk_buff *skb;
4655 	struct net *net = info->nl_net;
4656 	u32 seq;
4657 	int err;
4658 
4659 	err = -ENOBUFS;
4660 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4661 
4662 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4663 	if (!skb)
4664 		goto errout;
4665 
4666 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4667 				event, info->portid, seq, nlm_flags);
4668 	if (err < 0) {
4669 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4670 		WARN_ON(err == -EMSGSIZE);
4671 		kfree_skb(skb);
4672 		goto errout;
4673 	}
4674 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4675 		    info->nlh, gfp_any());
4676 	return;
4677 errout:
4678 	if (err < 0)
4679 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4680 }
4681 
4682 static int ip6_route_dev_notify(struct notifier_block *this,
4683 				unsigned long event, void *ptr)
4684 {
4685 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4686 	struct net *net = dev_net(dev);
4687 
4688 	if (!(dev->flags & IFF_LOOPBACK))
4689 		return NOTIFY_OK;
4690 
4691 	if (event == NETDEV_REGISTER) {
4692 		net->ipv6.ip6_null_entry->dst.dev = dev;
4693 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4694 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4695 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4696 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4697 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4698 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4699 #endif
4700 	 } else if (event == NETDEV_UNREGISTER &&
4701 		    dev->reg_state != NETREG_UNREGISTERED) {
4702 		/* NETDEV_UNREGISTER could be fired for multiple times by
4703 		 * netdev_wait_allrefs(). Make sure we only call this once.
4704 		 */
4705 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4706 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4707 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4708 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4709 #endif
4710 	}
4711 
4712 	return NOTIFY_OK;
4713 }
4714 
4715 /*
4716  *	/proc
4717  */
4718 
4719 #ifdef CONFIG_PROC_FS
4720 
4721 static const struct file_operations ipv6_route_proc_fops = {
4722 	.open		= ipv6_route_open,
4723 	.read		= seq_read,
4724 	.llseek		= seq_lseek,
4725 	.release	= seq_release_net,
4726 };
4727 
4728 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4729 {
4730 	struct net *net = (struct net *)seq->private;
4731 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4732 		   net->ipv6.rt6_stats->fib_nodes,
4733 		   net->ipv6.rt6_stats->fib_route_nodes,
4734 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4735 		   net->ipv6.rt6_stats->fib_rt_entries,
4736 		   net->ipv6.rt6_stats->fib_rt_cache,
4737 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4738 		   net->ipv6.rt6_stats->fib_discarded_routes);
4739 
4740 	return 0;
4741 }
4742 
4743 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4744 {
4745 	return single_open_net(inode, file, rt6_stats_seq_show);
4746 }
4747 
4748 static const struct file_operations rt6_stats_seq_fops = {
4749 	.open	 = rt6_stats_seq_open,
4750 	.read	 = seq_read,
4751 	.llseek	 = seq_lseek,
4752 	.release = single_release_net,
4753 };
4754 #endif	/* CONFIG_PROC_FS */
4755 
4756 #ifdef CONFIG_SYSCTL
4757 
4758 static
4759 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4760 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4761 {
4762 	struct net *net;
4763 	int delay;
4764 	if (!write)
4765 		return -EINVAL;
4766 
4767 	net = (struct net *)ctl->extra1;
4768 	delay = net->ipv6.sysctl.flush_delay;
4769 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4770 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4771 	return 0;
4772 }
4773 
4774 struct ctl_table ipv6_route_table_template[] = {
4775 	{
4776 		.procname	=	"flush",
4777 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4778 		.maxlen		=	sizeof(int),
4779 		.mode		=	0200,
4780 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4781 	},
4782 	{
4783 		.procname	=	"gc_thresh",
4784 		.data		=	&ip6_dst_ops_template.gc_thresh,
4785 		.maxlen		=	sizeof(int),
4786 		.mode		=	0644,
4787 		.proc_handler	=	proc_dointvec,
4788 	},
4789 	{
4790 		.procname	=	"max_size",
4791 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4792 		.maxlen		=	sizeof(int),
4793 		.mode		=	0644,
4794 		.proc_handler	=	proc_dointvec,
4795 	},
4796 	{
4797 		.procname	=	"gc_min_interval",
4798 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4799 		.maxlen		=	sizeof(int),
4800 		.mode		=	0644,
4801 		.proc_handler	=	proc_dointvec_jiffies,
4802 	},
4803 	{
4804 		.procname	=	"gc_timeout",
4805 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4806 		.maxlen		=	sizeof(int),
4807 		.mode		=	0644,
4808 		.proc_handler	=	proc_dointvec_jiffies,
4809 	},
4810 	{
4811 		.procname	=	"gc_interval",
4812 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4813 		.maxlen		=	sizeof(int),
4814 		.mode		=	0644,
4815 		.proc_handler	=	proc_dointvec_jiffies,
4816 	},
4817 	{
4818 		.procname	=	"gc_elasticity",
4819 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4820 		.maxlen		=	sizeof(int),
4821 		.mode		=	0644,
4822 		.proc_handler	=	proc_dointvec,
4823 	},
4824 	{
4825 		.procname	=	"mtu_expires",
4826 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4827 		.maxlen		=	sizeof(int),
4828 		.mode		=	0644,
4829 		.proc_handler	=	proc_dointvec_jiffies,
4830 	},
4831 	{
4832 		.procname	=	"min_adv_mss",
4833 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4834 		.maxlen		=	sizeof(int),
4835 		.mode		=	0644,
4836 		.proc_handler	=	proc_dointvec,
4837 	},
4838 	{
4839 		.procname	=	"gc_min_interval_ms",
4840 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4841 		.maxlen		=	sizeof(int),
4842 		.mode		=	0644,
4843 		.proc_handler	=	proc_dointvec_ms_jiffies,
4844 	},
4845 	{ }
4846 };
4847 
4848 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4849 {
4850 	struct ctl_table *table;
4851 
4852 	table = kmemdup(ipv6_route_table_template,
4853 			sizeof(ipv6_route_table_template),
4854 			GFP_KERNEL);
4855 
4856 	if (table) {
4857 		table[0].data = &net->ipv6.sysctl.flush_delay;
4858 		table[0].extra1 = net;
4859 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4860 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4861 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4862 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4863 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4864 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4865 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4866 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4867 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4868 
4869 		/* Don't export sysctls to unprivileged users */
4870 		if (net->user_ns != &init_user_ns)
4871 			table[0].procname = NULL;
4872 	}
4873 
4874 	return table;
4875 }
4876 #endif
4877 
4878 static int __net_init ip6_route_net_init(struct net *net)
4879 {
4880 	int ret = -ENOMEM;
4881 
4882 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4883 	       sizeof(net->ipv6.ip6_dst_ops));
4884 
4885 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4886 		goto out_ip6_dst_ops;
4887 
4888 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4889 					   sizeof(*net->ipv6.ip6_null_entry),
4890 					   GFP_KERNEL);
4891 	if (!net->ipv6.ip6_null_entry)
4892 		goto out_ip6_dst_entries;
4893 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4894 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4895 			 ip6_template_metrics, true);
4896 
4897 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4898 	net->ipv6.fib6_has_custom_rules = false;
4899 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4900 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4901 					       GFP_KERNEL);
4902 	if (!net->ipv6.ip6_prohibit_entry)
4903 		goto out_ip6_null_entry;
4904 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4905 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4906 			 ip6_template_metrics, true);
4907 
4908 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4909 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4910 					       GFP_KERNEL);
4911 	if (!net->ipv6.ip6_blk_hole_entry)
4912 		goto out_ip6_prohibit_entry;
4913 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4914 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4915 			 ip6_template_metrics, true);
4916 #endif
4917 
4918 	net->ipv6.sysctl.flush_delay = 0;
4919 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4920 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4921 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4922 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4923 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4924 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4925 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4926 
4927 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4928 
4929 	ret = 0;
4930 out:
4931 	return ret;
4932 
4933 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4934 out_ip6_prohibit_entry:
4935 	kfree(net->ipv6.ip6_prohibit_entry);
4936 out_ip6_null_entry:
4937 	kfree(net->ipv6.ip6_null_entry);
4938 #endif
4939 out_ip6_dst_entries:
4940 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4941 out_ip6_dst_ops:
4942 	goto out;
4943 }
4944 
4945 static void __net_exit ip6_route_net_exit(struct net *net)
4946 {
4947 	kfree(net->ipv6.ip6_null_entry);
4948 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4949 	kfree(net->ipv6.ip6_prohibit_entry);
4950 	kfree(net->ipv6.ip6_blk_hole_entry);
4951 #endif
4952 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4953 }
4954 
4955 static int __net_init ip6_route_net_init_late(struct net *net)
4956 {
4957 #ifdef CONFIG_PROC_FS
4958 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4959 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4960 #endif
4961 	return 0;
4962 }
4963 
4964 static void __net_exit ip6_route_net_exit_late(struct net *net)
4965 {
4966 #ifdef CONFIG_PROC_FS
4967 	remove_proc_entry("ipv6_route", net->proc_net);
4968 	remove_proc_entry("rt6_stats", net->proc_net);
4969 #endif
4970 }
4971 
4972 static struct pernet_operations ip6_route_net_ops = {
4973 	.init = ip6_route_net_init,
4974 	.exit = ip6_route_net_exit,
4975 };
4976 
4977 static int __net_init ipv6_inetpeer_init(struct net *net)
4978 {
4979 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4980 
4981 	if (!bp)
4982 		return -ENOMEM;
4983 	inet_peer_base_init(bp);
4984 	net->ipv6.peers = bp;
4985 	return 0;
4986 }
4987 
4988 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4989 {
4990 	struct inet_peer_base *bp = net->ipv6.peers;
4991 
4992 	net->ipv6.peers = NULL;
4993 	inetpeer_invalidate_tree(bp);
4994 	kfree(bp);
4995 }
4996 
4997 static struct pernet_operations ipv6_inetpeer_ops = {
4998 	.init	=	ipv6_inetpeer_init,
4999 	.exit	=	ipv6_inetpeer_exit,
5000 };
5001 
5002 static struct pernet_operations ip6_route_net_late_ops = {
5003 	.init = ip6_route_net_init_late,
5004 	.exit = ip6_route_net_exit_late,
5005 };
5006 
5007 static struct notifier_block ip6_route_dev_notifier = {
5008 	.notifier_call = ip6_route_dev_notify,
5009 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5010 };
5011 
5012 void __init ip6_route_init_special_entries(void)
5013 {
5014 	/* Registering of the loopback is done before this portion of code,
5015 	 * the loopback reference in rt6_info will not be taken, do it
5016 	 * manually for init_net */
5017 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5018 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5019   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5020 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5021 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5022 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5023 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5024   #endif
5025 }
5026 
5027 int __init ip6_route_init(void)
5028 {
5029 	int ret;
5030 	int cpu;
5031 
5032 	ret = -ENOMEM;
5033 	ip6_dst_ops_template.kmem_cachep =
5034 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5035 				  SLAB_HWCACHE_ALIGN, NULL);
5036 	if (!ip6_dst_ops_template.kmem_cachep)
5037 		goto out;
5038 
5039 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5040 	if (ret)
5041 		goto out_kmem_cache;
5042 
5043 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5044 	if (ret)
5045 		goto out_dst_entries;
5046 
5047 	ret = register_pernet_subsys(&ip6_route_net_ops);
5048 	if (ret)
5049 		goto out_register_inetpeer;
5050 
5051 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5052 
5053 	ret = fib6_init();
5054 	if (ret)
5055 		goto out_register_subsys;
5056 
5057 	ret = xfrm6_init();
5058 	if (ret)
5059 		goto out_fib6_init;
5060 
5061 	ret = fib6_rules_init();
5062 	if (ret)
5063 		goto xfrm6_init;
5064 
5065 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5066 	if (ret)
5067 		goto fib6_rules_init;
5068 
5069 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5070 				   inet6_rtm_newroute, NULL, 0);
5071 	if (ret < 0)
5072 		goto out_register_late_subsys;
5073 
5074 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5075 				   inet6_rtm_delroute, NULL, 0);
5076 	if (ret < 0)
5077 		goto out_register_late_subsys;
5078 
5079 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5080 				   inet6_rtm_getroute, NULL,
5081 				   RTNL_FLAG_DOIT_UNLOCKED);
5082 	if (ret < 0)
5083 		goto out_register_late_subsys;
5084 
5085 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5086 	if (ret)
5087 		goto out_register_late_subsys;
5088 
5089 	for_each_possible_cpu(cpu) {
5090 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5091 
5092 		INIT_LIST_HEAD(&ul->head);
5093 		spin_lock_init(&ul->lock);
5094 	}
5095 
5096 out:
5097 	return ret;
5098 
5099 out_register_late_subsys:
5100 	rtnl_unregister_all(PF_INET6);
5101 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5102 fib6_rules_init:
5103 	fib6_rules_cleanup();
5104 xfrm6_init:
5105 	xfrm6_fini();
5106 out_fib6_init:
5107 	fib6_gc_cleanup();
5108 out_register_subsys:
5109 	unregister_pernet_subsys(&ip6_route_net_ops);
5110 out_register_inetpeer:
5111 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5112 out_dst_entries:
5113 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5114 out_kmem_cache:
5115 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5116 	goto out;
5117 }
5118 
5119 void ip6_route_cleanup(void)
5120 {
5121 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5122 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5123 	fib6_rules_cleanup();
5124 	xfrm6_fini();
5125 	fib6_gc_cleanup();
5126 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5127 	unregister_pernet_subsys(&ip6_route_net_ops);
5128 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5129 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5130 }
5131