xref: /openbmc/linux/net/ipv6/route.c (revision 31afeb425f7fad8bcf9561aeb0b8405479f97a98)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 static void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 static void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 
459 	/* We might have already computed the hash for ICMPv6 errors. In such
460 	 * case it will always be non-zero. Otherwise now is the time to do it.
461 	 */
462 	if (!fl6->mp_hash)
463 		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464 
465 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
466 		return match;
467 
468 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
469 				 rt6i_siblings) {
470 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
471 			continue;
472 		if (rt6_score_route(sibling, oif, strict) < 0)
473 			break;
474 		match = sibling;
475 		break;
476 	}
477 
478 	return match;
479 }
480 
481 /*
482  *	Route lookup. rcu_read_lock() should be held.
483  */
484 
485 static inline struct rt6_info *rt6_device_match(struct net *net,
486 						    struct rt6_info *rt,
487 						    const struct in6_addr *saddr,
488 						    int oif,
489 						    int flags)
490 {
491 	struct rt6_info *local = NULL;
492 	struct rt6_info *sprt;
493 
494 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
495 		return rt;
496 
497 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
498 		struct net_device *dev = sprt->dst.dev;
499 
500 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
501 			continue;
502 
503 		if (oif) {
504 			if (dev->ifindex == oif)
505 				return sprt;
506 			if (dev->flags & IFF_LOOPBACK) {
507 				if (!sprt->rt6i_idev ||
508 				    sprt->rt6i_idev->dev->ifindex != oif) {
509 					if (flags & RT6_LOOKUP_F_IFACE)
510 						continue;
511 					if (local &&
512 					    local->rt6i_idev->dev->ifindex == oif)
513 						continue;
514 				}
515 				local = sprt;
516 			}
517 		} else {
518 			if (ipv6_chk_addr(net, saddr, dev,
519 					  flags & RT6_LOOKUP_F_IFACE))
520 				return sprt;
521 		}
522 	}
523 
524 	if (oif) {
525 		if (local)
526 			return local;
527 
528 		if (flags & RT6_LOOKUP_F_IFACE)
529 			return net->ipv6.ip6_null_entry;
530 	}
531 
532 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
533 }
534 
535 #ifdef CONFIG_IPV6_ROUTER_PREF
536 struct __rt6_probe_work {
537 	struct work_struct work;
538 	struct in6_addr target;
539 	struct net_device *dev;
540 };
541 
542 static void rt6_probe_deferred(struct work_struct *w)
543 {
544 	struct in6_addr mcaddr;
545 	struct __rt6_probe_work *work =
546 		container_of(w, struct __rt6_probe_work, work);
547 
548 	addrconf_addr_solict_mult(&work->target, &mcaddr);
549 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
550 	dev_put(work->dev);
551 	kfree(work);
552 }
553 
554 static void rt6_probe(struct rt6_info *rt)
555 {
556 	struct __rt6_probe_work *work;
557 	struct neighbour *neigh;
558 	/*
559 	 * Okay, this does not seem to be appropriate
560 	 * for now, however, we need to check if it
561 	 * is really so; aka Router Reachability Probing.
562 	 *
563 	 * Router Reachability Probe MUST be rate-limited
564 	 * to no more than one per minute.
565 	 */
566 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 		return;
568 	rcu_read_lock_bh();
569 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
570 	if (neigh) {
571 		if (neigh->nud_state & NUD_VALID)
572 			goto out;
573 
574 		work = NULL;
575 		write_lock(&neigh->lock);
576 		if (!(neigh->nud_state & NUD_VALID) &&
577 		    time_after(jiffies,
578 			       neigh->updated +
579 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
580 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 			if (work)
582 				__neigh_set_probe_once(neigh);
583 		}
584 		write_unlock(&neigh->lock);
585 	} else {
586 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
587 	}
588 
589 	if (work) {
590 		INIT_WORK(&work->work, rt6_probe_deferred);
591 		work->target = rt->rt6i_gateway;
592 		dev_hold(rt->dst.dev);
593 		work->dev = rt->dst.dev;
594 		schedule_work(&work->work);
595 	}
596 
597 out:
598 	rcu_read_unlock_bh();
599 }
600 #else
601 static inline void rt6_probe(struct rt6_info *rt)
602 {
603 }
604 #endif
605 
606 /*
607  * Default Router Selection (RFC 2461 6.3.6)
608  */
609 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
610 {
611 	struct net_device *dev = rt->dst.dev;
612 	if (!oif || dev->ifindex == oif)
613 		return 2;
614 	if ((dev->flags & IFF_LOOPBACK) &&
615 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
616 		return 1;
617 	return 0;
618 }
619 
620 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
621 {
622 	struct neighbour *neigh;
623 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
624 
625 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
626 	    !(rt->rt6i_flags & RTF_GATEWAY))
627 		return RT6_NUD_SUCCEED;
628 
629 	rcu_read_lock_bh();
630 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
631 	if (neigh) {
632 		read_lock(&neigh->lock);
633 		if (neigh->nud_state & NUD_VALID)
634 			ret = RT6_NUD_SUCCEED;
635 #ifdef CONFIG_IPV6_ROUTER_PREF
636 		else if (!(neigh->nud_state & NUD_FAILED))
637 			ret = RT6_NUD_SUCCEED;
638 		else
639 			ret = RT6_NUD_FAIL_PROBE;
640 #endif
641 		read_unlock(&neigh->lock);
642 	} else {
643 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
644 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
645 	}
646 	rcu_read_unlock_bh();
647 
648 	return ret;
649 }
650 
651 static int rt6_score_route(struct rt6_info *rt, int oif,
652 			   int strict)
653 {
654 	int m;
655 
656 	m = rt6_check_dev(rt, oif);
657 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
658 		return RT6_NUD_FAIL_HARD;
659 #ifdef CONFIG_IPV6_ROUTER_PREF
660 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
661 #endif
662 	if (strict & RT6_LOOKUP_F_REACHABLE) {
663 		int n = rt6_check_neigh(rt);
664 		if (n < 0)
665 			return n;
666 	}
667 	return m;
668 }
669 
670 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
671 				   int *mpri, struct rt6_info *match,
672 				   bool *do_rr)
673 {
674 	int m;
675 	bool match_do_rr = false;
676 	struct inet6_dev *idev = rt->rt6i_idev;
677 
678 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
679 		goto out;
680 
681 	if (idev->cnf.ignore_routes_with_linkdown &&
682 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
683 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
684 		goto out;
685 
686 	if (rt6_check_expired(rt))
687 		goto out;
688 
689 	m = rt6_score_route(rt, oif, strict);
690 	if (m == RT6_NUD_FAIL_DO_RR) {
691 		match_do_rr = true;
692 		m = 0; /* lowest valid score */
693 	} else if (m == RT6_NUD_FAIL_HARD) {
694 		goto out;
695 	}
696 
697 	if (strict & RT6_LOOKUP_F_REACHABLE)
698 		rt6_probe(rt);
699 
700 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
701 	if (m > *mpri) {
702 		*do_rr = match_do_rr;
703 		*mpri = m;
704 		match = rt;
705 	}
706 out:
707 	return match;
708 }
709 
710 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
711 				     struct rt6_info *leaf,
712 				     struct rt6_info *rr_head,
713 				     u32 metric, int oif, int strict,
714 				     bool *do_rr)
715 {
716 	struct rt6_info *rt, *match, *cont;
717 	int mpri = -1;
718 
719 	match = NULL;
720 	cont = NULL;
721 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
722 		if (rt->rt6i_metric != metric) {
723 			cont = rt;
724 			break;
725 		}
726 
727 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 	}
729 
730 	for (rt = leaf; rt && rt != rr_head;
731 	     rt = rcu_dereference(rt->rt6_next)) {
732 		if (rt->rt6i_metric != metric) {
733 			cont = rt;
734 			break;
735 		}
736 
737 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
738 	}
739 
740 	if (match || !cont)
741 		return match;
742 
743 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
744 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
745 
746 	return match;
747 }
748 
749 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
750 				   int oif, int strict)
751 {
752 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
753 	struct rt6_info *match, *rt0;
754 	bool do_rr = false;
755 	int key_plen;
756 
757 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
758 		return net->ipv6.ip6_null_entry;
759 
760 	rt0 = rcu_dereference(fn->rr_ptr);
761 	if (!rt0)
762 		rt0 = leaf;
763 
764 	/* Double check to make sure fn is not an intermediate node
765 	 * and fn->leaf does not points to its child's leaf
766 	 * (This might happen if all routes under fn are deleted from
767 	 * the tree and fib6_repair_tree() is called on the node.)
768 	 */
769 	key_plen = rt0->rt6i_dst.plen;
770 #ifdef CONFIG_IPV6_SUBTREES
771 	if (rt0->rt6i_src.plen)
772 		key_plen = rt0->rt6i_src.plen;
773 #endif
774 	if (fn->fn_bit != key_plen)
775 		return net->ipv6.ip6_null_entry;
776 
777 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
778 			     &do_rr);
779 
780 	if (do_rr) {
781 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
782 
783 		/* no entries matched; do round-robin */
784 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
785 			next = leaf;
786 
787 		if (next != rt0) {
788 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
789 			/* make sure next is not being deleted from the tree */
790 			if (next->rt6i_node)
791 				rcu_assign_pointer(fn->rr_ptr, next);
792 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
793 		}
794 	}
795 
796 	return match ? match : net->ipv6.ip6_null_entry;
797 }
798 
799 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
800 {
801 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
802 }
803 
804 #ifdef CONFIG_IPV6_ROUTE_INFO
805 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
806 		  const struct in6_addr *gwaddr)
807 {
808 	struct net *net = dev_net(dev);
809 	struct route_info *rinfo = (struct route_info *) opt;
810 	struct in6_addr prefix_buf, *prefix;
811 	unsigned int pref;
812 	unsigned long lifetime;
813 	struct rt6_info *rt;
814 
815 	if (len < sizeof(struct route_info)) {
816 		return -EINVAL;
817 	}
818 
819 	/* Sanity check for prefix_len and length */
820 	if (rinfo->length > 3) {
821 		return -EINVAL;
822 	} else if (rinfo->prefix_len > 128) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 64) {
825 		if (rinfo->length < 2) {
826 			return -EINVAL;
827 		}
828 	} else if (rinfo->prefix_len > 0) {
829 		if (rinfo->length < 1) {
830 			return -EINVAL;
831 		}
832 	}
833 
834 	pref = rinfo->route_pref;
835 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
836 		return -EINVAL;
837 
838 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
839 
840 	if (rinfo->length == 3)
841 		prefix = (struct in6_addr *)rinfo->prefix;
842 	else {
843 		/* this function is safe */
844 		ipv6_addr_prefix(&prefix_buf,
845 				 (struct in6_addr *)rinfo->prefix,
846 				 rinfo->prefix_len);
847 		prefix = &prefix_buf;
848 	}
849 
850 	if (rinfo->prefix_len == 0)
851 		rt = rt6_get_dflt_router(gwaddr, dev);
852 	else
853 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
854 					gwaddr, dev);
855 
856 	if (rt && !lifetime) {
857 		ip6_del_rt(rt);
858 		rt = NULL;
859 	}
860 
861 	if (!rt && lifetime)
862 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
863 					dev, pref);
864 	else if (rt)
865 		rt->rt6i_flags = RTF_ROUTEINFO |
866 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
867 
868 	if (rt) {
869 		if (!addrconf_finite_timeout(lifetime))
870 			rt6_clean_expires(rt);
871 		else
872 			rt6_set_expires(rt, jiffies + HZ * lifetime);
873 
874 		ip6_rt_put(rt);
875 	}
876 	return 0;
877 }
878 #endif
879 
880 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
881 					struct in6_addr *saddr)
882 {
883 	struct fib6_node *pn, *sn;
884 	while (1) {
885 		if (fn->fn_flags & RTN_TL_ROOT)
886 			return NULL;
887 		pn = rcu_dereference(fn->parent);
888 		sn = FIB6_SUBTREE(pn);
889 		if (sn && sn != fn)
890 			fn = fib6_lookup(sn, NULL, saddr);
891 		else
892 			fn = pn;
893 		if (fn->fn_flags & RTN_RTINFO)
894 			return fn;
895 	}
896 }
897 
898 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
899 			  bool null_fallback)
900 {
901 	struct rt6_info *rt = *prt;
902 
903 	if (dst_hold_safe(&rt->dst))
904 		return true;
905 	if (null_fallback) {
906 		rt = net->ipv6.ip6_null_entry;
907 		dst_hold(&rt->dst);
908 	} else {
909 		rt = NULL;
910 	}
911 	*prt = rt;
912 	return false;
913 }
914 
915 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
916 					     struct fib6_table *table,
917 					     struct flowi6 *fl6, int flags)
918 {
919 	struct rt6_info *rt, *rt_cache;
920 	struct fib6_node *fn;
921 
922 	rcu_read_lock();
923 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
924 restart:
925 	rt = rcu_dereference(fn->leaf);
926 	if (!rt) {
927 		rt = net->ipv6.ip6_null_entry;
928 	} else {
929 		rt = rt6_device_match(net, rt, &fl6->saddr,
930 				      fl6->flowi6_oif, flags);
931 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
932 			rt = rt6_multipath_select(rt, fl6,
933 						  fl6->flowi6_oif, flags);
934 	}
935 	if (rt == net->ipv6.ip6_null_entry) {
936 		fn = fib6_backtrack(fn, &fl6->saddr);
937 		if (fn)
938 			goto restart;
939 	}
940 	/* Search through exception table */
941 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
942 	if (rt_cache)
943 		rt = rt_cache;
944 
945 	if (ip6_hold_safe(net, &rt, true))
946 		dst_use_noref(&rt->dst, jiffies);
947 
948 	rcu_read_unlock();
949 
950 	trace_fib6_table_lookup(net, rt, table, fl6);
951 
952 	return rt;
953 
954 }
955 
956 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
957 				    int flags)
958 {
959 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
960 }
961 EXPORT_SYMBOL_GPL(ip6_route_lookup);
962 
963 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
964 			    const struct in6_addr *saddr, int oif, int strict)
965 {
966 	struct flowi6 fl6 = {
967 		.flowi6_oif = oif,
968 		.daddr = *daddr,
969 	};
970 	struct dst_entry *dst;
971 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
972 
973 	if (saddr) {
974 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
975 		flags |= RT6_LOOKUP_F_HAS_SADDR;
976 	}
977 
978 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
979 	if (dst->error == 0)
980 		return (struct rt6_info *) dst;
981 
982 	dst_release(dst);
983 
984 	return NULL;
985 }
986 EXPORT_SYMBOL(rt6_lookup);
987 
988 /* ip6_ins_rt is called with FREE table->tb6_lock.
989  * It takes new route entry, the addition fails by any reason the
990  * route is released.
991  * Caller must hold dst before calling it.
992  */
993 
994 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
995 			struct mx6_config *mxc,
996 			struct netlink_ext_ack *extack)
997 {
998 	int err;
999 	struct fib6_table *table;
1000 
1001 	table = rt->rt6i_table;
1002 	spin_lock_bh(&table->tb6_lock);
1003 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1004 	spin_unlock_bh(&table->tb6_lock);
1005 
1006 	return err;
1007 }
1008 
1009 int ip6_ins_rt(struct rt6_info *rt)
1010 {
1011 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1012 	struct mx6_config mxc = { .mx = NULL, };
1013 
1014 	/* Hold dst to account for the reference from the fib6 tree */
1015 	dst_hold(&rt->dst);
1016 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1017 }
1018 
1019 /* called with rcu_lock held */
1020 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1021 {
1022 	struct net_device *dev = rt->dst.dev;
1023 
1024 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1025 		/* for copies of local routes, dst->dev needs to be the
1026 		 * device if it is a master device, the master device if
1027 		 * device is enslaved, and the loopback as the default
1028 		 */
1029 		if (netif_is_l3_slave(dev) &&
1030 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1031 			dev = l3mdev_master_dev_rcu(dev);
1032 		else if (!netif_is_l3_master(dev))
1033 			dev = dev_net(dev)->loopback_dev;
1034 		/* last case is netif_is_l3_master(dev) is true in which
1035 		 * case we want dev returned to be dev
1036 		 */
1037 	}
1038 
1039 	return dev;
1040 }
1041 
1042 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1043 					   const struct in6_addr *daddr,
1044 					   const struct in6_addr *saddr)
1045 {
1046 	struct net_device *dev;
1047 	struct rt6_info *rt;
1048 
1049 	/*
1050 	 *	Clone the route.
1051 	 */
1052 
1053 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1054 		ort = ort->from;
1055 
1056 	rcu_read_lock();
1057 	dev = ip6_rt_get_dev_rcu(ort);
1058 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1059 	rcu_read_unlock();
1060 	if (!rt)
1061 		return NULL;
1062 
1063 	ip6_rt_copy_init(rt, ort);
1064 	rt->rt6i_flags |= RTF_CACHE;
1065 	rt->rt6i_metric = 0;
1066 	rt->dst.flags |= DST_HOST;
1067 	rt->rt6i_dst.addr = *daddr;
1068 	rt->rt6i_dst.plen = 128;
1069 
1070 	if (!rt6_is_gw_or_nonexthop(ort)) {
1071 		if (ort->rt6i_dst.plen != 128 &&
1072 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1073 			rt->rt6i_flags |= RTF_ANYCAST;
1074 #ifdef CONFIG_IPV6_SUBTREES
1075 		if (rt->rt6i_src.plen && saddr) {
1076 			rt->rt6i_src.addr = *saddr;
1077 			rt->rt6i_src.plen = 128;
1078 		}
1079 #endif
1080 	}
1081 
1082 	return rt;
1083 }
1084 
1085 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1086 {
1087 	struct net_device *dev;
1088 	struct rt6_info *pcpu_rt;
1089 
1090 	rcu_read_lock();
1091 	dev = ip6_rt_get_dev_rcu(rt);
1092 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1093 	rcu_read_unlock();
1094 	if (!pcpu_rt)
1095 		return NULL;
1096 	ip6_rt_copy_init(pcpu_rt, rt);
1097 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1098 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1099 	return pcpu_rt;
1100 }
1101 
1102 /* It should be called with rcu_read_lock() acquired */
1103 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1104 {
1105 	struct rt6_info *pcpu_rt, **p;
1106 
1107 	p = this_cpu_ptr(rt->rt6i_pcpu);
1108 	pcpu_rt = *p;
1109 
1110 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1111 		rt6_dst_from_metrics_check(pcpu_rt);
1112 
1113 	return pcpu_rt;
1114 }
1115 
1116 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1117 {
1118 	struct rt6_info *pcpu_rt, *prev, **p;
1119 
1120 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1121 	if (!pcpu_rt) {
1122 		struct net *net = dev_net(rt->dst.dev);
1123 
1124 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1125 		return net->ipv6.ip6_null_entry;
1126 	}
1127 
1128 	dst_hold(&pcpu_rt->dst);
1129 	p = this_cpu_ptr(rt->rt6i_pcpu);
1130 	prev = cmpxchg(p, NULL, pcpu_rt);
1131 	BUG_ON(prev);
1132 
1133 	rt6_dst_from_metrics_check(pcpu_rt);
1134 	return pcpu_rt;
1135 }
1136 
1137 /* exception hash table implementation
1138  */
1139 static DEFINE_SPINLOCK(rt6_exception_lock);
1140 
1141 /* Remove rt6_ex from hash table and free the memory
1142  * Caller must hold rt6_exception_lock
1143  */
1144 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1145 				 struct rt6_exception *rt6_ex)
1146 {
1147 	struct net *net;
1148 
1149 	if (!bucket || !rt6_ex)
1150 		return;
1151 
1152 	net = dev_net(rt6_ex->rt6i->dst.dev);
1153 	rt6_ex->rt6i->rt6i_node = NULL;
1154 	hlist_del_rcu(&rt6_ex->hlist);
1155 	rt6_release(rt6_ex->rt6i);
1156 	kfree_rcu(rt6_ex, rcu);
1157 	WARN_ON_ONCE(!bucket->depth);
1158 	bucket->depth--;
1159 	net->ipv6.rt6_stats->fib_rt_cache--;
1160 }
1161 
1162 /* Remove oldest rt6_ex in bucket and free the memory
1163  * Caller must hold rt6_exception_lock
1164  */
1165 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1166 {
1167 	struct rt6_exception *rt6_ex, *oldest = NULL;
1168 
1169 	if (!bucket)
1170 		return;
1171 
1172 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1173 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1174 			oldest = rt6_ex;
1175 	}
1176 	rt6_remove_exception(bucket, oldest);
1177 }
1178 
1179 static u32 rt6_exception_hash(const struct in6_addr *dst,
1180 			      const struct in6_addr *src)
1181 {
1182 	static u32 seed __read_mostly;
1183 	u32 val;
1184 
1185 	net_get_random_once(&seed, sizeof(seed));
1186 	val = jhash(dst, sizeof(*dst), seed);
1187 
1188 #ifdef CONFIG_IPV6_SUBTREES
1189 	if (src)
1190 		val = jhash(src, sizeof(*src), val);
1191 #endif
1192 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1193 }
1194 
1195 /* Helper function to find the cached rt in the hash table
1196  * and update bucket pointer to point to the bucket for this
1197  * (daddr, saddr) pair
1198  * Caller must hold rt6_exception_lock
1199  */
1200 static struct rt6_exception *
1201 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1202 			      const struct in6_addr *daddr,
1203 			      const struct in6_addr *saddr)
1204 {
1205 	struct rt6_exception *rt6_ex;
1206 	u32 hval;
1207 
1208 	if (!(*bucket) || !daddr)
1209 		return NULL;
1210 
1211 	hval = rt6_exception_hash(daddr, saddr);
1212 	*bucket += hval;
1213 
1214 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1215 		struct rt6_info *rt6 = rt6_ex->rt6i;
1216 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1217 
1218 #ifdef CONFIG_IPV6_SUBTREES
1219 		if (matched && saddr)
1220 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1221 #endif
1222 		if (matched)
1223 			return rt6_ex;
1224 	}
1225 	return NULL;
1226 }
1227 
1228 /* Helper function to find the cached rt in the hash table
1229  * and update bucket pointer to point to the bucket for this
1230  * (daddr, saddr) pair
1231  * Caller must hold rcu_read_lock()
1232  */
1233 static struct rt6_exception *
1234 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1235 			 const struct in6_addr *daddr,
1236 			 const struct in6_addr *saddr)
1237 {
1238 	struct rt6_exception *rt6_ex;
1239 	u32 hval;
1240 
1241 	WARN_ON_ONCE(!rcu_read_lock_held());
1242 
1243 	if (!(*bucket) || !daddr)
1244 		return NULL;
1245 
1246 	hval = rt6_exception_hash(daddr, saddr);
1247 	*bucket += hval;
1248 
1249 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1250 		struct rt6_info *rt6 = rt6_ex->rt6i;
1251 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1252 
1253 #ifdef CONFIG_IPV6_SUBTREES
1254 		if (matched && saddr)
1255 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1256 #endif
1257 		if (matched)
1258 			return rt6_ex;
1259 	}
1260 	return NULL;
1261 }
1262 
1263 static int rt6_insert_exception(struct rt6_info *nrt,
1264 				struct rt6_info *ort)
1265 {
1266 	struct net *net = dev_net(ort->dst.dev);
1267 	struct rt6_exception_bucket *bucket;
1268 	struct in6_addr *src_key = NULL;
1269 	struct rt6_exception *rt6_ex;
1270 	int err = 0;
1271 
1272 	/* ort can't be a cache or pcpu route */
1273 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1274 		ort = ort->from;
1275 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1276 
1277 	spin_lock_bh(&rt6_exception_lock);
1278 
1279 	if (ort->exception_bucket_flushed) {
1280 		err = -EINVAL;
1281 		goto out;
1282 	}
1283 
1284 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1285 					lockdep_is_held(&rt6_exception_lock));
1286 	if (!bucket) {
1287 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1288 				 GFP_ATOMIC);
1289 		if (!bucket) {
1290 			err = -ENOMEM;
1291 			goto out;
1292 		}
1293 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1294 	}
1295 
1296 #ifdef CONFIG_IPV6_SUBTREES
1297 	/* rt6i_src.plen != 0 indicates ort is in subtree
1298 	 * and exception table is indexed by a hash of
1299 	 * both rt6i_dst and rt6i_src.
1300 	 * Otherwise, the exception table is indexed by
1301 	 * a hash of only rt6i_dst.
1302 	 */
1303 	if (ort->rt6i_src.plen)
1304 		src_key = &nrt->rt6i_src.addr;
1305 #endif
1306 
1307 	/* Update rt6i_prefsrc as it could be changed
1308 	 * in rt6_remove_prefsrc()
1309 	 */
1310 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1311 	/* rt6_mtu_change() might lower mtu on ort.
1312 	 * Only insert this exception route if its mtu
1313 	 * is less than ort's mtu value.
1314 	 */
1315 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1316 		err = -EINVAL;
1317 		goto out;
1318 	}
1319 
1320 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1321 					       src_key);
1322 	if (rt6_ex)
1323 		rt6_remove_exception(bucket, rt6_ex);
1324 
1325 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1326 	if (!rt6_ex) {
1327 		err = -ENOMEM;
1328 		goto out;
1329 	}
1330 	rt6_ex->rt6i = nrt;
1331 	rt6_ex->stamp = jiffies;
1332 	atomic_inc(&nrt->rt6i_ref);
1333 	nrt->rt6i_node = ort->rt6i_node;
1334 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1335 	bucket->depth++;
1336 	net->ipv6.rt6_stats->fib_rt_cache++;
1337 
1338 	if (bucket->depth > FIB6_MAX_DEPTH)
1339 		rt6_exception_remove_oldest(bucket);
1340 
1341 out:
1342 	spin_unlock_bh(&rt6_exception_lock);
1343 
1344 	/* Update fn->fn_sernum to invalidate all cached dst */
1345 	if (!err) {
1346 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1347 		fib6_update_sernum(ort);
1348 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1349 		fib6_force_start_gc(net);
1350 	}
1351 
1352 	return err;
1353 }
1354 
1355 void rt6_flush_exceptions(struct rt6_info *rt)
1356 {
1357 	struct rt6_exception_bucket *bucket;
1358 	struct rt6_exception *rt6_ex;
1359 	struct hlist_node *tmp;
1360 	int i;
1361 
1362 	spin_lock_bh(&rt6_exception_lock);
1363 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1364 	rt->exception_bucket_flushed = 1;
1365 
1366 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1367 				    lockdep_is_held(&rt6_exception_lock));
1368 	if (!bucket)
1369 		goto out;
1370 
1371 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1372 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1373 			rt6_remove_exception(bucket, rt6_ex);
1374 		WARN_ON_ONCE(bucket->depth);
1375 		bucket++;
1376 	}
1377 
1378 out:
1379 	spin_unlock_bh(&rt6_exception_lock);
1380 }
1381 
1382 /* Find cached rt in the hash table inside passed in rt
1383  * Caller has to hold rcu_read_lock()
1384  */
1385 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1386 					   struct in6_addr *daddr,
1387 					   struct in6_addr *saddr)
1388 {
1389 	struct rt6_exception_bucket *bucket;
1390 	struct in6_addr *src_key = NULL;
1391 	struct rt6_exception *rt6_ex;
1392 	struct rt6_info *res = NULL;
1393 
1394 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1395 
1396 #ifdef CONFIG_IPV6_SUBTREES
1397 	/* rt6i_src.plen != 0 indicates rt is in subtree
1398 	 * and exception table is indexed by a hash of
1399 	 * both rt6i_dst and rt6i_src.
1400 	 * Otherwise, the exception table is indexed by
1401 	 * a hash of only rt6i_dst.
1402 	 */
1403 	if (rt->rt6i_src.plen)
1404 		src_key = saddr;
1405 #endif
1406 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1407 
1408 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1409 		res = rt6_ex->rt6i;
1410 
1411 	return res;
1412 }
1413 
1414 /* Remove the passed in cached rt from the hash table that contains it */
1415 int rt6_remove_exception_rt(struct rt6_info *rt)
1416 {
1417 	struct rt6_exception_bucket *bucket;
1418 	struct rt6_info *from = rt->from;
1419 	struct in6_addr *src_key = NULL;
1420 	struct rt6_exception *rt6_ex;
1421 	int err;
1422 
1423 	if (!from ||
1424 	    !(rt->rt6i_flags & RTF_CACHE))
1425 		return -EINVAL;
1426 
1427 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1428 		return -ENOENT;
1429 
1430 	spin_lock_bh(&rt6_exception_lock);
1431 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1432 				    lockdep_is_held(&rt6_exception_lock));
1433 #ifdef CONFIG_IPV6_SUBTREES
1434 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1435 	 * and exception table is indexed by a hash of
1436 	 * both rt6i_dst and rt6i_src.
1437 	 * Otherwise, the exception table is indexed by
1438 	 * a hash of only rt6i_dst.
1439 	 */
1440 	if (from->rt6i_src.plen)
1441 		src_key = &rt->rt6i_src.addr;
1442 #endif
1443 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1444 					       &rt->rt6i_dst.addr,
1445 					       src_key);
1446 	if (rt6_ex) {
1447 		rt6_remove_exception(bucket, rt6_ex);
1448 		err = 0;
1449 	} else {
1450 		err = -ENOENT;
1451 	}
1452 
1453 	spin_unlock_bh(&rt6_exception_lock);
1454 	return err;
1455 }
1456 
1457 /* Find rt6_ex which contains the passed in rt cache and
1458  * refresh its stamp
1459  */
1460 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1461 {
1462 	struct rt6_exception_bucket *bucket;
1463 	struct rt6_info *from = rt->from;
1464 	struct in6_addr *src_key = NULL;
1465 	struct rt6_exception *rt6_ex;
1466 
1467 	if (!from ||
1468 	    !(rt->rt6i_flags & RTF_CACHE))
1469 		return;
1470 
1471 	rcu_read_lock();
1472 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1473 
1474 #ifdef CONFIG_IPV6_SUBTREES
1475 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1476 	 * and exception table is indexed by a hash of
1477 	 * both rt6i_dst and rt6i_src.
1478 	 * Otherwise, the exception table is indexed by
1479 	 * a hash of only rt6i_dst.
1480 	 */
1481 	if (from->rt6i_src.plen)
1482 		src_key = &rt->rt6i_src.addr;
1483 #endif
1484 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1485 					  &rt->rt6i_dst.addr,
1486 					  src_key);
1487 	if (rt6_ex)
1488 		rt6_ex->stamp = jiffies;
1489 
1490 	rcu_read_unlock();
1491 }
1492 
1493 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1494 {
1495 	struct rt6_exception_bucket *bucket;
1496 	struct rt6_exception *rt6_ex;
1497 	int i;
1498 
1499 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1500 					lockdep_is_held(&rt6_exception_lock));
1501 
1502 	if (bucket) {
1503 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1504 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1505 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1506 			}
1507 			bucket++;
1508 		}
1509 	}
1510 }
1511 
1512 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu)
1513 {
1514 	struct rt6_exception_bucket *bucket;
1515 	struct rt6_exception *rt6_ex;
1516 	int i;
1517 
1518 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1519 					lockdep_is_held(&rt6_exception_lock));
1520 
1521 	if (bucket) {
1522 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1523 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1524 				struct rt6_info *entry = rt6_ex->rt6i;
1525 				/* For RTF_CACHE with rt6i_pmtu == 0
1526 				 * (i.e. a redirected route),
1527 				 * the metrics of its rt->dst.from has already
1528 				 * been updated.
1529 				 */
1530 				if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu)
1531 					entry->rt6i_pmtu = mtu;
1532 			}
1533 			bucket++;
1534 		}
1535 	}
1536 }
1537 
1538 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1539 
1540 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1541 					struct in6_addr *gateway)
1542 {
1543 	struct rt6_exception_bucket *bucket;
1544 	struct rt6_exception *rt6_ex;
1545 	struct hlist_node *tmp;
1546 	int i;
1547 
1548 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1549 		return;
1550 
1551 	spin_lock_bh(&rt6_exception_lock);
1552 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1553 				     lockdep_is_held(&rt6_exception_lock));
1554 
1555 	if (bucket) {
1556 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1557 			hlist_for_each_entry_safe(rt6_ex, tmp,
1558 						  &bucket->chain, hlist) {
1559 				struct rt6_info *entry = rt6_ex->rt6i;
1560 
1561 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1562 				    RTF_CACHE_GATEWAY &&
1563 				    ipv6_addr_equal(gateway,
1564 						    &entry->rt6i_gateway)) {
1565 					rt6_remove_exception(bucket, rt6_ex);
1566 				}
1567 			}
1568 			bucket++;
1569 		}
1570 	}
1571 
1572 	spin_unlock_bh(&rt6_exception_lock);
1573 }
1574 
1575 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1576 				      struct rt6_exception *rt6_ex,
1577 				      struct fib6_gc_args *gc_args,
1578 				      unsigned long now)
1579 {
1580 	struct rt6_info *rt = rt6_ex->rt6i;
1581 
1582 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1583 	 * even if others have still references to them, so that on next
1584 	 * dst_check() such references can be dropped.
1585 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1586 	 * expired, independently from their aging, as per RFC 8201 section 4
1587 	 */
1588 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1589 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1590 			RT6_TRACE("aging clone %p\n", rt);
1591 			rt6_remove_exception(bucket, rt6_ex);
1592 			return;
1593 		}
1594 	} else if (time_after(jiffies, rt->dst.expires)) {
1595 		RT6_TRACE("purging expired route %p\n", rt);
1596 		rt6_remove_exception(bucket, rt6_ex);
1597 		return;
1598 	}
1599 
1600 	if (rt->rt6i_flags & RTF_GATEWAY) {
1601 		struct neighbour *neigh;
1602 		__u8 neigh_flags = 0;
1603 
1604 		neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
1605 		if (neigh) {
1606 			neigh_flags = neigh->flags;
1607 			neigh_release(neigh);
1608 		}
1609 		if (!(neigh_flags & NTF_ROUTER)) {
1610 			RT6_TRACE("purging route %p via non-router but gateway\n",
1611 				  rt);
1612 			rt6_remove_exception(bucket, rt6_ex);
1613 			return;
1614 		}
1615 	}
1616 
1617 	gc_args->more++;
1618 }
1619 
1620 void rt6_age_exceptions(struct rt6_info *rt,
1621 			struct fib6_gc_args *gc_args,
1622 			unsigned long now)
1623 {
1624 	struct rt6_exception_bucket *bucket;
1625 	struct rt6_exception *rt6_ex;
1626 	struct hlist_node *tmp;
1627 	int i;
1628 
1629 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1630 		return;
1631 
1632 	spin_lock_bh(&rt6_exception_lock);
1633 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1634 				    lockdep_is_held(&rt6_exception_lock));
1635 
1636 	if (bucket) {
1637 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1638 			hlist_for_each_entry_safe(rt6_ex, tmp,
1639 						  &bucket->chain, hlist) {
1640 				rt6_age_examine_exception(bucket, rt6_ex,
1641 							  gc_args, now);
1642 			}
1643 			bucket++;
1644 		}
1645 	}
1646 	spin_unlock_bh(&rt6_exception_lock);
1647 }
1648 
1649 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1650 			       int oif, struct flowi6 *fl6, int flags)
1651 {
1652 	struct fib6_node *fn, *saved_fn;
1653 	struct rt6_info *rt, *rt_cache;
1654 	int strict = 0;
1655 
1656 	strict |= flags & RT6_LOOKUP_F_IFACE;
1657 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1658 	if (net->ipv6.devconf_all->forwarding == 0)
1659 		strict |= RT6_LOOKUP_F_REACHABLE;
1660 
1661 	rcu_read_lock();
1662 
1663 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1664 	saved_fn = fn;
1665 
1666 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1667 		oif = 0;
1668 
1669 redo_rt6_select:
1670 	rt = rt6_select(net, fn, oif, strict);
1671 	if (rt->rt6i_nsiblings)
1672 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1673 	if (rt == net->ipv6.ip6_null_entry) {
1674 		fn = fib6_backtrack(fn, &fl6->saddr);
1675 		if (fn)
1676 			goto redo_rt6_select;
1677 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1678 			/* also consider unreachable route */
1679 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1680 			fn = saved_fn;
1681 			goto redo_rt6_select;
1682 		}
1683 	}
1684 
1685 	/*Search through exception table */
1686 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1687 	if (rt_cache)
1688 		rt = rt_cache;
1689 
1690 	if (rt == net->ipv6.ip6_null_entry) {
1691 		rcu_read_unlock();
1692 		dst_hold(&rt->dst);
1693 		trace_fib6_table_lookup(net, rt, table, fl6);
1694 		return rt;
1695 	} else if (rt->rt6i_flags & RTF_CACHE) {
1696 		if (ip6_hold_safe(net, &rt, true)) {
1697 			dst_use_noref(&rt->dst, jiffies);
1698 			rt6_dst_from_metrics_check(rt);
1699 		}
1700 		rcu_read_unlock();
1701 		trace_fib6_table_lookup(net, rt, table, fl6);
1702 		return rt;
1703 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1704 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1705 		/* Create a RTF_CACHE clone which will not be
1706 		 * owned by the fib6 tree.  It is for the special case where
1707 		 * the daddr in the skb during the neighbor look-up is different
1708 		 * from the fl6->daddr used to look-up route here.
1709 		 */
1710 
1711 		struct rt6_info *uncached_rt;
1712 
1713 		if (ip6_hold_safe(net, &rt, true)) {
1714 			dst_use_noref(&rt->dst, jiffies);
1715 		} else {
1716 			rcu_read_unlock();
1717 			uncached_rt = rt;
1718 			goto uncached_rt_out;
1719 		}
1720 		rcu_read_unlock();
1721 
1722 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1723 		dst_release(&rt->dst);
1724 
1725 		if (uncached_rt) {
1726 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1727 			 * No need for another dst_hold()
1728 			 */
1729 			rt6_uncached_list_add(uncached_rt);
1730 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1731 		} else {
1732 			uncached_rt = net->ipv6.ip6_null_entry;
1733 			dst_hold(&uncached_rt->dst);
1734 		}
1735 
1736 uncached_rt_out:
1737 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1738 		return uncached_rt;
1739 
1740 	} else {
1741 		/* Get a percpu copy */
1742 
1743 		struct rt6_info *pcpu_rt;
1744 
1745 		dst_use_noref(&rt->dst, jiffies);
1746 		local_bh_disable();
1747 		pcpu_rt = rt6_get_pcpu_route(rt);
1748 
1749 		if (!pcpu_rt) {
1750 			/* atomic_inc_not_zero() is needed when using rcu */
1751 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1752 				/* No dst_hold() on rt is needed because grabbing
1753 				 * rt->rt6i_ref makes sure rt can't be released.
1754 				 */
1755 				pcpu_rt = rt6_make_pcpu_route(rt);
1756 				rt6_release(rt);
1757 			} else {
1758 				/* rt is already removed from tree */
1759 				pcpu_rt = net->ipv6.ip6_null_entry;
1760 				dst_hold(&pcpu_rt->dst);
1761 			}
1762 		}
1763 		local_bh_enable();
1764 		rcu_read_unlock();
1765 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1766 		return pcpu_rt;
1767 	}
1768 }
1769 EXPORT_SYMBOL_GPL(ip6_pol_route);
1770 
1771 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1772 					    struct flowi6 *fl6, int flags)
1773 {
1774 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1775 }
1776 
1777 struct dst_entry *ip6_route_input_lookup(struct net *net,
1778 					 struct net_device *dev,
1779 					 struct flowi6 *fl6, int flags)
1780 {
1781 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1782 		flags |= RT6_LOOKUP_F_IFACE;
1783 
1784 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1785 }
1786 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1787 
1788 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1789 				  struct flow_keys *keys)
1790 {
1791 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1792 	const struct ipv6hdr *key_iph = outer_iph;
1793 	const struct ipv6hdr *inner_iph;
1794 	const struct icmp6hdr *icmph;
1795 	struct ipv6hdr _inner_iph;
1796 
1797 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1798 		goto out;
1799 
1800 	icmph = icmp6_hdr(skb);
1801 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1802 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1803 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1804 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1805 		goto out;
1806 
1807 	inner_iph = skb_header_pointer(skb,
1808 				       skb_transport_offset(skb) + sizeof(*icmph),
1809 				       sizeof(_inner_iph), &_inner_iph);
1810 	if (!inner_iph)
1811 		goto out;
1812 
1813 	key_iph = inner_iph;
1814 out:
1815 	memset(keys, 0, sizeof(*keys));
1816 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1817 	keys->addrs.v6addrs.src = key_iph->saddr;
1818 	keys->addrs.v6addrs.dst = key_iph->daddr;
1819 	keys->tags.flow_label = ip6_flowinfo(key_iph);
1820 	keys->basic.ip_proto = key_iph->nexthdr;
1821 }
1822 
1823 /* if skb is set it will be used and fl6 can be NULL */
1824 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1825 {
1826 	struct flow_keys hash_keys;
1827 
1828 	if (skb) {
1829 		ip6_multipath_l3_keys(skb, &hash_keys);
1830 		return flow_hash_from_keys(&hash_keys) >> 1;
1831 	}
1832 
1833 	return get_hash_from_flowi6(fl6) >> 1;
1834 }
1835 
1836 void ip6_route_input(struct sk_buff *skb)
1837 {
1838 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1839 	struct net *net = dev_net(skb->dev);
1840 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1841 	struct ip_tunnel_info *tun_info;
1842 	struct flowi6 fl6 = {
1843 		.flowi6_iif = skb->dev->ifindex,
1844 		.daddr = iph->daddr,
1845 		.saddr = iph->saddr,
1846 		.flowlabel = ip6_flowinfo(iph),
1847 		.flowi6_mark = skb->mark,
1848 		.flowi6_proto = iph->nexthdr,
1849 	};
1850 
1851 	tun_info = skb_tunnel_info(skb);
1852 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1853 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1854 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1855 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1856 	skb_dst_drop(skb);
1857 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1858 }
1859 
1860 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1861 					     struct flowi6 *fl6, int flags)
1862 {
1863 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1864 }
1865 
1866 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1867 					 struct flowi6 *fl6, int flags)
1868 {
1869 	bool any_src;
1870 
1871 	if (rt6_need_strict(&fl6->daddr)) {
1872 		struct dst_entry *dst;
1873 
1874 		dst = l3mdev_link_scope_lookup(net, fl6);
1875 		if (dst)
1876 			return dst;
1877 	}
1878 
1879 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1880 
1881 	any_src = ipv6_addr_any(&fl6->saddr);
1882 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1883 	    (fl6->flowi6_oif && any_src))
1884 		flags |= RT6_LOOKUP_F_IFACE;
1885 
1886 	if (!any_src)
1887 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1888 	else if (sk)
1889 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1890 
1891 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1892 }
1893 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1894 
1895 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1896 {
1897 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1898 	struct net_device *loopback_dev = net->loopback_dev;
1899 	struct dst_entry *new = NULL;
1900 
1901 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1902 		       DST_OBSOLETE_DEAD, 0);
1903 	if (rt) {
1904 		rt6_info_init(rt);
1905 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
1906 
1907 		new = &rt->dst;
1908 		new->__use = 1;
1909 		new->input = dst_discard;
1910 		new->output = dst_discard_out;
1911 
1912 		dst_copy_metrics(new, &ort->dst);
1913 
1914 		rt->rt6i_idev = in6_dev_get(loopback_dev);
1915 		rt->rt6i_gateway = ort->rt6i_gateway;
1916 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1917 		rt->rt6i_metric = 0;
1918 
1919 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1920 #ifdef CONFIG_IPV6_SUBTREES
1921 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1922 #endif
1923 	}
1924 
1925 	dst_release(dst_orig);
1926 	return new ? new : ERR_PTR(-ENOMEM);
1927 }
1928 
1929 /*
1930  *	Destination cache support functions
1931  */
1932 
1933 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1934 {
1935 	if (rt->from &&
1936 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
1937 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
1938 }
1939 
1940 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1941 {
1942 	u32 rt_cookie = 0;
1943 
1944 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1945 		return NULL;
1946 
1947 	if (rt6_check_expired(rt))
1948 		return NULL;
1949 
1950 	return &rt->dst;
1951 }
1952 
1953 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1954 {
1955 	if (!__rt6_check_expired(rt) &&
1956 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1957 	    rt6_check(rt->from, cookie))
1958 		return &rt->dst;
1959 	else
1960 		return NULL;
1961 }
1962 
1963 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1964 {
1965 	struct rt6_info *rt;
1966 
1967 	rt = (struct rt6_info *) dst;
1968 
1969 	/* All IPV6 dsts are created with ->obsolete set to the value
1970 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1971 	 * into this function always.
1972 	 */
1973 
1974 	rt6_dst_from_metrics_check(rt);
1975 
1976 	if (rt->rt6i_flags & RTF_PCPU ||
1977 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
1978 		return rt6_dst_from_check(rt, cookie);
1979 	else
1980 		return rt6_check(rt, cookie);
1981 }
1982 
1983 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1984 {
1985 	struct rt6_info *rt = (struct rt6_info *) dst;
1986 
1987 	if (rt) {
1988 		if (rt->rt6i_flags & RTF_CACHE) {
1989 			if (rt6_check_expired(rt)) {
1990 				ip6_del_rt(rt);
1991 				dst = NULL;
1992 			}
1993 		} else {
1994 			dst_release(dst);
1995 			dst = NULL;
1996 		}
1997 	}
1998 	return dst;
1999 }
2000 
2001 static void ip6_link_failure(struct sk_buff *skb)
2002 {
2003 	struct rt6_info *rt;
2004 
2005 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2006 
2007 	rt = (struct rt6_info *) skb_dst(skb);
2008 	if (rt) {
2009 		if (rt->rt6i_flags & RTF_CACHE) {
2010 			if (dst_hold_safe(&rt->dst))
2011 				ip6_del_rt(rt);
2012 		} else {
2013 			struct fib6_node *fn;
2014 
2015 			rcu_read_lock();
2016 			fn = rcu_dereference(rt->rt6i_node);
2017 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2018 				fn->fn_sernum = -1;
2019 			rcu_read_unlock();
2020 		}
2021 	}
2022 }
2023 
2024 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2025 {
2026 	struct net *net = dev_net(rt->dst.dev);
2027 
2028 	rt->rt6i_flags |= RTF_MODIFIED;
2029 	rt->rt6i_pmtu = mtu;
2030 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2031 }
2032 
2033 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2034 {
2035 	return !(rt->rt6i_flags & RTF_CACHE) &&
2036 		(rt->rt6i_flags & RTF_PCPU ||
2037 		 rcu_access_pointer(rt->rt6i_node));
2038 }
2039 
2040 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2041 				 const struct ipv6hdr *iph, u32 mtu)
2042 {
2043 	const struct in6_addr *daddr, *saddr;
2044 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2045 
2046 	if (rt6->rt6i_flags & RTF_LOCAL)
2047 		return;
2048 
2049 	if (dst_metric_locked(dst, RTAX_MTU))
2050 		return;
2051 
2052 	if (iph) {
2053 		daddr = &iph->daddr;
2054 		saddr = &iph->saddr;
2055 	} else if (sk) {
2056 		daddr = &sk->sk_v6_daddr;
2057 		saddr = &inet6_sk(sk)->saddr;
2058 	} else {
2059 		daddr = NULL;
2060 		saddr = NULL;
2061 	}
2062 	dst_confirm_neigh(dst, daddr);
2063 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2064 	if (mtu >= dst_mtu(dst))
2065 		return;
2066 
2067 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2068 		rt6_do_update_pmtu(rt6, mtu);
2069 		/* update rt6_ex->stamp for cache */
2070 		if (rt6->rt6i_flags & RTF_CACHE)
2071 			rt6_update_exception_stamp_rt(rt6);
2072 	} else if (daddr) {
2073 		struct rt6_info *nrt6;
2074 
2075 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2076 		if (nrt6) {
2077 			rt6_do_update_pmtu(nrt6, mtu);
2078 			if (rt6_insert_exception(nrt6, rt6))
2079 				dst_release_immediate(&nrt6->dst);
2080 		}
2081 	}
2082 }
2083 
2084 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2085 			       struct sk_buff *skb, u32 mtu)
2086 {
2087 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2088 }
2089 
2090 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2091 		     int oif, u32 mark, kuid_t uid)
2092 {
2093 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2094 	struct dst_entry *dst;
2095 	struct flowi6 fl6;
2096 
2097 	memset(&fl6, 0, sizeof(fl6));
2098 	fl6.flowi6_oif = oif;
2099 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2100 	fl6.daddr = iph->daddr;
2101 	fl6.saddr = iph->saddr;
2102 	fl6.flowlabel = ip6_flowinfo(iph);
2103 	fl6.flowi6_uid = uid;
2104 
2105 	dst = ip6_route_output(net, NULL, &fl6);
2106 	if (!dst->error)
2107 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2108 	dst_release(dst);
2109 }
2110 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2111 
2112 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2113 {
2114 	struct dst_entry *dst;
2115 
2116 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2117 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2118 
2119 	dst = __sk_dst_get(sk);
2120 	if (!dst || !dst->obsolete ||
2121 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2122 		return;
2123 
2124 	bh_lock_sock(sk);
2125 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2126 		ip6_datagram_dst_update(sk, false);
2127 	bh_unlock_sock(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2130 
2131 /* Handle redirects */
2132 struct ip6rd_flowi {
2133 	struct flowi6 fl6;
2134 	struct in6_addr gateway;
2135 };
2136 
2137 static struct rt6_info *__ip6_route_redirect(struct net *net,
2138 					     struct fib6_table *table,
2139 					     struct flowi6 *fl6,
2140 					     int flags)
2141 {
2142 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2143 	struct rt6_info *rt, *rt_cache;
2144 	struct fib6_node *fn;
2145 
2146 	/* Get the "current" route for this destination and
2147 	 * check if the redirect has come from appropriate router.
2148 	 *
2149 	 * RFC 4861 specifies that redirects should only be
2150 	 * accepted if they come from the nexthop to the target.
2151 	 * Due to the way the routes are chosen, this notion
2152 	 * is a bit fuzzy and one might need to check all possible
2153 	 * routes.
2154 	 */
2155 
2156 	rcu_read_lock();
2157 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2158 restart:
2159 	for_each_fib6_node_rt_rcu(fn) {
2160 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2161 			continue;
2162 		if (rt6_check_expired(rt))
2163 			continue;
2164 		if (rt->dst.error)
2165 			break;
2166 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2167 			continue;
2168 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2169 			continue;
2170 		/* rt_cache's gateway might be different from its 'parent'
2171 		 * in the case of an ip redirect.
2172 		 * So we keep searching in the exception table if the gateway
2173 		 * is different.
2174 		 */
2175 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2176 			rt_cache = rt6_find_cached_rt(rt,
2177 						      &fl6->daddr,
2178 						      &fl6->saddr);
2179 			if (rt_cache &&
2180 			    ipv6_addr_equal(&rdfl->gateway,
2181 					    &rt_cache->rt6i_gateway)) {
2182 				rt = rt_cache;
2183 				break;
2184 			}
2185 			continue;
2186 		}
2187 		break;
2188 	}
2189 
2190 	if (!rt)
2191 		rt = net->ipv6.ip6_null_entry;
2192 	else if (rt->dst.error) {
2193 		rt = net->ipv6.ip6_null_entry;
2194 		goto out;
2195 	}
2196 
2197 	if (rt == net->ipv6.ip6_null_entry) {
2198 		fn = fib6_backtrack(fn, &fl6->saddr);
2199 		if (fn)
2200 			goto restart;
2201 	}
2202 
2203 out:
2204 	ip6_hold_safe(net, &rt, true);
2205 
2206 	rcu_read_unlock();
2207 
2208 	trace_fib6_table_lookup(net, rt, table, fl6);
2209 	return rt;
2210 };
2211 
2212 static struct dst_entry *ip6_route_redirect(struct net *net,
2213 					const struct flowi6 *fl6,
2214 					const struct in6_addr *gateway)
2215 {
2216 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2217 	struct ip6rd_flowi rdfl;
2218 
2219 	rdfl.fl6 = *fl6;
2220 	rdfl.gateway = *gateway;
2221 
2222 	return fib6_rule_lookup(net, &rdfl.fl6,
2223 				flags, __ip6_route_redirect);
2224 }
2225 
2226 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2227 		  kuid_t uid)
2228 {
2229 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2230 	struct dst_entry *dst;
2231 	struct flowi6 fl6;
2232 
2233 	memset(&fl6, 0, sizeof(fl6));
2234 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2235 	fl6.flowi6_oif = oif;
2236 	fl6.flowi6_mark = mark;
2237 	fl6.daddr = iph->daddr;
2238 	fl6.saddr = iph->saddr;
2239 	fl6.flowlabel = ip6_flowinfo(iph);
2240 	fl6.flowi6_uid = uid;
2241 
2242 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
2243 	rt6_do_redirect(dst, NULL, skb);
2244 	dst_release(dst);
2245 }
2246 EXPORT_SYMBOL_GPL(ip6_redirect);
2247 
2248 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2249 			    u32 mark)
2250 {
2251 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2252 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2253 	struct dst_entry *dst;
2254 	struct flowi6 fl6;
2255 
2256 	memset(&fl6, 0, sizeof(fl6));
2257 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2258 	fl6.flowi6_oif = oif;
2259 	fl6.flowi6_mark = mark;
2260 	fl6.daddr = msg->dest;
2261 	fl6.saddr = iph->daddr;
2262 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2263 
2264 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
2265 	rt6_do_redirect(dst, NULL, skb);
2266 	dst_release(dst);
2267 }
2268 
2269 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2270 {
2271 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2272 		     sk->sk_uid);
2273 }
2274 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2275 
2276 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2277 {
2278 	struct net_device *dev = dst->dev;
2279 	unsigned int mtu = dst_mtu(dst);
2280 	struct net *net = dev_net(dev);
2281 
2282 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2283 
2284 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2285 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2286 
2287 	/*
2288 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2289 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2290 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2291 	 * rely only on pmtu discovery"
2292 	 */
2293 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2294 		mtu = IPV6_MAXPLEN;
2295 	return mtu;
2296 }
2297 
2298 static unsigned int ip6_mtu(const struct dst_entry *dst)
2299 {
2300 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2301 	unsigned int mtu = rt->rt6i_pmtu;
2302 	struct inet6_dev *idev;
2303 
2304 	if (mtu)
2305 		goto out;
2306 
2307 	mtu = dst_metric_raw(dst, RTAX_MTU);
2308 	if (mtu)
2309 		goto out;
2310 
2311 	mtu = IPV6_MIN_MTU;
2312 
2313 	rcu_read_lock();
2314 	idev = __in6_dev_get(dst->dev);
2315 	if (idev)
2316 		mtu = idev->cnf.mtu6;
2317 	rcu_read_unlock();
2318 
2319 out:
2320 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2321 
2322 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2323 }
2324 
2325 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2326 				  struct flowi6 *fl6)
2327 {
2328 	struct dst_entry *dst;
2329 	struct rt6_info *rt;
2330 	struct inet6_dev *idev = in6_dev_get(dev);
2331 	struct net *net = dev_net(dev);
2332 
2333 	if (unlikely(!idev))
2334 		return ERR_PTR(-ENODEV);
2335 
2336 	rt = ip6_dst_alloc(net, dev, 0);
2337 	if (unlikely(!rt)) {
2338 		in6_dev_put(idev);
2339 		dst = ERR_PTR(-ENOMEM);
2340 		goto out;
2341 	}
2342 
2343 	rt->dst.flags |= DST_HOST;
2344 	rt->dst.input = ip6_input;
2345 	rt->dst.output  = ip6_output;
2346 	rt->rt6i_gateway  = fl6->daddr;
2347 	rt->rt6i_dst.addr = fl6->daddr;
2348 	rt->rt6i_dst.plen = 128;
2349 	rt->rt6i_idev     = idev;
2350 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2351 
2352 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2353 	 * do proper release of the net_device
2354 	 */
2355 	rt6_uncached_list_add(rt);
2356 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2357 
2358 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2359 
2360 out:
2361 	return dst;
2362 }
2363 
2364 static int ip6_dst_gc(struct dst_ops *ops)
2365 {
2366 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2367 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2368 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2369 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2370 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2371 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2372 	int entries;
2373 
2374 	entries = dst_entries_get_fast(ops);
2375 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2376 	    entries <= rt_max_size)
2377 		goto out;
2378 
2379 	net->ipv6.ip6_rt_gc_expire++;
2380 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2381 	entries = dst_entries_get_slow(ops);
2382 	if (entries < ops->gc_thresh)
2383 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2384 out:
2385 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2386 	return entries > rt_max_size;
2387 }
2388 
2389 static int ip6_convert_metrics(struct mx6_config *mxc,
2390 			       const struct fib6_config *cfg)
2391 {
2392 	struct net *net = cfg->fc_nlinfo.nl_net;
2393 	bool ecn_ca = false;
2394 	struct nlattr *nla;
2395 	int remaining;
2396 	u32 *mp;
2397 
2398 	if (!cfg->fc_mx)
2399 		return 0;
2400 
2401 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2402 	if (unlikely(!mp))
2403 		return -ENOMEM;
2404 
2405 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2406 		int type = nla_type(nla);
2407 		u32 val;
2408 
2409 		if (!type)
2410 			continue;
2411 		if (unlikely(type > RTAX_MAX))
2412 			goto err;
2413 
2414 		if (type == RTAX_CC_ALGO) {
2415 			char tmp[TCP_CA_NAME_MAX];
2416 
2417 			nla_strlcpy(tmp, nla, sizeof(tmp));
2418 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2419 			if (val == TCP_CA_UNSPEC)
2420 				goto err;
2421 		} else {
2422 			val = nla_get_u32(nla);
2423 		}
2424 		if (type == RTAX_HOPLIMIT && val > 255)
2425 			val = 255;
2426 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2427 			goto err;
2428 
2429 		mp[type - 1] = val;
2430 		__set_bit(type - 1, mxc->mx_valid);
2431 	}
2432 
2433 	if (ecn_ca) {
2434 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2435 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2436 	}
2437 
2438 	mxc->mx = mp;
2439 	return 0;
2440  err:
2441 	kfree(mp);
2442 	return -EINVAL;
2443 }
2444 
2445 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2446 					    struct fib6_config *cfg,
2447 					    const struct in6_addr *gw_addr,
2448 					    u32 tbid, int flags)
2449 {
2450 	struct flowi6 fl6 = {
2451 		.flowi6_oif = cfg->fc_ifindex,
2452 		.daddr = *gw_addr,
2453 		.saddr = cfg->fc_prefsrc,
2454 	};
2455 	struct fib6_table *table;
2456 	struct rt6_info *rt;
2457 
2458 	table = fib6_get_table(net, tbid);
2459 	if (!table)
2460 		return NULL;
2461 
2462 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2463 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2464 
2465 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2466 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
2467 
2468 	/* if table lookup failed, fall back to full lookup */
2469 	if (rt == net->ipv6.ip6_null_entry) {
2470 		ip6_rt_put(rt);
2471 		rt = NULL;
2472 	}
2473 
2474 	return rt;
2475 }
2476 
2477 static int ip6_route_check_nh_onlink(struct net *net,
2478 				     struct fib6_config *cfg,
2479 				     struct net_device *dev,
2480 				     struct netlink_ext_ack *extack)
2481 {
2482 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
2483 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2484 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2485 	struct rt6_info *grt;
2486 	int err;
2487 
2488 	err = 0;
2489 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2490 	if (grt) {
2491 		if (grt->rt6i_flags & flags || dev != grt->dst.dev) {
2492 			NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
2493 			err = -EINVAL;
2494 		}
2495 
2496 		ip6_rt_put(grt);
2497 	}
2498 
2499 	return err;
2500 }
2501 
2502 static int ip6_route_check_nh(struct net *net,
2503 			      struct fib6_config *cfg,
2504 			      struct net_device **_dev,
2505 			      struct inet6_dev **idev)
2506 {
2507 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2508 	struct net_device *dev = _dev ? *_dev : NULL;
2509 	struct rt6_info *grt = NULL;
2510 	int err = -EHOSTUNREACH;
2511 
2512 	if (cfg->fc_table) {
2513 		int flags = RT6_LOOKUP_F_IFACE;
2514 
2515 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2516 					  cfg->fc_table, flags);
2517 		if (grt) {
2518 			if (grt->rt6i_flags & RTF_GATEWAY ||
2519 			    (dev && dev != grt->dst.dev)) {
2520 				ip6_rt_put(grt);
2521 				grt = NULL;
2522 			}
2523 		}
2524 	}
2525 
2526 	if (!grt)
2527 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
2528 
2529 	if (!grt)
2530 		goto out;
2531 
2532 	if (dev) {
2533 		if (dev != grt->dst.dev) {
2534 			ip6_rt_put(grt);
2535 			goto out;
2536 		}
2537 	} else {
2538 		*_dev = dev = grt->dst.dev;
2539 		*idev = grt->rt6i_idev;
2540 		dev_hold(dev);
2541 		in6_dev_hold(grt->rt6i_idev);
2542 	}
2543 
2544 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2545 		err = 0;
2546 
2547 	ip6_rt_put(grt);
2548 
2549 out:
2550 	return err;
2551 }
2552 
2553 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2554 					      struct netlink_ext_ack *extack)
2555 {
2556 	struct net *net = cfg->fc_nlinfo.nl_net;
2557 	struct rt6_info *rt = NULL;
2558 	struct net_device *dev = NULL;
2559 	struct inet6_dev *idev = NULL;
2560 	struct fib6_table *table;
2561 	int addr_type;
2562 	int err = -EINVAL;
2563 
2564 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2565 	if (cfg->fc_flags & RTF_PCPU) {
2566 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2567 		goto out;
2568 	}
2569 
2570 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2571 	if (cfg->fc_flags & RTF_CACHE) {
2572 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2573 		goto out;
2574 	}
2575 
2576 	if (cfg->fc_dst_len > 128) {
2577 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2578 		goto out;
2579 	}
2580 	if (cfg->fc_src_len > 128) {
2581 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2582 		goto out;
2583 	}
2584 #ifndef CONFIG_IPV6_SUBTREES
2585 	if (cfg->fc_src_len) {
2586 		NL_SET_ERR_MSG(extack,
2587 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2588 		goto out;
2589 	}
2590 #endif
2591 	if (cfg->fc_ifindex) {
2592 		err = -ENODEV;
2593 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2594 		if (!dev)
2595 			goto out;
2596 		idev = in6_dev_get(dev);
2597 		if (!idev)
2598 			goto out;
2599 	}
2600 
2601 	if (cfg->fc_metric == 0)
2602 		cfg->fc_metric = IP6_RT_PRIO_USER;
2603 
2604 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2605 		if (!dev) {
2606 			NL_SET_ERR_MSG(extack,
2607 				       "Nexthop device required for onlink");
2608 			err = -ENODEV;
2609 			goto out;
2610 		}
2611 
2612 		if (!(dev->flags & IFF_UP)) {
2613 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2614 			err = -ENETDOWN;
2615 			goto out;
2616 		}
2617 	}
2618 
2619 	err = -ENOBUFS;
2620 	if (cfg->fc_nlinfo.nlh &&
2621 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2622 		table = fib6_get_table(net, cfg->fc_table);
2623 		if (!table) {
2624 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2625 			table = fib6_new_table(net, cfg->fc_table);
2626 		}
2627 	} else {
2628 		table = fib6_new_table(net, cfg->fc_table);
2629 	}
2630 
2631 	if (!table)
2632 		goto out;
2633 
2634 	rt = ip6_dst_alloc(net, NULL,
2635 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2636 
2637 	if (!rt) {
2638 		err = -ENOMEM;
2639 		goto out;
2640 	}
2641 
2642 	if (cfg->fc_flags & RTF_EXPIRES)
2643 		rt6_set_expires(rt, jiffies +
2644 				clock_t_to_jiffies(cfg->fc_expires));
2645 	else
2646 		rt6_clean_expires(rt);
2647 
2648 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2649 		cfg->fc_protocol = RTPROT_BOOT;
2650 	rt->rt6i_protocol = cfg->fc_protocol;
2651 
2652 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2653 
2654 	if (addr_type & IPV6_ADDR_MULTICAST)
2655 		rt->dst.input = ip6_mc_input;
2656 	else if (cfg->fc_flags & RTF_LOCAL)
2657 		rt->dst.input = ip6_input;
2658 	else
2659 		rt->dst.input = ip6_forward;
2660 
2661 	rt->dst.output = ip6_output;
2662 
2663 	if (cfg->fc_encap) {
2664 		struct lwtunnel_state *lwtstate;
2665 
2666 		err = lwtunnel_build_state(cfg->fc_encap_type,
2667 					   cfg->fc_encap, AF_INET6, cfg,
2668 					   &lwtstate, extack);
2669 		if (err)
2670 			goto out;
2671 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2672 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2673 			rt->dst.lwtstate->orig_output = rt->dst.output;
2674 			rt->dst.output = lwtunnel_output;
2675 		}
2676 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2677 			rt->dst.lwtstate->orig_input = rt->dst.input;
2678 			rt->dst.input = lwtunnel_input;
2679 		}
2680 	}
2681 
2682 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2683 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2684 	if (rt->rt6i_dst.plen == 128)
2685 		rt->dst.flags |= DST_HOST;
2686 
2687 #ifdef CONFIG_IPV6_SUBTREES
2688 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2689 	rt->rt6i_src.plen = cfg->fc_src_len;
2690 #endif
2691 
2692 	rt->rt6i_metric = cfg->fc_metric;
2693 	rt->rt6i_nh_weight = 1;
2694 
2695 	/* We cannot add true routes via loopback here,
2696 	   they would result in kernel looping; promote them to reject routes
2697 	 */
2698 	if ((cfg->fc_flags & RTF_REJECT) ||
2699 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2700 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2701 	     !(cfg->fc_flags & RTF_LOCAL))) {
2702 		/* hold loopback dev/idev if we haven't done so. */
2703 		if (dev != net->loopback_dev) {
2704 			if (dev) {
2705 				dev_put(dev);
2706 				in6_dev_put(idev);
2707 			}
2708 			dev = net->loopback_dev;
2709 			dev_hold(dev);
2710 			idev = in6_dev_get(dev);
2711 			if (!idev) {
2712 				err = -ENODEV;
2713 				goto out;
2714 			}
2715 		}
2716 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2717 		switch (cfg->fc_type) {
2718 		case RTN_BLACKHOLE:
2719 			rt->dst.error = -EINVAL;
2720 			rt->dst.output = dst_discard_out;
2721 			rt->dst.input = dst_discard;
2722 			break;
2723 		case RTN_PROHIBIT:
2724 			rt->dst.error = -EACCES;
2725 			rt->dst.output = ip6_pkt_prohibit_out;
2726 			rt->dst.input = ip6_pkt_prohibit;
2727 			break;
2728 		case RTN_THROW:
2729 		case RTN_UNREACHABLE:
2730 		default:
2731 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2732 					: (cfg->fc_type == RTN_UNREACHABLE)
2733 					? -EHOSTUNREACH : -ENETUNREACH;
2734 			rt->dst.output = ip6_pkt_discard_out;
2735 			rt->dst.input = ip6_pkt_discard;
2736 			break;
2737 		}
2738 		goto install_route;
2739 	}
2740 
2741 	if (cfg->fc_flags & RTF_GATEWAY) {
2742 		const struct in6_addr *gw_addr;
2743 		int gwa_type;
2744 
2745 		gw_addr = &cfg->fc_gateway;
2746 		gwa_type = ipv6_addr_type(gw_addr);
2747 
2748 		/* if gw_addr is local we will fail to detect this in case
2749 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2750 		 * will return already-added prefix route via interface that
2751 		 * prefix route was assigned to, which might be non-loopback.
2752 		 */
2753 		err = -EINVAL;
2754 		if (ipv6_chk_addr_and_flags(net, gw_addr,
2755 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
2756 					    dev : NULL, 0, 0)) {
2757 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2758 			goto out;
2759 		}
2760 		rt->rt6i_gateway = *gw_addr;
2761 
2762 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2763 			/* IPv6 strictly inhibits using not link-local
2764 			   addresses as nexthop address.
2765 			   Otherwise, router will not able to send redirects.
2766 			   It is very good, but in some (rare!) circumstances
2767 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
2768 			   some exceptions. --ANK
2769 			   We allow IPv4-mapped nexthops to support RFC4798-type
2770 			   addressing
2771 			 */
2772 			if (!(gwa_type & (IPV6_ADDR_UNICAST |
2773 					  IPV6_ADDR_MAPPED))) {
2774 				NL_SET_ERR_MSG(extack,
2775 					       "Invalid gateway address");
2776 				goto out;
2777 			}
2778 
2779 			if (cfg->fc_flags & RTNH_F_ONLINK) {
2780 				err = ip6_route_check_nh_onlink(net, cfg, dev,
2781 								extack);
2782 			} else {
2783 				err = ip6_route_check_nh(net, cfg, &dev, &idev);
2784 			}
2785 			if (err)
2786 				goto out;
2787 		}
2788 		err = -EINVAL;
2789 		if (!dev) {
2790 			NL_SET_ERR_MSG(extack, "Egress device not specified");
2791 			goto out;
2792 		} else if (dev->flags & IFF_LOOPBACK) {
2793 			NL_SET_ERR_MSG(extack,
2794 				       "Egress device can not be loopback device for this route");
2795 			goto out;
2796 		}
2797 	}
2798 
2799 	err = -ENODEV;
2800 	if (!dev)
2801 		goto out;
2802 
2803 	if (!(dev->flags & IFF_UP)) {
2804 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2805 		err = -ENETDOWN;
2806 		goto out;
2807 	}
2808 
2809 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2810 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2811 			NL_SET_ERR_MSG(extack, "Invalid source address");
2812 			err = -EINVAL;
2813 			goto out;
2814 		}
2815 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2816 		rt->rt6i_prefsrc.plen = 128;
2817 	} else
2818 		rt->rt6i_prefsrc.plen = 0;
2819 
2820 	rt->rt6i_flags = cfg->fc_flags;
2821 
2822 install_route:
2823 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2824 	    !netif_carrier_ok(dev))
2825 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2826 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2827 	rt->dst.dev = dev;
2828 	rt->rt6i_idev = idev;
2829 	rt->rt6i_table = table;
2830 
2831 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2832 
2833 	return rt;
2834 out:
2835 	if (dev)
2836 		dev_put(dev);
2837 	if (idev)
2838 		in6_dev_put(idev);
2839 	if (rt)
2840 		dst_release_immediate(&rt->dst);
2841 
2842 	return ERR_PTR(err);
2843 }
2844 
2845 int ip6_route_add(struct fib6_config *cfg,
2846 		  struct netlink_ext_ack *extack)
2847 {
2848 	struct mx6_config mxc = { .mx = NULL, };
2849 	struct rt6_info *rt;
2850 	int err;
2851 
2852 	rt = ip6_route_info_create(cfg, extack);
2853 	if (IS_ERR(rt)) {
2854 		err = PTR_ERR(rt);
2855 		rt = NULL;
2856 		goto out;
2857 	}
2858 
2859 	err = ip6_convert_metrics(&mxc, cfg);
2860 	if (err)
2861 		goto out;
2862 
2863 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2864 
2865 	kfree(mxc.mx);
2866 
2867 	return err;
2868 out:
2869 	if (rt)
2870 		dst_release_immediate(&rt->dst);
2871 
2872 	return err;
2873 }
2874 
2875 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2876 {
2877 	int err;
2878 	struct fib6_table *table;
2879 	struct net *net = dev_net(rt->dst.dev);
2880 
2881 	if (rt == net->ipv6.ip6_null_entry) {
2882 		err = -ENOENT;
2883 		goto out;
2884 	}
2885 
2886 	table = rt->rt6i_table;
2887 	spin_lock_bh(&table->tb6_lock);
2888 	err = fib6_del(rt, info);
2889 	spin_unlock_bh(&table->tb6_lock);
2890 
2891 out:
2892 	ip6_rt_put(rt);
2893 	return err;
2894 }
2895 
2896 int ip6_del_rt(struct rt6_info *rt)
2897 {
2898 	struct nl_info info = {
2899 		.nl_net = dev_net(rt->dst.dev),
2900 	};
2901 	return __ip6_del_rt(rt, &info);
2902 }
2903 
2904 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2905 {
2906 	struct nl_info *info = &cfg->fc_nlinfo;
2907 	struct net *net = info->nl_net;
2908 	struct sk_buff *skb = NULL;
2909 	struct fib6_table *table;
2910 	int err = -ENOENT;
2911 
2912 	if (rt == net->ipv6.ip6_null_entry)
2913 		goto out_put;
2914 	table = rt->rt6i_table;
2915 	spin_lock_bh(&table->tb6_lock);
2916 
2917 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2918 		struct rt6_info *sibling, *next_sibling;
2919 
2920 		/* prefer to send a single notification with all hops */
2921 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2922 		if (skb) {
2923 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2924 
2925 			if (rt6_fill_node(net, skb, rt,
2926 					  NULL, NULL, 0, RTM_DELROUTE,
2927 					  info->portid, seq, 0) < 0) {
2928 				kfree_skb(skb);
2929 				skb = NULL;
2930 			} else
2931 				info->skip_notify = 1;
2932 		}
2933 
2934 		list_for_each_entry_safe(sibling, next_sibling,
2935 					 &rt->rt6i_siblings,
2936 					 rt6i_siblings) {
2937 			err = fib6_del(sibling, info);
2938 			if (err)
2939 				goto out_unlock;
2940 		}
2941 	}
2942 
2943 	err = fib6_del(rt, info);
2944 out_unlock:
2945 	spin_unlock_bh(&table->tb6_lock);
2946 out_put:
2947 	ip6_rt_put(rt);
2948 
2949 	if (skb) {
2950 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2951 			    info->nlh, gfp_any());
2952 	}
2953 	return err;
2954 }
2955 
2956 static int ip6_route_del(struct fib6_config *cfg,
2957 			 struct netlink_ext_ack *extack)
2958 {
2959 	struct rt6_info *rt, *rt_cache;
2960 	struct fib6_table *table;
2961 	struct fib6_node *fn;
2962 	int err = -ESRCH;
2963 
2964 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2965 	if (!table) {
2966 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
2967 		return err;
2968 	}
2969 
2970 	rcu_read_lock();
2971 
2972 	fn = fib6_locate(&table->tb6_root,
2973 			 &cfg->fc_dst, cfg->fc_dst_len,
2974 			 &cfg->fc_src, cfg->fc_src_len,
2975 			 !(cfg->fc_flags & RTF_CACHE));
2976 
2977 	if (fn) {
2978 		for_each_fib6_node_rt_rcu(fn) {
2979 			if (cfg->fc_flags & RTF_CACHE) {
2980 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
2981 							      &cfg->fc_src);
2982 				if (!rt_cache)
2983 					continue;
2984 				rt = rt_cache;
2985 			}
2986 			if (cfg->fc_ifindex &&
2987 			    (!rt->dst.dev ||
2988 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2989 				continue;
2990 			if (cfg->fc_flags & RTF_GATEWAY &&
2991 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2992 				continue;
2993 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2994 				continue;
2995 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2996 				continue;
2997 			if (!dst_hold_safe(&rt->dst))
2998 				break;
2999 			rcu_read_unlock();
3000 
3001 			/* if gateway was specified only delete the one hop */
3002 			if (cfg->fc_flags & RTF_GATEWAY)
3003 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3004 
3005 			return __ip6_del_rt_siblings(rt, cfg);
3006 		}
3007 	}
3008 	rcu_read_unlock();
3009 
3010 	return err;
3011 }
3012 
3013 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3014 {
3015 	struct netevent_redirect netevent;
3016 	struct rt6_info *rt, *nrt = NULL;
3017 	struct ndisc_options ndopts;
3018 	struct inet6_dev *in6_dev;
3019 	struct neighbour *neigh;
3020 	struct rd_msg *msg;
3021 	int optlen, on_link;
3022 	u8 *lladdr;
3023 
3024 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3025 	optlen -= sizeof(*msg);
3026 
3027 	if (optlen < 0) {
3028 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3029 		return;
3030 	}
3031 
3032 	msg = (struct rd_msg *)icmp6_hdr(skb);
3033 
3034 	if (ipv6_addr_is_multicast(&msg->dest)) {
3035 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3036 		return;
3037 	}
3038 
3039 	on_link = 0;
3040 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3041 		on_link = 1;
3042 	} else if (ipv6_addr_type(&msg->target) !=
3043 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3044 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3045 		return;
3046 	}
3047 
3048 	in6_dev = __in6_dev_get(skb->dev);
3049 	if (!in6_dev)
3050 		return;
3051 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3052 		return;
3053 
3054 	/* RFC2461 8.1:
3055 	 *	The IP source address of the Redirect MUST be the same as the current
3056 	 *	first-hop router for the specified ICMP Destination Address.
3057 	 */
3058 
3059 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3060 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3061 		return;
3062 	}
3063 
3064 	lladdr = NULL;
3065 	if (ndopts.nd_opts_tgt_lladdr) {
3066 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3067 					     skb->dev);
3068 		if (!lladdr) {
3069 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3070 			return;
3071 		}
3072 	}
3073 
3074 	rt = (struct rt6_info *) dst;
3075 	if (rt->rt6i_flags & RTF_REJECT) {
3076 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3077 		return;
3078 	}
3079 
3080 	/* Redirect received -> path was valid.
3081 	 * Look, redirects are sent only in response to data packets,
3082 	 * so that this nexthop apparently is reachable. --ANK
3083 	 */
3084 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3085 
3086 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3087 	if (!neigh)
3088 		return;
3089 
3090 	/*
3091 	 *	We have finally decided to accept it.
3092 	 */
3093 
3094 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3095 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3096 		     NEIGH_UPDATE_F_OVERRIDE|
3097 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3098 				     NEIGH_UPDATE_F_ISROUTER)),
3099 		     NDISC_REDIRECT, &ndopts);
3100 
3101 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3102 	if (!nrt)
3103 		goto out;
3104 
3105 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3106 	if (on_link)
3107 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3108 
3109 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3110 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3111 
3112 	/* No need to remove rt from the exception table if rt is
3113 	 * a cached route because rt6_insert_exception() will
3114 	 * takes care of it
3115 	 */
3116 	if (rt6_insert_exception(nrt, rt)) {
3117 		dst_release_immediate(&nrt->dst);
3118 		goto out;
3119 	}
3120 
3121 	netevent.old = &rt->dst;
3122 	netevent.new = &nrt->dst;
3123 	netevent.daddr = &msg->dest;
3124 	netevent.neigh = neigh;
3125 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3126 
3127 out:
3128 	neigh_release(neigh);
3129 }
3130 
3131 /*
3132  *	Misc support functions
3133  */
3134 
3135 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3136 {
3137 	BUG_ON(from->from);
3138 
3139 	rt->rt6i_flags &= ~RTF_EXPIRES;
3140 	dst_hold(&from->dst);
3141 	rt->from = from;
3142 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3143 }
3144 
3145 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3146 {
3147 	rt->dst.input = ort->dst.input;
3148 	rt->dst.output = ort->dst.output;
3149 	rt->rt6i_dst = ort->rt6i_dst;
3150 	rt->dst.error = ort->dst.error;
3151 	rt->rt6i_idev = ort->rt6i_idev;
3152 	if (rt->rt6i_idev)
3153 		in6_dev_hold(rt->rt6i_idev);
3154 	rt->dst.lastuse = jiffies;
3155 	rt->rt6i_gateway = ort->rt6i_gateway;
3156 	rt->rt6i_flags = ort->rt6i_flags;
3157 	rt6_set_from(rt, ort);
3158 	rt->rt6i_metric = ort->rt6i_metric;
3159 #ifdef CONFIG_IPV6_SUBTREES
3160 	rt->rt6i_src = ort->rt6i_src;
3161 #endif
3162 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3163 	rt->rt6i_table = ort->rt6i_table;
3164 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3165 }
3166 
3167 #ifdef CONFIG_IPV6_ROUTE_INFO
3168 static struct rt6_info *rt6_get_route_info(struct net *net,
3169 					   const struct in6_addr *prefix, int prefixlen,
3170 					   const struct in6_addr *gwaddr,
3171 					   struct net_device *dev)
3172 {
3173 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3174 	int ifindex = dev->ifindex;
3175 	struct fib6_node *fn;
3176 	struct rt6_info *rt = NULL;
3177 	struct fib6_table *table;
3178 
3179 	table = fib6_get_table(net, tb_id);
3180 	if (!table)
3181 		return NULL;
3182 
3183 	rcu_read_lock();
3184 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3185 	if (!fn)
3186 		goto out;
3187 
3188 	for_each_fib6_node_rt_rcu(fn) {
3189 		if (rt->dst.dev->ifindex != ifindex)
3190 			continue;
3191 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3192 			continue;
3193 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3194 			continue;
3195 		ip6_hold_safe(NULL, &rt, false);
3196 		break;
3197 	}
3198 out:
3199 	rcu_read_unlock();
3200 	return rt;
3201 }
3202 
3203 static struct rt6_info *rt6_add_route_info(struct net *net,
3204 					   const struct in6_addr *prefix, int prefixlen,
3205 					   const struct in6_addr *gwaddr,
3206 					   struct net_device *dev,
3207 					   unsigned int pref)
3208 {
3209 	struct fib6_config cfg = {
3210 		.fc_metric	= IP6_RT_PRIO_USER,
3211 		.fc_ifindex	= dev->ifindex,
3212 		.fc_dst_len	= prefixlen,
3213 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3214 				  RTF_UP | RTF_PREF(pref),
3215 		.fc_protocol = RTPROT_RA,
3216 		.fc_nlinfo.portid = 0,
3217 		.fc_nlinfo.nlh = NULL,
3218 		.fc_nlinfo.nl_net = net,
3219 	};
3220 
3221 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3222 	cfg.fc_dst = *prefix;
3223 	cfg.fc_gateway = *gwaddr;
3224 
3225 	/* We should treat it as a default route if prefix length is 0. */
3226 	if (!prefixlen)
3227 		cfg.fc_flags |= RTF_DEFAULT;
3228 
3229 	ip6_route_add(&cfg, NULL);
3230 
3231 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3232 }
3233 #endif
3234 
3235 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3236 {
3237 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3238 	struct rt6_info *rt;
3239 	struct fib6_table *table;
3240 
3241 	table = fib6_get_table(dev_net(dev), tb_id);
3242 	if (!table)
3243 		return NULL;
3244 
3245 	rcu_read_lock();
3246 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3247 		if (dev == rt->dst.dev &&
3248 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3249 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3250 			break;
3251 	}
3252 	if (rt)
3253 		ip6_hold_safe(NULL, &rt, false);
3254 	rcu_read_unlock();
3255 	return rt;
3256 }
3257 
3258 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3259 				     struct net_device *dev,
3260 				     unsigned int pref)
3261 {
3262 	struct fib6_config cfg = {
3263 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3264 		.fc_metric	= IP6_RT_PRIO_USER,
3265 		.fc_ifindex	= dev->ifindex,
3266 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3267 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3268 		.fc_protocol = RTPROT_RA,
3269 		.fc_nlinfo.portid = 0,
3270 		.fc_nlinfo.nlh = NULL,
3271 		.fc_nlinfo.nl_net = dev_net(dev),
3272 	};
3273 
3274 	cfg.fc_gateway = *gwaddr;
3275 
3276 	if (!ip6_route_add(&cfg, NULL)) {
3277 		struct fib6_table *table;
3278 
3279 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3280 		if (table)
3281 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3282 	}
3283 
3284 	return rt6_get_dflt_router(gwaddr, dev);
3285 }
3286 
3287 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3288 {
3289 	struct rt6_info *rt;
3290 
3291 restart:
3292 	rcu_read_lock();
3293 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3294 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3295 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3296 			if (dst_hold_safe(&rt->dst)) {
3297 				rcu_read_unlock();
3298 				ip6_del_rt(rt);
3299 			} else {
3300 				rcu_read_unlock();
3301 			}
3302 			goto restart;
3303 		}
3304 	}
3305 	rcu_read_unlock();
3306 
3307 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3308 }
3309 
3310 void rt6_purge_dflt_routers(struct net *net)
3311 {
3312 	struct fib6_table *table;
3313 	struct hlist_head *head;
3314 	unsigned int h;
3315 
3316 	rcu_read_lock();
3317 
3318 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3319 		head = &net->ipv6.fib_table_hash[h];
3320 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3321 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3322 				__rt6_purge_dflt_routers(table);
3323 		}
3324 	}
3325 
3326 	rcu_read_unlock();
3327 }
3328 
3329 static void rtmsg_to_fib6_config(struct net *net,
3330 				 struct in6_rtmsg *rtmsg,
3331 				 struct fib6_config *cfg)
3332 {
3333 	memset(cfg, 0, sizeof(*cfg));
3334 
3335 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3336 			 : RT6_TABLE_MAIN;
3337 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3338 	cfg->fc_metric = rtmsg->rtmsg_metric;
3339 	cfg->fc_expires = rtmsg->rtmsg_info;
3340 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3341 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3342 	cfg->fc_flags = rtmsg->rtmsg_flags;
3343 
3344 	cfg->fc_nlinfo.nl_net = net;
3345 
3346 	cfg->fc_dst = rtmsg->rtmsg_dst;
3347 	cfg->fc_src = rtmsg->rtmsg_src;
3348 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3349 }
3350 
3351 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3352 {
3353 	struct fib6_config cfg;
3354 	struct in6_rtmsg rtmsg;
3355 	int err;
3356 
3357 	switch (cmd) {
3358 	case SIOCADDRT:		/* Add a route */
3359 	case SIOCDELRT:		/* Delete a route */
3360 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3361 			return -EPERM;
3362 		err = copy_from_user(&rtmsg, arg,
3363 				     sizeof(struct in6_rtmsg));
3364 		if (err)
3365 			return -EFAULT;
3366 
3367 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3368 
3369 		rtnl_lock();
3370 		switch (cmd) {
3371 		case SIOCADDRT:
3372 			err = ip6_route_add(&cfg, NULL);
3373 			break;
3374 		case SIOCDELRT:
3375 			err = ip6_route_del(&cfg, NULL);
3376 			break;
3377 		default:
3378 			err = -EINVAL;
3379 		}
3380 		rtnl_unlock();
3381 
3382 		return err;
3383 	}
3384 
3385 	return -EINVAL;
3386 }
3387 
3388 /*
3389  *	Drop the packet on the floor
3390  */
3391 
3392 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3393 {
3394 	int type;
3395 	struct dst_entry *dst = skb_dst(skb);
3396 	switch (ipstats_mib_noroutes) {
3397 	case IPSTATS_MIB_INNOROUTES:
3398 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3399 		if (type == IPV6_ADDR_ANY) {
3400 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3401 				      IPSTATS_MIB_INADDRERRORS);
3402 			break;
3403 		}
3404 		/* FALLTHROUGH */
3405 	case IPSTATS_MIB_OUTNOROUTES:
3406 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3407 			      ipstats_mib_noroutes);
3408 		break;
3409 	}
3410 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3411 	kfree_skb(skb);
3412 	return 0;
3413 }
3414 
3415 static int ip6_pkt_discard(struct sk_buff *skb)
3416 {
3417 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3418 }
3419 
3420 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3421 {
3422 	skb->dev = skb_dst(skb)->dev;
3423 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3424 }
3425 
3426 static int ip6_pkt_prohibit(struct sk_buff *skb)
3427 {
3428 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3429 }
3430 
3431 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3432 {
3433 	skb->dev = skb_dst(skb)->dev;
3434 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3435 }
3436 
3437 /*
3438  *	Allocate a dst for local (unicast / anycast) address.
3439  */
3440 
3441 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3442 				    const struct in6_addr *addr,
3443 				    bool anycast)
3444 {
3445 	u32 tb_id;
3446 	struct net *net = dev_net(idev->dev);
3447 	struct net_device *dev = idev->dev;
3448 	struct rt6_info *rt;
3449 
3450 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3451 	if (!rt)
3452 		return ERR_PTR(-ENOMEM);
3453 
3454 	in6_dev_hold(idev);
3455 
3456 	rt->dst.flags |= DST_HOST;
3457 	rt->dst.input = ip6_input;
3458 	rt->dst.output = ip6_output;
3459 	rt->rt6i_idev = idev;
3460 
3461 	rt->rt6i_protocol = RTPROT_KERNEL;
3462 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3463 	if (anycast)
3464 		rt->rt6i_flags |= RTF_ANYCAST;
3465 	else
3466 		rt->rt6i_flags |= RTF_LOCAL;
3467 
3468 	rt->rt6i_gateway  = *addr;
3469 	rt->rt6i_dst.addr = *addr;
3470 	rt->rt6i_dst.plen = 128;
3471 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3472 	rt->rt6i_table = fib6_get_table(net, tb_id);
3473 
3474 	return rt;
3475 }
3476 
3477 /* remove deleted ip from prefsrc entries */
3478 struct arg_dev_net_ip {
3479 	struct net_device *dev;
3480 	struct net *net;
3481 	struct in6_addr *addr;
3482 };
3483 
3484 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3485 {
3486 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3487 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3488 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3489 
3490 	if (((void *)rt->dst.dev == dev || !dev) &&
3491 	    rt != net->ipv6.ip6_null_entry &&
3492 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3493 		spin_lock_bh(&rt6_exception_lock);
3494 		/* remove prefsrc entry */
3495 		rt->rt6i_prefsrc.plen = 0;
3496 		/* need to update cache as well */
3497 		rt6_exceptions_remove_prefsrc(rt);
3498 		spin_unlock_bh(&rt6_exception_lock);
3499 	}
3500 	return 0;
3501 }
3502 
3503 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3504 {
3505 	struct net *net = dev_net(ifp->idev->dev);
3506 	struct arg_dev_net_ip adni = {
3507 		.dev = ifp->idev->dev,
3508 		.net = net,
3509 		.addr = &ifp->addr,
3510 	};
3511 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3512 }
3513 
3514 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3515 
3516 /* Remove routers and update dst entries when gateway turn into host. */
3517 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3518 {
3519 	struct in6_addr *gateway = (struct in6_addr *)arg;
3520 
3521 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3522 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3523 		return -1;
3524 	}
3525 
3526 	/* Further clean up cached routes in exception table.
3527 	 * This is needed because cached route may have a different
3528 	 * gateway than its 'parent' in the case of an ip redirect.
3529 	 */
3530 	rt6_exceptions_clean_tohost(rt, gateway);
3531 
3532 	return 0;
3533 }
3534 
3535 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3536 {
3537 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3538 }
3539 
3540 struct arg_netdev_event {
3541 	const struct net_device *dev;
3542 	union {
3543 		unsigned int nh_flags;
3544 		unsigned long event;
3545 	};
3546 };
3547 
3548 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3549 {
3550 	struct rt6_info *iter;
3551 	struct fib6_node *fn;
3552 
3553 	fn = rcu_dereference_protected(rt->rt6i_node,
3554 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3555 	iter = rcu_dereference_protected(fn->leaf,
3556 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3557 	while (iter) {
3558 		if (iter->rt6i_metric == rt->rt6i_metric &&
3559 		    rt6_qualify_for_ecmp(iter))
3560 			return iter;
3561 		iter = rcu_dereference_protected(iter->rt6_next,
3562 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3563 	}
3564 
3565 	return NULL;
3566 }
3567 
3568 static bool rt6_is_dead(const struct rt6_info *rt)
3569 {
3570 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3571 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3572 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3573 		return true;
3574 
3575 	return false;
3576 }
3577 
3578 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3579 {
3580 	struct rt6_info *iter;
3581 	int total = 0;
3582 
3583 	if (!rt6_is_dead(rt))
3584 		total += rt->rt6i_nh_weight;
3585 
3586 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3587 		if (!rt6_is_dead(iter))
3588 			total += iter->rt6i_nh_weight;
3589 	}
3590 
3591 	return total;
3592 }
3593 
3594 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3595 {
3596 	int upper_bound = -1;
3597 
3598 	if (!rt6_is_dead(rt)) {
3599 		*weight += rt->rt6i_nh_weight;
3600 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3601 						    total) - 1;
3602 	}
3603 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3604 }
3605 
3606 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3607 {
3608 	struct rt6_info *iter;
3609 	int weight = 0;
3610 
3611 	rt6_upper_bound_set(rt, &weight, total);
3612 
3613 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3614 		rt6_upper_bound_set(iter, &weight, total);
3615 }
3616 
3617 void rt6_multipath_rebalance(struct rt6_info *rt)
3618 {
3619 	struct rt6_info *first;
3620 	int total;
3621 
3622 	/* In case the entire multipath route was marked for flushing,
3623 	 * then there is no need to rebalance upon the removal of every
3624 	 * sibling route.
3625 	 */
3626 	if (!rt->rt6i_nsiblings || rt->should_flush)
3627 		return;
3628 
3629 	/* During lookup routes are evaluated in order, so we need to
3630 	 * make sure upper bounds are assigned from the first sibling
3631 	 * onwards.
3632 	 */
3633 	first = rt6_multipath_first_sibling(rt);
3634 	if (WARN_ON_ONCE(!first))
3635 		return;
3636 
3637 	total = rt6_multipath_total_weight(first);
3638 	rt6_multipath_upper_bound_set(first, total);
3639 }
3640 
3641 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3642 {
3643 	const struct arg_netdev_event *arg = p_arg;
3644 	const struct net *net = dev_net(arg->dev);
3645 
3646 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3647 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3648 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3649 		rt6_multipath_rebalance(rt);
3650 	}
3651 
3652 	return 0;
3653 }
3654 
3655 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3656 {
3657 	struct arg_netdev_event arg = {
3658 		.dev = dev,
3659 		{
3660 			.nh_flags = nh_flags,
3661 		},
3662 	};
3663 
3664 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3665 		arg.nh_flags |= RTNH_F_LINKDOWN;
3666 
3667 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3668 }
3669 
3670 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3671 				   const struct net_device *dev)
3672 {
3673 	struct rt6_info *iter;
3674 
3675 	if (rt->dst.dev == dev)
3676 		return true;
3677 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3678 		if (iter->dst.dev == dev)
3679 			return true;
3680 
3681 	return false;
3682 }
3683 
3684 static void rt6_multipath_flush(struct rt6_info *rt)
3685 {
3686 	struct rt6_info *iter;
3687 
3688 	rt->should_flush = 1;
3689 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3690 		iter->should_flush = 1;
3691 }
3692 
3693 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3694 					     const struct net_device *down_dev)
3695 {
3696 	struct rt6_info *iter;
3697 	unsigned int dead = 0;
3698 
3699 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3700 		dead++;
3701 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3702 		if (iter->dst.dev == down_dev ||
3703 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3704 			dead++;
3705 
3706 	return dead;
3707 }
3708 
3709 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3710 				       const struct net_device *dev,
3711 				       unsigned int nh_flags)
3712 {
3713 	struct rt6_info *iter;
3714 
3715 	if (rt->dst.dev == dev)
3716 		rt->rt6i_nh_flags |= nh_flags;
3717 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3718 		if (iter->dst.dev == dev)
3719 			iter->rt6i_nh_flags |= nh_flags;
3720 }
3721 
3722 /* called with write lock held for table with rt */
3723 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3724 {
3725 	const struct arg_netdev_event *arg = p_arg;
3726 	const struct net_device *dev = arg->dev;
3727 	const struct net *net = dev_net(dev);
3728 
3729 	if (rt == net->ipv6.ip6_null_entry)
3730 		return 0;
3731 
3732 	switch (arg->event) {
3733 	case NETDEV_UNREGISTER:
3734 		return rt->dst.dev == dev ? -1 : 0;
3735 	case NETDEV_DOWN:
3736 		if (rt->should_flush)
3737 			return -1;
3738 		if (!rt->rt6i_nsiblings)
3739 			return rt->dst.dev == dev ? -1 : 0;
3740 		if (rt6_multipath_uses_dev(rt, dev)) {
3741 			unsigned int count;
3742 
3743 			count = rt6_multipath_dead_count(rt, dev);
3744 			if (rt->rt6i_nsiblings + 1 == count) {
3745 				rt6_multipath_flush(rt);
3746 				return -1;
3747 			}
3748 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3749 						   RTNH_F_LINKDOWN);
3750 			fib6_update_sernum(rt);
3751 			rt6_multipath_rebalance(rt);
3752 		}
3753 		return -2;
3754 	case NETDEV_CHANGE:
3755 		if (rt->dst.dev != dev ||
3756 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3757 			break;
3758 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3759 		rt6_multipath_rebalance(rt);
3760 		break;
3761 	}
3762 
3763 	return 0;
3764 }
3765 
3766 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3767 {
3768 	struct arg_netdev_event arg = {
3769 		.dev = dev,
3770 		{
3771 			.event = event,
3772 		},
3773 	};
3774 
3775 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3776 }
3777 
3778 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3779 {
3780 	rt6_sync_down_dev(dev, event);
3781 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3782 	neigh_ifdown(&nd_tbl, dev);
3783 }
3784 
3785 struct rt6_mtu_change_arg {
3786 	struct net_device *dev;
3787 	unsigned int mtu;
3788 };
3789 
3790 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3791 {
3792 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3793 	struct inet6_dev *idev;
3794 
3795 	/* In IPv6 pmtu discovery is not optional,
3796 	   so that RTAX_MTU lock cannot disable it.
3797 	   We still use this lock to block changes
3798 	   caused by addrconf/ndisc.
3799 	*/
3800 
3801 	idev = __in6_dev_get(arg->dev);
3802 	if (!idev)
3803 		return 0;
3804 
3805 	/* For administrative MTU increase, there is no way to discover
3806 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3807 	   Since RFC 1981 doesn't include administrative MTU increase
3808 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3809 	 */
3810 	/*
3811 	   If new MTU is less than route PMTU, this new MTU will be the
3812 	   lowest MTU in the path, update the route PMTU to reflect PMTU
3813 	   decreases; if new MTU is greater than route PMTU, and the
3814 	   old MTU is the lowest MTU in the path, update the route PMTU
3815 	   to reflect the increase. In this case if the other nodes' MTU
3816 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
3817 	   PMTU discovery.
3818 	 */
3819 	if (rt->dst.dev == arg->dev &&
3820 	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
3821 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3822 		spin_lock_bh(&rt6_exception_lock);
3823 		if (dst_mtu(&rt->dst) >= arg->mtu ||
3824 		    (dst_mtu(&rt->dst) < arg->mtu &&
3825 		     dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
3826 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3827 		}
3828 		rt6_exceptions_update_pmtu(rt, arg->mtu);
3829 		spin_unlock_bh(&rt6_exception_lock);
3830 	}
3831 	return 0;
3832 }
3833 
3834 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3835 {
3836 	struct rt6_mtu_change_arg arg = {
3837 		.dev = dev,
3838 		.mtu = mtu,
3839 	};
3840 
3841 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3842 }
3843 
3844 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3845 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3846 	[RTA_OIF]               = { .type = NLA_U32 },
3847 	[RTA_IIF]		= { .type = NLA_U32 },
3848 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3849 	[RTA_METRICS]           = { .type = NLA_NESTED },
3850 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3851 	[RTA_PREF]              = { .type = NLA_U8 },
3852 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3853 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3854 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3855 	[RTA_UID]		= { .type = NLA_U32 },
3856 	[RTA_MARK]		= { .type = NLA_U32 },
3857 };
3858 
3859 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3860 			      struct fib6_config *cfg,
3861 			      struct netlink_ext_ack *extack)
3862 {
3863 	struct rtmsg *rtm;
3864 	struct nlattr *tb[RTA_MAX+1];
3865 	unsigned int pref;
3866 	int err;
3867 
3868 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3869 			  NULL);
3870 	if (err < 0)
3871 		goto errout;
3872 
3873 	err = -EINVAL;
3874 	rtm = nlmsg_data(nlh);
3875 	memset(cfg, 0, sizeof(*cfg));
3876 
3877 	cfg->fc_table = rtm->rtm_table;
3878 	cfg->fc_dst_len = rtm->rtm_dst_len;
3879 	cfg->fc_src_len = rtm->rtm_src_len;
3880 	cfg->fc_flags = RTF_UP;
3881 	cfg->fc_protocol = rtm->rtm_protocol;
3882 	cfg->fc_type = rtm->rtm_type;
3883 
3884 	if (rtm->rtm_type == RTN_UNREACHABLE ||
3885 	    rtm->rtm_type == RTN_BLACKHOLE ||
3886 	    rtm->rtm_type == RTN_PROHIBIT ||
3887 	    rtm->rtm_type == RTN_THROW)
3888 		cfg->fc_flags |= RTF_REJECT;
3889 
3890 	if (rtm->rtm_type == RTN_LOCAL)
3891 		cfg->fc_flags |= RTF_LOCAL;
3892 
3893 	if (rtm->rtm_flags & RTM_F_CLONED)
3894 		cfg->fc_flags |= RTF_CACHE;
3895 
3896 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
3897 
3898 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3899 	cfg->fc_nlinfo.nlh = nlh;
3900 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3901 
3902 	if (tb[RTA_GATEWAY]) {
3903 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3904 		cfg->fc_flags |= RTF_GATEWAY;
3905 	}
3906 
3907 	if (tb[RTA_DST]) {
3908 		int plen = (rtm->rtm_dst_len + 7) >> 3;
3909 
3910 		if (nla_len(tb[RTA_DST]) < plen)
3911 			goto errout;
3912 
3913 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3914 	}
3915 
3916 	if (tb[RTA_SRC]) {
3917 		int plen = (rtm->rtm_src_len + 7) >> 3;
3918 
3919 		if (nla_len(tb[RTA_SRC]) < plen)
3920 			goto errout;
3921 
3922 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3923 	}
3924 
3925 	if (tb[RTA_PREFSRC])
3926 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3927 
3928 	if (tb[RTA_OIF])
3929 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3930 
3931 	if (tb[RTA_PRIORITY])
3932 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3933 
3934 	if (tb[RTA_METRICS]) {
3935 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3936 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3937 	}
3938 
3939 	if (tb[RTA_TABLE])
3940 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3941 
3942 	if (tb[RTA_MULTIPATH]) {
3943 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3944 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3945 
3946 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3947 						     cfg->fc_mp_len, extack);
3948 		if (err < 0)
3949 			goto errout;
3950 	}
3951 
3952 	if (tb[RTA_PREF]) {
3953 		pref = nla_get_u8(tb[RTA_PREF]);
3954 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
3955 		    pref != ICMPV6_ROUTER_PREF_HIGH)
3956 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
3957 		cfg->fc_flags |= RTF_PREF(pref);
3958 	}
3959 
3960 	if (tb[RTA_ENCAP])
3961 		cfg->fc_encap = tb[RTA_ENCAP];
3962 
3963 	if (tb[RTA_ENCAP_TYPE]) {
3964 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3965 
3966 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3967 		if (err < 0)
3968 			goto errout;
3969 	}
3970 
3971 	if (tb[RTA_EXPIRES]) {
3972 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3973 
3974 		if (addrconf_finite_timeout(timeout)) {
3975 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3976 			cfg->fc_flags |= RTF_EXPIRES;
3977 		}
3978 	}
3979 
3980 	err = 0;
3981 errout:
3982 	return err;
3983 }
3984 
3985 struct rt6_nh {
3986 	struct rt6_info *rt6_info;
3987 	struct fib6_config r_cfg;
3988 	struct mx6_config mxc;
3989 	struct list_head next;
3990 };
3991 
3992 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3993 {
3994 	struct rt6_nh *nh;
3995 
3996 	list_for_each_entry(nh, rt6_nh_list, next) {
3997 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3998 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3999 		        nh->r_cfg.fc_ifindex);
4000 	}
4001 }
4002 
4003 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4004 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4005 {
4006 	struct rt6_nh *nh;
4007 	int err = -EEXIST;
4008 
4009 	list_for_each_entry(nh, rt6_nh_list, next) {
4010 		/* check if rt6_info already exists */
4011 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4012 			return err;
4013 	}
4014 
4015 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4016 	if (!nh)
4017 		return -ENOMEM;
4018 	nh->rt6_info = rt;
4019 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4020 	if (err) {
4021 		kfree(nh);
4022 		return err;
4023 	}
4024 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4025 	list_add_tail(&nh->next, rt6_nh_list);
4026 
4027 	return 0;
4028 }
4029 
4030 static void ip6_route_mpath_notify(struct rt6_info *rt,
4031 				   struct rt6_info *rt_last,
4032 				   struct nl_info *info,
4033 				   __u16 nlflags)
4034 {
4035 	/* if this is an APPEND route, then rt points to the first route
4036 	 * inserted and rt_last points to last route inserted. Userspace
4037 	 * wants a consistent dump of the route which starts at the first
4038 	 * nexthop. Since sibling routes are always added at the end of
4039 	 * the list, find the first sibling of the last route appended
4040 	 */
4041 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4042 		rt = list_first_entry(&rt_last->rt6i_siblings,
4043 				      struct rt6_info,
4044 				      rt6i_siblings);
4045 	}
4046 
4047 	if (rt)
4048 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4049 }
4050 
4051 static int ip6_route_multipath_add(struct fib6_config *cfg,
4052 				   struct netlink_ext_ack *extack)
4053 {
4054 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4055 	struct nl_info *info = &cfg->fc_nlinfo;
4056 	struct fib6_config r_cfg;
4057 	struct rtnexthop *rtnh;
4058 	struct rt6_info *rt;
4059 	struct rt6_nh *err_nh;
4060 	struct rt6_nh *nh, *nh_safe;
4061 	__u16 nlflags;
4062 	int remaining;
4063 	int attrlen;
4064 	int err = 1;
4065 	int nhn = 0;
4066 	int replace = (cfg->fc_nlinfo.nlh &&
4067 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4068 	LIST_HEAD(rt6_nh_list);
4069 
4070 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4071 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4072 		nlflags |= NLM_F_APPEND;
4073 
4074 	remaining = cfg->fc_mp_len;
4075 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4076 
4077 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4078 	 * rt6_info structs per nexthop
4079 	 */
4080 	while (rtnh_ok(rtnh, remaining)) {
4081 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4082 		if (rtnh->rtnh_ifindex)
4083 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4084 
4085 		attrlen = rtnh_attrlen(rtnh);
4086 		if (attrlen > 0) {
4087 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4088 
4089 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4090 			if (nla) {
4091 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4092 				r_cfg.fc_flags |= RTF_GATEWAY;
4093 			}
4094 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4095 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4096 			if (nla)
4097 				r_cfg.fc_encap_type = nla_get_u16(nla);
4098 		}
4099 
4100 		rt = ip6_route_info_create(&r_cfg, extack);
4101 		if (IS_ERR(rt)) {
4102 			err = PTR_ERR(rt);
4103 			rt = NULL;
4104 			goto cleanup;
4105 		}
4106 
4107 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4108 
4109 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4110 		if (err) {
4111 			dst_release_immediate(&rt->dst);
4112 			goto cleanup;
4113 		}
4114 
4115 		rtnh = rtnh_next(rtnh, &remaining);
4116 	}
4117 
4118 	/* for add and replace send one notification with all nexthops.
4119 	 * Skip the notification in fib6_add_rt2node and send one with
4120 	 * the full route when done
4121 	 */
4122 	info->skip_notify = 1;
4123 
4124 	err_nh = NULL;
4125 	list_for_each_entry(nh, &rt6_nh_list, next) {
4126 		rt_last = nh->rt6_info;
4127 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4128 		/* save reference to first route for notification */
4129 		if (!rt_notif && !err)
4130 			rt_notif = nh->rt6_info;
4131 
4132 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4133 		nh->rt6_info = NULL;
4134 		if (err) {
4135 			if (replace && nhn)
4136 				ip6_print_replace_route_err(&rt6_nh_list);
4137 			err_nh = nh;
4138 			goto add_errout;
4139 		}
4140 
4141 		/* Because each route is added like a single route we remove
4142 		 * these flags after the first nexthop: if there is a collision,
4143 		 * we have already failed to add the first nexthop:
4144 		 * fib6_add_rt2node() has rejected it; when replacing, old
4145 		 * nexthops have been replaced by first new, the rest should
4146 		 * be added to it.
4147 		 */
4148 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4149 						     NLM_F_REPLACE);
4150 		nhn++;
4151 	}
4152 
4153 	/* success ... tell user about new route */
4154 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4155 	goto cleanup;
4156 
4157 add_errout:
4158 	/* send notification for routes that were added so that
4159 	 * the delete notifications sent by ip6_route_del are
4160 	 * coherent
4161 	 */
4162 	if (rt_notif)
4163 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4164 
4165 	/* Delete routes that were already added */
4166 	list_for_each_entry(nh, &rt6_nh_list, next) {
4167 		if (err_nh == nh)
4168 			break;
4169 		ip6_route_del(&nh->r_cfg, extack);
4170 	}
4171 
4172 cleanup:
4173 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4174 		if (nh->rt6_info)
4175 			dst_release_immediate(&nh->rt6_info->dst);
4176 		kfree(nh->mxc.mx);
4177 		list_del(&nh->next);
4178 		kfree(nh);
4179 	}
4180 
4181 	return err;
4182 }
4183 
4184 static int ip6_route_multipath_del(struct fib6_config *cfg,
4185 				   struct netlink_ext_ack *extack)
4186 {
4187 	struct fib6_config r_cfg;
4188 	struct rtnexthop *rtnh;
4189 	int remaining;
4190 	int attrlen;
4191 	int err = 1, last_err = 0;
4192 
4193 	remaining = cfg->fc_mp_len;
4194 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4195 
4196 	/* Parse a Multipath Entry */
4197 	while (rtnh_ok(rtnh, remaining)) {
4198 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4199 		if (rtnh->rtnh_ifindex)
4200 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4201 
4202 		attrlen = rtnh_attrlen(rtnh);
4203 		if (attrlen > 0) {
4204 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4205 
4206 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4207 			if (nla) {
4208 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4209 				r_cfg.fc_flags |= RTF_GATEWAY;
4210 			}
4211 		}
4212 		err = ip6_route_del(&r_cfg, extack);
4213 		if (err)
4214 			last_err = err;
4215 
4216 		rtnh = rtnh_next(rtnh, &remaining);
4217 	}
4218 
4219 	return last_err;
4220 }
4221 
4222 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4223 			      struct netlink_ext_ack *extack)
4224 {
4225 	struct fib6_config cfg;
4226 	int err;
4227 
4228 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4229 	if (err < 0)
4230 		return err;
4231 
4232 	if (cfg.fc_mp)
4233 		return ip6_route_multipath_del(&cfg, extack);
4234 	else {
4235 		cfg.fc_delete_all_nh = 1;
4236 		return ip6_route_del(&cfg, extack);
4237 	}
4238 }
4239 
4240 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4241 			      struct netlink_ext_ack *extack)
4242 {
4243 	struct fib6_config cfg;
4244 	int err;
4245 
4246 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4247 	if (err < 0)
4248 		return err;
4249 
4250 	if (cfg.fc_mp)
4251 		return ip6_route_multipath_add(&cfg, extack);
4252 	else
4253 		return ip6_route_add(&cfg, extack);
4254 }
4255 
4256 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4257 {
4258 	int nexthop_len = 0;
4259 
4260 	if (rt->rt6i_nsiblings) {
4261 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4262 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4263 			    + nla_total_size(16) /* RTA_GATEWAY */
4264 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4265 
4266 		nexthop_len *= rt->rt6i_nsiblings;
4267 	}
4268 
4269 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4270 	       + nla_total_size(16) /* RTA_SRC */
4271 	       + nla_total_size(16) /* RTA_DST */
4272 	       + nla_total_size(16) /* RTA_GATEWAY */
4273 	       + nla_total_size(16) /* RTA_PREFSRC */
4274 	       + nla_total_size(4) /* RTA_TABLE */
4275 	       + nla_total_size(4) /* RTA_IIF */
4276 	       + nla_total_size(4) /* RTA_OIF */
4277 	       + nla_total_size(4) /* RTA_PRIORITY */
4278 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4279 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4280 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4281 	       + nla_total_size(1) /* RTA_PREF */
4282 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4283 	       + nexthop_len;
4284 }
4285 
4286 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4287 			    unsigned int *flags, bool skip_oif)
4288 {
4289 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4290 		*flags |= RTNH_F_DEAD;
4291 
4292 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4293 		*flags |= RTNH_F_LINKDOWN;
4294 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4295 			*flags |= RTNH_F_DEAD;
4296 	}
4297 
4298 	if (rt->rt6i_flags & RTF_GATEWAY) {
4299 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4300 			goto nla_put_failure;
4301 	}
4302 
4303 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4304 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4305 		*flags |= RTNH_F_OFFLOAD;
4306 
4307 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4308 	if (!skip_oif && rt->dst.dev &&
4309 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4310 		goto nla_put_failure;
4311 
4312 	if (rt->dst.lwtstate &&
4313 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4314 		goto nla_put_failure;
4315 
4316 	return 0;
4317 
4318 nla_put_failure:
4319 	return -EMSGSIZE;
4320 }
4321 
4322 /* add multipath next hop */
4323 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4324 {
4325 	struct rtnexthop *rtnh;
4326 	unsigned int flags = 0;
4327 
4328 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4329 	if (!rtnh)
4330 		goto nla_put_failure;
4331 
4332 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4333 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4334 
4335 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4336 		goto nla_put_failure;
4337 
4338 	rtnh->rtnh_flags = flags;
4339 
4340 	/* length of rtnetlink header + attributes */
4341 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4342 
4343 	return 0;
4344 
4345 nla_put_failure:
4346 	return -EMSGSIZE;
4347 }
4348 
4349 static int rt6_fill_node(struct net *net,
4350 			 struct sk_buff *skb, struct rt6_info *rt,
4351 			 struct in6_addr *dst, struct in6_addr *src,
4352 			 int iif, int type, u32 portid, u32 seq,
4353 			 unsigned int flags)
4354 {
4355 	u32 metrics[RTAX_MAX];
4356 	struct rtmsg *rtm;
4357 	struct nlmsghdr *nlh;
4358 	long expires;
4359 	u32 table;
4360 
4361 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4362 	if (!nlh)
4363 		return -EMSGSIZE;
4364 
4365 	rtm = nlmsg_data(nlh);
4366 	rtm->rtm_family = AF_INET6;
4367 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4368 	rtm->rtm_src_len = rt->rt6i_src.plen;
4369 	rtm->rtm_tos = 0;
4370 	if (rt->rt6i_table)
4371 		table = rt->rt6i_table->tb6_id;
4372 	else
4373 		table = RT6_TABLE_UNSPEC;
4374 	rtm->rtm_table = table;
4375 	if (nla_put_u32(skb, RTA_TABLE, table))
4376 		goto nla_put_failure;
4377 	if (rt->rt6i_flags & RTF_REJECT) {
4378 		switch (rt->dst.error) {
4379 		case -EINVAL:
4380 			rtm->rtm_type = RTN_BLACKHOLE;
4381 			break;
4382 		case -EACCES:
4383 			rtm->rtm_type = RTN_PROHIBIT;
4384 			break;
4385 		case -EAGAIN:
4386 			rtm->rtm_type = RTN_THROW;
4387 			break;
4388 		default:
4389 			rtm->rtm_type = RTN_UNREACHABLE;
4390 			break;
4391 		}
4392 	}
4393 	else if (rt->rt6i_flags & RTF_LOCAL)
4394 		rtm->rtm_type = RTN_LOCAL;
4395 	else if (rt->rt6i_flags & RTF_ANYCAST)
4396 		rtm->rtm_type = RTN_ANYCAST;
4397 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4398 		rtm->rtm_type = RTN_LOCAL;
4399 	else
4400 		rtm->rtm_type = RTN_UNICAST;
4401 	rtm->rtm_flags = 0;
4402 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4403 	rtm->rtm_protocol = rt->rt6i_protocol;
4404 
4405 	if (rt->rt6i_flags & RTF_CACHE)
4406 		rtm->rtm_flags |= RTM_F_CLONED;
4407 
4408 	if (dst) {
4409 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4410 			goto nla_put_failure;
4411 		rtm->rtm_dst_len = 128;
4412 	} else if (rtm->rtm_dst_len)
4413 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4414 			goto nla_put_failure;
4415 #ifdef CONFIG_IPV6_SUBTREES
4416 	if (src) {
4417 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4418 			goto nla_put_failure;
4419 		rtm->rtm_src_len = 128;
4420 	} else if (rtm->rtm_src_len &&
4421 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4422 		goto nla_put_failure;
4423 #endif
4424 	if (iif) {
4425 #ifdef CONFIG_IPV6_MROUTE
4426 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4427 			int err = ip6mr_get_route(net, skb, rtm, portid);
4428 
4429 			if (err == 0)
4430 				return 0;
4431 			if (err < 0)
4432 				goto nla_put_failure;
4433 		} else
4434 #endif
4435 			if (nla_put_u32(skb, RTA_IIF, iif))
4436 				goto nla_put_failure;
4437 	} else if (dst) {
4438 		struct in6_addr saddr_buf;
4439 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4440 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4441 			goto nla_put_failure;
4442 	}
4443 
4444 	if (rt->rt6i_prefsrc.plen) {
4445 		struct in6_addr saddr_buf;
4446 		saddr_buf = rt->rt6i_prefsrc.addr;
4447 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4448 			goto nla_put_failure;
4449 	}
4450 
4451 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4452 	if (rt->rt6i_pmtu)
4453 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4454 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4455 		goto nla_put_failure;
4456 
4457 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4458 		goto nla_put_failure;
4459 
4460 	/* For multipath routes, walk the siblings list and add
4461 	 * each as a nexthop within RTA_MULTIPATH.
4462 	 */
4463 	if (rt->rt6i_nsiblings) {
4464 		struct rt6_info *sibling, *next_sibling;
4465 		struct nlattr *mp;
4466 
4467 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4468 		if (!mp)
4469 			goto nla_put_failure;
4470 
4471 		if (rt6_add_nexthop(skb, rt) < 0)
4472 			goto nla_put_failure;
4473 
4474 		list_for_each_entry_safe(sibling, next_sibling,
4475 					 &rt->rt6i_siblings, rt6i_siblings) {
4476 			if (rt6_add_nexthop(skb, sibling) < 0)
4477 				goto nla_put_failure;
4478 		}
4479 
4480 		nla_nest_end(skb, mp);
4481 	} else {
4482 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4483 			goto nla_put_failure;
4484 	}
4485 
4486 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4487 
4488 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4489 		goto nla_put_failure;
4490 
4491 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4492 		goto nla_put_failure;
4493 
4494 
4495 	nlmsg_end(skb, nlh);
4496 	return 0;
4497 
4498 nla_put_failure:
4499 	nlmsg_cancel(skb, nlh);
4500 	return -EMSGSIZE;
4501 }
4502 
4503 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4504 {
4505 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4506 	struct net *net = arg->net;
4507 
4508 	if (rt == net->ipv6.ip6_null_entry)
4509 		return 0;
4510 
4511 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4512 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4513 
4514 		/* user wants prefix routes only */
4515 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4516 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4517 			/* success since this is not a prefix route */
4518 			return 1;
4519 		}
4520 	}
4521 
4522 	return rt6_fill_node(net,
4523 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4524 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4525 		     NLM_F_MULTI);
4526 }
4527 
4528 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4529 			      struct netlink_ext_ack *extack)
4530 {
4531 	struct net *net = sock_net(in_skb->sk);
4532 	struct nlattr *tb[RTA_MAX+1];
4533 	int err, iif = 0, oif = 0;
4534 	struct dst_entry *dst;
4535 	struct rt6_info *rt;
4536 	struct sk_buff *skb;
4537 	struct rtmsg *rtm;
4538 	struct flowi6 fl6;
4539 	bool fibmatch;
4540 
4541 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4542 			  extack);
4543 	if (err < 0)
4544 		goto errout;
4545 
4546 	err = -EINVAL;
4547 	memset(&fl6, 0, sizeof(fl6));
4548 	rtm = nlmsg_data(nlh);
4549 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4550 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4551 
4552 	if (tb[RTA_SRC]) {
4553 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4554 			goto errout;
4555 
4556 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4557 	}
4558 
4559 	if (tb[RTA_DST]) {
4560 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4561 			goto errout;
4562 
4563 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4564 	}
4565 
4566 	if (tb[RTA_IIF])
4567 		iif = nla_get_u32(tb[RTA_IIF]);
4568 
4569 	if (tb[RTA_OIF])
4570 		oif = nla_get_u32(tb[RTA_OIF]);
4571 
4572 	if (tb[RTA_MARK])
4573 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4574 
4575 	if (tb[RTA_UID])
4576 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4577 					   nla_get_u32(tb[RTA_UID]));
4578 	else
4579 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4580 
4581 	if (iif) {
4582 		struct net_device *dev;
4583 		int flags = 0;
4584 
4585 		rcu_read_lock();
4586 
4587 		dev = dev_get_by_index_rcu(net, iif);
4588 		if (!dev) {
4589 			rcu_read_unlock();
4590 			err = -ENODEV;
4591 			goto errout;
4592 		}
4593 
4594 		fl6.flowi6_iif = iif;
4595 
4596 		if (!ipv6_addr_any(&fl6.saddr))
4597 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4598 
4599 		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
4600 
4601 		rcu_read_unlock();
4602 	} else {
4603 		fl6.flowi6_oif = oif;
4604 
4605 		dst = ip6_route_output(net, NULL, &fl6);
4606 	}
4607 
4608 
4609 	rt = container_of(dst, struct rt6_info, dst);
4610 	if (rt->dst.error) {
4611 		err = rt->dst.error;
4612 		ip6_rt_put(rt);
4613 		goto errout;
4614 	}
4615 
4616 	if (rt == net->ipv6.ip6_null_entry) {
4617 		err = rt->dst.error;
4618 		ip6_rt_put(rt);
4619 		goto errout;
4620 	}
4621 
4622 	if (fibmatch && rt->from) {
4623 		struct rt6_info *ort = rt->from;
4624 
4625 		dst_hold(&ort->dst);
4626 		ip6_rt_put(rt);
4627 		rt = ort;
4628 	}
4629 
4630 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4631 	if (!skb) {
4632 		ip6_rt_put(rt);
4633 		err = -ENOBUFS;
4634 		goto errout;
4635 	}
4636 
4637 	skb_dst_set(skb, &rt->dst);
4638 	if (fibmatch)
4639 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4640 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4641 				    nlh->nlmsg_seq, 0);
4642 	else
4643 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4644 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4645 				    nlh->nlmsg_seq, 0);
4646 	if (err < 0) {
4647 		kfree_skb(skb);
4648 		goto errout;
4649 	}
4650 
4651 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4652 errout:
4653 	return err;
4654 }
4655 
4656 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4657 		     unsigned int nlm_flags)
4658 {
4659 	struct sk_buff *skb;
4660 	struct net *net = info->nl_net;
4661 	u32 seq;
4662 	int err;
4663 
4664 	err = -ENOBUFS;
4665 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4666 
4667 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4668 	if (!skb)
4669 		goto errout;
4670 
4671 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4672 				event, info->portid, seq, nlm_flags);
4673 	if (err < 0) {
4674 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4675 		WARN_ON(err == -EMSGSIZE);
4676 		kfree_skb(skb);
4677 		goto errout;
4678 	}
4679 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4680 		    info->nlh, gfp_any());
4681 	return;
4682 errout:
4683 	if (err < 0)
4684 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4685 }
4686 
4687 static int ip6_route_dev_notify(struct notifier_block *this,
4688 				unsigned long event, void *ptr)
4689 {
4690 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4691 	struct net *net = dev_net(dev);
4692 
4693 	if (!(dev->flags & IFF_LOOPBACK))
4694 		return NOTIFY_OK;
4695 
4696 	if (event == NETDEV_REGISTER) {
4697 		net->ipv6.ip6_null_entry->dst.dev = dev;
4698 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4699 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4700 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4701 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4702 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4703 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4704 #endif
4705 	 } else if (event == NETDEV_UNREGISTER &&
4706 		    dev->reg_state != NETREG_UNREGISTERED) {
4707 		/* NETDEV_UNREGISTER could be fired for multiple times by
4708 		 * netdev_wait_allrefs(). Make sure we only call this once.
4709 		 */
4710 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4711 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4712 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4713 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4714 #endif
4715 	}
4716 
4717 	return NOTIFY_OK;
4718 }
4719 
4720 /*
4721  *	/proc
4722  */
4723 
4724 #ifdef CONFIG_PROC_FS
4725 
4726 static const struct file_operations ipv6_route_proc_fops = {
4727 	.open		= ipv6_route_open,
4728 	.read		= seq_read,
4729 	.llseek		= seq_lseek,
4730 	.release	= seq_release_net,
4731 };
4732 
4733 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4734 {
4735 	struct net *net = (struct net *)seq->private;
4736 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4737 		   net->ipv6.rt6_stats->fib_nodes,
4738 		   net->ipv6.rt6_stats->fib_route_nodes,
4739 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4740 		   net->ipv6.rt6_stats->fib_rt_entries,
4741 		   net->ipv6.rt6_stats->fib_rt_cache,
4742 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4743 		   net->ipv6.rt6_stats->fib_discarded_routes);
4744 
4745 	return 0;
4746 }
4747 
4748 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4749 {
4750 	return single_open_net(inode, file, rt6_stats_seq_show);
4751 }
4752 
4753 static const struct file_operations rt6_stats_seq_fops = {
4754 	.open	 = rt6_stats_seq_open,
4755 	.read	 = seq_read,
4756 	.llseek	 = seq_lseek,
4757 	.release = single_release_net,
4758 };
4759 #endif	/* CONFIG_PROC_FS */
4760 
4761 #ifdef CONFIG_SYSCTL
4762 
4763 static
4764 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4765 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4766 {
4767 	struct net *net;
4768 	int delay;
4769 	if (!write)
4770 		return -EINVAL;
4771 
4772 	net = (struct net *)ctl->extra1;
4773 	delay = net->ipv6.sysctl.flush_delay;
4774 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4775 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4776 	return 0;
4777 }
4778 
4779 struct ctl_table ipv6_route_table_template[] = {
4780 	{
4781 		.procname	=	"flush",
4782 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4783 		.maxlen		=	sizeof(int),
4784 		.mode		=	0200,
4785 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4786 	},
4787 	{
4788 		.procname	=	"gc_thresh",
4789 		.data		=	&ip6_dst_ops_template.gc_thresh,
4790 		.maxlen		=	sizeof(int),
4791 		.mode		=	0644,
4792 		.proc_handler	=	proc_dointvec,
4793 	},
4794 	{
4795 		.procname	=	"max_size",
4796 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4797 		.maxlen		=	sizeof(int),
4798 		.mode		=	0644,
4799 		.proc_handler	=	proc_dointvec,
4800 	},
4801 	{
4802 		.procname	=	"gc_min_interval",
4803 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4804 		.maxlen		=	sizeof(int),
4805 		.mode		=	0644,
4806 		.proc_handler	=	proc_dointvec_jiffies,
4807 	},
4808 	{
4809 		.procname	=	"gc_timeout",
4810 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4811 		.maxlen		=	sizeof(int),
4812 		.mode		=	0644,
4813 		.proc_handler	=	proc_dointvec_jiffies,
4814 	},
4815 	{
4816 		.procname	=	"gc_interval",
4817 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4818 		.maxlen		=	sizeof(int),
4819 		.mode		=	0644,
4820 		.proc_handler	=	proc_dointvec_jiffies,
4821 	},
4822 	{
4823 		.procname	=	"gc_elasticity",
4824 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4825 		.maxlen		=	sizeof(int),
4826 		.mode		=	0644,
4827 		.proc_handler	=	proc_dointvec,
4828 	},
4829 	{
4830 		.procname	=	"mtu_expires",
4831 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4832 		.maxlen		=	sizeof(int),
4833 		.mode		=	0644,
4834 		.proc_handler	=	proc_dointvec_jiffies,
4835 	},
4836 	{
4837 		.procname	=	"min_adv_mss",
4838 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4839 		.maxlen		=	sizeof(int),
4840 		.mode		=	0644,
4841 		.proc_handler	=	proc_dointvec,
4842 	},
4843 	{
4844 		.procname	=	"gc_min_interval_ms",
4845 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4846 		.maxlen		=	sizeof(int),
4847 		.mode		=	0644,
4848 		.proc_handler	=	proc_dointvec_ms_jiffies,
4849 	},
4850 	{ }
4851 };
4852 
4853 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4854 {
4855 	struct ctl_table *table;
4856 
4857 	table = kmemdup(ipv6_route_table_template,
4858 			sizeof(ipv6_route_table_template),
4859 			GFP_KERNEL);
4860 
4861 	if (table) {
4862 		table[0].data = &net->ipv6.sysctl.flush_delay;
4863 		table[0].extra1 = net;
4864 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4865 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4866 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4867 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4868 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4869 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4870 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4871 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4872 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4873 
4874 		/* Don't export sysctls to unprivileged users */
4875 		if (net->user_ns != &init_user_ns)
4876 			table[0].procname = NULL;
4877 	}
4878 
4879 	return table;
4880 }
4881 #endif
4882 
4883 static int __net_init ip6_route_net_init(struct net *net)
4884 {
4885 	int ret = -ENOMEM;
4886 
4887 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4888 	       sizeof(net->ipv6.ip6_dst_ops));
4889 
4890 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4891 		goto out_ip6_dst_ops;
4892 
4893 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4894 					   sizeof(*net->ipv6.ip6_null_entry),
4895 					   GFP_KERNEL);
4896 	if (!net->ipv6.ip6_null_entry)
4897 		goto out_ip6_dst_entries;
4898 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4899 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4900 			 ip6_template_metrics, true);
4901 
4902 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4903 	net->ipv6.fib6_has_custom_rules = false;
4904 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4905 					       sizeof(*net->ipv6.ip6_prohibit_entry),
4906 					       GFP_KERNEL);
4907 	if (!net->ipv6.ip6_prohibit_entry)
4908 		goto out_ip6_null_entry;
4909 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4910 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4911 			 ip6_template_metrics, true);
4912 
4913 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4914 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
4915 					       GFP_KERNEL);
4916 	if (!net->ipv6.ip6_blk_hole_entry)
4917 		goto out_ip6_prohibit_entry;
4918 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4919 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4920 			 ip6_template_metrics, true);
4921 #endif
4922 
4923 	net->ipv6.sysctl.flush_delay = 0;
4924 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
4925 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4926 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4927 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4928 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4929 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4930 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4931 
4932 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
4933 
4934 	ret = 0;
4935 out:
4936 	return ret;
4937 
4938 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4939 out_ip6_prohibit_entry:
4940 	kfree(net->ipv6.ip6_prohibit_entry);
4941 out_ip6_null_entry:
4942 	kfree(net->ipv6.ip6_null_entry);
4943 #endif
4944 out_ip6_dst_entries:
4945 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4946 out_ip6_dst_ops:
4947 	goto out;
4948 }
4949 
4950 static void __net_exit ip6_route_net_exit(struct net *net)
4951 {
4952 	kfree(net->ipv6.ip6_null_entry);
4953 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4954 	kfree(net->ipv6.ip6_prohibit_entry);
4955 	kfree(net->ipv6.ip6_blk_hole_entry);
4956 #endif
4957 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4958 }
4959 
4960 static int __net_init ip6_route_net_init_late(struct net *net)
4961 {
4962 #ifdef CONFIG_PROC_FS
4963 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4964 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4965 #endif
4966 	return 0;
4967 }
4968 
4969 static void __net_exit ip6_route_net_exit_late(struct net *net)
4970 {
4971 #ifdef CONFIG_PROC_FS
4972 	remove_proc_entry("ipv6_route", net->proc_net);
4973 	remove_proc_entry("rt6_stats", net->proc_net);
4974 #endif
4975 }
4976 
4977 static struct pernet_operations ip6_route_net_ops = {
4978 	.init = ip6_route_net_init,
4979 	.exit = ip6_route_net_exit,
4980 };
4981 
4982 static int __net_init ipv6_inetpeer_init(struct net *net)
4983 {
4984 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4985 
4986 	if (!bp)
4987 		return -ENOMEM;
4988 	inet_peer_base_init(bp);
4989 	net->ipv6.peers = bp;
4990 	return 0;
4991 }
4992 
4993 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4994 {
4995 	struct inet_peer_base *bp = net->ipv6.peers;
4996 
4997 	net->ipv6.peers = NULL;
4998 	inetpeer_invalidate_tree(bp);
4999 	kfree(bp);
5000 }
5001 
5002 static struct pernet_operations ipv6_inetpeer_ops = {
5003 	.init	=	ipv6_inetpeer_init,
5004 	.exit	=	ipv6_inetpeer_exit,
5005 };
5006 
5007 static struct pernet_operations ip6_route_net_late_ops = {
5008 	.init = ip6_route_net_init_late,
5009 	.exit = ip6_route_net_exit_late,
5010 };
5011 
5012 static struct notifier_block ip6_route_dev_notifier = {
5013 	.notifier_call = ip6_route_dev_notify,
5014 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5015 };
5016 
5017 void __init ip6_route_init_special_entries(void)
5018 {
5019 	/* Registering of the loopback is done before this portion of code,
5020 	 * the loopback reference in rt6_info will not be taken, do it
5021 	 * manually for init_net */
5022 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5023 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5024   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5025 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5026 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5027 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5028 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5029   #endif
5030 }
5031 
5032 int __init ip6_route_init(void)
5033 {
5034 	int ret;
5035 	int cpu;
5036 
5037 	ret = -ENOMEM;
5038 	ip6_dst_ops_template.kmem_cachep =
5039 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5040 				  SLAB_HWCACHE_ALIGN, NULL);
5041 	if (!ip6_dst_ops_template.kmem_cachep)
5042 		goto out;
5043 
5044 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5045 	if (ret)
5046 		goto out_kmem_cache;
5047 
5048 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5049 	if (ret)
5050 		goto out_dst_entries;
5051 
5052 	ret = register_pernet_subsys(&ip6_route_net_ops);
5053 	if (ret)
5054 		goto out_register_inetpeer;
5055 
5056 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5057 
5058 	ret = fib6_init();
5059 	if (ret)
5060 		goto out_register_subsys;
5061 
5062 	ret = xfrm6_init();
5063 	if (ret)
5064 		goto out_fib6_init;
5065 
5066 	ret = fib6_rules_init();
5067 	if (ret)
5068 		goto xfrm6_init;
5069 
5070 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5071 	if (ret)
5072 		goto fib6_rules_init;
5073 
5074 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5075 				   inet6_rtm_newroute, NULL, 0);
5076 	if (ret < 0)
5077 		goto out_register_late_subsys;
5078 
5079 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5080 				   inet6_rtm_delroute, NULL, 0);
5081 	if (ret < 0)
5082 		goto out_register_late_subsys;
5083 
5084 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5085 				   inet6_rtm_getroute, NULL,
5086 				   RTNL_FLAG_DOIT_UNLOCKED);
5087 	if (ret < 0)
5088 		goto out_register_late_subsys;
5089 
5090 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5091 	if (ret)
5092 		goto out_register_late_subsys;
5093 
5094 	for_each_possible_cpu(cpu) {
5095 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5096 
5097 		INIT_LIST_HEAD(&ul->head);
5098 		spin_lock_init(&ul->lock);
5099 	}
5100 
5101 out:
5102 	return ret;
5103 
5104 out_register_late_subsys:
5105 	rtnl_unregister_all(PF_INET6);
5106 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5107 fib6_rules_init:
5108 	fib6_rules_cleanup();
5109 xfrm6_init:
5110 	xfrm6_fini();
5111 out_fib6_init:
5112 	fib6_gc_cleanup();
5113 out_register_subsys:
5114 	unregister_pernet_subsys(&ip6_route_net_ops);
5115 out_register_inetpeer:
5116 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5117 out_dst_entries:
5118 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5119 out_kmem_cache:
5120 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5121 	goto out;
5122 }
5123 
5124 void ip6_route_cleanup(void)
5125 {
5126 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5127 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5128 	fib6_rules_cleanup();
5129 	xfrm6_fini();
5130 	fib6_gc_cleanup();
5131 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5132 	unregister_pernet_subsys(&ip6_route_net_ops);
5133 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5134 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5135 }
5136