xref: /openbmc/linux/net/ipv6/route.c (revision 2d972b6a)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 					     struct rt6_info *match,
455 					     struct flowi6 *fl6, int oif,
456 					     const struct sk_buff *skb,
457 					     int strict)
458 {
459 	struct rt6_info *sibling, *next_sibling;
460 
461 	/* We might have already computed the hash for ICMPv6 errors. In such
462 	 * case it will always be non-zero. Otherwise now is the time to do it.
463 	 */
464 	if (!fl6->mp_hash)
465 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466 
467 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 		return match;
469 
470 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 				 rt6i_siblings) {
472 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 			continue;
474 		if (rt6_score_route(sibling, oif, strict) < 0)
475 			break;
476 		match = sibling;
477 		break;
478 	}
479 
480 	return match;
481 }
482 
483 /*
484  *	Route lookup. rcu_read_lock() should be held.
485  */
486 
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488 						    struct rt6_info *rt,
489 						    const struct in6_addr *saddr,
490 						    int oif,
491 						    int flags)
492 {
493 	struct rt6_info *local = NULL;
494 	struct rt6_info *sprt;
495 
496 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 		return rt;
498 
499 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 		struct net_device *dev = sprt->dst.dev;
501 
502 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 			continue;
504 
505 		if (oif) {
506 			if (dev->ifindex == oif)
507 				return sprt;
508 			if (dev->flags & IFF_LOOPBACK) {
509 				if (!sprt->rt6i_idev ||
510 				    sprt->rt6i_idev->dev->ifindex != oif) {
511 					if (flags & RT6_LOOKUP_F_IFACE)
512 						continue;
513 					if (local &&
514 					    local->rt6i_idev->dev->ifindex == oif)
515 						continue;
516 				}
517 				local = sprt;
518 			}
519 		} else {
520 			if (ipv6_chk_addr(net, saddr, dev,
521 					  flags & RT6_LOOKUP_F_IFACE))
522 				return sprt;
523 		}
524 	}
525 
526 	if (oif) {
527 		if (local)
528 			return local;
529 
530 		if (flags & RT6_LOOKUP_F_IFACE)
531 			return net->ipv6.ip6_null_entry;
532 	}
533 
534 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536 
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 	struct work_struct work;
540 	struct in6_addr target;
541 	struct net_device *dev;
542 };
543 
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546 	struct in6_addr mcaddr;
547 	struct __rt6_probe_work *work =
548 		container_of(w, struct __rt6_probe_work, work);
549 
550 	addrconf_addr_solict_mult(&work->target, &mcaddr);
551 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552 	dev_put(work->dev);
553 	kfree(work);
554 }
555 
556 static void rt6_probe(struct rt6_info *rt)
557 {
558 	struct __rt6_probe_work *work;
559 	struct neighbour *neigh;
560 	/*
561 	 * Okay, this does not seem to be appropriate
562 	 * for now, however, we need to check if it
563 	 * is really so; aka Router Reachability Probing.
564 	 *
565 	 * Router Reachability Probe MUST be rate-limited
566 	 * to no more than one per minute.
567 	 */
568 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569 		return;
570 	rcu_read_lock_bh();
571 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 	if (neigh) {
573 		if (neigh->nud_state & NUD_VALID)
574 			goto out;
575 
576 		work = NULL;
577 		write_lock(&neigh->lock);
578 		if (!(neigh->nud_state & NUD_VALID) &&
579 		    time_after(jiffies,
580 			       neigh->updated +
581 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 			if (work)
584 				__neigh_set_probe_once(neigh);
585 		}
586 		write_unlock(&neigh->lock);
587 	} else {
588 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 	}
590 
591 	if (work) {
592 		INIT_WORK(&work->work, rt6_probe_deferred);
593 		work->target = rt->rt6i_gateway;
594 		dev_hold(rt->dst.dev);
595 		work->dev = rt->dst.dev;
596 		schedule_work(&work->work);
597 	}
598 
599 out:
600 	rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607 
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613 	struct net_device *dev = rt->dst.dev;
614 	if (!oif || dev->ifindex == oif)
615 		return 2;
616 	if ((dev->flags & IFF_LOOPBACK) &&
617 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 		return 1;
619 	return 0;
620 }
621 
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624 	struct neighbour *neigh;
625 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626 
627 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 	    !(rt->rt6i_flags & RTF_GATEWAY))
629 		return RT6_NUD_SUCCEED;
630 
631 	rcu_read_lock_bh();
632 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 	if (neigh) {
634 		read_lock(&neigh->lock);
635 		if (neigh->nud_state & NUD_VALID)
636 			ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 		else if (!(neigh->nud_state & NUD_FAILED))
639 			ret = RT6_NUD_SUCCEED;
640 		else
641 			ret = RT6_NUD_FAIL_PROBE;
642 #endif
643 		read_unlock(&neigh->lock);
644 	} else {
645 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647 	}
648 	rcu_read_unlock_bh();
649 
650 	return ret;
651 }
652 
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654 			   int strict)
655 {
656 	int m;
657 
658 	m = rt6_check_dev(rt, oif);
659 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 		return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664 	if (strict & RT6_LOOKUP_F_REACHABLE) {
665 		int n = rt6_check_neigh(rt);
666 		if (n < 0)
667 			return n;
668 	}
669 	return m;
670 }
671 
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 				   int *mpri, struct rt6_info *match,
674 				   bool *do_rr)
675 {
676 	int m;
677 	bool match_do_rr = false;
678 	struct inet6_dev *idev = rt->rt6i_idev;
679 
680 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 		goto out;
682 
683 	if (idev->cnf.ignore_routes_with_linkdown &&
684 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686 		goto out;
687 
688 	if (rt6_check_expired(rt))
689 		goto out;
690 
691 	m = rt6_score_route(rt, oif, strict);
692 	if (m == RT6_NUD_FAIL_DO_RR) {
693 		match_do_rr = true;
694 		m = 0; /* lowest valid score */
695 	} else if (m == RT6_NUD_FAIL_HARD) {
696 		goto out;
697 	}
698 
699 	if (strict & RT6_LOOKUP_F_REACHABLE)
700 		rt6_probe(rt);
701 
702 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
703 	if (m > *mpri) {
704 		*do_rr = match_do_rr;
705 		*mpri = m;
706 		match = rt;
707 	}
708 out:
709 	return match;
710 }
711 
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 				     struct rt6_info *leaf,
714 				     struct rt6_info *rr_head,
715 				     u32 metric, int oif, int strict,
716 				     bool *do_rr)
717 {
718 	struct rt6_info *rt, *match, *cont;
719 	int mpri = -1;
720 
721 	match = NULL;
722 	cont = NULL;
723 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 		if (rt->rt6i_metric != metric) {
725 			cont = rt;
726 			break;
727 		}
728 
729 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 	}
731 
732 	for (rt = leaf; rt && rt != rr_head;
733 	     rt = rcu_dereference(rt->rt6_next)) {
734 		if (rt->rt6i_metric != metric) {
735 			cont = rt;
736 			break;
737 		}
738 
739 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
740 	}
741 
742 	if (match || !cont)
743 		return match;
744 
745 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
747 
748 	return match;
749 }
750 
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 				   int oif, int strict)
753 {
754 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 	struct rt6_info *match, *rt0;
756 	bool do_rr = false;
757 	int key_plen;
758 
759 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 		return net->ipv6.ip6_null_entry;
761 
762 	rt0 = rcu_dereference(fn->rr_ptr);
763 	if (!rt0)
764 		rt0 = leaf;
765 
766 	/* Double check to make sure fn is not an intermediate node
767 	 * and fn->leaf does not points to its child's leaf
768 	 * (This might happen if all routes under fn are deleted from
769 	 * the tree and fib6_repair_tree() is called on the node.)
770 	 */
771 	key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 	if (rt0->rt6i_src.plen)
774 		key_plen = rt0->rt6i_src.plen;
775 #endif
776 	if (fn->fn_bit != key_plen)
777 		return net->ipv6.ip6_null_entry;
778 
779 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780 			     &do_rr);
781 
782 	if (do_rr) {
783 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784 
785 		/* no entries matched; do round-robin */
786 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
787 			next = leaf;
788 
789 		if (next != rt0) {
790 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 			/* make sure next is not being deleted from the tree */
792 			if (next->rt6i_node)
793 				rcu_assign_pointer(fn->rr_ptr, next);
794 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 		}
796 	}
797 
798 	return match ? match : net->ipv6.ip6_null_entry;
799 }
800 
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805 
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 		  const struct in6_addr *gwaddr)
809 {
810 	struct net *net = dev_net(dev);
811 	struct route_info *rinfo = (struct route_info *) opt;
812 	struct in6_addr prefix_buf, *prefix;
813 	unsigned int pref;
814 	unsigned long lifetime;
815 	struct rt6_info *rt;
816 
817 	if (len < sizeof(struct route_info)) {
818 		return -EINVAL;
819 	}
820 
821 	/* Sanity check for prefix_len and length */
822 	if (rinfo->length > 3) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 128) {
825 		return -EINVAL;
826 	} else if (rinfo->prefix_len > 64) {
827 		if (rinfo->length < 2) {
828 			return -EINVAL;
829 		}
830 	} else if (rinfo->prefix_len > 0) {
831 		if (rinfo->length < 1) {
832 			return -EINVAL;
833 		}
834 	}
835 
836 	pref = rinfo->route_pref;
837 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
838 		return -EINVAL;
839 
840 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841 
842 	if (rinfo->length == 3)
843 		prefix = (struct in6_addr *)rinfo->prefix;
844 	else {
845 		/* this function is safe */
846 		ipv6_addr_prefix(&prefix_buf,
847 				 (struct in6_addr *)rinfo->prefix,
848 				 rinfo->prefix_len);
849 		prefix = &prefix_buf;
850 	}
851 
852 	if (rinfo->prefix_len == 0)
853 		rt = rt6_get_dflt_router(gwaddr, dev);
854 	else
855 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856 					gwaddr, dev);
857 
858 	if (rt && !lifetime) {
859 		ip6_del_rt(rt);
860 		rt = NULL;
861 	}
862 
863 	if (!rt && lifetime)
864 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 					dev, pref);
866 	else if (rt)
867 		rt->rt6i_flags = RTF_ROUTEINFO |
868 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869 
870 	if (rt) {
871 		if (!addrconf_finite_timeout(lifetime))
872 			rt6_clean_expires(rt);
873 		else
874 			rt6_set_expires(rt, jiffies + HZ * lifetime);
875 
876 		ip6_rt_put(rt);
877 	}
878 	return 0;
879 }
880 #endif
881 
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 					struct in6_addr *saddr)
884 {
885 	struct fib6_node *pn, *sn;
886 	while (1) {
887 		if (fn->fn_flags & RTN_TL_ROOT)
888 			return NULL;
889 		pn = rcu_dereference(fn->parent);
890 		sn = FIB6_SUBTREE(pn);
891 		if (sn && sn != fn)
892 			fn = fib6_lookup(sn, NULL, saddr);
893 		else
894 			fn = pn;
895 		if (fn->fn_flags & RTN_RTINFO)
896 			return fn;
897 	}
898 }
899 
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 			  bool null_fallback)
902 {
903 	struct rt6_info *rt = *prt;
904 
905 	if (dst_hold_safe(&rt->dst))
906 		return true;
907 	if (null_fallback) {
908 		rt = net->ipv6.ip6_null_entry;
909 		dst_hold(&rt->dst);
910 	} else {
911 		rt = NULL;
912 	}
913 	*prt = rt;
914 	return false;
915 }
916 
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 					     struct fib6_table *table,
919 					     struct flowi6 *fl6,
920 					     const struct sk_buff *skb,
921 					     int flags)
922 {
923 	struct rt6_info *rt, *rt_cache;
924 	struct fib6_node *fn;
925 
926 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 		flags &= ~RT6_LOOKUP_F_IFACE;
928 
929 	rcu_read_lock();
930 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932 	rt = rcu_dereference(fn->leaf);
933 	if (!rt) {
934 		rt = net->ipv6.ip6_null_entry;
935 	} else {
936 		rt = rt6_device_match(net, rt, &fl6->saddr,
937 				      fl6->flowi6_oif, flags);
938 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939 			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940 						  skb, flags);
941 	}
942 	if (rt == net->ipv6.ip6_null_entry) {
943 		fn = fib6_backtrack(fn, &fl6->saddr);
944 		if (fn)
945 			goto restart;
946 	}
947 	/* Search through exception table */
948 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 	if (rt_cache)
950 		rt = rt_cache;
951 
952 	if (ip6_hold_safe(net, &rt, true))
953 		dst_use_noref(&rt->dst, jiffies);
954 
955 	rcu_read_unlock();
956 
957 	trace_fib6_table_lookup(net, rt, table, fl6);
958 
959 	return rt;
960 
961 }
962 
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964 				   const struct sk_buff *skb, int flags)
965 {
966 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969 
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971 			    const struct in6_addr *saddr, int oif,
972 			    const struct sk_buff *skb, int strict)
973 {
974 	struct flowi6 fl6 = {
975 		.flowi6_oif = oif,
976 		.daddr = *daddr,
977 	};
978 	struct dst_entry *dst;
979 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980 
981 	if (saddr) {
982 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 		flags |= RT6_LOOKUP_F_HAS_SADDR;
984 	}
985 
986 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987 	if (dst->error == 0)
988 		return (struct rt6_info *) dst;
989 
990 	dst_release(dst);
991 
992 	return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995 
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001 
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 			struct mx6_config *mxc,
1004 			struct netlink_ext_ack *extack)
1005 {
1006 	int err;
1007 	struct fib6_table *table;
1008 
1009 	table = rt->rt6i_table;
1010 	spin_lock_bh(&table->tb6_lock);
1011 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 	spin_unlock_bh(&table->tb6_lock);
1013 
1014 	return err;
1015 }
1016 
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1020 	struct mx6_config mxc = { .mx = NULL, };
1021 
1022 	/* Hold dst to account for the reference from the fib6 tree */
1023 	dst_hold(&rt->dst);
1024 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026 
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030 	struct net_device *dev = rt->dst.dev;
1031 
1032 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033 		/* for copies of local routes, dst->dev needs to be the
1034 		 * device if it is a master device, the master device if
1035 		 * device is enslaved, and the loopback as the default
1036 		 */
1037 		if (netif_is_l3_slave(dev) &&
1038 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1039 			dev = l3mdev_master_dev_rcu(dev);
1040 		else if (!netif_is_l3_master(dev))
1041 			dev = dev_net(dev)->loopback_dev;
1042 		/* last case is netif_is_l3_master(dev) is true in which
1043 		 * case we want dev returned to be dev
1044 		 */
1045 	}
1046 
1047 	return dev;
1048 }
1049 
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 					   const struct in6_addr *daddr,
1052 					   const struct in6_addr *saddr)
1053 {
1054 	struct net_device *dev;
1055 	struct rt6_info *rt;
1056 
1057 	/*
1058 	 *	Clone the route.
1059 	 */
1060 
1061 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 		ort = ort->from;
1063 
1064 	rcu_read_lock();
1065 	dev = ip6_rt_get_dev_rcu(ort);
1066 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067 	rcu_read_unlock();
1068 	if (!rt)
1069 		return NULL;
1070 
1071 	ip6_rt_copy_init(rt, ort);
1072 	rt->rt6i_flags |= RTF_CACHE;
1073 	rt->rt6i_metric = 0;
1074 	rt->dst.flags |= DST_HOST;
1075 	rt->rt6i_dst.addr = *daddr;
1076 	rt->rt6i_dst.plen = 128;
1077 
1078 	if (!rt6_is_gw_or_nonexthop(ort)) {
1079 		if (ort->rt6i_dst.plen != 128 &&
1080 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 			rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 		if (rt->rt6i_src.plen && saddr) {
1084 			rt->rt6i_src.addr = *saddr;
1085 			rt->rt6i_src.plen = 128;
1086 		}
1087 #endif
1088 	}
1089 
1090 	return rt;
1091 }
1092 
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095 	struct net_device *dev;
1096 	struct rt6_info *pcpu_rt;
1097 
1098 	rcu_read_lock();
1099 	dev = ip6_rt_get_dev_rcu(rt);
1100 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101 	rcu_read_unlock();
1102 	if (!pcpu_rt)
1103 		return NULL;
1104 	ip6_rt_copy_init(pcpu_rt, rt);
1105 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 	return pcpu_rt;
1108 }
1109 
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113 	struct rt6_info *pcpu_rt, **p;
1114 
1115 	p = this_cpu_ptr(rt->rt6i_pcpu);
1116 	pcpu_rt = *p;
1117 
1118 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 		rt6_dst_from_metrics_check(pcpu_rt);
1120 
1121 	return pcpu_rt;
1122 }
1123 
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126 	struct rt6_info *pcpu_rt, *prev, **p;
1127 
1128 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129 	if (!pcpu_rt) {
1130 		struct net *net = dev_net(rt->dst.dev);
1131 
1132 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 		return net->ipv6.ip6_null_entry;
1134 	}
1135 
1136 	dst_hold(&pcpu_rt->dst);
1137 	p = this_cpu_ptr(rt->rt6i_pcpu);
1138 	prev = cmpxchg(p, NULL, pcpu_rt);
1139 	BUG_ON(prev);
1140 
1141 	rt6_dst_from_metrics_check(pcpu_rt);
1142 	return pcpu_rt;
1143 }
1144 
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148 
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 				 struct rt6_exception *rt6_ex)
1154 {
1155 	struct net *net;
1156 
1157 	if (!bucket || !rt6_ex)
1158 		return;
1159 
1160 	net = dev_net(rt6_ex->rt6i->dst.dev);
1161 	rt6_ex->rt6i->rt6i_node = NULL;
1162 	hlist_del_rcu(&rt6_ex->hlist);
1163 	rt6_release(rt6_ex->rt6i);
1164 	kfree_rcu(rt6_ex, rcu);
1165 	WARN_ON_ONCE(!bucket->depth);
1166 	bucket->depth--;
1167 	net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169 
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175 	struct rt6_exception *rt6_ex, *oldest = NULL;
1176 
1177 	if (!bucket)
1178 		return;
1179 
1180 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182 			oldest = rt6_ex;
1183 	}
1184 	rt6_remove_exception(bucket, oldest);
1185 }
1186 
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188 			      const struct in6_addr *src)
1189 {
1190 	static u32 seed __read_mostly;
1191 	u32 val;
1192 
1193 	net_get_random_once(&seed, sizeof(seed));
1194 	val = jhash(dst, sizeof(*dst), seed);
1195 
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 	if (src)
1198 		val = jhash(src, sizeof(*src), val);
1199 #endif
1200 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202 
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210 			      const struct in6_addr *daddr,
1211 			      const struct in6_addr *saddr)
1212 {
1213 	struct rt6_exception *rt6_ex;
1214 	u32 hval;
1215 
1216 	if (!(*bucket) || !daddr)
1217 		return NULL;
1218 
1219 	hval = rt6_exception_hash(daddr, saddr);
1220 	*bucket += hval;
1221 
1222 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223 		struct rt6_info *rt6 = rt6_ex->rt6i;
1224 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225 
1226 #ifdef CONFIG_IPV6_SUBTREES
1227 		if (matched && saddr)
1228 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230 		if (matched)
1231 			return rt6_ex;
1232 	}
1233 	return NULL;
1234 }
1235 
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243 			 const struct in6_addr *daddr,
1244 			 const struct in6_addr *saddr)
1245 {
1246 	struct rt6_exception *rt6_ex;
1247 	u32 hval;
1248 
1249 	WARN_ON_ONCE(!rcu_read_lock_held());
1250 
1251 	if (!(*bucket) || !daddr)
1252 		return NULL;
1253 
1254 	hval = rt6_exception_hash(daddr, saddr);
1255 	*bucket += hval;
1256 
1257 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258 		struct rt6_info *rt6 = rt6_ex->rt6i;
1259 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260 
1261 #ifdef CONFIG_IPV6_SUBTREES
1262 		if (matched && saddr)
1263 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265 		if (matched)
1266 			return rt6_ex;
1267 	}
1268 	return NULL;
1269 }
1270 
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272 				struct rt6_info *ort)
1273 {
1274 	struct net *net = dev_net(ort->dst.dev);
1275 	struct rt6_exception_bucket *bucket;
1276 	struct in6_addr *src_key = NULL;
1277 	struct rt6_exception *rt6_ex;
1278 	int err = 0;
1279 
1280 	/* ort can't be a cache or pcpu route */
1281 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282 		ort = ort->from;
1283 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284 
1285 	spin_lock_bh(&rt6_exception_lock);
1286 
1287 	if (ort->exception_bucket_flushed) {
1288 		err = -EINVAL;
1289 		goto out;
1290 	}
1291 
1292 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293 					lockdep_is_held(&rt6_exception_lock));
1294 	if (!bucket) {
1295 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296 				 GFP_ATOMIC);
1297 		if (!bucket) {
1298 			err = -ENOMEM;
1299 			goto out;
1300 		}
1301 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302 	}
1303 
1304 #ifdef CONFIG_IPV6_SUBTREES
1305 	/* rt6i_src.plen != 0 indicates ort is in subtree
1306 	 * and exception table is indexed by a hash of
1307 	 * both rt6i_dst and rt6i_src.
1308 	 * Otherwise, the exception table is indexed by
1309 	 * a hash of only rt6i_dst.
1310 	 */
1311 	if (ort->rt6i_src.plen)
1312 		src_key = &nrt->rt6i_src.addr;
1313 #endif
1314 
1315 	/* Update rt6i_prefsrc as it could be changed
1316 	 * in rt6_remove_prefsrc()
1317 	 */
1318 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319 	/* rt6_mtu_change() might lower mtu on ort.
1320 	 * Only insert this exception route if its mtu
1321 	 * is less than ort's mtu value.
1322 	 */
1323 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324 		err = -EINVAL;
1325 		goto out;
1326 	}
1327 
1328 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329 					       src_key);
1330 	if (rt6_ex)
1331 		rt6_remove_exception(bucket, rt6_ex);
1332 
1333 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334 	if (!rt6_ex) {
1335 		err = -ENOMEM;
1336 		goto out;
1337 	}
1338 	rt6_ex->rt6i = nrt;
1339 	rt6_ex->stamp = jiffies;
1340 	atomic_inc(&nrt->rt6i_ref);
1341 	nrt->rt6i_node = ort->rt6i_node;
1342 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343 	bucket->depth++;
1344 	net->ipv6.rt6_stats->fib_rt_cache++;
1345 
1346 	if (bucket->depth > FIB6_MAX_DEPTH)
1347 		rt6_exception_remove_oldest(bucket);
1348 
1349 out:
1350 	spin_unlock_bh(&rt6_exception_lock);
1351 
1352 	/* Update fn->fn_sernum to invalidate all cached dst */
1353 	if (!err) {
1354 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355 		fib6_update_sernum(ort);
1356 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357 		fib6_force_start_gc(net);
1358 	}
1359 
1360 	return err;
1361 }
1362 
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365 	struct rt6_exception_bucket *bucket;
1366 	struct rt6_exception *rt6_ex;
1367 	struct hlist_node *tmp;
1368 	int i;
1369 
1370 	spin_lock_bh(&rt6_exception_lock);
1371 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1372 	rt->exception_bucket_flushed = 1;
1373 
1374 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375 				    lockdep_is_held(&rt6_exception_lock));
1376 	if (!bucket)
1377 		goto out;
1378 
1379 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381 			rt6_remove_exception(bucket, rt6_ex);
1382 		WARN_ON_ONCE(bucket->depth);
1383 		bucket++;
1384 	}
1385 
1386 out:
1387 	spin_unlock_bh(&rt6_exception_lock);
1388 }
1389 
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394 					   struct in6_addr *daddr,
1395 					   struct in6_addr *saddr)
1396 {
1397 	struct rt6_exception_bucket *bucket;
1398 	struct in6_addr *src_key = NULL;
1399 	struct rt6_exception *rt6_ex;
1400 	struct rt6_info *res = NULL;
1401 
1402 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403 
1404 #ifdef CONFIG_IPV6_SUBTREES
1405 	/* rt6i_src.plen != 0 indicates rt is in subtree
1406 	 * and exception table is indexed by a hash of
1407 	 * both rt6i_dst and rt6i_src.
1408 	 * Otherwise, the exception table is indexed by
1409 	 * a hash of only rt6i_dst.
1410 	 */
1411 	if (rt->rt6i_src.plen)
1412 		src_key = saddr;
1413 #endif
1414 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415 
1416 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417 		res = rt6_ex->rt6i;
1418 
1419 	return res;
1420 }
1421 
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425 	struct rt6_exception_bucket *bucket;
1426 	struct rt6_info *from = rt->from;
1427 	struct in6_addr *src_key = NULL;
1428 	struct rt6_exception *rt6_ex;
1429 	int err;
1430 
1431 	if (!from ||
1432 	    !(rt->rt6i_flags & RTF_CACHE))
1433 		return -EINVAL;
1434 
1435 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436 		return -ENOENT;
1437 
1438 	spin_lock_bh(&rt6_exception_lock);
1439 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440 				    lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1443 	 * and exception table is indexed by a hash of
1444 	 * both rt6i_dst and rt6i_src.
1445 	 * Otherwise, the exception table is indexed by
1446 	 * a hash of only rt6i_dst.
1447 	 */
1448 	if (from->rt6i_src.plen)
1449 		src_key = &rt->rt6i_src.addr;
1450 #endif
1451 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452 					       &rt->rt6i_dst.addr,
1453 					       src_key);
1454 	if (rt6_ex) {
1455 		rt6_remove_exception(bucket, rt6_ex);
1456 		err = 0;
1457 	} else {
1458 		err = -ENOENT;
1459 	}
1460 
1461 	spin_unlock_bh(&rt6_exception_lock);
1462 	return err;
1463 }
1464 
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470 	struct rt6_exception_bucket *bucket;
1471 	struct rt6_info *from = rt->from;
1472 	struct in6_addr *src_key = NULL;
1473 	struct rt6_exception *rt6_ex;
1474 
1475 	if (!from ||
1476 	    !(rt->rt6i_flags & RTF_CACHE))
1477 		return;
1478 
1479 	rcu_read_lock();
1480 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1481 
1482 #ifdef CONFIG_IPV6_SUBTREES
1483 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1484 	 * and exception table is indexed by a hash of
1485 	 * both rt6i_dst and rt6i_src.
1486 	 * Otherwise, the exception table is indexed by
1487 	 * a hash of only rt6i_dst.
1488 	 */
1489 	if (from->rt6i_src.plen)
1490 		src_key = &rt->rt6i_src.addr;
1491 #endif
1492 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1493 					  &rt->rt6i_dst.addr,
1494 					  src_key);
1495 	if (rt6_ex)
1496 		rt6_ex->stamp = jiffies;
1497 
1498 	rcu_read_unlock();
1499 }
1500 
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503 	struct rt6_exception_bucket *bucket;
1504 	struct rt6_exception *rt6_ex;
1505 	int i;
1506 
1507 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508 					lockdep_is_held(&rt6_exception_lock));
1509 
1510 	if (bucket) {
1511 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514 			}
1515 			bucket++;
1516 		}
1517 	}
1518 }
1519 
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521 					 struct rt6_info *rt, int mtu)
1522 {
1523 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1524 	 * lowest MTU in the path: always allow updating the route PMTU to
1525 	 * reflect PMTU decreases.
1526 	 *
1527 	 * If the new MTU is higher, and the route PMTU is equal to the local
1528 	 * MTU, this means the old MTU is the lowest in the path, so allow
1529 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530 	 * handle this.
1531 	 */
1532 
1533 	if (dst_mtu(&rt->dst) >= mtu)
1534 		return true;
1535 
1536 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537 		return true;
1538 
1539 	return false;
1540 }
1541 
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 				       struct rt6_info *rt, int mtu)
1544 {
1545 	struct rt6_exception_bucket *bucket;
1546 	struct rt6_exception *rt6_ex;
1547 	int i;
1548 
1549 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 					lockdep_is_held(&rt6_exception_lock));
1551 
1552 	if (!bucket)
1553 		return;
1554 
1555 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557 			struct rt6_info *entry = rt6_ex->rt6i;
1558 
1559 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 			 * route), the metrics of its rt->dst.from have already
1561 			 * been updated.
1562 			 */
1563 			if (entry->rt6i_pmtu &&
1564 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 				entry->rt6i_pmtu = mtu;
1566 		}
1567 		bucket++;
1568 	}
1569 }
1570 
1571 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1572 
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574 					struct in6_addr *gateway)
1575 {
1576 	struct rt6_exception_bucket *bucket;
1577 	struct rt6_exception *rt6_ex;
1578 	struct hlist_node *tmp;
1579 	int i;
1580 
1581 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582 		return;
1583 
1584 	spin_lock_bh(&rt6_exception_lock);
1585 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586 				     lockdep_is_held(&rt6_exception_lock));
1587 
1588 	if (bucket) {
1589 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590 			hlist_for_each_entry_safe(rt6_ex, tmp,
1591 						  &bucket->chain, hlist) {
1592 				struct rt6_info *entry = rt6_ex->rt6i;
1593 
1594 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595 				    RTF_CACHE_GATEWAY &&
1596 				    ipv6_addr_equal(gateway,
1597 						    &entry->rt6i_gateway)) {
1598 					rt6_remove_exception(bucket, rt6_ex);
1599 				}
1600 			}
1601 			bucket++;
1602 		}
1603 	}
1604 
1605 	spin_unlock_bh(&rt6_exception_lock);
1606 }
1607 
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609 				      struct rt6_exception *rt6_ex,
1610 				      struct fib6_gc_args *gc_args,
1611 				      unsigned long now)
1612 {
1613 	struct rt6_info *rt = rt6_ex->rt6i;
1614 
1615 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1616 	 * even if others have still references to them, so that on next
1617 	 * dst_check() such references can be dropped.
1618 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619 	 * expired, independently from their aging, as per RFC 8201 section 4
1620 	 */
1621 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623 			RT6_TRACE("aging clone %p\n", rt);
1624 			rt6_remove_exception(bucket, rt6_ex);
1625 			return;
1626 		}
1627 	} else if (time_after(jiffies, rt->dst.expires)) {
1628 		RT6_TRACE("purging expired route %p\n", rt);
1629 		rt6_remove_exception(bucket, rt6_ex);
1630 		return;
1631 	}
1632 
1633 	if (rt->rt6i_flags & RTF_GATEWAY) {
1634 		struct neighbour *neigh;
1635 		__u8 neigh_flags = 0;
1636 
1637 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638 		if (neigh)
1639 			neigh_flags = neigh->flags;
1640 
1641 		if (!(neigh_flags & NTF_ROUTER)) {
1642 			RT6_TRACE("purging route %p via non-router but gateway\n",
1643 				  rt);
1644 			rt6_remove_exception(bucket, rt6_ex);
1645 			return;
1646 		}
1647 	}
1648 
1649 	gc_args->more++;
1650 }
1651 
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653 			struct fib6_gc_args *gc_args,
1654 			unsigned long now)
1655 {
1656 	struct rt6_exception_bucket *bucket;
1657 	struct rt6_exception *rt6_ex;
1658 	struct hlist_node *tmp;
1659 	int i;
1660 
1661 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662 		return;
1663 
1664 	rcu_read_lock_bh();
1665 	spin_lock(&rt6_exception_lock);
1666 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 				    lockdep_is_held(&rt6_exception_lock));
1668 
1669 	if (bucket) {
1670 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 			hlist_for_each_entry_safe(rt6_ex, tmp,
1672 						  &bucket->chain, hlist) {
1673 				rt6_age_examine_exception(bucket, rt6_ex,
1674 							  gc_args, now);
1675 			}
1676 			bucket++;
1677 		}
1678 	}
1679 	spin_unlock(&rt6_exception_lock);
1680 	rcu_read_unlock_bh();
1681 }
1682 
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684 			       int oif, struct flowi6 *fl6,
1685 			       const struct sk_buff *skb, int flags)
1686 {
1687 	struct fib6_node *fn, *saved_fn;
1688 	struct rt6_info *rt, *rt_cache;
1689 	int strict = 0;
1690 
1691 	strict |= flags & RT6_LOOKUP_F_IFACE;
1692 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693 	if (net->ipv6.devconf_all->forwarding == 0)
1694 		strict |= RT6_LOOKUP_F_REACHABLE;
1695 
1696 	rcu_read_lock();
1697 
1698 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699 	saved_fn = fn;
1700 
1701 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702 		oif = 0;
1703 
1704 redo_rt6_select:
1705 	rt = rt6_select(net, fn, oif, strict);
1706 	if (rt->rt6i_nsiblings)
1707 		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708 	if (rt == net->ipv6.ip6_null_entry) {
1709 		fn = fib6_backtrack(fn, &fl6->saddr);
1710 		if (fn)
1711 			goto redo_rt6_select;
1712 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713 			/* also consider unreachable route */
1714 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1715 			fn = saved_fn;
1716 			goto redo_rt6_select;
1717 		}
1718 	}
1719 
1720 	/*Search through exception table */
1721 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722 	if (rt_cache)
1723 		rt = rt_cache;
1724 
1725 	if (rt == net->ipv6.ip6_null_entry) {
1726 		rcu_read_unlock();
1727 		dst_hold(&rt->dst);
1728 		trace_fib6_table_lookup(net, rt, table, fl6);
1729 		return rt;
1730 	} else if (rt->rt6i_flags & RTF_CACHE) {
1731 		if (ip6_hold_safe(net, &rt, true)) {
1732 			dst_use_noref(&rt->dst, jiffies);
1733 			rt6_dst_from_metrics_check(rt);
1734 		}
1735 		rcu_read_unlock();
1736 		trace_fib6_table_lookup(net, rt, table, fl6);
1737 		return rt;
1738 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1740 		/* Create a RTF_CACHE clone which will not be
1741 		 * owned by the fib6 tree.  It is for the special case where
1742 		 * the daddr in the skb during the neighbor look-up is different
1743 		 * from the fl6->daddr used to look-up route here.
1744 		 */
1745 
1746 		struct rt6_info *uncached_rt;
1747 
1748 		if (ip6_hold_safe(net, &rt, true)) {
1749 			dst_use_noref(&rt->dst, jiffies);
1750 		} else {
1751 			rcu_read_unlock();
1752 			uncached_rt = rt;
1753 			goto uncached_rt_out;
1754 		}
1755 		rcu_read_unlock();
1756 
1757 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758 		dst_release(&rt->dst);
1759 
1760 		if (uncached_rt) {
1761 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762 			 * No need for another dst_hold()
1763 			 */
1764 			rt6_uncached_list_add(uncached_rt);
1765 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766 		} else {
1767 			uncached_rt = net->ipv6.ip6_null_entry;
1768 			dst_hold(&uncached_rt->dst);
1769 		}
1770 
1771 uncached_rt_out:
1772 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773 		return uncached_rt;
1774 
1775 	} else {
1776 		/* Get a percpu copy */
1777 
1778 		struct rt6_info *pcpu_rt;
1779 
1780 		dst_use_noref(&rt->dst, jiffies);
1781 		local_bh_disable();
1782 		pcpu_rt = rt6_get_pcpu_route(rt);
1783 
1784 		if (!pcpu_rt) {
1785 			/* atomic_inc_not_zero() is needed when using rcu */
1786 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787 				/* No dst_hold() on rt is needed because grabbing
1788 				 * rt->rt6i_ref makes sure rt can't be released.
1789 				 */
1790 				pcpu_rt = rt6_make_pcpu_route(rt);
1791 				rt6_release(rt);
1792 			} else {
1793 				/* rt is already removed from tree */
1794 				pcpu_rt = net->ipv6.ip6_null_entry;
1795 				dst_hold(&pcpu_rt->dst);
1796 			}
1797 		}
1798 		local_bh_enable();
1799 		rcu_read_unlock();
1800 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801 		return pcpu_rt;
1802 	}
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805 
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807 					    struct fib6_table *table,
1808 					    struct flowi6 *fl6,
1809 					    const struct sk_buff *skb,
1810 					    int flags)
1811 {
1812 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814 
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816 					 struct net_device *dev,
1817 					 struct flowi6 *fl6,
1818 					 const struct sk_buff *skb,
1819 					 int flags)
1820 {
1821 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822 		flags |= RT6_LOOKUP_F_IFACE;
1823 
1824 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827 
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829 				  struct flow_keys *keys,
1830 				  struct flow_keys *flkeys)
1831 {
1832 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833 	const struct ipv6hdr *key_iph = outer_iph;
1834 	struct flow_keys *_flkeys = flkeys;
1835 	const struct ipv6hdr *inner_iph;
1836 	const struct icmp6hdr *icmph;
1837 	struct ipv6hdr _inner_iph;
1838 
1839 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840 		goto out;
1841 
1842 	icmph = icmp6_hdr(skb);
1843 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1847 		goto out;
1848 
1849 	inner_iph = skb_header_pointer(skb,
1850 				       skb_transport_offset(skb) + sizeof(*icmph),
1851 				       sizeof(_inner_iph), &_inner_iph);
1852 	if (!inner_iph)
1853 		goto out;
1854 
1855 	key_iph = inner_iph;
1856 	_flkeys = NULL;
1857 out:
1858 	if (_flkeys) {
1859 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861 		keys->tags.flow_label = _flkeys->tags.flow_label;
1862 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863 	} else {
1864 		keys->addrs.v6addrs.src = key_iph->saddr;
1865 		keys->addrs.v6addrs.dst = key_iph->daddr;
1866 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1867 		keys->basic.ip_proto = key_iph->nexthdr;
1868 	}
1869 }
1870 
1871 /* if skb is set it will be used and fl6 can be NULL */
1872 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1874 {
1875 	struct flow_keys hash_keys;
1876 	u32 mhash;
1877 
1878 	switch (ip6_multipath_hash_policy(net)) {
1879 	case 0:
1880 		memset(&hash_keys, 0, sizeof(hash_keys));
1881 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882 		if (skb) {
1883 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884 		} else {
1885 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1886 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889 		}
1890 		break;
1891 	case 1:
1892 		if (skb) {
1893 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894 			struct flow_keys keys;
1895 
1896 			/* short-circuit if we already have L4 hash present */
1897 			if (skb->l4_hash)
1898 				return skb_get_hash_raw(skb) >> 1;
1899 
1900 			memset(&hash_keys, 0, sizeof(hash_keys));
1901 
1902                         if (!flkeys) {
1903 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1904 				flkeys = &keys;
1905 			}
1906 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909 			hash_keys.ports.src = flkeys->ports.src;
1910 			hash_keys.ports.dst = flkeys->ports.dst;
1911 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912 		} else {
1913 			memset(&hash_keys, 0, sizeof(hash_keys));
1914 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1916 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917 			hash_keys.ports.src = fl6->fl6_sport;
1918 			hash_keys.ports.dst = fl6->fl6_dport;
1919 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920 		}
1921 		break;
1922 	}
1923 	mhash = flow_hash_from_keys(&hash_keys);
1924 
1925 	return mhash >> 1;
1926 }
1927 
1928 void ip6_route_input(struct sk_buff *skb)
1929 {
1930 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1931 	struct net *net = dev_net(skb->dev);
1932 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1933 	struct ip_tunnel_info *tun_info;
1934 	struct flowi6 fl6 = {
1935 		.flowi6_iif = skb->dev->ifindex,
1936 		.daddr = iph->daddr,
1937 		.saddr = iph->saddr,
1938 		.flowlabel = ip6_flowinfo(iph),
1939 		.flowi6_mark = skb->mark,
1940 		.flowi6_proto = iph->nexthdr,
1941 	};
1942 	struct flow_keys *flkeys = NULL, _flkeys;
1943 
1944 	tun_info = skb_tunnel_info(skb);
1945 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1946 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1947 
1948 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949 		flkeys = &_flkeys;
1950 
1951 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1952 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1953 	skb_dst_drop(skb);
1954 	skb_dst_set(skb,
1955 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1956 }
1957 
1958 static struct rt6_info *ip6_pol_route_output(struct net *net,
1959 					     struct fib6_table *table,
1960 					     struct flowi6 *fl6,
1961 					     const struct sk_buff *skb,
1962 					     int flags)
1963 {
1964 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1965 }
1966 
1967 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968 					 struct flowi6 *fl6, int flags)
1969 {
1970 	bool any_src;
1971 
1972 	if (rt6_need_strict(&fl6->daddr)) {
1973 		struct dst_entry *dst;
1974 
1975 		dst = l3mdev_link_scope_lookup(net, fl6);
1976 		if (dst)
1977 			return dst;
1978 	}
1979 
1980 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1981 
1982 	any_src = ipv6_addr_any(&fl6->saddr);
1983 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1984 	    (fl6->flowi6_oif && any_src))
1985 		flags |= RT6_LOOKUP_F_IFACE;
1986 
1987 	if (!any_src)
1988 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1989 	else if (sk)
1990 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1991 
1992 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1993 }
1994 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1995 
1996 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1997 {
1998 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1999 	struct net_device *loopback_dev = net->loopback_dev;
2000 	struct dst_entry *new = NULL;
2001 
2002 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2003 		       DST_OBSOLETE_DEAD, 0);
2004 	if (rt) {
2005 		rt6_info_init(rt);
2006 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2007 
2008 		new = &rt->dst;
2009 		new->__use = 1;
2010 		new->input = dst_discard;
2011 		new->output = dst_discard_out;
2012 
2013 		dst_copy_metrics(new, &ort->dst);
2014 
2015 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2016 		rt->rt6i_gateway = ort->rt6i_gateway;
2017 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2018 		rt->rt6i_metric = 0;
2019 
2020 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021 #ifdef CONFIG_IPV6_SUBTREES
2022 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023 #endif
2024 	}
2025 
2026 	dst_release(dst_orig);
2027 	return new ? new : ERR_PTR(-ENOMEM);
2028 }
2029 
2030 /*
2031  *	Destination cache support functions
2032  */
2033 
2034 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035 {
2036 	if (rt->from &&
2037 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2039 }
2040 
2041 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042 {
2043 	u32 rt_cookie = 0;
2044 
2045 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2046 		return NULL;
2047 
2048 	if (rt6_check_expired(rt))
2049 		return NULL;
2050 
2051 	return &rt->dst;
2052 }
2053 
2054 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055 {
2056 	if (!__rt6_check_expired(rt) &&
2057 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2058 	    rt6_check(rt->from, cookie))
2059 		return &rt->dst;
2060 	else
2061 		return NULL;
2062 }
2063 
2064 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066 	struct rt6_info *rt;
2067 
2068 	rt = (struct rt6_info *) dst;
2069 
2070 	/* All IPV6 dsts are created with ->obsolete set to the value
2071 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072 	 * into this function always.
2073 	 */
2074 
2075 	rt6_dst_from_metrics_check(rt);
2076 
2077 	if (rt->rt6i_flags & RTF_PCPU ||
2078 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2079 		return rt6_dst_from_check(rt, cookie);
2080 	else
2081 		return rt6_check(rt, cookie);
2082 }
2083 
2084 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085 {
2086 	struct rt6_info *rt = (struct rt6_info *) dst;
2087 
2088 	if (rt) {
2089 		if (rt->rt6i_flags & RTF_CACHE) {
2090 			if (rt6_check_expired(rt)) {
2091 				ip6_del_rt(rt);
2092 				dst = NULL;
2093 			}
2094 		} else {
2095 			dst_release(dst);
2096 			dst = NULL;
2097 		}
2098 	}
2099 	return dst;
2100 }
2101 
2102 static void ip6_link_failure(struct sk_buff *skb)
2103 {
2104 	struct rt6_info *rt;
2105 
2106 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2107 
2108 	rt = (struct rt6_info *) skb_dst(skb);
2109 	if (rt) {
2110 		if (rt->rt6i_flags & RTF_CACHE) {
2111 			if (dst_hold_safe(&rt->dst))
2112 				ip6_del_rt(rt);
2113 		} else {
2114 			struct fib6_node *fn;
2115 
2116 			rcu_read_lock();
2117 			fn = rcu_dereference(rt->rt6i_node);
2118 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119 				fn->fn_sernum = -1;
2120 			rcu_read_unlock();
2121 		}
2122 	}
2123 }
2124 
2125 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126 {
2127 	struct net *net = dev_net(rt->dst.dev);
2128 
2129 	rt->rt6i_flags |= RTF_MODIFIED;
2130 	rt->rt6i_pmtu = mtu;
2131 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132 }
2133 
2134 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135 {
2136 	return !(rt->rt6i_flags & RTF_CACHE) &&
2137 		(rt->rt6i_flags & RTF_PCPU ||
2138 		 rcu_access_pointer(rt->rt6i_node));
2139 }
2140 
2141 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142 				 const struct ipv6hdr *iph, u32 mtu)
2143 {
2144 	const struct in6_addr *daddr, *saddr;
2145 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2146 
2147 	if (rt6->rt6i_flags & RTF_LOCAL)
2148 		return;
2149 
2150 	if (dst_metric_locked(dst, RTAX_MTU))
2151 		return;
2152 
2153 	if (iph) {
2154 		daddr = &iph->daddr;
2155 		saddr = &iph->saddr;
2156 	} else if (sk) {
2157 		daddr = &sk->sk_v6_daddr;
2158 		saddr = &inet6_sk(sk)->saddr;
2159 	} else {
2160 		daddr = NULL;
2161 		saddr = NULL;
2162 	}
2163 	dst_confirm_neigh(dst, daddr);
2164 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165 	if (mtu >= dst_mtu(dst))
2166 		return;
2167 
2168 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2169 		rt6_do_update_pmtu(rt6, mtu);
2170 		/* update rt6_ex->stamp for cache */
2171 		if (rt6->rt6i_flags & RTF_CACHE)
2172 			rt6_update_exception_stamp_rt(rt6);
2173 	} else if (daddr) {
2174 		struct rt6_info *nrt6;
2175 
2176 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177 		if (nrt6) {
2178 			rt6_do_update_pmtu(nrt6, mtu);
2179 			if (rt6_insert_exception(nrt6, rt6))
2180 				dst_release_immediate(&nrt6->dst);
2181 		}
2182 	}
2183 }
2184 
2185 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186 			       struct sk_buff *skb, u32 mtu)
2187 {
2188 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189 }
2190 
2191 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2192 		     int oif, u32 mark, kuid_t uid)
2193 {
2194 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195 	struct dst_entry *dst;
2196 	struct flowi6 fl6;
2197 
2198 	memset(&fl6, 0, sizeof(fl6));
2199 	fl6.flowi6_oif = oif;
2200 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2201 	fl6.daddr = iph->daddr;
2202 	fl6.saddr = iph->saddr;
2203 	fl6.flowlabel = ip6_flowinfo(iph);
2204 	fl6.flowi6_uid = uid;
2205 
2206 	dst = ip6_route_output(net, NULL, &fl6);
2207 	if (!dst->error)
2208 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2209 	dst_release(dst);
2210 }
2211 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212 
2213 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214 {
2215 	struct dst_entry *dst;
2216 
2217 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2218 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2219 
2220 	dst = __sk_dst_get(sk);
2221 	if (!dst || !dst->obsolete ||
2222 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223 		return;
2224 
2225 	bh_lock_sock(sk);
2226 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227 		ip6_datagram_dst_update(sk, false);
2228 	bh_unlock_sock(sk);
2229 }
2230 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231 
2232 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2233 			   const struct flowi6 *fl6)
2234 {
2235 #ifdef CONFIG_IPV6_SUBTREES
2236 	struct ipv6_pinfo *np = inet6_sk(sk);
2237 #endif
2238 
2239 	ip6_dst_store(sk, dst,
2240 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2241 		      &sk->sk_v6_daddr : NULL,
2242 #ifdef CONFIG_IPV6_SUBTREES
2243 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2244 		      &np->saddr :
2245 #endif
2246 		      NULL);
2247 }
2248 
2249 /* Handle redirects */
2250 struct ip6rd_flowi {
2251 	struct flowi6 fl6;
2252 	struct in6_addr gateway;
2253 };
2254 
2255 static struct rt6_info *__ip6_route_redirect(struct net *net,
2256 					     struct fib6_table *table,
2257 					     struct flowi6 *fl6,
2258 					     const struct sk_buff *skb,
2259 					     int flags)
2260 {
2261 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2262 	struct rt6_info *rt, *rt_cache;
2263 	struct fib6_node *fn;
2264 
2265 	/* Get the "current" route for this destination and
2266 	 * check if the redirect has come from appropriate router.
2267 	 *
2268 	 * RFC 4861 specifies that redirects should only be
2269 	 * accepted if they come from the nexthop to the target.
2270 	 * Due to the way the routes are chosen, this notion
2271 	 * is a bit fuzzy and one might need to check all possible
2272 	 * routes.
2273 	 */
2274 
2275 	rcu_read_lock();
2276 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2277 restart:
2278 	for_each_fib6_node_rt_rcu(fn) {
2279 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2280 			continue;
2281 		if (rt6_check_expired(rt))
2282 			continue;
2283 		if (rt->dst.error)
2284 			break;
2285 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2286 			continue;
2287 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2288 			continue;
2289 		/* rt_cache's gateway might be different from its 'parent'
2290 		 * in the case of an ip redirect.
2291 		 * So we keep searching in the exception table if the gateway
2292 		 * is different.
2293 		 */
2294 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2295 			rt_cache = rt6_find_cached_rt(rt,
2296 						      &fl6->daddr,
2297 						      &fl6->saddr);
2298 			if (rt_cache &&
2299 			    ipv6_addr_equal(&rdfl->gateway,
2300 					    &rt_cache->rt6i_gateway)) {
2301 				rt = rt_cache;
2302 				break;
2303 			}
2304 			continue;
2305 		}
2306 		break;
2307 	}
2308 
2309 	if (!rt)
2310 		rt = net->ipv6.ip6_null_entry;
2311 	else if (rt->dst.error) {
2312 		rt = net->ipv6.ip6_null_entry;
2313 		goto out;
2314 	}
2315 
2316 	if (rt == net->ipv6.ip6_null_entry) {
2317 		fn = fib6_backtrack(fn, &fl6->saddr);
2318 		if (fn)
2319 			goto restart;
2320 	}
2321 
2322 out:
2323 	ip6_hold_safe(net, &rt, true);
2324 
2325 	rcu_read_unlock();
2326 
2327 	trace_fib6_table_lookup(net, rt, table, fl6);
2328 	return rt;
2329 };
2330 
2331 static struct dst_entry *ip6_route_redirect(struct net *net,
2332 					    const struct flowi6 *fl6,
2333 					    const struct sk_buff *skb,
2334 					    const struct in6_addr *gateway)
2335 {
2336 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2337 	struct ip6rd_flowi rdfl;
2338 
2339 	rdfl.fl6 = *fl6;
2340 	rdfl.gateway = *gateway;
2341 
2342 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2343 				flags, __ip6_route_redirect);
2344 }
2345 
2346 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2347 		  kuid_t uid)
2348 {
2349 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350 	struct dst_entry *dst;
2351 	struct flowi6 fl6;
2352 
2353 	memset(&fl6, 0, sizeof(fl6));
2354 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2355 	fl6.flowi6_oif = oif;
2356 	fl6.flowi6_mark = mark;
2357 	fl6.daddr = iph->daddr;
2358 	fl6.saddr = iph->saddr;
2359 	fl6.flowlabel = ip6_flowinfo(iph);
2360 	fl6.flowi6_uid = uid;
2361 
2362 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2363 	rt6_do_redirect(dst, NULL, skb);
2364 	dst_release(dst);
2365 }
2366 EXPORT_SYMBOL_GPL(ip6_redirect);
2367 
2368 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2369 			    u32 mark)
2370 {
2371 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2372 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2373 	struct dst_entry *dst;
2374 	struct flowi6 fl6;
2375 
2376 	memset(&fl6, 0, sizeof(fl6));
2377 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2378 	fl6.flowi6_oif = oif;
2379 	fl6.flowi6_mark = mark;
2380 	fl6.daddr = msg->dest;
2381 	fl6.saddr = iph->daddr;
2382 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2383 
2384 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2385 	rt6_do_redirect(dst, NULL, skb);
2386 	dst_release(dst);
2387 }
2388 
2389 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2390 {
2391 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2392 		     sk->sk_uid);
2393 }
2394 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2395 
2396 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2397 {
2398 	struct net_device *dev = dst->dev;
2399 	unsigned int mtu = dst_mtu(dst);
2400 	struct net *net = dev_net(dev);
2401 
2402 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2403 
2404 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2405 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2406 
2407 	/*
2408 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2409 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2410 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2411 	 * rely only on pmtu discovery"
2412 	 */
2413 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2414 		mtu = IPV6_MAXPLEN;
2415 	return mtu;
2416 }
2417 
2418 static unsigned int ip6_mtu(const struct dst_entry *dst)
2419 {
2420 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2421 	unsigned int mtu = rt->rt6i_pmtu;
2422 	struct inet6_dev *idev;
2423 
2424 	if (mtu)
2425 		goto out;
2426 
2427 	mtu = dst_metric_raw(dst, RTAX_MTU);
2428 	if (mtu)
2429 		goto out;
2430 
2431 	mtu = IPV6_MIN_MTU;
2432 
2433 	rcu_read_lock();
2434 	idev = __in6_dev_get(dst->dev);
2435 	if (idev)
2436 		mtu = idev->cnf.mtu6;
2437 	rcu_read_unlock();
2438 
2439 out:
2440 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2441 
2442 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2443 }
2444 
2445 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2446 				  struct flowi6 *fl6)
2447 {
2448 	struct dst_entry *dst;
2449 	struct rt6_info *rt;
2450 	struct inet6_dev *idev = in6_dev_get(dev);
2451 	struct net *net = dev_net(dev);
2452 
2453 	if (unlikely(!idev))
2454 		return ERR_PTR(-ENODEV);
2455 
2456 	rt = ip6_dst_alloc(net, dev, 0);
2457 	if (unlikely(!rt)) {
2458 		in6_dev_put(idev);
2459 		dst = ERR_PTR(-ENOMEM);
2460 		goto out;
2461 	}
2462 
2463 	rt->dst.flags |= DST_HOST;
2464 	rt->dst.input = ip6_input;
2465 	rt->dst.output  = ip6_output;
2466 	rt->rt6i_gateway  = fl6->daddr;
2467 	rt->rt6i_dst.addr = fl6->daddr;
2468 	rt->rt6i_dst.plen = 128;
2469 	rt->rt6i_idev     = idev;
2470 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2471 
2472 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2473 	 * do proper release of the net_device
2474 	 */
2475 	rt6_uncached_list_add(rt);
2476 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2477 
2478 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2479 
2480 out:
2481 	return dst;
2482 }
2483 
2484 static int ip6_dst_gc(struct dst_ops *ops)
2485 {
2486 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2487 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2488 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2489 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2490 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2491 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2492 	int entries;
2493 
2494 	entries = dst_entries_get_fast(ops);
2495 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2496 	    entries <= rt_max_size)
2497 		goto out;
2498 
2499 	net->ipv6.ip6_rt_gc_expire++;
2500 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2501 	entries = dst_entries_get_slow(ops);
2502 	if (entries < ops->gc_thresh)
2503 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2504 out:
2505 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2506 	return entries > rt_max_size;
2507 }
2508 
2509 static int ip6_convert_metrics(struct mx6_config *mxc,
2510 			       const struct fib6_config *cfg)
2511 {
2512 	struct net *net = cfg->fc_nlinfo.nl_net;
2513 	bool ecn_ca = false;
2514 	struct nlattr *nla;
2515 	int remaining;
2516 	u32 *mp;
2517 
2518 	if (!cfg->fc_mx)
2519 		return 0;
2520 
2521 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2522 	if (unlikely(!mp))
2523 		return -ENOMEM;
2524 
2525 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2526 		int type = nla_type(nla);
2527 		u32 val;
2528 
2529 		if (!type)
2530 			continue;
2531 		if (unlikely(type > RTAX_MAX))
2532 			goto err;
2533 
2534 		if (type == RTAX_CC_ALGO) {
2535 			char tmp[TCP_CA_NAME_MAX];
2536 
2537 			nla_strlcpy(tmp, nla, sizeof(tmp));
2538 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2539 			if (val == TCP_CA_UNSPEC)
2540 				goto err;
2541 		} else {
2542 			val = nla_get_u32(nla);
2543 		}
2544 		if (type == RTAX_HOPLIMIT && val > 255)
2545 			val = 255;
2546 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2547 			goto err;
2548 
2549 		mp[type - 1] = val;
2550 		__set_bit(type - 1, mxc->mx_valid);
2551 	}
2552 
2553 	if (ecn_ca) {
2554 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2555 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2556 	}
2557 
2558 	mxc->mx = mp;
2559 	return 0;
2560  err:
2561 	kfree(mp);
2562 	return -EINVAL;
2563 }
2564 
2565 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2566 					    struct fib6_config *cfg,
2567 					    const struct in6_addr *gw_addr,
2568 					    u32 tbid, int flags)
2569 {
2570 	struct flowi6 fl6 = {
2571 		.flowi6_oif = cfg->fc_ifindex,
2572 		.daddr = *gw_addr,
2573 		.saddr = cfg->fc_prefsrc,
2574 	};
2575 	struct fib6_table *table;
2576 	struct rt6_info *rt;
2577 
2578 	table = fib6_get_table(net, tbid);
2579 	if (!table)
2580 		return NULL;
2581 
2582 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2583 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2584 
2585 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2586 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2587 
2588 	/* if table lookup failed, fall back to full lookup */
2589 	if (rt == net->ipv6.ip6_null_entry) {
2590 		ip6_rt_put(rt);
2591 		rt = NULL;
2592 	}
2593 
2594 	return rt;
2595 }
2596 
2597 static int ip6_route_check_nh_onlink(struct net *net,
2598 				     struct fib6_config *cfg,
2599 				     const struct net_device *dev,
2600 				     struct netlink_ext_ack *extack)
2601 {
2602 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2603 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2604 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2605 	struct rt6_info *grt;
2606 	int err;
2607 
2608 	err = 0;
2609 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2610 	if (grt) {
2611 		if (!grt->dst.error &&
2612 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2613 			NL_SET_ERR_MSG(extack,
2614 				       "Nexthop has invalid gateway or device mismatch");
2615 			err = -EINVAL;
2616 		}
2617 
2618 		ip6_rt_put(grt);
2619 	}
2620 
2621 	return err;
2622 }
2623 
2624 static int ip6_route_check_nh(struct net *net,
2625 			      struct fib6_config *cfg,
2626 			      struct net_device **_dev,
2627 			      struct inet6_dev **idev)
2628 {
2629 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2630 	struct net_device *dev = _dev ? *_dev : NULL;
2631 	struct rt6_info *grt = NULL;
2632 	int err = -EHOSTUNREACH;
2633 
2634 	if (cfg->fc_table) {
2635 		int flags = RT6_LOOKUP_F_IFACE;
2636 
2637 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2638 					  cfg->fc_table, flags);
2639 		if (grt) {
2640 			if (grt->rt6i_flags & RTF_GATEWAY ||
2641 			    (dev && dev != grt->dst.dev)) {
2642 				ip6_rt_put(grt);
2643 				grt = NULL;
2644 			}
2645 		}
2646 	}
2647 
2648 	if (!grt)
2649 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2650 
2651 	if (!grt)
2652 		goto out;
2653 
2654 	if (dev) {
2655 		if (dev != grt->dst.dev) {
2656 			ip6_rt_put(grt);
2657 			goto out;
2658 		}
2659 	} else {
2660 		*_dev = dev = grt->dst.dev;
2661 		*idev = grt->rt6i_idev;
2662 		dev_hold(dev);
2663 		in6_dev_hold(grt->rt6i_idev);
2664 	}
2665 
2666 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2667 		err = 0;
2668 
2669 	ip6_rt_put(grt);
2670 
2671 out:
2672 	return err;
2673 }
2674 
2675 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2676 			   struct net_device **_dev, struct inet6_dev **idev,
2677 			   struct netlink_ext_ack *extack)
2678 {
2679 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2680 	int gwa_type = ipv6_addr_type(gw_addr);
2681 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2682 	const struct net_device *dev = *_dev;
2683 	bool need_addr_check = !dev;
2684 	int err = -EINVAL;
2685 
2686 	/* if gw_addr is local we will fail to detect this in case
2687 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2688 	 * will return already-added prefix route via interface that
2689 	 * prefix route was assigned to, which might be non-loopback.
2690 	 */
2691 	if (dev &&
2692 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2693 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2694 		goto out;
2695 	}
2696 
2697 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2698 		/* IPv6 strictly inhibits using not link-local
2699 		 * addresses as nexthop address.
2700 		 * Otherwise, router will not able to send redirects.
2701 		 * It is very good, but in some (rare!) circumstances
2702 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2703 		 * some exceptions. --ANK
2704 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2705 		 * addressing
2706 		 */
2707 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2708 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2709 			goto out;
2710 		}
2711 
2712 		if (cfg->fc_flags & RTNH_F_ONLINK)
2713 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2714 		else
2715 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2716 
2717 		if (err)
2718 			goto out;
2719 	}
2720 
2721 	/* reload in case device was changed */
2722 	dev = *_dev;
2723 
2724 	err = -EINVAL;
2725 	if (!dev) {
2726 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2727 		goto out;
2728 	} else if (dev->flags & IFF_LOOPBACK) {
2729 		NL_SET_ERR_MSG(extack,
2730 			       "Egress device can not be loopback device for this route");
2731 		goto out;
2732 	}
2733 
2734 	/* if we did not check gw_addr above, do so now that the
2735 	 * egress device has been resolved.
2736 	 */
2737 	if (need_addr_check &&
2738 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2739 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2740 		goto out;
2741 	}
2742 
2743 	err = 0;
2744 out:
2745 	return err;
2746 }
2747 
2748 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2749 					      struct netlink_ext_ack *extack)
2750 {
2751 	struct net *net = cfg->fc_nlinfo.nl_net;
2752 	struct rt6_info *rt = NULL;
2753 	struct net_device *dev = NULL;
2754 	struct inet6_dev *idev = NULL;
2755 	struct fib6_table *table;
2756 	int addr_type;
2757 	int err = -EINVAL;
2758 
2759 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2760 	if (cfg->fc_flags & RTF_PCPU) {
2761 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2762 		goto out;
2763 	}
2764 
2765 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2766 	if (cfg->fc_flags & RTF_CACHE) {
2767 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2768 		goto out;
2769 	}
2770 
2771 	if (cfg->fc_dst_len > 128) {
2772 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2773 		goto out;
2774 	}
2775 	if (cfg->fc_src_len > 128) {
2776 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2777 		goto out;
2778 	}
2779 #ifndef CONFIG_IPV6_SUBTREES
2780 	if (cfg->fc_src_len) {
2781 		NL_SET_ERR_MSG(extack,
2782 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2783 		goto out;
2784 	}
2785 #endif
2786 	if (cfg->fc_ifindex) {
2787 		err = -ENODEV;
2788 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2789 		if (!dev)
2790 			goto out;
2791 		idev = in6_dev_get(dev);
2792 		if (!idev)
2793 			goto out;
2794 	}
2795 
2796 	if (cfg->fc_metric == 0)
2797 		cfg->fc_metric = IP6_RT_PRIO_USER;
2798 
2799 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2800 		if (!dev) {
2801 			NL_SET_ERR_MSG(extack,
2802 				       "Nexthop device required for onlink");
2803 			err = -ENODEV;
2804 			goto out;
2805 		}
2806 
2807 		if (!(dev->flags & IFF_UP)) {
2808 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2809 			err = -ENETDOWN;
2810 			goto out;
2811 		}
2812 	}
2813 
2814 	err = -ENOBUFS;
2815 	if (cfg->fc_nlinfo.nlh &&
2816 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2817 		table = fib6_get_table(net, cfg->fc_table);
2818 		if (!table) {
2819 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2820 			table = fib6_new_table(net, cfg->fc_table);
2821 		}
2822 	} else {
2823 		table = fib6_new_table(net, cfg->fc_table);
2824 	}
2825 
2826 	if (!table)
2827 		goto out;
2828 
2829 	rt = ip6_dst_alloc(net, NULL,
2830 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2831 
2832 	if (!rt) {
2833 		err = -ENOMEM;
2834 		goto out;
2835 	}
2836 
2837 	if (cfg->fc_flags & RTF_EXPIRES)
2838 		rt6_set_expires(rt, jiffies +
2839 				clock_t_to_jiffies(cfg->fc_expires));
2840 	else
2841 		rt6_clean_expires(rt);
2842 
2843 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2844 		cfg->fc_protocol = RTPROT_BOOT;
2845 	rt->rt6i_protocol = cfg->fc_protocol;
2846 
2847 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2848 
2849 	if (addr_type & IPV6_ADDR_MULTICAST)
2850 		rt->dst.input = ip6_mc_input;
2851 	else if (cfg->fc_flags & RTF_LOCAL)
2852 		rt->dst.input = ip6_input;
2853 	else
2854 		rt->dst.input = ip6_forward;
2855 
2856 	rt->dst.output = ip6_output;
2857 
2858 	if (cfg->fc_encap) {
2859 		struct lwtunnel_state *lwtstate;
2860 
2861 		err = lwtunnel_build_state(cfg->fc_encap_type,
2862 					   cfg->fc_encap, AF_INET6, cfg,
2863 					   &lwtstate, extack);
2864 		if (err)
2865 			goto out;
2866 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2867 		lwtunnel_set_redirect(&rt->dst);
2868 	}
2869 
2870 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2871 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2872 	if (rt->rt6i_dst.plen == 128)
2873 		rt->dst.flags |= DST_HOST;
2874 
2875 #ifdef CONFIG_IPV6_SUBTREES
2876 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2877 	rt->rt6i_src.plen = cfg->fc_src_len;
2878 #endif
2879 
2880 	rt->rt6i_metric = cfg->fc_metric;
2881 	rt->rt6i_nh_weight = 1;
2882 
2883 	/* We cannot add true routes via loopback here,
2884 	   they would result in kernel looping; promote them to reject routes
2885 	 */
2886 	if ((cfg->fc_flags & RTF_REJECT) ||
2887 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2888 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2889 	     !(cfg->fc_flags & RTF_LOCAL))) {
2890 		/* hold loopback dev/idev if we haven't done so. */
2891 		if (dev != net->loopback_dev) {
2892 			if (dev) {
2893 				dev_put(dev);
2894 				in6_dev_put(idev);
2895 			}
2896 			dev = net->loopback_dev;
2897 			dev_hold(dev);
2898 			idev = in6_dev_get(dev);
2899 			if (!idev) {
2900 				err = -ENODEV;
2901 				goto out;
2902 			}
2903 		}
2904 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2905 		switch (cfg->fc_type) {
2906 		case RTN_BLACKHOLE:
2907 			rt->dst.error = -EINVAL;
2908 			rt->dst.output = dst_discard_out;
2909 			rt->dst.input = dst_discard;
2910 			break;
2911 		case RTN_PROHIBIT:
2912 			rt->dst.error = -EACCES;
2913 			rt->dst.output = ip6_pkt_prohibit_out;
2914 			rt->dst.input = ip6_pkt_prohibit;
2915 			break;
2916 		case RTN_THROW:
2917 		case RTN_UNREACHABLE:
2918 		default:
2919 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2920 					: (cfg->fc_type == RTN_UNREACHABLE)
2921 					? -EHOSTUNREACH : -ENETUNREACH;
2922 			rt->dst.output = ip6_pkt_discard_out;
2923 			rt->dst.input = ip6_pkt_discard;
2924 			break;
2925 		}
2926 		goto install_route;
2927 	}
2928 
2929 	if (cfg->fc_flags & RTF_GATEWAY) {
2930 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2931 		if (err)
2932 			goto out;
2933 
2934 		rt->rt6i_gateway = cfg->fc_gateway;
2935 	}
2936 
2937 	err = -ENODEV;
2938 	if (!dev)
2939 		goto out;
2940 
2941 	if (idev->cnf.disable_ipv6) {
2942 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2943 		err = -EACCES;
2944 		goto out;
2945 	}
2946 
2947 	if (!(dev->flags & IFF_UP)) {
2948 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2949 		err = -ENETDOWN;
2950 		goto out;
2951 	}
2952 
2953 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2954 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2955 			NL_SET_ERR_MSG(extack, "Invalid source address");
2956 			err = -EINVAL;
2957 			goto out;
2958 		}
2959 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2960 		rt->rt6i_prefsrc.plen = 128;
2961 	} else
2962 		rt->rt6i_prefsrc.plen = 0;
2963 
2964 	rt->rt6i_flags = cfg->fc_flags;
2965 
2966 install_route:
2967 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2968 	    !netif_carrier_ok(dev))
2969 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2970 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2971 	rt->dst.dev = dev;
2972 	rt->rt6i_idev = idev;
2973 	rt->rt6i_table = table;
2974 
2975 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2976 
2977 	return rt;
2978 out:
2979 	if (dev)
2980 		dev_put(dev);
2981 	if (idev)
2982 		in6_dev_put(idev);
2983 	if (rt)
2984 		dst_release_immediate(&rt->dst);
2985 
2986 	return ERR_PTR(err);
2987 }
2988 
2989 int ip6_route_add(struct fib6_config *cfg,
2990 		  struct netlink_ext_ack *extack)
2991 {
2992 	struct mx6_config mxc = { .mx = NULL, };
2993 	struct rt6_info *rt;
2994 	int err;
2995 
2996 	rt = ip6_route_info_create(cfg, extack);
2997 	if (IS_ERR(rt)) {
2998 		err = PTR_ERR(rt);
2999 		rt = NULL;
3000 		goto out;
3001 	}
3002 
3003 	err = ip6_convert_metrics(&mxc, cfg);
3004 	if (err)
3005 		goto out;
3006 
3007 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3008 
3009 	kfree(mxc.mx);
3010 
3011 	return err;
3012 out:
3013 	if (rt)
3014 		dst_release_immediate(&rt->dst);
3015 
3016 	return err;
3017 }
3018 
3019 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3020 {
3021 	int err;
3022 	struct fib6_table *table;
3023 	struct net *net = dev_net(rt->dst.dev);
3024 
3025 	if (rt == net->ipv6.ip6_null_entry) {
3026 		err = -ENOENT;
3027 		goto out;
3028 	}
3029 
3030 	table = rt->rt6i_table;
3031 	spin_lock_bh(&table->tb6_lock);
3032 	err = fib6_del(rt, info);
3033 	spin_unlock_bh(&table->tb6_lock);
3034 
3035 out:
3036 	ip6_rt_put(rt);
3037 	return err;
3038 }
3039 
3040 int ip6_del_rt(struct rt6_info *rt)
3041 {
3042 	struct nl_info info = {
3043 		.nl_net = dev_net(rt->dst.dev),
3044 	};
3045 	return __ip6_del_rt(rt, &info);
3046 }
3047 
3048 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3049 {
3050 	struct nl_info *info = &cfg->fc_nlinfo;
3051 	struct net *net = info->nl_net;
3052 	struct sk_buff *skb = NULL;
3053 	struct fib6_table *table;
3054 	int err = -ENOENT;
3055 
3056 	if (rt == net->ipv6.ip6_null_entry)
3057 		goto out_put;
3058 	table = rt->rt6i_table;
3059 	spin_lock_bh(&table->tb6_lock);
3060 
3061 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3062 		struct rt6_info *sibling, *next_sibling;
3063 
3064 		/* prefer to send a single notification with all hops */
3065 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3066 		if (skb) {
3067 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3068 
3069 			if (rt6_fill_node(net, skb, rt,
3070 					  NULL, NULL, 0, RTM_DELROUTE,
3071 					  info->portid, seq, 0) < 0) {
3072 				kfree_skb(skb);
3073 				skb = NULL;
3074 			} else
3075 				info->skip_notify = 1;
3076 		}
3077 
3078 		list_for_each_entry_safe(sibling, next_sibling,
3079 					 &rt->rt6i_siblings,
3080 					 rt6i_siblings) {
3081 			err = fib6_del(sibling, info);
3082 			if (err)
3083 				goto out_unlock;
3084 		}
3085 	}
3086 
3087 	err = fib6_del(rt, info);
3088 out_unlock:
3089 	spin_unlock_bh(&table->tb6_lock);
3090 out_put:
3091 	ip6_rt_put(rt);
3092 
3093 	if (skb) {
3094 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3095 			    info->nlh, gfp_any());
3096 	}
3097 	return err;
3098 }
3099 
3100 static int ip6_route_del(struct fib6_config *cfg,
3101 			 struct netlink_ext_ack *extack)
3102 {
3103 	struct rt6_info *rt, *rt_cache;
3104 	struct fib6_table *table;
3105 	struct fib6_node *fn;
3106 	int err = -ESRCH;
3107 
3108 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3109 	if (!table) {
3110 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3111 		return err;
3112 	}
3113 
3114 	rcu_read_lock();
3115 
3116 	fn = fib6_locate(&table->tb6_root,
3117 			 &cfg->fc_dst, cfg->fc_dst_len,
3118 			 &cfg->fc_src, cfg->fc_src_len,
3119 			 !(cfg->fc_flags & RTF_CACHE));
3120 
3121 	if (fn) {
3122 		for_each_fib6_node_rt_rcu(fn) {
3123 			if (cfg->fc_flags & RTF_CACHE) {
3124 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3125 							      &cfg->fc_src);
3126 				if (!rt_cache)
3127 					continue;
3128 				rt = rt_cache;
3129 			}
3130 			if (cfg->fc_ifindex &&
3131 			    (!rt->dst.dev ||
3132 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
3133 				continue;
3134 			if (cfg->fc_flags & RTF_GATEWAY &&
3135 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3136 				continue;
3137 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3138 				continue;
3139 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3140 				continue;
3141 			if (!dst_hold_safe(&rt->dst))
3142 				break;
3143 			rcu_read_unlock();
3144 
3145 			/* if gateway was specified only delete the one hop */
3146 			if (cfg->fc_flags & RTF_GATEWAY)
3147 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3148 
3149 			return __ip6_del_rt_siblings(rt, cfg);
3150 		}
3151 	}
3152 	rcu_read_unlock();
3153 
3154 	return err;
3155 }
3156 
3157 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3158 {
3159 	struct netevent_redirect netevent;
3160 	struct rt6_info *rt, *nrt = NULL;
3161 	struct ndisc_options ndopts;
3162 	struct inet6_dev *in6_dev;
3163 	struct neighbour *neigh;
3164 	struct rd_msg *msg;
3165 	int optlen, on_link;
3166 	u8 *lladdr;
3167 
3168 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3169 	optlen -= sizeof(*msg);
3170 
3171 	if (optlen < 0) {
3172 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3173 		return;
3174 	}
3175 
3176 	msg = (struct rd_msg *)icmp6_hdr(skb);
3177 
3178 	if (ipv6_addr_is_multicast(&msg->dest)) {
3179 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3180 		return;
3181 	}
3182 
3183 	on_link = 0;
3184 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3185 		on_link = 1;
3186 	} else if (ipv6_addr_type(&msg->target) !=
3187 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3188 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3189 		return;
3190 	}
3191 
3192 	in6_dev = __in6_dev_get(skb->dev);
3193 	if (!in6_dev)
3194 		return;
3195 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3196 		return;
3197 
3198 	/* RFC2461 8.1:
3199 	 *	The IP source address of the Redirect MUST be the same as the current
3200 	 *	first-hop router for the specified ICMP Destination Address.
3201 	 */
3202 
3203 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3204 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3205 		return;
3206 	}
3207 
3208 	lladdr = NULL;
3209 	if (ndopts.nd_opts_tgt_lladdr) {
3210 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3211 					     skb->dev);
3212 		if (!lladdr) {
3213 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3214 			return;
3215 		}
3216 	}
3217 
3218 	rt = (struct rt6_info *) dst;
3219 	if (rt->rt6i_flags & RTF_REJECT) {
3220 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3221 		return;
3222 	}
3223 
3224 	/* Redirect received -> path was valid.
3225 	 * Look, redirects are sent only in response to data packets,
3226 	 * so that this nexthop apparently is reachable. --ANK
3227 	 */
3228 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3229 
3230 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3231 	if (!neigh)
3232 		return;
3233 
3234 	/*
3235 	 *	We have finally decided to accept it.
3236 	 */
3237 
3238 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3239 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3240 		     NEIGH_UPDATE_F_OVERRIDE|
3241 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3242 				     NEIGH_UPDATE_F_ISROUTER)),
3243 		     NDISC_REDIRECT, &ndopts);
3244 
3245 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3246 	if (!nrt)
3247 		goto out;
3248 
3249 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3250 	if (on_link)
3251 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3252 
3253 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3254 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3255 
3256 	/* No need to remove rt from the exception table if rt is
3257 	 * a cached route because rt6_insert_exception() will
3258 	 * takes care of it
3259 	 */
3260 	if (rt6_insert_exception(nrt, rt)) {
3261 		dst_release_immediate(&nrt->dst);
3262 		goto out;
3263 	}
3264 
3265 	netevent.old = &rt->dst;
3266 	netevent.new = &nrt->dst;
3267 	netevent.daddr = &msg->dest;
3268 	netevent.neigh = neigh;
3269 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3270 
3271 out:
3272 	neigh_release(neigh);
3273 }
3274 
3275 /*
3276  *	Misc support functions
3277  */
3278 
3279 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3280 {
3281 	BUG_ON(from->from);
3282 
3283 	rt->rt6i_flags &= ~RTF_EXPIRES;
3284 	dst_hold(&from->dst);
3285 	rt->from = from;
3286 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3287 }
3288 
3289 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3290 {
3291 	rt->dst.input = ort->dst.input;
3292 	rt->dst.output = ort->dst.output;
3293 	rt->rt6i_dst = ort->rt6i_dst;
3294 	rt->dst.error = ort->dst.error;
3295 	rt->rt6i_idev = ort->rt6i_idev;
3296 	if (rt->rt6i_idev)
3297 		in6_dev_hold(rt->rt6i_idev);
3298 	rt->dst.lastuse = jiffies;
3299 	rt->rt6i_gateway = ort->rt6i_gateway;
3300 	rt->rt6i_flags = ort->rt6i_flags;
3301 	rt6_set_from(rt, ort);
3302 	rt->rt6i_metric = ort->rt6i_metric;
3303 #ifdef CONFIG_IPV6_SUBTREES
3304 	rt->rt6i_src = ort->rt6i_src;
3305 #endif
3306 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3307 	rt->rt6i_table = ort->rt6i_table;
3308 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3309 }
3310 
3311 #ifdef CONFIG_IPV6_ROUTE_INFO
3312 static struct rt6_info *rt6_get_route_info(struct net *net,
3313 					   const struct in6_addr *prefix, int prefixlen,
3314 					   const struct in6_addr *gwaddr,
3315 					   struct net_device *dev)
3316 {
3317 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3318 	int ifindex = dev->ifindex;
3319 	struct fib6_node *fn;
3320 	struct rt6_info *rt = NULL;
3321 	struct fib6_table *table;
3322 
3323 	table = fib6_get_table(net, tb_id);
3324 	if (!table)
3325 		return NULL;
3326 
3327 	rcu_read_lock();
3328 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3329 	if (!fn)
3330 		goto out;
3331 
3332 	for_each_fib6_node_rt_rcu(fn) {
3333 		if (rt->dst.dev->ifindex != ifindex)
3334 			continue;
3335 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3336 			continue;
3337 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3338 			continue;
3339 		ip6_hold_safe(NULL, &rt, false);
3340 		break;
3341 	}
3342 out:
3343 	rcu_read_unlock();
3344 	return rt;
3345 }
3346 
3347 static struct rt6_info *rt6_add_route_info(struct net *net,
3348 					   const struct in6_addr *prefix, int prefixlen,
3349 					   const struct in6_addr *gwaddr,
3350 					   struct net_device *dev,
3351 					   unsigned int pref)
3352 {
3353 	struct fib6_config cfg = {
3354 		.fc_metric	= IP6_RT_PRIO_USER,
3355 		.fc_ifindex	= dev->ifindex,
3356 		.fc_dst_len	= prefixlen,
3357 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3358 				  RTF_UP | RTF_PREF(pref),
3359 		.fc_protocol = RTPROT_RA,
3360 		.fc_nlinfo.portid = 0,
3361 		.fc_nlinfo.nlh = NULL,
3362 		.fc_nlinfo.nl_net = net,
3363 	};
3364 
3365 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3366 	cfg.fc_dst = *prefix;
3367 	cfg.fc_gateway = *gwaddr;
3368 
3369 	/* We should treat it as a default route if prefix length is 0. */
3370 	if (!prefixlen)
3371 		cfg.fc_flags |= RTF_DEFAULT;
3372 
3373 	ip6_route_add(&cfg, NULL);
3374 
3375 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3376 }
3377 #endif
3378 
3379 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3380 {
3381 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3382 	struct rt6_info *rt;
3383 	struct fib6_table *table;
3384 
3385 	table = fib6_get_table(dev_net(dev), tb_id);
3386 	if (!table)
3387 		return NULL;
3388 
3389 	rcu_read_lock();
3390 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3391 		if (dev == rt->dst.dev &&
3392 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3393 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3394 			break;
3395 	}
3396 	if (rt)
3397 		ip6_hold_safe(NULL, &rt, false);
3398 	rcu_read_unlock();
3399 	return rt;
3400 }
3401 
3402 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3403 				     struct net_device *dev,
3404 				     unsigned int pref)
3405 {
3406 	struct fib6_config cfg = {
3407 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3408 		.fc_metric	= IP6_RT_PRIO_USER,
3409 		.fc_ifindex	= dev->ifindex,
3410 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3411 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3412 		.fc_protocol = RTPROT_RA,
3413 		.fc_nlinfo.portid = 0,
3414 		.fc_nlinfo.nlh = NULL,
3415 		.fc_nlinfo.nl_net = dev_net(dev),
3416 	};
3417 
3418 	cfg.fc_gateway = *gwaddr;
3419 
3420 	if (!ip6_route_add(&cfg, NULL)) {
3421 		struct fib6_table *table;
3422 
3423 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3424 		if (table)
3425 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3426 	}
3427 
3428 	return rt6_get_dflt_router(gwaddr, dev);
3429 }
3430 
3431 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3432 {
3433 	struct rt6_info *rt;
3434 
3435 restart:
3436 	rcu_read_lock();
3437 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3438 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3439 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3440 			if (dst_hold_safe(&rt->dst)) {
3441 				rcu_read_unlock();
3442 				ip6_del_rt(rt);
3443 			} else {
3444 				rcu_read_unlock();
3445 			}
3446 			goto restart;
3447 		}
3448 	}
3449 	rcu_read_unlock();
3450 
3451 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3452 }
3453 
3454 void rt6_purge_dflt_routers(struct net *net)
3455 {
3456 	struct fib6_table *table;
3457 	struct hlist_head *head;
3458 	unsigned int h;
3459 
3460 	rcu_read_lock();
3461 
3462 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3463 		head = &net->ipv6.fib_table_hash[h];
3464 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3465 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3466 				__rt6_purge_dflt_routers(table);
3467 		}
3468 	}
3469 
3470 	rcu_read_unlock();
3471 }
3472 
3473 static void rtmsg_to_fib6_config(struct net *net,
3474 				 struct in6_rtmsg *rtmsg,
3475 				 struct fib6_config *cfg)
3476 {
3477 	memset(cfg, 0, sizeof(*cfg));
3478 
3479 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3480 			 : RT6_TABLE_MAIN;
3481 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3482 	cfg->fc_metric = rtmsg->rtmsg_metric;
3483 	cfg->fc_expires = rtmsg->rtmsg_info;
3484 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3485 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3486 	cfg->fc_flags = rtmsg->rtmsg_flags;
3487 
3488 	cfg->fc_nlinfo.nl_net = net;
3489 
3490 	cfg->fc_dst = rtmsg->rtmsg_dst;
3491 	cfg->fc_src = rtmsg->rtmsg_src;
3492 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3493 }
3494 
3495 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3496 {
3497 	struct fib6_config cfg;
3498 	struct in6_rtmsg rtmsg;
3499 	int err;
3500 
3501 	switch (cmd) {
3502 	case SIOCADDRT:		/* Add a route */
3503 	case SIOCDELRT:		/* Delete a route */
3504 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3505 			return -EPERM;
3506 		err = copy_from_user(&rtmsg, arg,
3507 				     sizeof(struct in6_rtmsg));
3508 		if (err)
3509 			return -EFAULT;
3510 
3511 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3512 
3513 		rtnl_lock();
3514 		switch (cmd) {
3515 		case SIOCADDRT:
3516 			err = ip6_route_add(&cfg, NULL);
3517 			break;
3518 		case SIOCDELRT:
3519 			err = ip6_route_del(&cfg, NULL);
3520 			break;
3521 		default:
3522 			err = -EINVAL;
3523 		}
3524 		rtnl_unlock();
3525 
3526 		return err;
3527 	}
3528 
3529 	return -EINVAL;
3530 }
3531 
3532 /*
3533  *	Drop the packet on the floor
3534  */
3535 
3536 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3537 {
3538 	int type;
3539 	struct dst_entry *dst = skb_dst(skb);
3540 	switch (ipstats_mib_noroutes) {
3541 	case IPSTATS_MIB_INNOROUTES:
3542 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3543 		if (type == IPV6_ADDR_ANY) {
3544 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3545 				      IPSTATS_MIB_INADDRERRORS);
3546 			break;
3547 		}
3548 		/* FALLTHROUGH */
3549 	case IPSTATS_MIB_OUTNOROUTES:
3550 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3551 			      ipstats_mib_noroutes);
3552 		break;
3553 	}
3554 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3555 	kfree_skb(skb);
3556 	return 0;
3557 }
3558 
3559 static int ip6_pkt_discard(struct sk_buff *skb)
3560 {
3561 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3562 }
3563 
3564 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3565 {
3566 	skb->dev = skb_dst(skb)->dev;
3567 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3568 }
3569 
3570 static int ip6_pkt_prohibit(struct sk_buff *skb)
3571 {
3572 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3573 }
3574 
3575 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3576 {
3577 	skb->dev = skb_dst(skb)->dev;
3578 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3579 }
3580 
3581 /*
3582  *	Allocate a dst for local (unicast / anycast) address.
3583  */
3584 
3585 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3586 				    const struct in6_addr *addr,
3587 				    bool anycast)
3588 {
3589 	u32 tb_id;
3590 	struct net *net = dev_net(idev->dev);
3591 	struct net_device *dev = idev->dev;
3592 	struct rt6_info *rt;
3593 
3594 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3595 	if (!rt)
3596 		return ERR_PTR(-ENOMEM);
3597 
3598 	in6_dev_hold(idev);
3599 
3600 	rt->dst.flags |= DST_HOST;
3601 	rt->dst.input = ip6_input;
3602 	rt->dst.output = ip6_output;
3603 	rt->rt6i_idev = idev;
3604 
3605 	rt->rt6i_protocol = RTPROT_KERNEL;
3606 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3607 	if (anycast)
3608 		rt->rt6i_flags |= RTF_ANYCAST;
3609 	else
3610 		rt->rt6i_flags |= RTF_LOCAL;
3611 
3612 	rt->rt6i_gateway  = *addr;
3613 	rt->rt6i_dst.addr = *addr;
3614 	rt->rt6i_dst.plen = 128;
3615 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3616 	rt->rt6i_table = fib6_get_table(net, tb_id);
3617 
3618 	return rt;
3619 }
3620 
3621 /* remove deleted ip from prefsrc entries */
3622 struct arg_dev_net_ip {
3623 	struct net_device *dev;
3624 	struct net *net;
3625 	struct in6_addr *addr;
3626 };
3627 
3628 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3629 {
3630 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3631 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3632 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3633 
3634 	if (((void *)rt->dst.dev == dev || !dev) &&
3635 	    rt != net->ipv6.ip6_null_entry &&
3636 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3637 		spin_lock_bh(&rt6_exception_lock);
3638 		/* remove prefsrc entry */
3639 		rt->rt6i_prefsrc.plen = 0;
3640 		/* need to update cache as well */
3641 		rt6_exceptions_remove_prefsrc(rt);
3642 		spin_unlock_bh(&rt6_exception_lock);
3643 	}
3644 	return 0;
3645 }
3646 
3647 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3648 {
3649 	struct net *net = dev_net(ifp->idev->dev);
3650 	struct arg_dev_net_ip adni = {
3651 		.dev = ifp->idev->dev,
3652 		.net = net,
3653 		.addr = &ifp->addr,
3654 	};
3655 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3656 }
3657 
3658 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3659 
3660 /* Remove routers and update dst entries when gateway turn into host. */
3661 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3662 {
3663 	struct in6_addr *gateway = (struct in6_addr *)arg;
3664 
3665 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3666 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3667 		return -1;
3668 	}
3669 
3670 	/* Further clean up cached routes in exception table.
3671 	 * This is needed because cached route may have a different
3672 	 * gateway than its 'parent' in the case of an ip redirect.
3673 	 */
3674 	rt6_exceptions_clean_tohost(rt, gateway);
3675 
3676 	return 0;
3677 }
3678 
3679 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3680 {
3681 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3682 }
3683 
3684 struct arg_netdev_event {
3685 	const struct net_device *dev;
3686 	union {
3687 		unsigned int nh_flags;
3688 		unsigned long event;
3689 	};
3690 };
3691 
3692 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3693 {
3694 	struct rt6_info *iter;
3695 	struct fib6_node *fn;
3696 
3697 	fn = rcu_dereference_protected(rt->rt6i_node,
3698 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3699 	iter = rcu_dereference_protected(fn->leaf,
3700 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3701 	while (iter) {
3702 		if (iter->rt6i_metric == rt->rt6i_metric &&
3703 		    rt6_qualify_for_ecmp(iter))
3704 			return iter;
3705 		iter = rcu_dereference_protected(iter->rt6_next,
3706 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3707 	}
3708 
3709 	return NULL;
3710 }
3711 
3712 static bool rt6_is_dead(const struct rt6_info *rt)
3713 {
3714 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3715 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3716 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3717 		return true;
3718 
3719 	return false;
3720 }
3721 
3722 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3723 {
3724 	struct rt6_info *iter;
3725 	int total = 0;
3726 
3727 	if (!rt6_is_dead(rt))
3728 		total += rt->rt6i_nh_weight;
3729 
3730 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3731 		if (!rt6_is_dead(iter))
3732 			total += iter->rt6i_nh_weight;
3733 	}
3734 
3735 	return total;
3736 }
3737 
3738 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3739 {
3740 	int upper_bound = -1;
3741 
3742 	if (!rt6_is_dead(rt)) {
3743 		*weight += rt->rt6i_nh_weight;
3744 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3745 						    total) - 1;
3746 	}
3747 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3748 }
3749 
3750 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3751 {
3752 	struct rt6_info *iter;
3753 	int weight = 0;
3754 
3755 	rt6_upper_bound_set(rt, &weight, total);
3756 
3757 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3758 		rt6_upper_bound_set(iter, &weight, total);
3759 }
3760 
3761 void rt6_multipath_rebalance(struct rt6_info *rt)
3762 {
3763 	struct rt6_info *first;
3764 	int total;
3765 
3766 	/* In case the entire multipath route was marked for flushing,
3767 	 * then there is no need to rebalance upon the removal of every
3768 	 * sibling route.
3769 	 */
3770 	if (!rt->rt6i_nsiblings || rt->should_flush)
3771 		return;
3772 
3773 	/* During lookup routes are evaluated in order, so we need to
3774 	 * make sure upper bounds are assigned from the first sibling
3775 	 * onwards.
3776 	 */
3777 	first = rt6_multipath_first_sibling(rt);
3778 	if (WARN_ON_ONCE(!first))
3779 		return;
3780 
3781 	total = rt6_multipath_total_weight(first);
3782 	rt6_multipath_upper_bound_set(first, total);
3783 }
3784 
3785 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3786 {
3787 	const struct arg_netdev_event *arg = p_arg;
3788 	const struct net *net = dev_net(arg->dev);
3789 
3790 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3791 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3792 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3793 		rt6_multipath_rebalance(rt);
3794 	}
3795 
3796 	return 0;
3797 }
3798 
3799 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3800 {
3801 	struct arg_netdev_event arg = {
3802 		.dev = dev,
3803 		{
3804 			.nh_flags = nh_flags,
3805 		},
3806 	};
3807 
3808 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3809 		arg.nh_flags |= RTNH_F_LINKDOWN;
3810 
3811 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3812 }
3813 
3814 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3815 				   const struct net_device *dev)
3816 {
3817 	struct rt6_info *iter;
3818 
3819 	if (rt->dst.dev == dev)
3820 		return true;
3821 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3822 		if (iter->dst.dev == dev)
3823 			return true;
3824 
3825 	return false;
3826 }
3827 
3828 static void rt6_multipath_flush(struct rt6_info *rt)
3829 {
3830 	struct rt6_info *iter;
3831 
3832 	rt->should_flush = 1;
3833 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3834 		iter->should_flush = 1;
3835 }
3836 
3837 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3838 					     const struct net_device *down_dev)
3839 {
3840 	struct rt6_info *iter;
3841 	unsigned int dead = 0;
3842 
3843 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3844 		dead++;
3845 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3846 		if (iter->dst.dev == down_dev ||
3847 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3848 			dead++;
3849 
3850 	return dead;
3851 }
3852 
3853 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3854 				       const struct net_device *dev,
3855 				       unsigned int nh_flags)
3856 {
3857 	struct rt6_info *iter;
3858 
3859 	if (rt->dst.dev == dev)
3860 		rt->rt6i_nh_flags |= nh_flags;
3861 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3862 		if (iter->dst.dev == dev)
3863 			iter->rt6i_nh_flags |= nh_flags;
3864 }
3865 
3866 /* called with write lock held for table with rt */
3867 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3868 {
3869 	const struct arg_netdev_event *arg = p_arg;
3870 	const struct net_device *dev = arg->dev;
3871 	const struct net *net = dev_net(dev);
3872 
3873 	if (rt == net->ipv6.ip6_null_entry)
3874 		return 0;
3875 
3876 	switch (arg->event) {
3877 	case NETDEV_UNREGISTER:
3878 		return rt->dst.dev == dev ? -1 : 0;
3879 	case NETDEV_DOWN:
3880 		if (rt->should_flush)
3881 			return -1;
3882 		if (!rt->rt6i_nsiblings)
3883 			return rt->dst.dev == dev ? -1 : 0;
3884 		if (rt6_multipath_uses_dev(rt, dev)) {
3885 			unsigned int count;
3886 
3887 			count = rt6_multipath_dead_count(rt, dev);
3888 			if (rt->rt6i_nsiblings + 1 == count) {
3889 				rt6_multipath_flush(rt);
3890 				return -1;
3891 			}
3892 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3893 						   RTNH_F_LINKDOWN);
3894 			fib6_update_sernum(rt);
3895 			rt6_multipath_rebalance(rt);
3896 		}
3897 		return -2;
3898 	case NETDEV_CHANGE:
3899 		if (rt->dst.dev != dev ||
3900 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3901 			break;
3902 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3903 		rt6_multipath_rebalance(rt);
3904 		break;
3905 	}
3906 
3907 	return 0;
3908 }
3909 
3910 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3911 {
3912 	struct arg_netdev_event arg = {
3913 		.dev = dev,
3914 		{
3915 			.event = event,
3916 		},
3917 	};
3918 
3919 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3920 }
3921 
3922 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3923 {
3924 	rt6_sync_down_dev(dev, event);
3925 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3926 	neigh_ifdown(&nd_tbl, dev);
3927 }
3928 
3929 struct rt6_mtu_change_arg {
3930 	struct net_device *dev;
3931 	unsigned int mtu;
3932 };
3933 
3934 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3935 {
3936 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3937 	struct inet6_dev *idev;
3938 
3939 	/* In IPv6 pmtu discovery is not optional,
3940 	   so that RTAX_MTU lock cannot disable it.
3941 	   We still use this lock to block changes
3942 	   caused by addrconf/ndisc.
3943 	*/
3944 
3945 	idev = __in6_dev_get(arg->dev);
3946 	if (!idev)
3947 		return 0;
3948 
3949 	/* For administrative MTU increase, there is no way to discover
3950 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3951 	   Since RFC 1981 doesn't include administrative MTU increase
3952 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3953 	 */
3954 	if (rt->dst.dev == arg->dev &&
3955 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3956 		spin_lock_bh(&rt6_exception_lock);
3957 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3958 		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3959 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3960 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3961 		spin_unlock_bh(&rt6_exception_lock);
3962 	}
3963 	return 0;
3964 }
3965 
3966 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3967 {
3968 	struct rt6_mtu_change_arg arg = {
3969 		.dev = dev,
3970 		.mtu = mtu,
3971 	};
3972 
3973 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3974 }
3975 
3976 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3977 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3978 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
3979 	[RTA_OIF]               = { .type = NLA_U32 },
3980 	[RTA_IIF]		= { .type = NLA_U32 },
3981 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3982 	[RTA_METRICS]           = { .type = NLA_NESTED },
3983 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3984 	[RTA_PREF]              = { .type = NLA_U8 },
3985 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3986 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3987 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3988 	[RTA_UID]		= { .type = NLA_U32 },
3989 	[RTA_MARK]		= { .type = NLA_U32 },
3990 	[RTA_TABLE]		= { .type = NLA_U32 },
3991 };
3992 
3993 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3994 			      struct fib6_config *cfg,
3995 			      struct netlink_ext_ack *extack)
3996 {
3997 	struct rtmsg *rtm;
3998 	struct nlattr *tb[RTA_MAX+1];
3999 	unsigned int pref;
4000 	int err;
4001 
4002 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4003 			  NULL);
4004 	if (err < 0)
4005 		goto errout;
4006 
4007 	err = -EINVAL;
4008 	rtm = nlmsg_data(nlh);
4009 	memset(cfg, 0, sizeof(*cfg));
4010 
4011 	cfg->fc_table = rtm->rtm_table;
4012 	cfg->fc_dst_len = rtm->rtm_dst_len;
4013 	cfg->fc_src_len = rtm->rtm_src_len;
4014 	cfg->fc_flags = RTF_UP;
4015 	cfg->fc_protocol = rtm->rtm_protocol;
4016 	cfg->fc_type = rtm->rtm_type;
4017 
4018 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4019 	    rtm->rtm_type == RTN_BLACKHOLE ||
4020 	    rtm->rtm_type == RTN_PROHIBIT ||
4021 	    rtm->rtm_type == RTN_THROW)
4022 		cfg->fc_flags |= RTF_REJECT;
4023 
4024 	if (rtm->rtm_type == RTN_LOCAL)
4025 		cfg->fc_flags |= RTF_LOCAL;
4026 
4027 	if (rtm->rtm_flags & RTM_F_CLONED)
4028 		cfg->fc_flags |= RTF_CACHE;
4029 
4030 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4031 
4032 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4033 	cfg->fc_nlinfo.nlh = nlh;
4034 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4035 
4036 	if (tb[RTA_GATEWAY]) {
4037 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4038 		cfg->fc_flags |= RTF_GATEWAY;
4039 	}
4040 
4041 	if (tb[RTA_DST]) {
4042 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4043 
4044 		if (nla_len(tb[RTA_DST]) < plen)
4045 			goto errout;
4046 
4047 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4048 	}
4049 
4050 	if (tb[RTA_SRC]) {
4051 		int plen = (rtm->rtm_src_len + 7) >> 3;
4052 
4053 		if (nla_len(tb[RTA_SRC]) < plen)
4054 			goto errout;
4055 
4056 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4057 	}
4058 
4059 	if (tb[RTA_PREFSRC])
4060 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4061 
4062 	if (tb[RTA_OIF])
4063 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4064 
4065 	if (tb[RTA_PRIORITY])
4066 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4067 
4068 	if (tb[RTA_METRICS]) {
4069 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4070 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4071 	}
4072 
4073 	if (tb[RTA_TABLE])
4074 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4075 
4076 	if (tb[RTA_MULTIPATH]) {
4077 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4078 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4079 
4080 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4081 						     cfg->fc_mp_len, extack);
4082 		if (err < 0)
4083 			goto errout;
4084 	}
4085 
4086 	if (tb[RTA_PREF]) {
4087 		pref = nla_get_u8(tb[RTA_PREF]);
4088 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4089 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4090 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4091 		cfg->fc_flags |= RTF_PREF(pref);
4092 	}
4093 
4094 	if (tb[RTA_ENCAP])
4095 		cfg->fc_encap = tb[RTA_ENCAP];
4096 
4097 	if (tb[RTA_ENCAP_TYPE]) {
4098 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4099 
4100 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4101 		if (err < 0)
4102 			goto errout;
4103 	}
4104 
4105 	if (tb[RTA_EXPIRES]) {
4106 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4107 
4108 		if (addrconf_finite_timeout(timeout)) {
4109 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4110 			cfg->fc_flags |= RTF_EXPIRES;
4111 		}
4112 	}
4113 
4114 	err = 0;
4115 errout:
4116 	return err;
4117 }
4118 
4119 struct rt6_nh {
4120 	struct rt6_info *rt6_info;
4121 	struct fib6_config r_cfg;
4122 	struct mx6_config mxc;
4123 	struct list_head next;
4124 };
4125 
4126 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4127 {
4128 	struct rt6_nh *nh;
4129 
4130 	list_for_each_entry(nh, rt6_nh_list, next) {
4131 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4132 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4133 		        nh->r_cfg.fc_ifindex);
4134 	}
4135 }
4136 
4137 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4138 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4139 {
4140 	struct rt6_nh *nh;
4141 	int err = -EEXIST;
4142 
4143 	list_for_each_entry(nh, rt6_nh_list, next) {
4144 		/* check if rt6_info already exists */
4145 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4146 			return err;
4147 	}
4148 
4149 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4150 	if (!nh)
4151 		return -ENOMEM;
4152 	nh->rt6_info = rt;
4153 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4154 	if (err) {
4155 		kfree(nh);
4156 		return err;
4157 	}
4158 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4159 	list_add_tail(&nh->next, rt6_nh_list);
4160 
4161 	return 0;
4162 }
4163 
4164 static void ip6_route_mpath_notify(struct rt6_info *rt,
4165 				   struct rt6_info *rt_last,
4166 				   struct nl_info *info,
4167 				   __u16 nlflags)
4168 {
4169 	/* if this is an APPEND route, then rt points to the first route
4170 	 * inserted and rt_last points to last route inserted. Userspace
4171 	 * wants a consistent dump of the route which starts at the first
4172 	 * nexthop. Since sibling routes are always added at the end of
4173 	 * the list, find the first sibling of the last route appended
4174 	 */
4175 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4176 		rt = list_first_entry(&rt_last->rt6i_siblings,
4177 				      struct rt6_info,
4178 				      rt6i_siblings);
4179 	}
4180 
4181 	if (rt)
4182 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4183 }
4184 
4185 static int ip6_route_multipath_add(struct fib6_config *cfg,
4186 				   struct netlink_ext_ack *extack)
4187 {
4188 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4189 	struct nl_info *info = &cfg->fc_nlinfo;
4190 	struct fib6_config r_cfg;
4191 	struct rtnexthop *rtnh;
4192 	struct rt6_info *rt;
4193 	struct rt6_nh *err_nh;
4194 	struct rt6_nh *nh, *nh_safe;
4195 	__u16 nlflags;
4196 	int remaining;
4197 	int attrlen;
4198 	int err = 1;
4199 	int nhn = 0;
4200 	int replace = (cfg->fc_nlinfo.nlh &&
4201 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4202 	LIST_HEAD(rt6_nh_list);
4203 
4204 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4205 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4206 		nlflags |= NLM_F_APPEND;
4207 
4208 	remaining = cfg->fc_mp_len;
4209 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4210 
4211 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4212 	 * rt6_info structs per nexthop
4213 	 */
4214 	while (rtnh_ok(rtnh, remaining)) {
4215 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4216 		if (rtnh->rtnh_ifindex)
4217 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4218 
4219 		attrlen = rtnh_attrlen(rtnh);
4220 		if (attrlen > 0) {
4221 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4222 
4223 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4224 			if (nla) {
4225 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4226 				r_cfg.fc_flags |= RTF_GATEWAY;
4227 			}
4228 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4229 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4230 			if (nla)
4231 				r_cfg.fc_encap_type = nla_get_u16(nla);
4232 		}
4233 
4234 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4235 		rt = ip6_route_info_create(&r_cfg, extack);
4236 		if (IS_ERR(rt)) {
4237 			err = PTR_ERR(rt);
4238 			rt = NULL;
4239 			goto cleanup;
4240 		}
4241 
4242 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4243 
4244 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4245 		if (err) {
4246 			dst_release_immediate(&rt->dst);
4247 			goto cleanup;
4248 		}
4249 
4250 		rtnh = rtnh_next(rtnh, &remaining);
4251 	}
4252 
4253 	/* for add and replace send one notification with all nexthops.
4254 	 * Skip the notification in fib6_add_rt2node and send one with
4255 	 * the full route when done
4256 	 */
4257 	info->skip_notify = 1;
4258 
4259 	err_nh = NULL;
4260 	list_for_each_entry(nh, &rt6_nh_list, next) {
4261 		rt_last = nh->rt6_info;
4262 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4263 		/* save reference to first route for notification */
4264 		if (!rt_notif && !err)
4265 			rt_notif = nh->rt6_info;
4266 
4267 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4268 		nh->rt6_info = NULL;
4269 		if (err) {
4270 			if (replace && nhn)
4271 				ip6_print_replace_route_err(&rt6_nh_list);
4272 			err_nh = nh;
4273 			goto add_errout;
4274 		}
4275 
4276 		/* Because each route is added like a single route we remove
4277 		 * these flags after the first nexthop: if there is a collision,
4278 		 * we have already failed to add the first nexthop:
4279 		 * fib6_add_rt2node() has rejected it; when replacing, old
4280 		 * nexthops have been replaced by first new, the rest should
4281 		 * be added to it.
4282 		 */
4283 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4284 						     NLM_F_REPLACE);
4285 		nhn++;
4286 	}
4287 
4288 	/* success ... tell user about new route */
4289 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4290 	goto cleanup;
4291 
4292 add_errout:
4293 	/* send notification for routes that were added so that
4294 	 * the delete notifications sent by ip6_route_del are
4295 	 * coherent
4296 	 */
4297 	if (rt_notif)
4298 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4299 
4300 	/* Delete routes that were already added */
4301 	list_for_each_entry(nh, &rt6_nh_list, next) {
4302 		if (err_nh == nh)
4303 			break;
4304 		ip6_route_del(&nh->r_cfg, extack);
4305 	}
4306 
4307 cleanup:
4308 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4309 		if (nh->rt6_info)
4310 			dst_release_immediate(&nh->rt6_info->dst);
4311 		kfree(nh->mxc.mx);
4312 		list_del(&nh->next);
4313 		kfree(nh);
4314 	}
4315 
4316 	return err;
4317 }
4318 
4319 static int ip6_route_multipath_del(struct fib6_config *cfg,
4320 				   struct netlink_ext_ack *extack)
4321 {
4322 	struct fib6_config r_cfg;
4323 	struct rtnexthop *rtnh;
4324 	int remaining;
4325 	int attrlen;
4326 	int err = 1, last_err = 0;
4327 
4328 	remaining = cfg->fc_mp_len;
4329 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4330 
4331 	/* Parse a Multipath Entry */
4332 	while (rtnh_ok(rtnh, remaining)) {
4333 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4334 		if (rtnh->rtnh_ifindex)
4335 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4336 
4337 		attrlen = rtnh_attrlen(rtnh);
4338 		if (attrlen > 0) {
4339 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4340 
4341 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4342 			if (nla) {
4343 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4344 				r_cfg.fc_flags |= RTF_GATEWAY;
4345 			}
4346 		}
4347 		err = ip6_route_del(&r_cfg, extack);
4348 		if (err)
4349 			last_err = err;
4350 
4351 		rtnh = rtnh_next(rtnh, &remaining);
4352 	}
4353 
4354 	return last_err;
4355 }
4356 
4357 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4358 			      struct netlink_ext_ack *extack)
4359 {
4360 	struct fib6_config cfg;
4361 	int err;
4362 
4363 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4364 	if (err < 0)
4365 		return err;
4366 
4367 	if (cfg.fc_mp)
4368 		return ip6_route_multipath_del(&cfg, extack);
4369 	else {
4370 		cfg.fc_delete_all_nh = 1;
4371 		return ip6_route_del(&cfg, extack);
4372 	}
4373 }
4374 
4375 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4376 			      struct netlink_ext_ack *extack)
4377 {
4378 	struct fib6_config cfg;
4379 	int err;
4380 
4381 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4382 	if (err < 0)
4383 		return err;
4384 
4385 	if (cfg.fc_mp)
4386 		return ip6_route_multipath_add(&cfg, extack);
4387 	else
4388 		return ip6_route_add(&cfg, extack);
4389 }
4390 
4391 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4392 {
4393 	int nexthop_len = 0;
4394 
4395 	if (rt->rt6i_nsiblings) {
4396 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4397 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4398 			    + nla_total_size(16) /* RTA_GATEWAY */
4399 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4400 
4401 		nexthop_len *= rt->rt6i_nsiblings;
4402 	}
4403 
4404 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4405 	       + nla_total_size(16) /* RTA_SRC */
4406 	       + nla_total_size(16) /* RTA_DST */
4407 	       + nla_total_size(16) /* RTA_GATEWAY */
4408 	       + nla_total_size(16) /* RTA_PREFSRC */
4409 	       + nla_total_size(4) /* RTA_TABLE */
4410 	       + nla_total_size(4) /* RTA_IIF */
4411 	       + nla_total_size(4) /* RTA_OIF */
4412 	       + nla_total_size(4) /* RTA_PRIORITY */
4413 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4414 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4415 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4416 	       + nla_total_size(1) /* RTA_PREF */
4417 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4418 	       + nexthop_len;
4419 }
4420 
4421 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4422 			    unsigned int *flags, bool skip_oif)
4423 {
4424 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4425 		*flags |= RTNH_F_DEAD;
4426 
4427 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4428 		*flags |= RTNH_F_LINKDOWN;
4429 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4430 			*flags |= RTNH_F_DEAD;
4431 	}
4432 
4433 	if (rt->rt6i_flags & RTF_GATEWAY) {
4434 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4435 			goto nla_put_failure;
4436 	}
4437 
4438 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4439 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4440 		*flags |= RTNH_F_OFFLOAD;
4441 
4442 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4443 	if (!skip_oif && rt->dst.dev &&
4444 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4445 		goto nla_put_failure;
4446 
4447 	if (rt->dst.lwtstate &&
4448 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4449 		goto nla_put_failure;
4450 
4451 	return 0;
4452 
4453 nla_put_failure:
4454 	return -EMSGSIZE;
4455 }
4456 
4457 /* add multipath next hop */
4458 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4459 {
4460 	struct rtnexthop *rtnh;
4461 	unsigned int flags = 0;
4462 
4463 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4464 	if (!rtnh)
4465 		goto nla_put_failure;
4466 
4467 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4468 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4469 
4470 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4471 		goto nla_put_failure;
4472 
4473 	rtnh->rtnh_flags = flags;
4474 
4475 	/* length of rtnetlink header + attributes */
4476 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4477 
4478 	return 0;
4479 
4480 nla_put_failure:
4481 	return -EMSGSIZE;
4482 }
4483 
4484 static int rt6_fill_node(struct net *net,
4485 			 struct sk_buff *skb, struct rt6_info *rt,
4486 			 struct in6_addr *dst, struct in6_addr *src,
4487 			 int iif, int type, u32 portid, u32 seq,
4488 			 unsigned int flags)
4489 {
4490 	u32 metrics[RTAX_MAX];
4491 	struct rtmsg *rtm;
4492 	struct nlmsghdr *nlh;
4493 	long expires;
4494 	u32 table;
4495 
4496 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4497 	if (!nlh)
4498 		return -EMSGSIZE;
4499 
4500 	rtm = nlmsg_data(nlh);
4501 	rtm->rtm_family = AF_INET6;
4502 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4503 	rtm->rtm_src_len = rt->rt6i_src.plen;
4504 	rtm->rtm_tos = 0;
4505 	if (rt->rt6i_table)
4506 		table = rt->rt6i_table->tb6_id;
4507 	else
4508 		table = RT6_TABLE_UNSPEC;
4509 	rtm->rtm_table = table;
4510 	if (nla_put_u32(skb, RTA_TABLE, table))
4511 		goto nla_put_failure;
4512 	if (rt->rt6i_flags & RTF_REJECT) {
4513 		switch (rt->dst.error) {
4514 		case -EINVAL:
4515 			rtm->rtm_type = RTN_BLACKHOLE;
4516 			break;
4517 		case -EACCES:
4518 			rtm->rtm_type = RTN_PROHIBIT;
4519 			break;
4520 		case -EAGAIN:
4521 			rtm->rtm_type = RTN_THROW;
4522 			break;
4523 		default:
4524 			rtm->rtm_type = RTN_UNREACHABLE;
4525 			break;
4526 		}
4527 	}
4528 	else if (rt->rt6i_flags & RTF_LOCAL)
4529 		rtm->rtm_type = RTN_LOCAL;
4530 	else if (rt->rt6i_flags & RTF_ANYCAST)
4531 		rtm->rtm_type = RTN_ANYCAST;
4532 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4533 		rtm->rtm_type = RTN_LOCAL;
4534 	else
4535 		rtm->rtm_type = RTN_UNICAST;
4536 	rtm->rtm_flags = 0;
4537 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4538 	rtm->rtm_protocol = rt->rt6i_protocol;
4539 
4540 	if (rt->rt6i_flags & RTF_CACHE)
4541 		rtm->rtm_flags |= RTM_F_CLONED;
4542 
4543 	if (dst) {
4544 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4545 			goto nla_put_failure;
4546 		rtm->rtm_dst_len = 128;
4547 	} else if (rtm->rtm_dst_len)
4548 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4549 			goto nla_put_failure;
4550 #ifdef CONFIG_IPV6_SUBTREES
4551 	if (src) {
4552 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4553 			goto nla_put_failure;
4554 		rtm->rtm_src_len = 128;
4555 	} else if (rtm->rtm_src_len &&
4556 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4557 		goto nla_put_failure;
4558 #endif
4559 	if (iif) {
4560 #ifdef CONFIG_IPV6_MROUTE
4561 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4562 			int err = ip6mr_get_route(net, skb, rtm, portid);
4563 
4564 			if (err == 0)
4565 				return 0;
4566 			if (err < 0)
4567 				goto nla_put_failure;
4568 		} else
4569 #endif
4570 			if (nla_put_u32(skb, RTA_IIF, iif))
4571 				goto nla_put_failure;
4572 	} else if (dst) {
4573 		struct in6_addr saddr_buf;
4574 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4575 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4576 			goto nla_put_failure;
4577 	}
4578 
4579 	if (rt->rt6i_prefsrc.plen) {
4580 		struct in6_addr saddr_buf;
4581 		saddr_buf = rt->rt6i_prefsrc.addr;
4582 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4583 			goto nla_put_failure;
4584 	}
4585 
4586 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4587 	if (rt->rt6i_pmtu)
4588 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4589 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4590 		goto nla_put_failure;
4591 
4592 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4593 		goto nla_put_failure;
4594 
4595 	/* For multipath routes, walk the siblings list and add
4596 	 * each as a nexthop within RTA_MULTIPATH.
4597 	 */
4598 	if (rt->rt6i_nsiblings) {
4599 		struct rt6_info *sibling, *next_sibling;
4600 		struct nlattr *mp;
4601 
4602 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4603 		if (!mp)
4604 			goto nla_put_failure;
4605 
4606 		if (rt6_add_nexthop(skb, rt) < 0)
4607 			goto nla_put_failure;
4608 
4609 		list_for_each_entry_safe(sibling, next_sibling,
4610 					 &rt->rt6i_siblings, rt6i_siblings) {
4611 			if (rt6_add_nexthop(skb, sibling) < 0)
4612 				goto nla_put_failure;
4613 		}
4614 
4615 		nla_nest_end(skb, mp);
4616 	} else {
4617 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4618 			goto nla_put_failure;
4619 	}
4620 
4621 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4622 
4623 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4624 		goto nla_put_failure;
4625 
4626 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4627 		goto nla_put_failure;
4628 
4629 
4630 	nlmsg_end(skb, nlh);
4631 	return 0;
4632 
4633 nla_put_failure:
4634 	nlmsg_cancel(skb, nlh);
4635 	return -EMSGSIZE;
4636 }
4637 
4638 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4639 {
4640 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4641 	struct net *net = arg->net;
4642 
4643 	if (rt == net->ipv6.ip6_null_entry)
4644 		return 0;
4645 
4646 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4647 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4648 
4649 		/* user wants prefix routes only */
4650 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4651 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4652 			/* success since this is not a prefix route */
4653 			return 1;
4654 		}
4655 	}
4656 
4657 	return rt6_fill_node(net,
4658 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4659 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4660 		     NLM_F_MULTI);
4661 }
4662 
4663 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4664 			      struct netlink_ext_ack *extack)
4665 {
4666 	struct net *net = sock_net(in_skb->sk);
4667 	struct nlattr *tb[RTA_MAX+1];
4668 	int err, iif = 0, oif = 0;
4669 	struct dst_entry *dst;
4670 	struct rt6_info *rt;
4671 	struct sk_buff *skb;
4672 	struct rtmsg *rtm;
4673 	struct flowi6 fl6;
4674 	bool fibmatch;
4675 
4676 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4677 			  extack);
4678 	if (err < 0)
4679 		goto errout;
4680 
4681 	err = -EINVAL;
4682 	memset(&fl6, 0, sizeof(fl6));
4683 	rtm = nlmsg_data(nlh);
4684 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4685 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4686 
4687 	if (tb[RTA_SRC]) {
4688 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4689 			goto errout;
4690 
4691 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4692 	}
4693 
4694 	if (tb[RTA_DST]) {
4695 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4696 			goto errout;
4697 
4698 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4699 	}
4700 
4701 	if (tb[RTA_IIF])
4702 		iif = nla_get_u32(tb[RTA_IIF]);
4703 
4704 	if (tb[RTA_OIF])
4705 		oif = nla_get_u32(tb[RTA_OIF]);
4706 
4707 	if (tb[RTA_MARK])
4708 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4709 
4710 	if (tb[RTA_UID])
4711 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4712 					   nla_get_u32(tb[RTA_UID]));
4713 	else
4714 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4715 
4716 	if (iif) {
4717 		struct net_device *dev;
4718 		int flags = 0;
4719 
4720 		rcu_read_lock();
4721 
4722 		dev = dev_get_by_index_rcu(net, iif);
4723 		if (!dev) {
4724 			rcu_read_unlock();
4725 			err = -ENODEV;
4726 			goto errout;
4727 		}
4728 
4729 		fl6.flowi6_iif = iif;
4730 
4731 		if (!ipv6_addr_any(&fl6.saddr))
4732 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4733 
4734 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4735 
4736 		rcu_read_unlock();
4737 	} else {
4738 		fl6.flowi6_oif = oif;
4739 
4740 		dst = ip6_route_output(net, NULL, &fl6);
4741 	}
4742 
4743 
4744 	rt = container_of(dst, struct rt6_info, dst);
4745 	if (rt->dst.error) {
4746 		err = rt->dst.error;
4747 		ip6_rt_put(rt);
4748 		goto errout;
4749 	}
4750 
4751 	if (rt == net->ipv6.ip6_null_entry) {
4752 		err = rt->dst.error;
4753 		ip6_rt_put(rt);
4754 		goto errout;
4755 	}
4756 
4757 	if (fibmatch && rt->from) {
4758 		struct rt6_info *ort = rt->from;
4759 
4760 		dst_hold(&ort->dst);
4761 		ip6_rt_put(rt);
4762 		rt = ort;
4763 	}
4764 
4765 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4766 	if (!skb) {
4767 		ip6_rt_put(rt);
4768 		err = -ENOBUFS;
4769 		goto errout;
4770 	}
4771 
4772 	skb_dst_set(skb, &rt->dst);
4773 	if (fibmatch)
4774 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4775 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4776 				    nlh->nlmsg_seq, 0);
4777 	else
4778 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4779 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4780 				    nlh->nlmsg_seq, 0);
4781 	if (err < 0) {
4782 		kfree_skb(skb);
4783 		goto errout;
4784 	}
4785 
4786 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4787 errout:
4788 	return err;
4789 }
4790 
4791 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4792 		     unsigned int nlm_flags)
4793 {
4794 	struct sk_buff *skb;
4795 	struct net *net = info->nl_net;
4796 	u32 seq;
4797 	int err;
4798 
4799 	err = -ENOBUFS;
4800 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4801 
4802 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4803 	if (!skb)
4804 		goto errout;
4805 
4806 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4807 				event, info->portid, seq, nlm_flags);
4808 	if (err < 0) {
4809 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4810 		WARN_ON(err == -EMSGSIZE);
4811 		kfree_skb(skb);
4812 		goto errout;
4813 	}
4814 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4815 		    info->nlh, gfp_any());
4816 	return;
4817 errout:
4818 	if (err < 0)
4819 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4820 }
4821 
4822 static int ip6_route_dev_notify(struct notifier_block *this,
4823 				unsigned long event, void *ptr)
4824 {
4825 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4826 	struct net *net = dev_net(dev);
4827 
4828 	if (!(dev->flags & IFF_LOOPBACK))
4829 		return NOTIFY_OK;
4830 
4831 	if (event == NETDEV_REGISTER) {
4832 		net->ipv6.ip6_null_entry->dst.dev = dev;
4833 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4834 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4835 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4836 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4837 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4838 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4839 #endif
4840 	 } else if (event == NETDEV_UNREGISTER &&
4841 		    dev->reg_state != NETREG_UNREGISTERED) {
4842 		/* NETDEV_UNREGISTER could be fired for multiple times by
4843 		 * netdev_wait_allrefs(). Make sure we only call this once.
4844 		 */
4845 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4846 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4847 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4848 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4849 #endif
4850 	}
4851 
4852 	return NOTIFY_OK;
4853 }
4854 
4855 /*
4856  *	/proc
4857  */
4858 
4859 #ifdef CONFIG_PROC_FS
4860 
4861 static const struct file_operations ipv6_route_proc_fops = {
4862 	.open		= ipv6_route_open,
4863 	.read		= seq_read,
4864 	.llseek		= seq_lseek,
4865 	.release	= seq_release_net,
4866 };
4867 
4868 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4869 {
4870 	struct net *net = (struct net *)seq->private;
4871 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4872 		   net->ipv6.rt6_stats->fib_nodes,
4873 		   net->ipv6.rt6_stats->fib_route_nodes,
4874 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4875 		   net->ipv6.rt6_stats->fib_rt_entries,
4876 		   net->ipv6.rt6_stats->fib_rt_cache,
4877 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4878 		   net->ipv6.rt6_stats->fib_discarded_routes);
4879 
4880 	return 0;
4881 }
4882 
4883 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4884 {
4885 	return single_open_net(inode, file, rt6_stats_seq_show);
4886 }
4887 
4888 static const struct file_operations rt6_stats_seq_fops = {
4889 	.open	 = rt6_stats_seq_open,
4890 	.read	 = seq_read,
4891 	.llseek	 = seq_lseek,
4892 	.release = single_release_net,
4893 };
4894 #endif	/* CONFIG_PROC_FS */
4895 
4896 #ifdef CONFIG_SYSCTL
4897 
4898 static
4899 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4900 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4901 {
4902 	struct net *net;
4903 	int delay;
4904 	if (!write)
4905 		return -EINVAL;
4906 
4907 	net = (struct net *)ctl->extra1;
4908 	delay = net->ipv6.sysctl.flush_delay;
4909 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4910 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4911 	return 0;
4912 }
4913 
4914 struct ctl_table ipv6_route_table_template[] = {
4915 	{
4916 		.procname	=	"flush",
4917 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4918 		.maxlen		=	sizeof(int),
4919 		.mode		=	0200,
4920 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4921 	},
4922 	{
4923 		.procname	=	"gc_thresh",
4924 		.data		=	&ip6_dst_ops_template.gc_thresh,
4925 		.maxlen		=	sizeof(int),
4926 		.mode		=	0644,
4927 		.proc_handler	=	proc_dointvec,
4928 	},
4929 	{
4930 		.procname	=	"max_size",
4931 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4932 		.maxlen		=	sizeof(int),
4933 		.mode		=	0644,
4934 		.proc_handler	=	proc_dointvec,
4935 	},
4936 	{
4937 		.procname	=	"gc_min_interval",
4938 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4939 		.maxlen		=	sizeof(int),
4940 		.mode		=	0644,
4941 		.proc_handler	=	proc_dointvec_jiffies,
4942 	},
4943 	{
4944 		.procname	=	"gc_timeout",
4945 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4946 		.maxlen		=	sizeof(int),
4947 		.mode		=	0644,
4948 		.proc_handler	=	proc_dointvec_jiffies,
4949 	},
4950 	{
4951 		.procname	=	"gc_interval",
4952 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4953 		.maxlen		=	sizeof(int),
4954 		.mode		=	0644,
4955 		.proc_handler	=	proc_dointvec_jiffies,
4956 	},
4957 	{
4958 		.procname	=	"gc_elasticity",
4959 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4960 		.maxlen		=	sizeof(int),
4961 		.mode		=	0644,
4962 		.proc_handler	=	proc_dointvec,
4963 	},
4964 	{
4965 		.procname	=	"mtu_expires",
4966 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4967 		.maxlen		=	sizeof(int),
4968 		.mode		=	0644,
4969 		.proc_handler	=	proc_dointvec_jiffies,
4970 	},
4971 	{
4972 		.procname	=	"min_adv_mss",
4973 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4974 		.maxlen		=	sizeof(int),
4975 		.mode		=	0644,
4976 		.proc_handler	=	proc_dointvec,
4977 	},
4978 	{
4979 		.procname	=	"gc_min_interval_ms",
4980 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4981 		.maxlen		=	sizeof(int),
4982 		.mode		=	0644,
4983 		.proc_handler	=	proc_dointvec_ms_jiffies,
4984 	},
4985 	{ }
4986 };
4987 
4988 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4989 {
4990 	struct ctl_table *table;
4991 
4992 	table = kmemdup(ipv6_route_table_template,
4993 			sizeof(ipv6_route_table_template),
4994 			GFP_KERNEL);
4995 
4996 	if (table) {
4997 		table[0].data = &net->ipv6.sysctl.flush_delay;
4998 		table[0].extra1 = net;
4999 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5000 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5001 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5002 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5003 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5004 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5005 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5006 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5007 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5008 
5009 		/* Don't export sysctls to unprivileged users */
5010 		if (net->user_ns != &init_user_ns)
5011 			table[0].procname = NULL;
5012 	}
5013 
5014 	return table;
5015 }
5016 #endif
5017 
5018 static int __net_init ip6_route_net_init(struct net *net)
5019 {
5020 	int ret = -ENOMEM;
5021 
5022 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5023 	       sizeof(net->ipv6.ip6_dst_ops));
5024 
5025 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5026 		goto out_ip6_dst_ops;
5027 
5028 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5029 					   sizeof(*net->ipv6.ip6_null_entry),
5030 					   GFP_KERNEL);
5031 	if (!net->ipv6.ip6_null_entry)
5032 		goto out_ip6_dst_entries;
5033 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5034 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5035 			 ip6_template_metrics, true);
5036 
5037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5038 	net->ipv6.fib6_has_custom_rules = false;
5039 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5040 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5041 					       GFP_KERNEL);
5042 	if (!net->ipv6.ip6_prohibit_entry)
5043 		goto out_ip6_null_entry;
5044 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5045 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5046 			 ip6_template_metrics, true);
5047 
5048 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5049 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5050 					       GFP_KERNEL);
5051 	if (!net->ipv6.ip6_blk_hole_entry)
5052 		goto out_ip6_prohibit_entry;
5053 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5054 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5055 			 ip6_template_metrics, true);
5056 #endif
5057 
5058 	net->ipv6.sysctl.flush_delay = 0;
5059 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5060 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5061 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5062 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5063 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5064 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5065 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5066 
5067 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5068 
5069 	ret = 0;
5070 out:
5071 	return ret;
5072 
5073 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5074 out_ip6_prohibit_entry:
5075 	kfree(net->ipv6.ip6_prohibit_entry);
5076 out_ip6_null_entry:
5077 	kfree(net->ipv6.ip6_null_entry);
5078 #endif
5079 out_ip6_dst_entries:
5080 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5081 out_ip6_dst_ops:
5082 	goto out;
5083 }
5084 
5085 static void __net_exit ip6_route_net_exit(struct net *net)
5086 {
5087 	kfree(net->ipv6.ip6_null_entry);
5088 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5089 	kfree(net->ipv6.ip6_prohibit_entry);
5090 	kfree(net->ipv6.ip6_blk_hole_entry);
5091 #endif
5092 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5093 }
5094 
5095 static int __net_init ip6_route_net_init_late(struct net *net)
5096 {
5097 #ifdef CONFIG_PROC_FS
5098 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5099 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5100 #endif
5101 	return 0;
5102 }
5103 
5104 static void __net_exit ip6_route_net_exit_late(struct net *net)
5105 {
5106 #ifdef CONFIG_PROC_FS
5107 	remove_proc_entry("ipv6_route", net->proc_net);
5108 	remove_proc_entry("rt6_stats", net->proc_net);
5109 #endif
5110 }
5111 
5112 static struct pernet_operations ip6_route_net_ops = {
5113 	.init = ip6_route_net_init,
5114 	.exit = ip6_route_net_exit,
5115 };
5116 
5117 static int __net_init ipv6_inetpeer_init(struct net *net)
5118 {
5119 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5120 
5121 	if (!bp)
5122 		return -ENOMEM;
5123 	inet_peer_base_init(bp);
5124 	net->ipv6.peers = bp;
5125 	return 0;
5126 }
5127 
5128 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5129 {
5130 	struct inet_peer_base *bp = net->ipv6.peers;
5131 
5132 	net->ipv6.peers = NULL;
5133 	inetpeer_invalidate_tree(bp);
5134 	kfree(bp);
5135 }
5136 
5137 static struct pernet_operations ipv6_inetpeer_ops = {
5138 	.init	=	ipv6_inetpeer_init,
5139 	.exit	=	ipv6_inetpeer_exit,
5140 };
5141 
5142 static struct pernet_operations ip6_route_net_late_ops = {
5143 	.init = ip6_route_net_init_late,
5144 	.exit = ip6_route_net_exit_late,
5145 };
5146 
5147 static struct notifier_block ip6_route_dev_notifier = {
5148 	.notifier_call = ip6_route_dev_notify,
5149 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5150 };
5151 
5152 void __init ip6_route_init_special_entries(void)
5153 {
5154 	/* Registering of the loopback is done before this portion of code,
5155 	 * the loopback reference in rt6_info will not be taken, do it
5156 	 * manually for init_net */
5157 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5158 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5159   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5160 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5161 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5162 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5163 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5164   #endif
5165 }
5166 
5167 int __init ip6_route_init(void)
5168 {
5169 	int ret;
5170 	int cpu;
5171 
5172 	ret = -ENOMEM;
5173 	ip6_dst_ops_template.kmem_cachep =
5174 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5175 				  SLAB_HWCACHE_ALIGN, NULL);
5176 	if (!ip6_dst_ops_template.kmem_cachep)
5177 		goto out;
5178 
5179 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5180 	if (ret)
5181 		goto out_kmem_cache;
5182 
5183 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5184 	if (ret)
5185 		goto out_dst_entries;
5186 
5187 	ret = register_pernet_subsys(&ip6_route_net_ops);
5188 	if (ret)
5189 		goto out_register_inetpeer;
5190 
5191 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5192 
5193 	ret = fib6_init();
5194 	if (ret)
5195 		goto out_register_subsys;
5196 
5197 	ret = xfrm6_init();
5198 	if (ret)
5199 		goto out_fib6_init;
5200 
5201 	ret = fib6_rules_init();
5202 	if (ret)
5203 		goto xfrm6_init;
5204 
5205 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5206 	if (ret)
5207 		goto fib6_rules_init;
5208 
5209 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5210 				   inet6_rtm_newroute, NULL, 0);
5211 	if (ret < 0)
5212 		goto out_register_late_subsys;
5213 
5214 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5215 				   inet6_rtm_delroute, NULL, 0);
5216 	if (ret < 0)
5217 		goto out_register_late_subsys;
5218 
5219 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5220 				   inet6_rtm_getroute, NULL,
5221 				   RTNL_FLAG_DOIT_UNLOCKED);
5222 	if (ret < 0)
5223 		goto out_register_late_subsys;
5224 
5225 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5226 	if (ret)
5227 		goto out_register_late_subsys;
5228 
5229 	for_each_possible_cpu(cpu) {
5230 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5231 
5232 		INIT_LIST_HEAD(&ul->head);
5233 		spin_lock_init(&ul->lock);
5234 	}
5235 
5236 out:
5237 	return ret;
5238 
5239 out_register_late_subsys:
5240 	rtnl_unregister_all(PF_INET6);
5241 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5242 fib6_rules_init:
5243 	fib6_rules_cleanup();
5244 xfrm6_init:
5245 	xfrm6_fini();
5246 out_fib6_init:
5247 	fib6_gc_cleanup();
5248 out_register_subsys:
5249 	unregister_pernet_subsys(&ip6_route_net_ops);
5250 out_register_inetpeer:
5251 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5252 out_dst_entries:
5253 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5254 out_kmem_cache:
5255 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5256 	goto out;
5257 }
5258 
5259 void ip6_route_cleanup(void)
5260 {
5261 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5262 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5263 	fib6_rules_cleanup();
5264 	xfrm6_fini();
5265 	fib6_gc_cleanup();
5266 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5267 	unregister_pernet_subsys(&ip6_route_net_ops);
5268 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5269 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5270 }
5271