xref: /openbmc/linux/net/ipv6/route.c (revision 5ef12cb4a3a78ffb331c03a795a15eea4ae35155)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
82 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
83 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
84 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
85 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86 static void		ip6_dst_destroy(struct dst_entry *);
87 static void		ip6_dst_ifdown(struct dst_entry *,
88 				       struct net_device *dev, int how);
89 static int		 ip6_dst_gc(struct dst_ops *ops);
90 
91 static int		ip6_pkt_discard(struct sk_buff *skb);
92 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static int		ip6_pkt_prohibit(struct sk_buff *skb);
94 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
95 static void		ip6_link_failure(struct sk_buff *skb);
96 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 					   struct sk_buff *skb, u32 mtu);
98 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 					struct sk_buff *skb);
100 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 			 struct sk_buff *skb, struct rt6_info *rt,
105 			 struct in6_addr *dst, struct in6_addr *src,
106 			 int iif, int type, u32 portid, u32 seq,
107 			 unsigned int flags);
108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 					   struct in6_addr *daddr,
110 					   struct in6_addr *saddr);
111 
112 #ifdef CONFIG_IPV6_ROUTE_INFO
113 static struct rt6_info *rt6_add_route_info(struct net *net,
114 					   const struct in6_addr *prefix, int prefixlen,
115 					   const struct in6_addr *gwaddr,
116 					   struct net_device *dev,
117 					   unsigned int pref);
118 static struct rt6_info *rt6_get_route_info(struct net *net,
119 					   const struct in6_addr *prefix, int prefixlen,
120 					   const struct in6_addr *gwaddr,
121 					   struct net_device *dev);
122 #endif
123 
124 struct uncached_list {
125 	spinlock_t		lock;
126 	struct list_head	head;
127 };
128 
129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130 
131 void rt6_uncached_list_add(struct rt6_info *rt)
132 {
133 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134 
135 	rt->rt6i_uncached_list = ul;
136 
137 	spin_lock_bh(&ul->lock);
138 	list_add_tail(&rt->rt6i_uncached, &ul->head);
139 	spin_unlock_bh(&ul->lock);
140 }
141 
142 void rt6_uncached_list_del(struct rt6_info *rt)
143 {
144 	if (!list_empty(&rt->rt6i_uncached)) {
145 		struct uncached_list *ul = rt->rt6i_uncached_list;
146 		struct net *net = dev_net(rt->dst.dev);
147 
148 		spin_lock_bh(&ul->lock);
149 		list_del(&rt->rt6i_uncached);
150 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
151 		spin_unlock_bh(&ul->lock);
152 	}
153 }
154 
155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156 {
157 	struct net_device *loopback_dev = net->loopback_dev;
158 	int cpu;
159 
160 	if (dev == loopback_dev)
161 		return;
162 
163 	for_each_possible_cpu(cpu) {
164 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 		struct rt6_info *rt;
166 
167 		spin_lock_bh(&ul->lock);
168 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 			struct inet6_dev *rt_idev = rt->rt6i_idev;
170 			struct net_device *rt_dev = rt->dst.dev;
171 
172 			if (rt_idev->dev == dev) {
173 				rt->rt6i_idev = in6_dev_get(loopback_dev);
174 				in6_dev_put(rt_idev);
175 			}
176 
177 			if (rt_dev == dev) {
178 				rt->dst.dev = loopback_dev;
179 				dev_hold(rt->dst.dev);
180 				dev_put(rt_dev);
181 			}
182 		}
183 		spin_unlock_bh(&ul->lock);
184 	}
185 }
186 
187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188 {
189 	return dst_metrics_write_ptr(&rt->from->dst);
190 }
191 
192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193 {
194 	struct rt6_info *rt = (struct rt6_info *)dst;
195 
196 	if (rt->rt6i_flags & RTF_PCPU)
197 		return rt6_pcpu_cow_metrics(rt);
198 	else if (rt->rt6i_flags & RTF_CACHE)
199 		return NULL;
200 	else
201 		return dst_cow_metrics_generic(dst, old);
202 }
203 
204 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 					     struct sk_buff *skb,
206 					     const void *daddr)
207 {
208 	struct in6_addr *p = &rt->rt6i_gateway;
209 
210 	if (!ipv6_addr_any(p))
211 		return (const void *) p;
212 	else if (skb)
213 		return &ipv6_hdr(skb)->daddr;
214 	return daddr;
215 }
216 
217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 					  struct sk_buff *skb,
219 					  const void *daddr)
220 {
221 	struct rt6_info *rt = (struct rt6_info *) dst;
222 	struct neighbour *n;
223 
224 	daddr = choose_neigh_daddr(rt, skb, daddr);
225 	n = __ipv6_neigh_lookup(dst->dev, daddr);
226 	if (n)
227 		return n;
228 	return neigh_create(&nd_tbl, daddr, dst->dev);
229 }
230 
231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232 {
233 	struct net_device *dev = dst->dev;
234 	struct rt6_info *rt = (struct rt6_info *)dst;
235 
236 	daddr = choose_neigh_daddr(rt, NULL, daddr);
237 	if (!daddr)
238 		return;
239 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 		return;
241 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 		return;
243 	__ipv6_confirm_neigh(dev, daddr);
244 }
245 
246 static struct dst_ops ip6_dst_ops_template = {
247 	.family			=	AF_INET6,
248 	.gc			=	ip6_dst_gc,
249 	.gc_thresh		=	1024,
250 	.check			=	ip6_dst_check,
251 	.default_advmss		=	ip6_default_advmss,
252 	.mtu			=	ip6_mtu,
253 	.cow_metrics		=	ipv6_cow_metrics,
254 	.destroy		=	ip6_dst_destroy,
255 	.ifdown			=	ip6_dst_ifdown,
256 	.negative_advice	=	ip6_negative_advice,
257 	.link_failure		=	ip6_link_failure,
258 	.update_pmtu		=	ip6_rt_update_pmtu,
259 	.redirect		=	rt6_do_redirect,
260 	.local_out		=	__ip6_local_out,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 	.confirm_neigh		=	ip6_confirm_neigh,
263 };
264 
265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
266 {
267 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268 
269 	return mtu ? : dst->dev->mtu;
270 }
271 
272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 					 struct sk_buff *skb, u32 mtu)
274 {
275 }
276 
277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 				      struct sk_buff *skb)
279 {
280 }
281 
282 static struct dst_ops ip6_dst_blackhole_ops = {
283 	.family			=	AF_INET6,
284 	.destroy		=	ip6_dst_destroy,
285 	.check			=	ip6_dst_check,
286 	.mtu			=	ip6_blackhole_mtu,
287 	.default_advmss		=	ip6_default_advmss,
288 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
289 	.redirect		=	ip6_rt_blackhole_redirect,
290 	.cow_metrics		=	dst_cow_metrics_generic,
291 	.neigh_lookup		=	ip6_neigh_lookup,
292 };
293 
294 static const u32 ip6_template_metrics[RTAX_MAX] = {
295 	[RTAX_HOPLIMIT - 1] = 0,
296 };
297 
298 static const struct rt6_info ip6_null_entry_template = {
299 	.dst = {
300 		.__refcnt	= ATOMIC_INIT(1),
301 		.__use		= 1,
302 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
303 		.error		= -ENETUNREACH,
304 		.input		= ip6_pkt_discard,
305 		.output		= ip6_pkt_discard_out,
306 	},
307 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
308 	.rt6i_protocol  = RTPROT_KERNEL,
309 	.rt6i_metric	= ~(u32) 0,
310 	.rt6i_ref	= ATOMIC_INIT(1),
311 };
312 
313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
314 
315 static const struct rt6_info ip6_prohibit_entry_template = {
316 	.dst = {
317 		.__refcnt	= ATOMIC_INIT(1),
318 		.__use		= 1,
319 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 		.error		= -EACCES,
321 		.input		= ip6_pkt_prohibit,
322 		.output		= ip6_pkt_prohibit_out,
323 	},
324 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
325 	.rt6i_protocol  = RTPROT_KERNEL,
326 	.rt6i_metric	= ~(u32) 0,
327 	.rt6i_ref	= ATOMIC_INIT(1),
328 };
329 
330 static const struct rt6_info ip6_blk_hole_entry_template = {
331 	.dst = {
332 		.__refcnt	= ATOMIC_INIT(1),
333 		.__use		= 1,
334 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
335 		.error		= -EINVAL,
336 		.input		= dst_discard,
337 		.output		= dst_discard_out,
338 	},
339 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
340 	.rt6i_protocol  = RTPROT_KERNEL,
341 	.rt6i_metric	= ~(u32) 0,
342 	.rt6i_ref	= ATOMIC_INIT(1),
343 };
344 
345 #endif
346 
347 static void rt6_info_init(struct rt6_info *rt)
348 {
349 	struct dst_entry *dst = &rt->dst;
350 
351 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 	INIT_LIST_HEAD(&rt->rt6i_siblings);
353 	INIT_LIST_HEAD(&rt->rt6i_uncached);
354 }
355 
356 /* allocate dst with ip6_dst_ops */
357 static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 					struct net_device *dev,
359 					int flags)
360 {
361 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
362 					1, DST_OBSOLETE_FORCE_CHK, flags);
363 
364 	if (rt) {
365 		rt6_info_init(rt);
366 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 	}
368 
369 	return rt;
370 }
371 
372 struct rt6_info *ip6_dst_alloc(struct net *net,
373 			       struct net_device *dev,
374 			       int flags)
375 {
376 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
377 
378 	if (rt) {
379 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
380 		if (!rt->rt6i_pcpu) {
381 			dst_release_immediate(&rt->dst);
382 			return NULL;
383 		}
384 	}
385 
386 	return rt;
387 }
388 EXPORT_SYMBOL(ip6_dst_alloc);
389 
390 static void ip6_dst_destroy(struct dst_entry *dst)
391 {
392 	struct rt6_info *rt = (struct rt6_info *)dst;
393 	struct rt6_exception_bucket *bucket;
394 	struct rt6_info *from = rt->from;
395 	struct inet6_dev *idev;
396 
397 	dst_destroy_metrics_generic(dst);
398 	free_percpu(rt->rt6i_pcpu);
399 	rt6_uncached_list_del(rt);
400 
401 	idev = rt->rt6i_idev;
402 	if (idev) {
403 		rt->rt6i_idev = NULL;
404 		in6_dev_put(idev);
405 	}
406 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 	if (bucket) {
408 		rt->rt6i_exception_bucket = NULL;
409 		kfree(bucket);
410 	}
411 
412 	rt->from = NULL;
413 	dst_release(&from->dst);
414 }
415 
416 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 			   int how)
418 {
419 	struct rt6_info *rt = (struct rt6_info *)dst;
420 	struct inet6_dev *idev = rt->rt6i_idev;
421 	struct net_device *loopback_dev =
422 		dev_net(dev)->loopback_dev;
423 
424 	if (idev && idev->dev != loopback_dev) {
425 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 		if (loopback_idev) {
427 			rt->rt6i_idev = loopback_idev;
428 			in6_dev_put(idev);
429 		}
430 	}
431 }
432 
433 static bool __rt6_check_expired(const struct rt6_info *rt)
434 {
435 	if (rt->rt6i_flags & RTF_EXPIRES)
436 		return time_after(jiffies, rt->dst.expires);
437 	else
438 		return false;
439 }
440 
441 static bool rt6_check_expired(const struct rt6_info *rt)
442 {
443 	if (rt->rt6i_flags & RTF_EXPIRES) {
444 		if (time_after(jiffies, rt->dst.expires))
445 			return true;
446 	} else if (rt->from) {
447 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
448 			rt6_check_expired(rt->from);
449 	}
450 	return false;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(const struct net *net,
454 					     struct rt6_info *match,
455 					     struct flowi6 *fl6, int oif,
456 					     const struct sk_buff *skb,
457 					     int strict)
458 {
459 	struct rt6_info *sibling, *next_sibling;
460 
461 	/* We might have already computed the hash for ICMPv6 errors. In such
462 	 * case it will always be non-zero. Otherwise now is the time to do it.
463 	 */
464 	if (!fl6->mp_hash)
465 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
466 
467 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 		return match;
469 
470 	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 				 rt6i_siblings) {
472 		if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 			continue;
474 		if (rt6_score_route(sibling, oif, strict) < 0)
475 			break;
476 		match = sibling;
477 		break;
478 	}
479 
480 	return match;
481 }
482 
483 /*
484  *	Route lookup. rcu_read_lock() should be held.
485  */
486 
487 static inline struct rt6_info *rt6_device_match(struct net *net,
488 						    struct rt6_info *rt,
489 						    const struct in6_addr *saddr,
490 						    int oif,
491 						    int flags)
492 {
493 	struct rt6_info *local = NULL;
494 	struct rt6_info *sprt;
495 
496 	if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 		return rt;
498 
499 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
500 		struct net_device *dev = sprt->dst.dev;
501 
502 		if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 			continue;
504 
505 		if (oif) {
506 			if (dev->ifindex == oif)
507 				return sprt;
508 			if (dev->flags & IFF_LOOPBACK) {
509 				if (!sprt->rt6i_idev ||
510 				    sprt->rt6i_idev->dev->ifindex != oif) {
511 					if (flags & RT6_LOOKUP_F_IFACE)
512 						continue;
513 					if (local &&
514 					    local->rt6i_idev->dev->ifindex == oif)
515 						continue;
516 				}
517 				local = sprt;
518 			}
519 		} else {
520 			if (ipv6_chk_addr(net, saddr, dev,
521 					  flags & RT6_LOOKUP_F_IFACE))
522 				return sprt;
523 		}
524 	}
525 
526 	if (oif) {
527 		if (local)
528 			return local;
529 
530 		if (flags & RT6_LOOKUP_F_IFACE)
531 			return net->ipv6.ip6_null_entry;
532 	}
533 
534 	return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
535 }
536 
537 #ifdef CONFIG_IPV6_ROUTER_PREF
538 struct __rt6_probe_work {
539 	struct work_struct work;
540 	struct in6_addr target;
541 	struct net_device *dev;
542 };
543 
544 static void rt6_probe_deferred(struct work_struct *w)
545 {
546 	struct in6_addr mcaddr;
547 	struct __rt6_probe_work *work =
548 		container_of(w, struct __rt6_probe_work, work);
549 
550 	addrconf_addr_solict_mult(&work->target, &mcaddr);
551 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552 	dev_put(work->dev);
553 	kfree(work);
554 }
555 
556 static void rt6_probe(struct rt6_info *rt)
557 {
558 	struct __rt6_probe_work *work;
559 	struct neighbour *neigh;
560 	/*
561 	 * Okay, this does not seem to be appropriate
562 	 * for now, however, we need to check if it
563 	 * is really so; aka Router Reachability Probing.
564 	 *
565 	 * Router Reachability Probe MUST be rate-limited
566 	 * to no more than one per minute.
567 	 */
568 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
569 		return;
570 	rcu_read_lock_bh();
571 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 	if (neigh) {
573 		if (neigh->nud_state & NUD_VALID)
574 			goto out;
575 
576 		work = NULL;
577 		write_lock(&neigh->lock);
578 		if (!(neigh->nud_state & NUD_VALID) &&
579 		    time_after(jiffies,
580 			       neigh->updated +
581 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 			if (work)
584 				__neigh_set_probe_once(neigh);
585 		}
586 		write_unlock(&neigh->lock);
587 	} else {
588 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
589 	}
590 
591 	if (work) {
592 		INIT_WORK(&work->work, rt6_probe_deferred);
593 		work->target = rt->rt6i_gateway;
594 		dev_hold(rt->dst.dev);
595 		work->dev = rt->dst.dev;
596 		schedule_work(&work->work);
597 	}
598 
599 out:
600 	rcu_read_unlock_bh();
601 }
602 #else
603 static inline void rt6_probe(struct rt6_info *rt)
604 {
605 }
606 #endif
607 
608 /*
609  * Default Router Selection (RFC 2461 6.3.6)
610  */
611 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
612 {
613 	struct net_device *dev = rt->dst.dev;
614 	if (!oif || dev->ifindex == oif)
615 		return 2;
616 	if ((dev->flags & IFF_LOOPBACK) &&
617 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 		return 1;
619 	return 0;
620 }
621 
622 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
623 {
624 	struct neighbour *neigh;
625 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
626 
627 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 	    !(rt->rt6i_flags & RTF_GATEWAY))
629 		return RT6_NUD_SUCCEED;
630 
631 	rcu_read_lock_bh();
632 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 	if (neigh) {
634 		read_lock(&neigh->lock);
635 		if (neigh->nud_state & NUD_VALID)
636 			ret = RT6_NUD_SUCCEED;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 		else if (!(neigh->nud_state & NUD_FAILED))
639 			ret = RT6_NUD_SUCCEED;
640 		else
641 			ret = RT6_NUD_FAIL_PROBE;
642 #endif
643 		read_unlock(&neigh->lock);
644 	} else {
645 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
646 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
647 	}
648 	rcu_read_unlock_bh();
649 
650 	return ret;
651 }
652 
653 static int rt6_score_route(struct rt6_info *rt, int oif,
654 			   int strict)
655 {
656 	int m;
657 
658 	m = rt6_check_dev(rt, oif);
659 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
660 		return RT6_NUD_FAIL_HARD;
661 #ifdef CONFIG_IPV6_ROUTER_PREF
662 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663 #endif
664 	if (strict & RT6_LOOKUP_F_REACHABLE) {
665 		int n = rt6_check_neigh(rt);
666 		if (n < 0)
667 			return n;
668 	}
669 	return m;
670 }
671 
672 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
673 				   int *mpri, struct rt6_info *match,
674 				   bool *do_rr)
675 {
676 	int m;
677 	bool match_do_rr = false;
678 	struct inet6_dev *idev = rt->rt6i_idev;
679 
680 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 		goto out;
682 
683 	if (idev->cnf.ignore_routes_with_linkdown &&
684 	    rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
685 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
686 		goto out;
687 
688 	if (rt6_check_expired(rt))
689 		goto out;
690 
691 	m = rt6_score_route(rt, oif, strict);
692 	if (m == RT6_NUD_FAIL_DO_RR) {
693 		match_do_rr = true;
694 		m = 0; /* lowest valid score */
695 	} else if (m == RT6_NUD_FAIL_HARD) {
696 		goto out;
697 	}
698 
699 	if (strict & RT6_LOOKUP_F_REACHABLE)
700 		rt6_probe(rt);
701 
702 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
703 	if (m > *mpri) {
704 		*do_rr = match_do_rr;
705 		*mpri = m;
706 		match = rt;
707 	}
708 out:
709 	return match;
710 }
711 
712 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
713 				     struct rt6_info *leaf,
714 				     struct rt6_info *rr_head,
715 				     u32 metric, int oif, int strict,
716 				     bool *do_rr)
717 {
718 	struct rt6_info *rt, *match, *cont;
719 	int mpri = -1;
720 
721 	match = NULL;
722 	cont = NULL;
723 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
724 		if (rt->rt6i_metric != metric) {
725 			cont = rt;
726 			break;
727 		}
728 
729 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 	}
731 
732 	for (rt = leaf; rt && rt != rr_head;
733 	     rt = rcu_dereference(rt->rt6_next)) {
734 		if (rt->rt6i_metric != metric) {
735 			cont = rt;
736 			break;
737 		}
738 
739 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
740 	}
741 
742 	if (match || !cont)
743 		return match;
744 
745 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
746 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
747 
748 	return match;
749 }
750 
751 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 				   int oif, int strict)
753 {
754 	struct rt6_info *leaf = rcu_dereference(fn->leaf);
755 	struct rt6_info *match, *rt0;
756 	bool do_rr = false;
757 	int key_plen;
758 
759 	if (!leaf || leaf == net->ipv6.ip6_null_entry)
760 		return net->ipv6.ip6_null_entry;
761 
762 	rt0 = rcu_dereference(fn->rr_ptr);
763 	if (!rt0)
764 		rt0 = leaf;
765 
766 	/* Double check to make sure fn is not an intermediate node
767 	 * and fn->leaf does not points to its child's leaf
768 	 * (This might happen if all routes under fn are deleted from
769 	 * the tree and fib6_repair_tree() is called on the node.)
770 	 */
771 	key_plen = rt0->rt6i_dst.plen;
772 #ifdef CONFIG_IPV6_SUBTREES
773 	if (rt0->rt6i_src.plen)
774 		key_plen = rt0->rt6i_src.plen;
775 #endif
776 	if (fn->fn_bit != key_plen)
777 		return net->ipv6.ip6_null_entry;
778 
779 	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
780 			     &do_rr);
781 
782 	if (do_rr) {
783 		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
784 
785 		/* no entries matched; do round-robin */
786 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
787 			next = leaf;
788 
789 		if (next != rt0) {
790 			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 			/* make sure next is not being deleted from the tree */
792 			if (next->rt6i_node)
793 				rcu_assign_pointer(fn->rr_ptr, next);
794 			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 		}
796 	}
797 
798 	return match ? match : net->ipv6.ip6_null_entry;
799 }
800 
801 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802 {
803 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804 }
805 
806 #ifdef CONFIG_IPV6_ROUTE_INFO
807 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
808 		  const struct in6_addr *gwaddr)
809 {
810 	struct net *net = dev_net(dev);
811 	struct route_info *rinfo = (struct route_info *) opt;
812 	struct in6_addr prefix_buf, *prefix;
813 	unsigned int pref;
814 	unsigned long lifetime;
815 	struct rt6_info *rt;
816 
817 	if (len < sizeof(struct route_info)) {
818 		return -EINVAL;
819 	}
820 
821 	/* Sanity check for prefix_len and length */
822 	if (rinfo->length > 3) {
823 		return -EINVAL;
824 	} else if (rinfo->prefix_len > 128) {
825 		return -EINVAL;
826 	} else if (rinfo->prefix_len > 64) {
827 		if (rinfo->length < 2) {
828 			return -EINVAL;
829 		}
830 	} else if (rinfo->prefix_len > 0) {
831 		if (rinfo->length < 1) {
832 			return -EINVAL;
833 		}
834 	}
835 
836 	pref = rinfo->route_pref;
837 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
838 		return -EINVAL;
839 
840 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
841 
842 	if (rinfo->length == 3)
843 		prefix = (struct in6_addr *)rinfo->prefix;
844 	else {
845 		/* this function is safe */
846 		ipv6_addr_prefix(&prefix_buf,
847 				 (struct in6_addr *)rinfo->prefix,
848 				 rinfo->prefix_len);
849 		prefix = &prefix_buf;
850 	}
851 
852 	if (rinfo->prefix_len == 0)
853 		rt = rt6_get_dflt_router(gwaddr, dev);
854 	else
855 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
856 					gwaddr, dev);
857 
858 	if (rt && !lifetime) {
859 		ip6_del_rt(rt);
860 		rt = NULL;
861 	}
862 
863 	if (!rt && lifetime)
864 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 					dev, pref);
866 	else if (rt)
867 		rt->rt6i_flags = RTF_ROUTEINFO |
868 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869 
870 	if (rt) {
871 		if (!addrconf_finite_timeout(lifetime))
872 			rt6_clean_expires(rt);
873 		else
874 			rt6_set_expires(rt, jiffies + HZ * lifetime);
875 
876 		ip6_rt_put(rt);
877 	}
878 	return 0;
879 }
880 #endif
881 
882 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 					struct in6_addr *saddr)
884 {
885 	struct fib6_node *pn, *sn;
886 	while (1) {
887 		if (fn->fn_flags & RTN_TL_ROOT)
888 			return NULL;
889 		pn = rcu_dereference(fn->parent);
890 		sn = FIB6_SUBTREE(pn);
891 		if (sn && sn != fn)
892 			fn = fib6_lookup(sn, NULL, saddr);
893 		else
894 			fn = pn;
895 		if (fn->fn_flags & RTN_RTINFO)
896 			return fn;
897 	}
898 }
899 
900 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 			  bool null_fallback)
902 {
903 	struct rt6_info *rt = *prt;
904 
905 	if (dst_hold_safe(&rt->dst))
906 		return true;
907 	if (null_fallback) {
908 		rt = net->ipv6.ip6_null_entry;
909 		dst_hold(&rt->dst);
910 	} else {
911 		rt = NULL;
912 	}
913 	*prt = rt;
914 	return false;
915 }
916 
917 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 					     struct fib6_table *table,
919 					     struct flowi6 *fl6,
920 					     const struct sk_buff *skb,
921 					     int flags)
922 {
923 	struct rt6_info *rt, *rt_cache;
924 	struct fib6_node *fn;
925 
926 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 		flags &= ~RT6_LOOKUP_F_IFACE;
928 
929 	rcu_read_lock();
930 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
931 restart:
932 	rt = rcu_dereference(fn->leaf);
933 	if (!rt) {
934 		rt = net->ipv6.ip6_null_entry;
935 	} else {
936 		rt = rt6_device_match(net, rt, &fl6->saddr,
937 				      fl6->flowi6_oif, flags);
938 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
939 			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
940 						  skb, flags);
941 	}
942 	if (rt == net->ipv6.ip6_null_entry) {
943 		fn = fib6_backtrack(fn, &fl6->saddr);
944 		if (fn)
945 			goto restart;
946 	}
947 	/* Search through exception table */
948 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 	if (rt_cache)
950 		rt = rt_cache;
951 
952 	if (ip6_hold_safe(net, &rt, true))
953 		dst_use_noref(&rt->dst, jiffies);
954 
955 	rcu_read_unlock();
956 
957 	trace_fib6_table_lookup(net, rt, table, fl6);
958 
959 	return rt;
960 
961 }
962 
963 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
964 				   const struct sk_buff *skb, int flags)
965 {
966 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
967 }
968 EXPORT_SYMBOL_GPL(ip6_route_lookup);
969 
970 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
971 			    const struct in6_addr *saddr, int oif,
972 			    const struct sk_buff *skb, int strict)
973 {
974 	struct flowi6 fl6 = {
975 		.flowi6_oif = oif,
976 		.daddr = *daddr,
977 	};
978 	struct dst_entry *dst;
979 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
980 
981 	if (saddr) {
982 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
983 		flags |= RT6_LOOKUP_F_HAS_SADDR;
984 	}
985 
986 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
987 	if (dst->error == 0)
988 		return (struct rt6_info *) dst;
989 
990 	dst_release(dst);
991 
992 	return NULL;
993 }
994 EXPORT_SYMBOL(rt6_lookup);
995 
996 /* ip6_ins_rt is called with FREE table->tb6_lock.
997  * It takes new route entry, the addition fails by any reason the
998  * route is released.
999  * Caller must hold dst before calling it.
1000  */
1001 
1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1003 			struct mx6_config *mxc,
1004 			struct netlink_ext_ack *extack)
1005 {
1006 	int err;
1007 	struct fib6_table *table;
1008 
1009 	table = rt->rt6i_table;
1010 	spin_lock_bh(&table->tb6_lock);
1011 	err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
1012 	spin_unlock_bh(&table->tb6_lock);
1013 
1014 	return err;
1015 }
1016 
1017 int ip6_ins_rt(struct rt6_info *rt)
1018 {
1019 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
1020 	struct mx6_config mxc = { .mx = NULL, };
1021 
1022 	/* Hold dst to account for the reference from the fib6 tree */
1023 	dst_hold(&rt->dst);
1024 	return __ip6_ins_rt(rt, &info, &mxc, NULL);
1025 }
1026 
1027 /* called with rcu_lock held */
1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029 {
1030 	struct net_device *dev = rt->dst.dev;
1031 
1032 	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1033 		/* for copies of local routes, dst->dev needs to be the
1034 		 * device if it is a master device, the master device if
1035 		 * device is enslaved, and the loopback as the default
1036 		 */
1037 		if (netif_is_l3_slave(dev) &&
1038 		    !rt6_need_strict(&rt->rt6i_dst.addr))
1039 			dev = l3mdev_master_dev_rcu(dev);
1040 		else if (!netif_is_l3_master(dev))
1041 			dev = dev_net(dev)->loopback_dev;
1042 		/* last case is netif_is_l3_master(dev) is true in which
1043 		 * case we want dev returned to be dev
1044 		 */
1045 	}
1046 
1047 	return dev;
1048 }
1049 
1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 					   const struct in6_addr *daddr,
1052 					   const struct in6_addr *saddr)
1053 {
1054 	struct net_device *dev;
1055 	struct rt6_info *rt;
1056 
1057 	/*
1058 	 *	Clone the route.
1059 	 */
1060 
1061 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1062 		ort = ort->from;
1063 
1064 	rcu_read_lock();
1065 	dev = ip6_rt_get_dev_rcu(ort);
1066 	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067 	rcu_read_unlock();
1068 	if (!rt)
1069 		return NULL;
1070 
1071 	ip6_rt_copy_init(rt, ort);
1072 	rt->rt6i_flags |= RTF_CACHE;
1073 	rt->rt6i_metric = 0;
1074 	rt->dst.flags |= DST_HOST;
1075 	rt->rt6i_dst.addr = *daddr;
1076 	rt->rt6i_dst.plen = 128;
1077 
1078 	if (!rt6_is_gw_or_nonexthop(ort)) {
1079 		if (ort->rt6i_dst.plen != 128 &&
1080 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 			rt->rt6i_flags |= RTF_ANYCAST;
1082 #ifdef CONFIG_IPV6_SUBTREES
1083 		if (rt->rt6i_src.plen && saddr) {
1084 			rt->rt6i_src.addr = *saddr;
1085 			rt->rt6i_src.plen = 128;
1086 		}
1087 #endif
1088 	}
1089 
1090 	return rt;
1091 }
1092 
1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094 {
1095 	struct net_device *dev;
1096 	struct rt6_info *pcpu_rt;
1097 
1098 	rcu_read_lock();
1099 	dev = ip6_rt_get_dev_rcu(rt);
1100 	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101 	rcu_read_unlock();
1102 	if (!pcpu_rt)
1103 		return NULL;
1104 	ip6_rt_copy_init(pcpu_rt, rt);
1105 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 	return pcpu_rt;
1108 }
1109 
1110 /* It should be called with rcu_read_lock() acquired */
1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112 {
1113 	struct rt6_info *pcpu_rt, **p;
1114 
1115 	p = this_cpu_ptr(rt->rt6i_pcpu);
1116 	pcpu_rt = *p;
1117 
1118 	if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
1119 		rt6_dst_from_metrics_check(pcpu_rt);
1120 
1121 	return pcpu_rt;
1122 }
1123 
1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125 {
1126 	struct rt6_info *pcpu_rt, *prev, **p;
1127 
1128 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129 	if (!pcpu_rt) {
1130 		struct net *net = dev_net(rt->dst.dev);
1131 
1132 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 		return net->ipv6.ip6_null_entry;
1134 	}
1135 
1136 	dst_hold(&pcpu_rt->dst);
1137 	p = this_cpu_ptr(rt->rt6i_pcpu);
1138 	prev = cmpxchg(p, NULL, pcpu_rt);
1139 	BUG_ON(prev);
1140 
1141 	rt6_dst_from_metrics_check(pcpu_rt);
1142 	return pcpu_rt;
1143 }
1144 
1145 /* exception hash table implementation
1146  */
1147 static DEFINE_SPINLOCK(rt6_exception_lock);
1148 
1149 /* Remove rt6_ex from hash table and free the memory
1150  * Caller must hold rt6_exception_lock
1151  */
1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 				 struct rt6_exception *rt6_ex)
1154 {
1155 	struct net *net;
1156 
1157 	if (!bucket || !rt6_ex)
1158 		return;
1159 
1160 	net = dev_net(rt6_ex->rt6i->dst.dev);
1161 	rt6_ex->rt6i->rt6i_node = NULL;
1162 	hlist_del_rcu(&rt6_ex->hlist);
1163 	rt6_release(rt6_ex->rt6i);
1164 	kfree_rcu(rt6_ex, rcu);
1165 	WARN_ON_ONCE(!bucket->depth);
1166 	bucket->depth--;
1167 	net->ipv6.rt6_stats->fib_rt_cache--;
1168 }
1169 
1170 /* Remove oldest rt6_ex in bucket and free the memory
1171  * Caller must hold rt6_exception_lock
1172  */
1173 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174 {
1175 	struct rt6_exception *rt6_ex, *oldest = NULL;
1176 
1177 	if (!bucket)
1178 		return;
1179 
1180 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182 			oldest = rt6_ex;
1183 	}
1184 	rt6_remove_exception(bucket, oldest);
1185 }
1186 
1187 static u32 rt6_exception_hash(const struct in6_addr *dst,
1188 			      const struct in6_addr *src)
1189 {
1190 	static u32 seed __read_mostly;
1191 	u32 val;
1192 
1193 	net_get_random_once(&seed, sizeof(seed));
1194 	val = jhash(dst, sizeof(*dst), seed);
1195 
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 	if (src)
1198 		val = jhash(src, sizeof(*src), val);
1199 #endif
1200 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201 }
1202 
1203 /* Helper function to find the cached rt in the hash table
1204  * and update bucket pointer to point to the bucket for this
1205  * (daddr, saddr) pair
1206  * Caller must hold rt6_exception_lock
1207  */
1208 static struct rt6_exception *
1209 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210 			      const struct in6_addr *daddr,
1211 			      const struct in6_addr *saddr)
1212 {
1213 	struct rt6_exception *rt6_ex;
1214 	u32 hval;
1215 
1216 	if (!(*bucket) || !daddr)
1217 		return NULL;
1218 
1219 	hval = rt6_exception_hash(daddr, saddr);
1220 	*bucket += hval;
1221 
1222 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223 		struct rt6_info *rt6 = rt6_ex->rt6i;
1224 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225 
1226 #ifdef CONFIG_IPV6_SUBTREES
1227 		if (matched && saddr)
1228 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229 #endif
1230 		if (matched)
1231 			return rt6_ex;
1232 	}
1233 	return NULL;
1234 }
1235 
1236 /* Helper function to find the cached rt in the hash table
1237  * and update bucket pointer to point to the bucket for this
1238  * (daddr, saddr) pair
1239  * Caller must hold rcu_read_lock()
1240  */
1241 static struct rt6_exception *
1242 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243 			 const struct in6_addr *daddr,
1244 			 const struct in6_addr *saddr)
1245 {
1246 	struct rt6_exception *rt6_ex;
1247 	u32 hval;
1248 
1249 	WARN_ON_ONCE(!rcu_read_lock_held());
1250 
1251 	if (!(*bucket) || !daddr)
1252 		return NULL;
1253 
1254 	hval = rt6_exception_hash(daddr, saddr);
1255 	*bucket += hval;
1256 
1257 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258 		struct rt6_info *rt6 = rt6_ex->rt6i;
1259 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260 
1261 #ifdef CONFIG_IPV6_SUBTREES
1262 		if (matched && saddr)
1263 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264 #endif
1265 		if (matched)
1266 			return rt6_ex;
1267 	}
1268 	return NULL;
1269 }
1270 
1271 static int rt6_insert_exception(struct rt6_info *nrt,
1272 				struct rt6_info *ort)
1273 {
1274 	struct net *net = dev_net(ort->dst.dev);
1275 	struct rt6_exception_bucket *bucket;
1276 	struct in6_addr *src_key = NULL;
1277 	struct rt6_exception *rt6_ex;
1278 	int err = 0;
1279 
1280 	/* ort can't be a cache or pcpu route */
1281 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1282 		ort = ort->from;
1283 	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284 
1285 	spin_lock_bh(&rt6_exception_lock);
1286 
1287 	if (ort->exception_bucket_flushed) {
1288 		err = -EINVAL;
1289 		goto out;
1290 	}
1291 
1292 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293 					lockdep_is_held(&rt6_exception_lock));
1294 	if (!bucket) {
1295 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296 				 GFP_ATOMIC);
1297 		if (!bucket) {
1298 			err = -ENOMEM;
1299 			goto out;
1300 		}
1301 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302 	}
1303 
1304 #ifdef CONFIG_IPV6_SUBTREES
1305 	/* rt6i_src.plen != 0 indicates ort is in subtree
1306 	 * and exception table is indexed by a hash of
1307 	 * both rt6i_dst and rt6i_src.
1308 	 * Otherwise, the exception table is indexed by
1309 	 * a hash of only rt6i_dst.
1310 	 */
1311 	if (ort->rt6i_src.plen)
1312 		src_key = &nrt->rt6i_src.addr;
1313 #endif
1314 
1315 	/* Update rt6i_prefsrc as it could be changed
1316 	 * in rt6_remove_prefsrc()
1317 	 */
1318 	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1319 	/* rt6_mtu_change() might lower mtu on ort.
1320 	 * Only insert this exception route if its mtu
1321 	 * is less than ort's mtu value.
1322 	 */
1323 	if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324 		err = -EINVAL;
1325 		goto out;
1326 	}
1327 
1328 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329 					       src_key);
1330 	if (rt6_ex)
1331 		rt6_remove_exception(bucket, rt6_ex);
1332 
1333 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334 	if (!rt6_ex) {
1335 		err = -ENOMEM;
1336 		goto out;
1337 	}
1338 	rt6_ex->rt6i = nrt;
1339 	rt6_ex->stamp = jiffies;
1340 	atomic_inc(&nrt->rt6i_ref);
1341 	nrt->rt6i_node = ort->rt6i_node;
1342 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343 	bucket->depth++;
1344 	net->ipv6.rt6_stats->fib_rt_cache++;
1345 
1346 	if (bucket->depth > FIB6_MAX_DEPTH)
1347 		rt6_exception_remove_oldest(bucket);
1348 
1349 out:
1350 	spin_unlock_bh(&rt6_exception_lock);
1351 
1352 	/* Update fn->fn_sernum to invalidate all cached dst */
1353 	if (!err) {
1354 		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1355 		fib6_update_sernum(ort);
1356 		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1357 		fib6_force_start_gc(net);
1358 	}
1359 
1360 	return err;
1361 }
1362 
1363 void rt6_flush_exceptions(struct rt6_info *rt)
1364 {
1365 	struct rt6_exception_bucket *bucket;
1366 	struct rt6_exception *rt6_ex;
1367 	struct hlist_node *tmp;
1368 	int i;
1369 
1370 	spin_lock_bh(&rt6_exception_lock);
1371 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1372 	rt->exception_bucket_flushed = 1;
1373 
1374 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375 				    lockdep_is_held(&rt6_exception_lock));
1376 	if (!bucket)
1377 		goto out;
1378 
1379 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381 			rt6_remove_exception(bucket, rt6_ex);
1382 		WARN_ON_ONCE(bucket->depth);
1383 		bucket++;
1384 	}
1385 
1386 out:
1387 	spin_unlock_bh(&rt6_exception_lock);
1388 }
1389 
1390 /* Find cached rt in the hash table inside passed in rt
1391  * Caller has to hold rcu_read_lock()
1392  */
1393 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394 					   struct in6_addr *daddr,
1395 					   struct in6_addr *saddr)
1396 {
1397 	struct rt6_exception_bucket *bucket;
1398 	struct in6_addr *src_key = NULL;
1399 	struct rt6_exception *rt6_ex;
1400 	struct rt6_info *res = NULL;
1401 
1402 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403 
1404 #ifdef CONFIG_IPV6_SUBTREES
1405 	/* rt6i_src.plen != 0 indicates rt is in subtree
1406 	 * and exception table is indexed by a hash of
1407 	 * both rt6i_dst and rt6i_src.
1408 	 * Otherwise, the exception table is indexed by
1409 	 * a hash of only rt6i_dst.
1410 	 */
1411 	if (rt->rt6i_src.plen)
1412 		src_key = saddr;
1413 #endif
1414 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415 
1416 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417 		res = rt6_ex->rt6i;
1418 
1419 	return res;
1420 }
1421 
1422 /* Remove the passed in cached rt from the hash table that contains it */
1423 int rt6_remove_exception_rt(struct rt6_info *rt)
1424 {
1425 	struct rt6_exception_bucket *bucket;
1426 	struct rt6_info *from = rt->from;
1427 	struct in6_addr *src_key = NULL;
1428 	struct rt6_exception *rt6_ex;
1429 	int err;
1430 
1431 	if (!from ||
1432 	    !(rt->rt6i_flags & RTF_CACHE))
1433 		return -EINVAL;
1434 
1435 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436 		return -ENOENT;
1437 
1438 	spin_lock_bh(&rt6_exception_lock);
1439 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440 				    lockdep_is_held(&rt6_exception_lock));
1441 #ifdef CONFIG_IPV6_SUBTREES
1442 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1443 	 * and exception table is indexed by a hash of
1444 	 * both rt6i_dst and rt6i_src.
1445 	 * Otherwise, the exception table is indexed by
1446 	 * a hash of only rt6i_dst.
1447 	 */
1448 	if (from->rt6i_src.plen)
1449 		src_key = &rt->rt6i_src.addr;
1450 #endif
1451 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452 					       &rt->rt6i_dst.addr,
1453 					       src_key);
1454 	if (rt6_ex) {
1455 		rt6_remove_exception(bucket, rt6_ex);
1456 		err = 0;
1457 	} else {
1458 		err = -ENOENT;
1459 	}
1460 
1461 	spin_unlock_bh(&rt6_exception_lock);
1462 	return err;
1463 }
1464 
1465 /* Find rt6_ex which contains the passed in rt cache and
1466  * refresh its stamp
1467  */
1468 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469 {
1470 	struct rt6_exception_bucket *bucket;
1471 	struct rt6_info *from = rt->from;
1472 	struct in6_addr *src_key = NULL;
1473 	struct rt6_exception *rt6_ex;
1474 
1475 	if (!from ||
1476 	    !(rt->rt6i_flags & RTF_CACHE))
1477 		return;
1478 
1479 	rcu_read_lock();
1480 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1481 
1482 #ifdef CONFIG_IPV6_SUBTREES
1483 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1484 	 * and exception table is indexed by a hash of
1485 	 * both rt6i_dst and rt6i_src.
1486 	 * Otherwise, the exception table is indexed by
1487 	 * a hash of only rt6i_dst.
1488 	 */
1489 	if (from->rt6i_src.plen)
1490 		src_key = &rt->rt6i_src.addr;
1491 #endif
1492 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1493 					  &rt->rt6i_dst.addr,
1494 					  src_key);
1495 	if (rt6_ex)
1496 		rt6_ex->stamp = jiffies;
1497 
1498 	rcu_read_unlock();
1499 }
1500 
1501 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502 {
1503 	struct rt6_exception_bucket *bucket;
1504 	struct rt6_exception *rt6_ex;
1505 	int i;
1506 
1507 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508 					lockdep_is_held(&rt6_exception_lock));
1509 
1510 	if (bucket) {
1511 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514 			}
1515 			bucket++;
1516 		}
1517 	}
1518 }
1519 
1520 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521 					 struct rt6_info *rt, int mtu)
1522 {
1523 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1524 	 * lowest MTU in the path: always allow updating the route PMTU to
1525 	 * reflect PMTU decreases.
1526 	 *
1527 	 * If the new MTU is higher, and the route PMTU is equal to the local
1528 	 * MTU, this means the old MTU is the lowest in the path, so allow
1529 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530 	 * handle this.
1531 	 */
1532 
1533 	if (dst_mtu(&rt->dst) >= mtu)
1534 		return true;
1535 
1536 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537 		return true;
1538 
1539 	return false;
1540 }
1541 
1542 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 				       struct rt6_info *rt, int mtu)
1544 {
1545 	struct rt6_exception_bucket *bucket;
1546 	struct rt6_exception *rt6_ex;
1547 	int i;
1548 
1549 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 					lockdep_is_held(&rt6_exception_lock));
1551 
1552 	if (!bucket)
1553 		return;
1554 
1555 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557 			struct rt6_info *entry = rt6_ex->rt6i;
1558 
1559 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 			 * route), the metrics of its rt->dst.from have already
1561 			 * been updated.
1562 			 */
1563 			if (entry->rt6i_pmtu &&
1564 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 				entry->rt6i_pmtu = mtu;
1566 		}
1567 		bucket++;
1568 	}
1569 }
1570 
1571 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1572 
1573 static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574 					struct in6_addr *gateway)
1575 {
1576 	struct rt6_exception_bucket *bucket;
1577 	struct rt6_exception *rt6_ex;
1578 	struct hlist_node *tmp;
1579 	int i;
1580 
1581 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582 		return;
1583 
1584 	spin_lock_bh(&rt6_exception_lock);
1585 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586 				     lockdep_is_held(&rt6_exception_lock));
1587 
1588 	if (bucket) {
1589 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590 			hlist_for_each_entry_safe(rt6_ex, tmp,
1591 						  &bucket->chain, hlist) {
1592 				struct rt6_info *entry = rt6_ex->rt6i;
1593 
1594 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595 				    RTF_CACHE_GATEWAY &&
1596 				    ipv6_addr_equal(gateway,
1597 						    &entry->rt6i_gateway)) {
1598 					rt6_remove_exception(bucket, rt6_ex);
1599 				}
1600 			}
1601 			bucket++;
1602 		}
1603 	}
1604 
1605 	spin_unlock_bh(&rt6_exception_lock);
1606 }
1607 
1608 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609 				      struct rt6_exception *rt6_ex,
1610 				      struct fib6_gc_args *gc_args,
1611 				      unsigned long now)
1612 {
1613 	struct rt6_info *rt = rt6_ex->rt6i;
1614 
1615 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1616 	 * even if others have still references to them, so that on next
1617 	 * dst_check() such references can be dropped.
1618 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619 	 * expired, independently from their aging, as per RFC 8201 section 4
1620 	 */
1621 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623 			RT6_TRACE("aging clone %p\n", rt);
1624 			rt6_remove_exception(bucket, rt6_ex);
1625 			return;
1626 		}
1627 	} else if (time_after(jiffies, rt->dst.expires)) {
1628 		RT6_TRACE("purging expired route %p\n", rt);
1629 		rt6_remove_exception(bucket, rt6_ex);
1630 		return;
1631 	}
1632 
1633 	if (rt->rt6i_flags & RTF_GATEWAY) {
1634 		struct neighbour *neigh;
1635 		__u8 neigh_flags = 0;
1636 
1637 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638 		if (neigh)
1639 			neigh_flags = neigh->flags;
1640 
1641 		if (!(neigh_flags & NTF_ROUTER)) {
1642 			RT6_TRACE("purging route %p via non-router but gateway\n",
1643 				  rt);
1644 			rt6_remove_exception(bucket, rt6_ex);
1645 			return;
1646 		}
1647 	}
1648 
1649 	gc_args->more++;
1650 }
1651 
1652 void rt6_age_exceptions(struct rt6_info *rt,
1653 			struct fib6_gc_args *gc_args,
1654 			unsigned long now)
1655 {
1656 	struct rt6_exception_bucket *bucket;
1657 	struct rt6_exception *rt6_ex;
1658 	struct hlist_node *tmp;
1659 	int i;
1660 
1661 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662 		return;
1663 
1664 	rcu_read_lock_bh();
1665 	spin_lock(&rt6_exception_lock);
1666 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 				    lockdep_is_held(&rt6_exception_lock));
1668 
1669 	if (bucket) {
1670 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 			hlist_for_each_entry_safe(rt6_ex, tmp,
1672 						  &bucket->chain, hlist) {
1673 				rt6_age_examine_exception(bucket, rt6_ex,
1674 							  gc_args, now);
1675 			}
1676 			bucket++;
1677 		}
1678 	}
1679 	spin_unlock(&rt6_exception_lock);
1680 	rcu_read_unlock_bh();
1681 }
1682 
1683 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1684 			       int oif, struct flowi6 *fl6,
1685 			       const struct sk_buff *skb, int flags)
1686 {
1687 	struct fib6_node *fn, *saved_fn;
1688 	struct rt6_info *rt, *rt_cache;
1689 	int strict = 0;
1690 
1691 	strict |= flags & RT6_LOOKUP_F_IFACE;
1692 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1693 	if (net->ipv6.devconf_all->forwarding == 0)
1694 		strict |= RT6_LOOKUP_F_REACHABLE;
1695 
1696 	rcu_read_lock();
1697 
1698 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1699 	saved_fn = fn;
1700 
1701 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702 		oif = 0;
1703 
1704 redo_rt6_select:
1705 	rt = rt6_select(net, fn, oif, strict);
1706 	if (rt->rt6i_nsiblings)
1707 		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
1708 	if (rt == net->ipv6.ip6_null_entry) {
1709 		fn = fib6_backtrack(fn, &fl6->saddr);
1710 		if (fn)
1711 			goto redo_rt6_select;
1712 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713 			/* also consider unreachable route */
1714 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1715 			fn = saved_fn;
1716 			goto redo_rt6_select;
1717 		}
1718 	}
1719 
1720 	/*Search through exception table */
1721 	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722 	if (rt_cache)
1723 		rt = rt_cache;
1724 
1725 	if (rt == net->ipv6.ip6_null_entry) {
1726 		rcu_read_unlock();
1727 		dst_hold(&rt->dst);
1728 		trace_fib6_table_lookup(net, rt, table, fl6);
1729 		return rt;
1730 	} else if (rt->rt6i_flags & RTF_CACHE) {
1731 		if (ip6_hold_safe(net, &rt, true)) {
1732 			dst_use_noref(&rt->dst, jiffies);
1733 			rt6_dst_from_metrics_check(rt);
1734 		}
1735 		rcu_read_unlock();
1736 		trace_fib6_table_lookup(net, rt, table, fl6);
1737 		return rt;
1738 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1740 		/* Create a RTF_CACHE clone which will not be
1741 		 * owned by the fib6 tree.  It is for the special case where
1742 		 * the daddr in the skb during the neighbor look-up is different
1743 		 * from the fl6->daddr used to look-up route here.
1744 		 */
1745 
1746 		struct rt6_info *uncached_rt;
1747 
1748 		if (ip6_hold_safe(net, &rt, true)) {
1749 			dst_use_noref(&rt->dst, jiffies);
1750 		} else {
1751 			rcu_read_unlock();
1752 			uncached_rt = rt;
1753 			goto uncached_rt_out;
1754 		}
1755 		rcu_read_unlock();
1756 
1757 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758 		dst_release(&rt->dst);
1759 
1760 		if (uncached_rt) {
1761 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762 			 * No need for another dst_hold()
1763 			 */
1764 			rt6_uncached_list_add(uncached_rt);
1765 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1766 		} else {
1767 			uncached_rt = net->ipv6.ip6_null_entry;
1768 			dst_hold(&uncached_rt->dst);
1769 		}
1770 
1771 uncached_rt_out:
1772 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1773 		return uncached_rt;
1774 
1775 	} else {
1776 		/* Get a percpu copy */
1777 
1778 		struct rt6_info *pcpu_rt;
1779 
1780 		dst_use_noref(&rt->dst, jiffies);
1781 		local_bh_disable();
1782 		pcpu_rt = rt6_get_pcpu_route(rt);
1783 
1784 		if (!pcpu_rt) {
1785 			/* atomic_inc_not_zero() is needed when using rcu */
1786 			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1787 				/* No dst_hold() on rt is needed because grabbing
1788 				 * rt->rt6i_ref makes sure rt can't be released.
1789 				 */
1790 				pcpu_rt = rt6_make_pcpu_route(rt);
1791 				rt6_release(rt);
1792 			} else {
1793 				/* rt is already removed from tree */
1794 				pcpu_rt = net->ipv6.ip6_null_entry;
1795 				dst_hold(&pcpu_rt->dst);
1796 			}
1797 		}
1798 		local_bh_enable();
1799 		rcu_read_unlock();
1800 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1801 		return pcpu_rt;
1802 	}
1803 }
1804 EXPORT_SYMBOL_GPL(ip6_pol_route);
1805 
1806 static struct rt6_info *ip6_pol_route_input(struct net *net,
1807 					    struct fib6_table *table,
1808 					    struct flowi6 *fl6,
1809 					    const struct sk_buff *skb,
1810 					    int flags)
1811 {
1812 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1813 }
1814 
1815 struct dst_entry *ip6_route_input_lookup(struct net *net,
1816 					 struct net_device *dev,
1817 					 struct flowi6 *fl6,
1818 					 const struct sk_buff *skb,
1819 					 int flags)
1820 {
1821 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822 		flags |= RT6_LOOKUP_F_IFACE;
1823 
1824 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1825 }
1826 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1827 
1828 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1829 				  struct flow_keys *keys,
1830 				  struct flow_keys *flkeys)
1831 {
1832 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833 	const struct ipv6hdr *key_iph = outer_iph;
1834 	struct flow_keys *_flkeys = flkeys;
1835 	const struct ipv6hdr *inner_iph;
1836 	const struct icmp6hdr *icmph;
1837 	struct ipv6hdr _inner_iph;
1838 	struct icmp6hdr _icmph;
1839 
1840 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1841 		goto out;
1842 
1843 	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1844 				   sizeof(_icmph), &_icmph);
1845 	if (!icmph)
1846 		goto out;
1847 
1848 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1849 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1850 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1851 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1852 		goto out;
1853 
1854 	inner_iph = skb_header_pointer(skb,
1855 				       skb_transport_offset(skb) + sizeof(*icmph),
1856 				       sizeof(_inner_iph), &_inner_iph);
1857 	if (!inner_iph)
1858 		goto out;
1859 
1860 	key_iph = inner_iph;
1861 	_flkeys = NULL;
1862 out:
1863 	if (_flkeys) {
1864 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1865 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1866 		keys->tags.flow_label = _flkeys->tags.flow_label;
1867 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1868 	} else {
1869 		keys->addrs.v6addrs.src = key_iph->saddr;
1870 		keys->addrs.v6addrs.dst = key_iph->daddr;
1871 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1872 		keys->basic.ip_proto = key_iph->nexthdr;
1873 	}
1874 }
1875 
1876 /* if skb is set it will be used and fl6 can be NULL */
1877 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1878 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1879 {
1880 	struct flow_keys hash_keys;
1881 	u32 mhash;
1882 
1883 	switch (ip6_multipath_hash_policy(net)) {
1884 	case 0:
1885 		memset(&hash_keys, 0, sizeof(hash_keys));
1886 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1887 		if (skb) {
1888 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1889 		} else {
1890 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1891 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1892 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1893 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1894 		}
1895 		break;
1896 	case 1:
1897 		if (skb) {
1898 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1899 			struct flow_keys keys;
1900 
1901 			/* short-circuit if we already have L4 hash present */
1902 			if (skb->l4_hash)
1903 				return skb_get_hash_raw(skb) >> 1;
1904 
1905 			memset(&hash_keys, 0, sizeof(hash_keys));
1906 
1907                         if (!flkeys) {
1908 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1909 				flkeys = &keys;
1910 			}
1911 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1912 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1913 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1914 			hash_keys.ports.src = flkeys->ports.src;
1915 			hash_keys.ports.dst = flkeys->ports.dst;
1916 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1917 		} else {
1918 			memset(&hash_keys, 0, sizeof(hash_keys));
1919 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1920 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1921 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1922 			hash_keys.ports.src = fl6->fl6_sport;
1923 			hash_keys.ports.dst = fl6->fl6_dport;
1924 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1925 		}
1926 		break;
1927 	}
1928 	mhash = flow_hash_from_keys(&hash_keys);
1929 
1930 	return mhash >> 1;
1931 }
1932 
1933 void ip6_route_input(struct sk_buff *skb)
1934 {
1935 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1936 	struct net *net = dev_net(skb->dev);
1937 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1938 	struct ip_tunnel_info *tun_info;
1939 	struct flowi6 fl6 = {
1940 		.flowi6_iif = skb->dev->ifindex,
1941 		.daddr = iph->daddr,
1942 		.saddr = iph->saddr,
1943 		.flowlabel = ip6_flowinfo(iph),
1944 		.flowi6_mark = skb->mark,
1945 		.flowi6_proto = iph->nexthdr,
1946 	};
1947 	struct flow_keys *flkeys = NULL, _flkeys;
1948 
1949 	tun_info = skb_tunnel_info(skb);
1950 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1951 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1952 
1953 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1954 		flkeys = &_flkeys;
1955 
1956 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1957 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
1958 	skb_dst_drop(skb);
1959 	skb_dst_set(skb,
1960 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
1961 }
1962 
1963 static struct rt6_info *ip6_pol_route_output(struct net *net,
1964 					     struct fib6_table *table,
1965 					     struct flowi6 *fl6,
1966 					     const struct sk_buff *skb,
1967 					     int flags)
1968 {
1969 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
1970 }
1971 
1972 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1973 					 struct flowi6 *fl6, int flags)
1974 {
1975 	bool any_src;
1976 
1977 	if (rt6_need_strict(&fl6->daddr)) {
1978 		struct dst_entry *dst;
1979 
1980 		dst = l3mdev_link_scope_lookup(net, fl6);
1981 		if (dst)
1982 			return dst;
1983 	}
1984 
1985 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1986 
1987 	any_src = ipv6_addr_any(&fl6->saddr);
1988 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1989 	    (fl6->flowi6_oif && any_src))
1990 		flags |= RT6_LOOKUP_F_IFACE;
1991 
1992 	if (!any_src)
1993 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1994 	else if (sk)
1995 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1996 
1997 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1998 }
1999 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2000 
2001 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2002 {
2003 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2004 	struct net_device *loopback_dev = net->loopback_dev;
2005 	struct dst_entry *new = NULL;
2006 
2007 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2008 		       DST_OBSOLETE_DEAD, 0);
2009 	if (rt) {
2010 		rt6_info_init(rt);
2011 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2012 
2013 		new = &rt->dst;
2014 		new->__use = 1;
2015 		new->input = dst_discard;
2016 		new->output = dst_discard_out;
2017 
2018 		dst_copy_metrics(new, &ort->dst);
2019 
2020 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2021 		rt->rt6i_gateway = ort->rt6i_gateway;
2022 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2023 		rt->rt6i_metric = 0;
2024 
2025 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2026 #ifdef CONFIG_IPV6_SUBTREES
2027 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2028 #endif
2029 	}
2030 
2031 	dst_release(dst_orig);
2032 	return new ? new : ERR_PTR(-ENOMEM);
2033 }
2034 
2035 /*
2036  *	Destination cache support functions
2037  */
2038 
2039 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2040 {
2041 	if (rt->from &&
2042 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2043 		dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
2044 }
2045 
2046 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2047 {
2048 	u32 rt_cookie = 0;
2049 
2050 	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2051 		return NULL;
2052 
2053 	if (rt6_check_expired(rt))
2054 		return NULL;
2055 
2056 	return &rt->dst;
2057 }
2058 
2059 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2060 {
2061 	if (!__rt6_check_expired(rt) &&
2062 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2063 	    rt6_check(rt->from, cookie))
2064 		return &rt->dst;
2065 	else
2066 		return NULL;
2067 }
2068 
2069 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2070 {
2071 	struct rt6_info *rt;
2072 
2073 	rt = (struct rt6_info *) dst;
2074 
2075 	/* All IPV6 dsts are created with ->obsolete set to the value
2076 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2077 	 * into this function always.
2078 	 */
2079 
2080 	rt6_dst_from_metrics_check(rt);
2081 
2082 	if (rt->rt6i_flags & RTF_PCPU ||
2083 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2084 		return rt6_dst_from_check(rt, cookie);
2085 	else
2086 		return rt6_check(rt, cookie);
2087 }
2088 
2089 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2090 {
2091 	struct rt6_info *rt = (struct rt6_info *) dst;
2092 
2093 	if (rt) {
2094 		if (rt->rt6i_flags & RTF_CACHE) {
2095 			if (rt6_check_expired(rt)) {
2096 				ip6_del_rt(rt);
2097 				dst = NULL;
2098 			}
2099 		} else {
2100 			dst_release(dst);
2101 			dst = NULL;
2102 		}
2103 	}
2104 	return dst;
2105 }
2106 
2107 static void ip6_link_failure(struct sk_buff *skb)
2108 {
2109 	struct rt6_info *rt;
2110 
2111 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2112 
2113 	rt = (struct rt6_info *) skb_dst(skb);
2114 	if (rt) {
2115 		if (rt->rt6i_flags & RTF_CACHE) {
2116 			if (dst_hold_safe(&rt->dst))
2117 				ip6_del_rt(rt);
2118 		} else {
2119 			struct fib6_node *fn;
2120 
2121 			rcu_read_lock();
2122 			fn = rcu_dereference(rt->rt6i_node);
2123 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2124 				fn->fn_sernum = -1;
2125 			rcu_read_unlock();
2126 		}
2127 	}
2128 }
2129 
2130 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2131 {
2132 	struct net *net = dev_net(rt->dst.dev);
2133 
2134 	rt->rt6i_flags |= RTF_MODIFIED;
2135 	rt->rt6i_pmtu = mtu;
2136 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2137 }
2138 
2139 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2140 {
2141 	return !(rt->rt6i_flags & RTF_CACHE) &&
2142 		(rt->rt6i_flags & RTF_PCPU ||
2143 		 rcu_access_pointer(rt->rt6i_node));
2144 }
2145 
2146 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2147 				 const struct ipv6hdr *iph, u32 mtu)
2148 {
2149 	const struct in6_addr *daddr, *saddr;
2150 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2151 
2152 	if (rt6->rt6i_flags & RTF_LOCAL)
2153 		return;
2154 
2155 	if (dst_metric_locked(dst, RTAX_MTU))
2156 		return;
2157 
2158 	if (iph) {
2159 		daddr = &iph->daddr;
2160 		saddr = &iph->saddr;
2161 	} else if (sk) {
2162 		daddr = &sk->sk_v6_daddr;
2163 		saddr = &inet6_sk(sk)->saddr;
2164 	} else {
2165 		daddr = NULL;
2166 		saddr = NULL;
2167 	}
2168 	dst_confirm_neigh(dst, daddr);
2169 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2170 	if (mtu >= dst_mtu(dst))
2171 		return;
2172 
2173 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2174 		rt6_do_update_pmtu(rt6, mtu);
2175 		/* update rt6_ex->stamp for cache */
2176 		if (rt6->rt6i_flags & RTF_CACHE)
2177 			rt6_update_exception_stamp_rt(rt6);
2178 	} else if (daddr) {
2179 		struct rt6_info *nrt6;
2180 
2181 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2182 		if (nrt6) {
2183 			rt6_do_update_pmtu(nrt6, mtu);
2184 			if (rt6_insert_exception(nrt6, rt6))
2185 				dst_release_immediate(&nrt6->dst);
2186 		}
2187 	}
2188 }
2189 
2190 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2191 			       struct sk_buff *skb, u32 mtu)
2192 {
2193 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2194 }
2195 
2196 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2197 		     int oif, u32 mark, kuid_t uid)
2198 {
2199 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2200 	struct dst_entry *dst;
2201 	struct flowi6 fl6;
2202 
2203 	memset(&fl6, 0, sizeof(fl6));
2204 	fl6.flowi6_oif = oif;
2205 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2206 	fl6.daddr = iph->daddr;
2207 	fl6.saddr = iph->saddr;
2208 	fl6.flowlabel = ip6_flowinfo(iph);
2209 	fl6.flowi6_uid = uid;
2210 
2211 	dst = ip6_route_output(net, NULL, &fl6);
2212 	if (!dst->error)
2213 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2214 	dst_release(dst);
2215 }
2216 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2217 
2218 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2219 {
2220 	struct dst_entry *dst;
2221 
2222 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2223 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2224 
2225 	dst = __sk_dst_get(sk);
2226 	if (!dst || !dst->obsolete ||
2227 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2228 		return;
2229 
2230 	bh_lock_sock(sk);
2231 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2232 		ip6_datagram_dst_update(sk, false);
2233 	bh_unlock_sock(sk);
2234 }
2235 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2236 
2237 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2238 			   const struct flowi6 *fl6)
2239 {
2240 #ifdef CONFIG_IPV6_SUBTREES
2241 	struct ipv6_pinfo *np = inet6_sk(sk);
2242 #endif
2243 
2244 	ip6_dst_store(sk, dst,
2245 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2246 		      &sk->sk_v6_daddr : NULL,
2247 #ifdef CONFIG_IPV6_SUBTREES
2248 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2249 		      &np->saddr :
2250 #endif
2251 		      NULL);
2252 }
2253 
2254 /* Handle redirects */
2255 struct ip6rd_flowi {
2256 	struct flowi6 fl6;
2257 	struct in6_addr gateway;
2258 };
2259 
2260 static struct rt6_info *__ip6_route_redirect(struct net *net,
2261 					     struct fib6_table *table,
2262 					     struct flowi6 *fl6,
2263 					     const struct sk_buff *skb,
2264 					     int flags)
2265 {
2266 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2267 	struct rt6_info *rt, *rt_cache;
2268 	struct fib6_node *fn;
2269 
2270 	/* Get the "current" route for this destination and
2271 	 * check if the redirect has come from appropriate router.
2272 	 *
2273 	 * RFC 4861 specifies that redirects should only be
2274 	 * accepted if they come from the nexthop to the target.
2275 	 * Due to the way the routes are chosen, this notion
2276 	 * is a bit fuzzy and one might need to check all possible
2277 	 * routes.
2278 	 */
2279 
2280 	rcu_read_lock();
2281 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2282 restart:
2283 	for_each_fib6_node_rt_rcu(fn) {
2284 		if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2285 			continue;
2286 		if (rt6_check_expired(rt))
2287 			continue;
2288 		if (rt->dst.error)
2289 			break;
2290 		if (!(rt->rt6i_flags & RTF_GATEWAY))
2291 			continue;
2292 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2293 			continue;
2294 		/* rt_cache's gateway might be different from its 'parent'
2295 		 * in the case of an ip redirect.
2296 		 * So we keep searching in the exception table if the gateway
2297 		 * is different.
2298 		 */
2299 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2300 			rt_cache = rt6_find_cached_rt(rt,
2301 						      &fl6->daddr,
2302 						      &fl6->saddr);
2303 			if (rt_cache &&
2304 			    ipv6_addr_equal(&rdfl->gateway,
2305 					    &rt_cache->rt6i_gateway)) {
2306 				rt = rt_cache;
2307 				break;
2308 			}
2309 			continue;
2310 		}
2311 		break;
2312 	}
2313 
2314 	if (!rt)
2315 		rt = net->ipv6.ip6_null_entry;
2316 	else if (rt->dst.error) {
2317 		rt = net->ipv6.ip6_null_entry;
2318 		goto out;
2319 	}
2320 
2321 	if (rt == net->ipv6.ip6_null_entry) {
2322 		fn = fib6_backtrack(fn, &fl6->saddr);
2323 		if (fn)
2324 			goto restart;
2325 	}
2326 
2327 out:
2328 	ip6_hold_safe(net, &rt, true);
2329 
2330 	rcu_read_unlock();
2331 
2332 	trace_fib6_table_lookup(net, rt, table, fl6);
2333 	return rt;
2334 };
2335 
2336 static struct dst_entry *ip6_route_redirect(struct net *net,
2337 					    const struct flowi6 *fl6,
2338 					    const struct sk_buff *skb,
2339 					    const struct in6_addr *gateway)
2340 {
2341 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2342 	struct ip6rd_flowi rdfl;
2343 
2344 	rdfl.fl6 = *fl6;
2345 	rdfl.gateway = *gateway;
2346 
2347 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2348 				flags, __ip6_route_redirect);
2349 }
2350 
2351 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2352 		  kuid_t uid)
2353 {
2354 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2355 	struct dst_entry *dst;
2356 	struct flowi6 fl6;
2357 
2358 	memset(&fl6, 0, sizeof(fl6));
2359 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2360 	fl6.flowi6_oif = oif;
2361 	fl6.flowi6_mark = mark;
2362 	fl6.daddr = iph->daddr;
2363 	fl6.saddr = iph->saddr;
2364 	fl6.flowlabel = ip6_flowinfo(iph);
2365 	fl6.flowi6_uid = uid;
2366 
2367 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2368 	rt6_do_redirect(dst, NULL, skb);
2369 	dst_release(dst);
2370 }
2371 EXPORT_SYMBOL_GPL(ip6_redirect);
2372 
2373 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2374 			    u32 mark)
2375 {
2376 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2377 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2378 	struct dst_entry *dst;
2379 	struct flowi6 fl6;
2380 
2381 	memset(&fl6, 0, sizeof(fl6));
2382 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2383 	fl6.flowi6_oif = oif;
2384 	fl6.flowi6_mark = mark;
2385 	fl6.daddr = msg->dest;
2386 	fl6.saddr = iph->daddr;
2387 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2388 
2389 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2390 	rt6_do_redirect(dst, NULL, skb);
2391 	dst_release(dst);
2392 }
2393 
2394 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2395 {
2396 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2397 		     sk->sk_uid);
2398 }
2399 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2400 
2401 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2402 {
2403 	struct net_device *dev = dst->dev;
2404 	unsigned int mtu = dst_mtu(dst);
2405 	struct net *net = dev_net(dev);
2406 
2407 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2408 
2409 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2410 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2411 
2412 	/*
2413 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2414 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2415 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2416 	 * rely only on pmtu discovery"
2417 	 */
2418 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2419 		mtu = IPV6_MAXPLEN;
2420 	return mtu;
2421 }
2422 
2423 static unsigned int ip6_mtu(const struct dst_entry *dst)
2424 {
2425 	const struct rt6_info *rt = (const struct rt6_info *)dst;
2426 	unsigned int mtu = rt->rt6i_pmtu;
2427 	struct inet6_dev *idev;
2428 
2429 	if (mtu)
2430 		goto out;
2431 
2432 	mtu = dst_metric_raw(dst, RTAX_MTU);
2433 	if (mtu)
2434 		goto out;
2435 
2436 	mtu = IPV6_MIN_MTU;
2437 
2438 	rcu_read_lock();
2439 	idev = __in6_dev_get(dst->dev);
2440 	if (idev)
2441 		mtu = idev->cnf.mtu6;
2442 	rcu_read_unlock();
2443 
2444 out:
2445 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2446 
2447 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2448 }
2449 
2450 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2451 				  struct flowi6 *fl6)
2452 {
2453 	struct dst_entry *dst;
2454 	struct rt6_info *rt;
2455 	struct inet6_dev *idev = in6_dev_get(dev);
2456 	struct net *net = dev_net(dev);
2457 
2458 	if (unlikely(!idev))
2459 		return ERR_PTR(-ENODEV);
2460 
2461 	rt = ip6_dst_alloc(net, dev, 0);
2462 	if (unlikely(!rt)) {
2463 		in6_dev_put(idev);
2464 		dst = ERR_PTR(-ENOMEM);
2465 		goto out;
2466 	}
2467 
2468 	rt->dst.flags |= DST_HOST;
2469 	rt->dst.input = ip6_input;
2470 	rt->dst.output  = ip6_output;
2471 	rt->rt6i_gateway  = fl6->daddr;
2472 	rt->rt6i_dst.addr = fl6->daddr;
2473 	rt->rt6i_dst.plen = 128;
2474 	rt->rt6i_idev     = idev;
2475 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2476 
2477 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2478 	 * do proper release of the net_device
2479 	 */
2480 	rt6_uncached_list_add(rt);
2481 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2482 
2483 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2484 
2485 out:
2486 	return dst;
2487 }
2488 
2489 static int ip6_dst_gc(struct dst_ops *ops)
2490 {
2491 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2492 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2493 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2494 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2495 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2496 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2497 	int entries;
2498 
2499 	entries = dst_entries_get_fast(ops);
2500 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2501 	    entries <= rt_max_size)
2502 		goto out;
2503 
2504 	net->ipv6.ip6_rt_gc_expire++;
2505 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2506 	entries = dst_entries_get_slow(ops);
2507 	if (entries < ops->gc_thresh)
2508 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2509 out:
2510 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2511 	return entries > rt_max_size;
2512 }
2513 
2514 static int ip6_convert_metrics(struct mx6_config *mxc,
2515 			       const struct fib6_config *cfg)
2516 {
2517 	struct net *net = cfg->fc_nlinfo.nl_net;
2518 	bool ecn_ca = false;
2519 	struct nlattr *nla;
2520 	int remaining;
2521 	u32 *mp;
2522 
2523 	if (!cfg->fc_mx)
2524 		return 0;
2525 
2526 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2527 	if (unlikely(!mp))
2528 		return -ENOMEM;
2529 
2530 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2531 		int type = nla_type(nla);
2532 		u32 val;
2533 
2534 		if (!type)
2535 			continue;
2536 		if (unlikely(type > RTAX_MAX))
2537 			goto err;
2538 
2539 		if (type == RTAX_CC_ALGO) {
2540 			char tmp[TCP_CA_NAME_MAX];
2541 
2542 			nla_strlcpy(tmp, nla, sizeof(tmp));
2543 			val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
2544 			if (val == TCP_CA_UNSPEC)
2545 				goto err;
2546 		} else {
2547 			val = nla_get_u32(nla);
2548 		}
2549 		if (type == RTAX_HOPLIMIT && val > 255)
2550 			val = 255;
2551 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2552 			goto err;
2553 
2554 		mp[type - 1] = val;
2555 		__set_bit(type - 1, mxc->mx_valid);
2556 	}
2557 
2558 	if (ecn_ca) {
2559 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2560 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2561 	}
2562 
2563 	mxc->mx = mp;
2564 	return 0;
2565  err:
2566 	kfree(mp);
2567 	return -EINVAL;
2568 }
2569 
2570 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2571 					    struct fib6_config *cfg,
2572 					    const struct in6_addr *gw_addr,
2573 					    u32 tbid, int flags)
2574 {
2575 	struct flowi6 fl6 = {
2576 		.flowi6_oif = cfg->fc_ifindex,
2577 		.daddr = *gw_addr,
2578 		.saddr = cfg->fc_prefsrc,
2579 	};
2580 	struct fib6_table *table;
2581 	struct rt6_info *rt;
2582 
2583 	table = fib6_get_table(net, tbid);
2584 	if (!table)
2585 		return NULL;
2586 
2587 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2588 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2589 
2590 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2591 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2592 
2593 	/* if table lookup failed, fall back to full lookup */
2594 	if (rt == net->ipv6.ip6_null_entry) {
2595 		ip6_rt_put(rt);
2596 		rt = NULL;
2597 	}
2598 
2599 	return rt;
2600 }
2601 
2602 static int ip6_route_check_nh_onlink(struct net *net,
2603 				     struct fib6_config *cfg,
2604 				     const struct net_device *dev,
2605 				     struct netlink_ext_ack *extack)
2606 {
2607 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2608 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2609 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2610 	struct rt6_info *grt;
2611 	int err;
2612 
2613 	err = 0;
2614 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2615 	if (grt) {
2616 		if (!grt->dst.error &&
2617 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2618 			NL_SET_ERR_MSG(extack,
2619 				       "Nexthop has invalid gateway or device mismatch");
2620 			err = -EINVAL;
2621 		}
2622 
2623 		ip6_rt_put(grt);
2624 	}
2625 
2626 	return err;
2627 }
2628 
2629 static int ip6_route_check_nh(struct net *net,
2630 			      struct fib6_config *cfg,
2631 			      struct net_device **_dev,
2632 			      struct inet6_dev **idev)
2633 {
2634 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2635 	struct net_device *dev = _dev ? *_dev : NULL;
2636 	struct rt6_info *grt = NULL;
2637 	int err = -EHOSTUNREACH;
2638 
2639 	if (cfg->fc_table) {
2640 		int flags = RT6_LOOKUP_F_IFACE;
2641 
2642 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2643 					  cfg->fc_table, flags);
2644 		if (grt) {
2645 			if (grt->rt6i_flags & RTF_GATEWAY ||
2646 			    (dev && dev != grt->dst.dev)) {
2647 				ip6_rt_put(grt);
2648 				grt = NULL;
2649 			}
2650 		}
2651 	}
2652 
2653 	if (!grt)
2654 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2655 
2656 	if (!grt)
2657 		goto out;
2658 
2659 	if (dev) {
2660 		if (dev != grt->dst.dev) {
2661 			ip6_rt_put(grt);
2662 			goto out;
2663 		}
2664 	} else {
2665 		*_dev = dev = grt->dst.dev;
2666 		*idev = grt->rt6i_idev;
2667 		dev_hold(dev);
2668 		in6_dev_hold(grt->rt6i_idev);
2669 	}
2670 
2671 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2672 		err = 0;
2673 
2674 	ip6_rt_put(grt);
2675 
2676 out:
2677 	return err;
2678 }
2679 
2680 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2681 			   struct net_device **_dev, struct inet6_dev **idev,
2682 			   struct netlink_ext_ack *extack)
2683 {
2684 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2685 	int gwa_type = ipv6_addr_type(gw_addr);
2686 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2687 	const struct net_device *dev = *_dev;
2688 	bool need_addr_check = !dev;
2689 	int err = -EINVAL;
2690 
2691 	/* if gw_addr is local we will fail to detect this in case
2692 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2693 	 * will return already-added prefix route via interface that
2694 	 * prefix route was assigned to, which might be non-loopback.
2695 	 */
2696 	if (dev &&
2697 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2698 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2699 		goto out;
2700 	}
2701 
2702 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2703 		/* IPv6 strictly inhibits using not link-local
2704 		 * addresses as nexthop address.
2705 		 * Otherwise, router will not able to send redirects.
2706 		 * It is very good, but in some (rare!) circumstances
2707 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2708 		 * some exceptions. --ANK
2709 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2710 		 * addressing
2711 		 */
2712 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2713 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2714 			goto out;
2715 		}
2716 
2717 		if (cfg->fc_flags & RTNH_F_ONLINK)
2718 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2719 		else
2720 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2721 
2722 		if (err)
2723 			goto out;
2724 	}
2725 
2726 	/* reload in case device was changed */
2727 	dev = *_dev;
2728 
2729 	err = -EINVAL;
2730 	if (!dev) {
2731 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2732 		goto out;
2733 	} else if (dev->flags & IFF_LOOPBACK) {
2734 		NL_SET_ERR_MSG(extack,
2735 			       "Egress device can not be loopback device for this route");
2736 		goto out;
2737 	}
2738 
2739 	/* if we did not check gw_addr above, do so now that the
2740 	 * egress device has been resolved.
2741 	 */
2742 	if (need_addr_check &&
2743 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2744 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2745 		goto out;
2746 	}
2747 
2748 	err = 0;
2749 out:
2750 	return err;
2751 }
2752 
2753 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2754 					      struct netlink_ext_ack *extack)
2755 {
2756 	struct net *net = cfg->fc_nlinfo.nl_net;
2757 	struct rt6_info *rt = NULL;
2758 	struct net_device *dev = NULL;
2759 	struct inet6_dev *idev = NULL;
2760 	struct fib6_table *table;
2761 	int addr_type;
2762 	int err = -EINVAL;
2763 
2764 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2765 	if (cfg->fc_flags & RTF_PCPU) {
2766 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2767 		goto out;
2768 	}
2769 
2770 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2771 	if (cfg->fc_flags & RTF_CACHE) {
2772 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2773 		goto out;
2774 	}
2775 
2776 	if (cfg->fc_dst_len > 128) {
2777 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2778 		goto out;
2779 	}
2780 	if (cfg->fc_src_len > 128) {
2781 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2782 		goto out;
2783 	}
2784 #ifndef CONFIG_IPV6_SUBTREES
2785 	if (cfg->fc_src_len) {
2786 		NL_SET_ERR_MSG(extack,
2787 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2788 		goto out;
2789 	}
2790 #endif
2791 	if (cfg->fc_ifindex) {
2792 		err = -ENODEV;
2793 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2794 		if (!dev)
2795 			goto out;
2796 		idev = in6_dev_get(dev);
2797 		if (!idev)
2798 			goto out;
2799 	}
2800 
2801 	if (cfg->fc_metric == 0)
2802 		cfg->fc_metric = IP6_RT_PRIO_USER;
2803 
2804 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2805 		if (!dev) {
2806 			NL_SET_ERR_MSG(extack,
2807 				       "Nexthop device required for onlink");
2808 			err = -ENODEV;
2809 			goto out;
2810 		}
2811 
2812 		if (!(dev->flags & IFF_UP)) {
2813 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2814 			err = -ENETDOWN;
2815 			goto out;
2816 		}
2817 	}
2818 
2819 	err = -ENOBUFS;
2820 	if (cfg->fc_nlinfo.nlh &&
2821 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2822 		table = fib6_get_table(net, cfg->fc_table);
2823 		if (!table) {
2824 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2825 			table = fib6_new_table(net, cfg->fc_table);
2826 		}
2827 	} else {
2828 		table = fib6_new_table(net, cfg->fc_table);
2829 	}
2830 
2831 	if (!table)
2832 		goto out;
2833 
2834 	rt = ip6_dst_alloc(net, NULL,
2835 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
2836 
2837 	if (!rt) {
2838 		err = -ENOMEM;
2839 		goto out;
2840 	}
2841 
2842 	if (cfg->fc_flags & RTF_EXPIRES)
2843 		rt6_set_expires(rt, jiffies +
2844 				clock_t_to_jiffies(cfg->fc_expires));
2845 	else
2846 		rt6_clean_expires(rt);
2847 
2848 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2849 		cfg->fc_protocol = RTPROT_BOOT;
2850 	rt->rt6i_protocol = cfg->fc_protocol;
2851 
2852 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2853 
2854 	if (addr_type & IPV6_ADDR_MULTICAST)
2855 		rt->dst.input = ip6_mc_input;
2856 	else if (cfg->fc_flags & RTF_LOCAL)
2857 		rt->dst.input = ip6_input;
2858 	else
2859 		rt->dst.input = ip6_forward;
2860 
2861 	rt->dst.output = ip6_output;
2862 
2863 	if (cfg->fc_encap) {
2864 		struct lwtunnel_state *lwtstate;
2865 
2866 		err = lwtunnel_build_state(cfg->fc_encap_type,
2867 					   cfg->fc_encap, AF_INET6, cfg,
2868 					   &lwtstate, extack);
2869 		if (err)
2870 			goto out;
2871 		rt->dst.lwtstate = lwtstate_get(lwtstate);
2872 		lwtunnel_set_redirect(&rt->dst);
2873 	}
2874 
2875 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2876 	rt->rt6i_dst.plen = cfg->fc_dst_len;
2877 	if (rt->rt6i_dst.plen == 128)
2878 		rt->dst.flags |= DST_HOST;
2879 
2880 #ifdef CONFIG_IPV6_SUBTREES
2881 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2882 	rt->rt6i_src.plen = cfg->fc_src_len;
2883 #endif
2884 
2885 	rt->rt6i_metric = cfg->fc_metric;
2886 	rt->rt6i_nh_weight = 1;
2887 
2888 	/* We cannot add true routes via loopback here,
2889 	   they would result in kernel looping; promote them to reject routes
2890 	 */
2891 	if ((cfg->fc_flags & RTF_REJECT) ||
2892 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2893 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2894 	     !(cfg->fc_flags & RTF_LOCAL))) {
2895 		/* hold loopback dev/idev if we haven't done so. */
2896 		if (dev != net->loopback_dev) {
2897 			if (dev) {
2898 				dev_put(dev);
2899 				in6_dev_put(idev);
2900 			}
2901 			dev = net->loopback_dev;
2902 			dev_hold(dev);
2903 			idev = in6_dev_get(dev);
2904 			if (!idev) {
2905 				err = -ENODEV;
2906 				goto out;
2907 			}
2908 		}
2909 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2910 		switch (cfg->fc_type) {
2911 		case RTN_BLACKHOLE:
2912 			rt->dst.error = -EINVAL;
2913 			rt->dst.output = dst_discard_out;
2914 			rt->dst.input = dst_discard;
2915 			break;
2916 		case RTN_PROHIBIT:
2917 			rt->dst.error = -EACCES;
2918 			rt->dst.output = ip6_pkt_prohibit_out;
2919 			rt->dst.input = ip6_pkt_prohibit;
2920 			break;
2921 		case RTN_THROW:
2922 		case RTN_UNREACHABLE:
2923 		default:
2924 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2925 					: (cfg->fc_type == RTN_UNREACHABLE)
2926 					? -EHOSTUNREACH : -ENETUNREACH;
2927 			rt->dst.output = ip6_pkt_discard_out;
2928 			rt->dst.input = ip6_pkt_discard;
2929 			break;
2930 		}
2931 		goto install_route;
2932 	}
2933 
2934 	if (cfg->fc_flags & RTF_GATEWAY) {
2935 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2936 		if (err)
2937 			goto out;
2938 
2939 		rt->rt6i_gateway = cfg->fc_gateway;
2940 	}
2941 
2942 	err = -ENODEV;
2943 	if (!dev)
2944 		goto out;
2945 
2946 	if (idev->cnf.disable_ipv6) {
2947 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2948 		err = -EACCES;
2949 		goto out;
2950 	}
2951 
2952 	if (!(dev->flags & IFF_UP)) {
2953 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2954 		err = -ENETDOWN;
2955 		goto out;
2956 	}
2957 
2958 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2959 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2960 			NL_SET_ERR_MSG(extack, "Invalid source address");
2961 			err = -EINVAL;
2962 			goto out;
2963 		}
2964 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2965 		rt->rt6i_prefsrc.plen = 128;
2966 	} else
2967 		rt->rt6i_prefsrc.plen = 0;
2968 
2969 	rt->rt6i_flags = cfg->fc_flags;
2970 
2971 install_route:
2972 	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2973 	    !netif_carrier_ok(dev))
2974 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
2975 	rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
2976 	rt->dst.dev = dev;
2977 	rt->rt6i_idev = idev;
2978 	rt->rt6i_table = table;
2979 
2980 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2981 
2982 	return rt;
2983 out:
2984 	if (dev)
2985 		dev_put(dev);
2986 	if (idev)
2987 		in6_dev_put(idev);
2988 	if (rt)
2989 		dst_release_immediate(&rt->dst);
2990 
2991 	return ERR_PTR(err);
2992 }
2993 
2994 int ip6_route_add(struct fib6_config *cfg,
2995 		  struct netlink_ext_ack *extack)
2996 {
2997 	struct mx6_config mxc = { .mx = NULL, };
2998 	struct rt6_info *rt;
2999 	int err;
3000 
3001 	rt = ip6_route_info_create(cfg, extack);
3002 	if (IS_ERR(rt)) {
3003 		err = PTR_ERR(rt);
3004 		rt = NULL;
3005 		goto out;
3006 	}
3007 
3008 	err = ip6_convert_metrics(&mxc, cfg);
3009 	if (err)
3010 		goto out;
3011 
3012 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
3013 
3014 	kfree(mxc.mx);
3015 
3016 	return err;
3017 out:
3018 	if (rt)
3019 		dst_release_immediate(&rt->dst);
3020 
3021 	return err;
3022 }
3023 
3024 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
3025 {
3026 	int err;
3027 	struct fib6_table *table;
3028 	struct net *net = dev_net(rt->dst.dev);
3029 
3030 	if (rt == net->ipv6.ip6_null_entry) {
3031 		err = -ENOENT;
3032 		goto out;
3033 	}
3034 
3035 	table = rt->rt6i_table;
3036 	spin_lock_bh(&table->tb6_lock);
3037 	err = fib6_del(rt, info);
3038 	spin_unlock_bh(&table->tb6_lock);
3039 
3040 out:
3041 	ip6_rt_put(rt);
3042 	return err;
3043 }
3044 
3045 int ip6_del_rt(struct rt6_info *rt)
3046 {
3047 	struct nl_info info = {
3048 		.nl_net = dev_net(rt->dst.dev),
3049 	};
3050 	return __ip6_del_rt(rt, &info);
3051 }
3052 
3053 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3054 {
3055 	struct nl_info *info = &cfg->fc_nlinfo;
3056 	struct net *net = info->nl_net;
3057 	struct sk_buff *skb = NULL;
3058 	struct fib6_table *table;
3059 	int err = -ENOENT;
3060 
3061 	if (rt == net->ipv6.ip6_null_entry)
3062 		goto out_put;
3063 	table = rt->rt6i_table;
3064 	spin_lock_bh(&table->tb6_lock);
3065 
3066 	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3067 		struct rt6_info *sibling, *next_sibling;
3068 
3069 		/* prefer to send a single notification with all hops */
3070 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3071 		if (skb) {
3072 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3073 
3074 			if (rt6_fill_node(net, skb, rt,
3075 					  NULL, NULL, 0, RTM_DELROUTE,
3076 					  info->portid, seq, 0) < 0) {
3077 				kfree_skb(skb);
3078 				skb = NULL;
3079 			} else
3080 				info->skip_notify = 1;
3081 		}
3082 
3083 		list_for_each_entry_safe(sibling, next_sibling,
3084 					 &rt->rt6i_siblings,
3085 					 rt6i_siblings) {
3086 			err = fib6_del(sibling, info);
3087 			if (err)
3088 				goto out_unlock;
3089 		}
3090 	}
3091 
3092 	err = fib6_del(rt, info);
3093 out_unlock:
3094 	spin_unlock_bh(&table->tb6_lock);
3095 out_put:
3096 	ip6_rt_put(rt);
3097 
3098 	if (skb) {
3099 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3100 			    info->nlh, gfp_any());
3101 	}
3102 	return err;
3103 }
3104 
3105 static int ip6_route_del(struct fib6_config *cfg,
3106 			 struct netlink_ext_ack *extack)
3107 {
3108 	struct rt6_info *rt, *rt_cache;
3109 	struct fib6_table *table;
3110 	struct fib6_node *fn;
3111 	int err = -ESRCH;
3112 
3113 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3114 	if (!table) {
3115 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3116 		return err;
3117 	}
3118 
3119 	rcu_read_lock();
3120 
3121 	fn = fib6_locate(&table->tb6_root,
3122 			 &cfg->fc_dst, cfg->fc_dst_len,
3123 			 &cfg->fc_src, cfg->fc_src_len,
3124 			 !(cfg->fc_flags & RTF_CACHE));
3125 
3126 	if (fn) {
3127 		for_each_fib6_node_rt_rcu(fn) {
3128 			if (cfg->fc_flags & RTF_CACHE) {
3129 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3130 							      &cfg->fc_src);
3131 				if (!rt_cache)
3132 					continue;
3133 				rt = rt_cache;
3134 			}
3135 			if (cfg->fc_ifindex &&
3136 			    (!rt->dst.dev ||
3137 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
3138 				continue;
3139 			if (cfg->fc_flags & RTF_GATEWAY &&
3140 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3141 				continue;
3142 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
3143 				continue;
3144 			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3145 				continue;
3146 			if (!dst_hold_safe(&rt->dst))
3147 				break;
3148 			rcu_read_unlock();
3149 
3150 			/* if gateway was specified only delete the one hop */
3151 			if (cfg->fc_flags & RTF_GATEWAY)
3152 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3153 
3154 			return __ip6_del_rt_siblings(rt, cfg);
3155 		}
3156 	}
3157 	rcu_read_unlock();
3158 
3159 	return err;
3160 }
3161 
3162 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3163 {
3164 	struct netevent_redirect netevent;
3165 	struct rt6_info *rt, *nrt = NULL;
3166 	struct ndisc_options ndopts;
3167 	struct inet6_dev *in6_dev;
3168 	struct neighbour *neigh;
3169 	struct rd_msg *msg;
3170 	int optlen, on_link;
3171 	u8 *lladdr;
3172 
3173 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3174 	optlen -= sizeof(*msg);
3175 
3176 	if (optlen < 0) {
3177 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3178 		return;
3179 	}
3180 
3181 	msg = (struct rd_msg *)icmp6_hdr(skb);
3182 
3183 	if (ipv6_addr_is_multicast(&msg->dest)) {
3184 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3185 		return;
3186 	}
3187 
3188 	on_link = 0;
3189 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3190 		on_link = 1;
3191 	} else if (ipv6_addr_type(&msg->target) !=
3192 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3193 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3194 		return;
3195 	}
3196 
3197 	in6_dev = __in6_dev_get(skb->dev);
3198 	if (!in6_dev)
3199 		return;
3200 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3201 		return;
3202 
3203 	/* RFC2461 8.1:
3204 	 *	The IP source address of the Redirect MUST be the same as the current
3205 	 *	first-hop router for the specified ICMP Destination Address.
3206 	 */
3207 
3208 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3209 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3210 		return;
3211 	}
3212 
3213 	lladdr = NULL;
3214 	if (ndopts.nd_opts_tgt_lladdr) {
3215 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3216 					     skb->dev);
3217 		if (!lladdr) {
3218 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3219 			return;
3220 		}
3221 	}
3222 
3223 	rt = (struct rt6_info *) dst;
3224 	if (rt->rt6i_flags & RTF_REJECT) {
3225 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3226 		return;
3227 	}
3228 
3229 	/* Redirect received -> path was valid.
3230 	 * Look, redirects are sent only in response to data packets,
3231 	 * so that this nexthop apparently is reachable. --ANK
3232 	 */
3233 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3234 
3235 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3236 	if (!neigh)
3237 		return;
3238 
3239 	/*
3240 	 *	We have finally decided to accept it.
3241 	 */
3242 
3243 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3244 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3245 		     NEIGH_UPDATE_F_OVERRIDE|
3246 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3247 				     NEIGH_UPDATE_F_ISROUTER)),
3248 		     NDISC_REDIRECT, &ndopts);
3249 
3250 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3251 	if (!nrt)
3252 		goto out;
3253 
3254 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3255 	if (on_link)
3256 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3257 
3258 	nrt->rt6i_protocol = RTPROT_REDIRECT;
3259 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3260 
3261 	/* No need to remove rt from the exception table if rt is
3262 	 * a cached route because rt6_insert_exception() will
3263 	 * takes care of it
3264 	 */
3265 	if (rt6_insert_exception(nrt, rt)) {
3266 		dst_release_immediate(&nrt->dst);
3267 		goto out;
3268 	}
3269 
3270 	netevent.old = &rt->dst;
3271 	netevent.new = &nrt->dst;
3272 	netevent.daddr = &msg->dest;
3273 	netevent.neigh = neigh;
3274 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3275 
3276 out:
3277 	neigh_release(neigh);
3278 }
3279 
3280 /*
3281  *	Misc support functions
3282  */
3283 
3284 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3285 {
3286 	BUG_ON(from->from);
3287 
3288 	rt->rt6i_flags &= ~RTF_EXPIRES;
3289 	dst_hold(&from->dst);
3290 	rt->from = from;
3291 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3292 }
3293 
3294 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3295 {
3296 	rt->dst.input = ort->dst.input;
3297 	rt->dst.output = ort->dst.output;
3298 	rt->rt6i_dst = ort->rt6i_dst;
3299 	rt->dst.error = ort->dst.error;
3300 	rt->rt6i_idev = ort->rt6i_idev;
3301 	if (rt->rt6i_idev)
3302 		in6_dev_hold(rt->rt6i_idev);
3303 	rt->dst.lastuse = jiffies;
3304 	rt->rt6i_gateway = ort->rt6i_gateway;
3305 	rt->rt6i_flags = ort->rt6i_flags;
3306 	rt6_set_from(rt, ort);
3307 	rt->rt6i_metric = ort->rt6i_metric;
3308 #ifdef CONFIG_IPV6_SUBTREES
3309 	rt->rt6i_src = ort->rt6i_src;
3310 #endif
3311 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3312 	rt->rt6i_table = ort->rt6i_table;
3313 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
3314 }
3315 
3316 #ifdef CONFIG_IPV6_ROUTE_INFO
3317 static struct rt6_info *rt6_get_route_info(struct net *net,
3318 					   const struct in6_addr *prefix, int prefixlen,
3319 					   const struct in6_addr *gwaddr,
3320 					   struct net_device *dev)
3321 {
3322 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3323 	int ifindex = dev->ifindex;
3324 	struct fib6_node *fn;
3325 	struct rt6_info *rt = NULL;
3326 	struct fib6_table *table;
3327 
3328 	table = fib6_get_table(net, tb_id);
3329 	if (!table)
3330 		return NULL;
3331 
3332 	rcu_read_lock();
3333 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3334 	if (!fn)
3335 		goto out;
3336 
3337 	for_each_fib6_node_rt_rcu(fn) {
3338 		if (rt->dst.dev->ifindex != ifindex)
3339 			continue;
3340 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3341 			continue;
3342 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3343 			continue;
3344 		ip6_hold_safe(NULL, &rt, false);
3345 		break;
3346 	}
3347 out:
3348 	rcu_read_unlock();
3349 	return rt;
3350 }
3351 
3352 static struct rt6_info *rt6_add_route_info(struct net *net,
3353 					   const struct in6_addr *prefix, int prefixlen,
3354 					   const struct in6_addr *gwaddr,
3355 					   struct net_device *dev,
3356 					   unsigned int pref)
3357 {
3358 	struct fib6_config cfg = {
3359 		.fc_metric	= IP6_RT_PRIO_USER,
3360 		.fc_ifindex	= dev->ifindex,
3361 		.fc_dst_len	= prefixlen,
3362 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3363 				  RTF_UP | RTF_PREF(pref),
3364 		.fc_protocol = RTPROT_RA,
3365 		.fc_nlinfo.portid = 0,
3366 		.fc_nlinfo.nlh = NULL,
3367 		.fc_nlinfo.nl_net = net,
3368 	};
3369 
3370 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3371 	cfg.fc_dst = *prefix;
3372 	cfg.fc_gateway = *gwaddr;
3373 
3374 	/* We should treat it as a default route if prefix length is 0. */
3375 	if (!prefixlen)
3376 		cfg.fc_flags |= RTF_DEFAULT;
3377 
3378 	ip6_route_add(&cfg, NULL);
3379 
3380 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3381 }
3382 #endif
3383 
3384 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
3385 {
3386 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3387 	struct rt6_info *rt;
3388 	struct fib6_table *table;
3389 
3390 	table = fib6_get_table(dev_net(dev), tb_id);
3391 	if (!table)
3392 		return NULL;
3393 
3394 	rcu_read_lock();
3395 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3396 		if (dev == rt->dst.dev &&
3397 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3398 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
3399 			break;
3400 	}
3401 	if (rt)
3402 		ip6_hold_safe(NULL, &rt, false);
3403 	rcu_read_unlock();
3404 	return rt;
3405 }
3406 
3407 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
3408 				     struct net_device *dev,
3409 				     unsigned int pref)
3410 {
3411 	struct fib6_config cfg = {
3412 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3413 		.fc_metric	= IP6_RT_PRIO_USER,
3414 		.fc_ifindex	= dev->ifindex,
3415 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3416 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3417 		.fc_protocol = RTPROT_RA,
3418 		.fc_nlinfo.portid = 0,
3419 		.fc_nlinfo.nlh = NULL,
3420 		.fc_nlinfo.nl_net = dev_net(dev),
3421 	};
3422 
3423 	cfg.fc_gateway = *gwaddr;
3424 
3425 	if (!ip6_route_add(&cfg, NULL)) {
3426 		struct fib6_table *table;
3427 
3428 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3429 		if (table)
3430 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3431 	}
3432 
3433 	return rt6_get_dflt_router(gwaddr, dev);
3434 }
3435 
3436 static void __rt6_purge_dflt_routers(struct fib6_table *table)
3437 {
3438 	struct rt6_info *rt;
3439 
3440 restart:
3441 	rcu_read_lock();
3442 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3443 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3444 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3445 			if (dst_hold_safe(&rt->dst)) {
3446 				rcu_read_unlock();
3447 				ip6_del_rt(rt);
3448 			} else {
3449 				rcu_read_unlock();
3450 			}
3451 			goto restart;
3452 		}
3453 	}
3454 	rcu_read_unlock();
3455 
3456 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3457 }
3458 
3459 void rt6_purge_dflt_routers(struct net *net)
3460 {
3461 	struct fib6_table *table;
3462 	struct hlist_head *head;
3463 	unsigned int h;
3464 
3465 	rcu_read_lock();
3466 
3467 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3468 		head = &net->ipv6.fib_table_hash[h];
3469 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3470 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3471 				__rt6_purge_dflt_routers(table);
3472 		}
3473 	}
3474 
3475 	rcu_read_unlock();
3476 }
3477 
3478 static void rtmsg_to_fib6_config(struct net *net,
3479 				 struct in6_rtmsg *rtmsg,
3480 				 struct fib6_config *cfg)
3481 {
3482 	memset(cfg, 0, sizeof(*cfg));
3483 
3484 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3485 			 : RT6_TABLE_MAIN;
3486 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3487 	cfg->fc_metric = rtmsg->rtmsg_metric;
3488 	cfg->fc_expires = rtmsg->rtmsg_info;
3489 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3490 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3491 	cfg->fc_flags = rtmsg->rtmsg_flags;
3492 
3493 	cfg->fc_nlinfo.nl_net = net;
3494 
3495 	cfg->fc_dst = rtmsg->rtmsg_dst;
3496 	cfg->fc_src = rtmsg->rtmsg_src;
3497 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3498 }
3499 
3500 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3501 {
3502 	struct fib6_config cfg;
3503 	struct in6_rtmsg rtmsg;
3504 	int err;
3505 
3506 	switch (cmd) {
3507 	case SIOCADDRT:		/* Add a route */
3508 	case SIOCDELRT:		/* Delete a route */
3509 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3510 			return -EPERM;
3511 		err = copy_from_user(&rtmsg, arg,
3512 				     sizeof(struct in6_rtmsg));
3513 		if (err)
3514 			return -EFAULT;
3515 
3516 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3517 
3518 		rtnl_lock();
3519 		switch (cmd) {
3520 		case SIOCADDRT:
3521 			err = ip6_route_add(&cfg, NULL);
3522 			break;
3523 		case SIOCDELRT:
3524 			err = ip6_route_del(&cfg, NULL);
3525 			break;
3526 		default:
3527 			err = -EINVAL;
3528 		}
3529 		rtnl_unlock();
3530 
3531 		return err;
3532 	}
3533 
3534 	return -EINVAL;
3535 }
3536 
3537 /*
3538  *	Drop the packet on the floor
3539  */
3540 
3541 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3542 {
3543 	int type;
3544 	struct dst_entry *dst = skb_dst(skb);
3545 	switch (ipstats_mib_noroutes) {
3546 	case IPSTATS_MIB_INNOROUTES:
3547 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3548 		if (type == IPV6_ADDR_ANY) {
3549 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3550 				      IPSTATS_MIB_INADDRERRORS);
3551 			break;
3552 		}
3553 		/* FALLTHROUGH */
3554 	case IPSTATS_MIB_OUTNOROUTES:
3555 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3556 			      ipstats_mib_noroutes);
3557 		break;
3558 	}
3559 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3560 	kfree_skb(skb);
3561 	return 0;
3562 }
3563 
3564 static int ip6_pkt_discard(struct sk_buff *skb)
3565 {
3566 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3567 }
3568 
3569 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3570 {
3571 	skb->dev = skb_dst(skb)->dev;
3572 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3573 }
3574 
3575 static int ip6_pkt_prohibit(struct sk_buff *skb)
3576 {
3577 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3578 }
3579 
3580 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3581 {
3582 	skb->dev = skb_dst(skb)->dev;
3583 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3584 }
3585 
3586 /*
3587  *	Allocate a dst for local (unicast / anycast) address.
3588  */
3589 
3590 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3591 				    const struct in6_addr *addr,
3592 				    bool anycast)
3593 {
3594 	u32 tb_id;
3595 	struct net *net = dev_net(idev->dev);
3596 	struct net_device *dev = idev->dev;
3597 	struct rt6_info *rt;
3598 
3599 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3600 	if (!rt)
3601 		return ERR_PTR(-ENOMEM);
3602 
3603 	in6_dev_hold(idev);
3604 
3605 	rt->dst.flags |= DST_HOST;
3606 	rt->dst.input = ip6_input;
3607 	rt->dst.output = ip6_output;
3608 	rt->rt6i_idev = idev;
3609 
3610 	rt->rt6i_protocol = RTPROT_KERNEL;
3611 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3612 	if (anycast)
3613 		rt->rt6i_flags |= RTF_ANYCAST;
3614 	else
3615 		rt->rt6i_flags |= RTF_LOCAL;
3616 
3617 	rt->rt6i_gateway  = *addr;
3618 	rt->rt6i_dst.addr = *addr;
3619 	rt->rt6i_dst.plen = 128;
3620 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3621 	rt->rt6i_table = fib6_get_table(net, tb_id);
3622 
3623 	return rt;
3624 }
3625 
3626 /* remove deleted ip from prefsrc entries */
3627 struct arg_dev_net_ip {
3628 	struct net_device *dev;
3629 	struct net *net;
3630 	struct in6_addr *addr;
3631 };
3632 
3633 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3634 {
3635 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3636 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3637 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3638 
3639 	if (((void *)rt->dst.dev == dev || !dev) &&
3640 	    rt != net->ipv6.ip6_null_entry &&
3641 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3642 		spin_lock_bh(&rt6_exception_lock);
3643 		/* remove prefsrc entry */
3644 		rt->rt6i_prefsrc.plen = 0;
3645 		/* need to update cache as well */
3646 		rt6_exceptions_remove_prefsrc(rt);
3647 		spin_unlock_bh(&rt6_exception_lock);
3648 	}
3649 	return 0;
3650 }
3651 
3652 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3653 {
3654 	struct net *net = dev_net(ifp->idev->dev);
3655 	struct arg_dev_net_ip adni = {
3656 		.dev = ifp->idev->dev,
3657 		.net = net,
3658 		.addr = &ifp->addr,
3659 	};
3660 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3661 }
3662 
3663 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3664 
3665 /* Remove routers and update dst entries when gateway turn into host. */
3666 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3667 {
3668 	struct in6_addr *gateway = (struct in6_addr *)arg;
3669 
3670 	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3671 	    ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
3672 		return -1;
3673 	}
3674 
3675 	/* Further clean up cached routes in exception table.
3676 	 * This is needed because cached route may have a different
3677 	 * gateway than its 'parent' in the case of an ip redirect.
3678 	 */
3679 	rt6_exceptions_clean_tohost(rt, gateway);
3680 
3681 	return 0;
3682 }
3683 
3684 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3685 {
3686 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3687 }
3688 
3689 struct arg_netdev_event {
3690 	const struct net_device *dev;
3691 	union {
3692 		unsigned int nh_flags;
3693 		unsigned long event;
3694 	};
3695 };
3696 
3697 static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3698 {
3699 	struct rt6_info *iter;
3700 	struct fib6_node *fn;
3701 
3702 	fn = rcu_dereference_protected(rt->rt6i_node,
3703 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3704 	iter = rcu_dereference_protected(fn->leaf,
3705 			lockdep_is_held(&rt->rt6i_table->tb6_lock));
3706 	while (iter) {
3707 		if (iter->rt6i_metric == rt->rt6i_metric &&
3708 		    rt6_qualify_for_ecmp(iter))
3709 			return iter;
3710 		iter = rcu_dereference_protected(iter->rt6_next,
3711 				lockdep_is_held(&rt->rt6i_table->tb6_lock));
3712 	}
3713 
3714 	return NULL;
3715 }
3716 
3717 static bool rt6_is_dead(const struct rt6_info *rt)
3718 {
3719 	if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3720 	    (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3721 	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3722 		return true;
3723 
3724 	return false;
3725 }
3726 
3727 static int rt6_multipath_total_weight(const struct rt6_info *rt)
3728 {
3729 	struct rt6_info *iter;
3730 	int total = 0;
3731 
3732 	if (!rt6_is_dead(rt))
3733 		total += rt->rt6i_nh_weight;
3734 
3735 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3736 		if (!rt6_is_dead(iter))
3737 			total += iter->rt6i_nh_weight;
3738 	}
3739 
3740 	return total;
3741 }
3742 
3743 static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3744 {
3745 	int upper_bound = -1;
3746 
3747 	if (!rt6_is_dead(rt)) {
3748 		*weight += rt->rt6i_nh_weight;
3749 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3750 						    total) - 1;
3751 	}
3752 	atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3753 }
3754 
3755 static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3756 {
3757 	struct rt6_info *iter;
3758 	int weight = 0;
3759 
3760 	rt6_upper_bound_set(rt, &weight, total);
3761 
3762 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3763 		rt6_upper_bound_set(iter, &weight, total);
3764 }
3765 
3766 void rt6_multipath_rebalance(struct rt6_info *rt)
3767 {
3768 	struct rt6_info *first;
3769 	int total;
3770 
3771 	/* In case the entire multipath route was marked for flushing,
3772 	 * then there is no need to rebalance upon the removal of every
3773 	 * sibling route.
3774 	 */
3775 	if (!rt->rt6i_nsiblings || rt->should_flush)
3776 		return;
3777 
3778 	/* During lookup routes are evaluated in order, so we need to
3779 	 * make sure upper bounds are assigned from the first sibling
3780 	 * onwards.
3781 	 */
3782 	first = rt6_multipath_first_sibling(rt);
3783 	if (WARN_ON_ONCE(!first))
3784 		return;
3785 
3786 	total = rt6_multipath_total_weight(first);
3787 	rt6_multipath_upper_bound_set(first, total);
3788 }
3789 
3790 static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3791 {
3792 	const struct arg_netdev_event *arg = p_arg;
3793 	const struct net *net = dev_net(arg->dev);
3794 
3795 	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
3796 		rt->rt6i_nh_flags &= ~arg->nh_flags;
3797 		fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
3798 		rt6_multipath_rebalance(rt);
3799 	}
3800 
3801 	return 0;
3802 }
3803 
3804 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3805 {
3806 	struct arg_netdev_event arg = {
3807 		.dev = dev,
3808 		{
3809 			.nh_flags = nh_flags,
3810 		},
3811 	};
3812 
3813 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3814 		arg.nh_flags |= RTNH_F_LINKDOWN;
3815 
3816 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3817 }
3818 
3819 static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3820 				   const struct net_device *dev)
3821 {
3822 	struct rt6_info *iter;
3823 
3824 	if (rt->dst.dev == dev)
3825 		return true;
3826 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3827 		if (iter->dst.dev == dev)
3828 			return true;
3829 
3830 	return false;
3831 }
3832 
3833 static void rt6_multipath_flush(struct rt6_info *rt)
3834 {
3835 	struct rt6_info *iter;
3836 
3837 	rt->should_flush = 1;
3838 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3839 		iter->should_flush = 1;
3840 }
3841 
3842 static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3843 					     const struct net_device *down_dev)
3844 {
3845 	struct rt6_info *iter;
3846 	unsigned int dead = 0;
3847 
3848 	if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3849 		dead++;
3850 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3851 		if (iter->dst.dev == down_dev ||
3852 		    iter->rt6i_nh_flags & RTNH_F_DEAD)
3853 			dead++;
3854 
3855 	return dead;
3856 }
3857 
3858 static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3859 				       const struct net_device *dev,
3860 				       unsigned int nh_flags)
3861 {
3862 	struct rt6_info *iter;
3863 
3864 	if (rt->dst.dev == dev)
3865 		rt->rt6i_nh_flags |= nh_flags;
3866 	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3867 		if (iter->dst.dev == dev)
3868 			iter->rt6i_nh_flags |= nh_flags;
3869 }
3870 
3871 /* called with write lock held for table with rt */
3872 static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
3873 {
3874 	const struct arg_netdev_event *arg = p_arg;
3875 	const struct net_device *dev = arg->dev;
3876 	const struct net *net = dev_net(dev);
3877 
3878 	if (rt == net->ipv6.ip6_null_entry)
3879 		return 0;
3880 
3881 	switch (arg->event) {
3882 	case NETDEV_UNREGISTER:
3883 		return rt->dst.dev == dev ? -1 : 0;
3884 	case NETDEV_DOWN:
3885 		if (rt->should_flush)
3886 			return -1;
3887 		if (!rt->rt6i_nsiblings)
3888 			return rt->dst.dev == dev ? -1 : 0;
3889 		if (rt6_multipath_uses_dev(rt, dev)) {
3890 			unsigned int count;
3891 
3892 			count = rt6_multipath_dead_count(rt, dev);
3893 			if (rt->rt6i_nsiblings + 1 == count) {
3894 				rt6_multipath_flush(rt);
3895 				return -1;
3896 			}
3897 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3898 						   RTNH_F_LINKDOWN);
3899 			fib6_update_sernum(rt);
3900 			rt6_multipath_rebalance(rt);
3901 		}
3902 		return -2;
3903 	case NETDEV_CHANGE:
3904 		if (rt->dst.dev != dev ||
3905 		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3906 			break;
3907 		rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
3908 		rt6_multipath_rebalance(rt);
3909 		break;
3910 	}
3911 
3912 	return 0;
3913 }
3914 
3915 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3916 {
3917 	struct arg_netdev_event arg = {
3918 		.dev = dev,
3919 		{
3920 			.event = event,
3921 		},
3922 	};
3923 
3924 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3925 }
3926 
3927 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3928 {
3929 	rt6_sync_down_dev(dev, event);
3930 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3931 	neigh_ifdown(&nd_tbl, dev);
3932 }
3933 
3934 struct rt6_mtu_change_arg {
3935 	struct net_device *dev;
3936 	unsigned int mtu;
3937 };
3938 
3939 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3940 {
3941 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3942 	struct inet6_dev *idev;
3943 
3944 	/* In IPv6 pmtu discovery is not optional,
3945 	   so that RTAX_MTU lock cannot disable it.
3946 	   We still use this lock to block changes
3947 	   caused by addrconf/ndisc.
3948 	*/
3949 
3950 	idev = __in6_dev_get(arg->dev);
3951 	if (!idev)
3952 		return 0;
3953 
3954 	/* For administrative MTU increase, there is no way to discover
3955 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3956 	   Since RFC 1981 doesn't include administrative MTU increase
3957 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3958 	 */
3959 	if (rt->dst.dev == arg->dev &&
3960 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
3961 		spin_lock_bh(&rt6_exception_lock);
3962 		if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3963 		    rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
3964 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
3965 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3966 		spin_unlock_bh(&rt6_exception_lock);
3967 	}
3968 	return 0;
3969 }
3970 
3971 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3972 {
3973 	struct rt6_mtu_change_arg arg = {
3974 		.dev = dev,
3975 		.mtu = mtu,
3976 	};
3977 
3978 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3979 }
3980 
3981 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3982 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3983 	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
3984 	[RTA_OIF]               = { .type = NLA_U32 },
3985 	[RTA_IIF]		= { .type = NLA_U32 },
3986 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3987 	[RTA_METRICS]           = { .type = NLA_NESTED },
3988 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3989 	[RTA_PREF]              = { .type = NLA_U8 },
3990 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
3991 	[RTA_ENCAP]		= { .type = NLA_NESTED },
3992 	[RTA_EXPIRES]		= { .type = NLA_U32 },
3993 	[RTA_UID]		= { .type = NLA_U32 },
3994 	[RTA_MARK]		= { .type = NLA_U32 },
3995 	[RTA_TABLE]		= { .type = NLA_U32 },
3996 };
3997 
3998 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
3999 			      struct fib6_config *cfg,
4000 			      struct netlink_ext_ack *extack)
4001 {
4002 	struct rtmsg *rtm;
4003 	struct nlattr *tb[RTA_MAX+1];
4004 	unsigned int pref;
4005 	int err;
4006 
4007 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4008 			  NULL);
4009 	if (err < 0)
4010 		goto errout;
4011 
4012 	err = -EINVAL;
4013 	rtm = nlmsg_data(nlh);
4014 	memset(cfg, 0, sizeof(*cfg));
4015 
4016 	cfg->fc_table = rtm->rtm_table;
4017 	cfg->fc_dst_len = rtm->rtm_dst_len;
4018 	cfg->fc_src_len = rtm->rtm_src_len;
4019 	cfg->fc_flags = RTF_UP;
4020 	cfg->fc_protocol = rtm->rtm_protocol;
4021 	cfg->fc_type = rtm->rtm_type;
4022 
4023 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4024 	    rtm->rtm_type == RTN_BLACKHOLE ||
4025 	    rtm->rtm_type == RTN_PROHIBIT ||
4026 	    rtm->rtm_type == RTN_THROW)
4027 		cfg->fc_flags |= RTF_REJECT;
4028 
4029 	if (rtm->rtm_type == RTN_LOCAL)
4030 		cfg->fc_flags |= RTF_LOCAL;
4031 
4032 	if (rtm->rtm_flags & RTM_F_CLONED)
4033 		cfg->fc_flags |= RTF_CACHE;
4034 
4035 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4036 
4037 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4038 	cfg->fc_nlinfo.nlh = nlh;
4039 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4040 
4041 	if (tb[RTA_GATEWAY]) {
4042 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4043 		cfg->fc_flags |= RTF_GATEWAY;
4044 	}
4045 
4046 	if (tb[RTA_DST]) {
4047 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4048 
4049 		if (nla_len(tb[RTA_DST]) < plen)
4050 			goto errout;
4051 
4052 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4053 	}
4054 
4055 	if (tb[RTA_SRC]) {
4056 		int plen = (rtm->rtm_src_len + 7) >> 3;
4057 
4058 		if (nla_len(tb[RTA_SRC]) < plen)
4059 			goto errout;
4060 
4061 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4062 	}
4063 
4064 	if (tb[RTA_PREFSRC])
4065 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4066 
4067 	if (tb[RTA_OIF])
4068 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4069 
4070 	if (tb[RTA_PRIORITY])
4071 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4072 
4073 	if (tb[RTA_METRICS]) {
4074 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4075 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4076 	}
4077 
4078 	if (tb[RTA_TABLE])
4079 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4080 
4081 	if (tb[RTA_MULTIPATH]) {
4082 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4083 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4084 
4085 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4086 						     cfg->fc_mp_len, extack);
4087 		if (err < 0)
4088 			goto errout;
4089 	}
4090 
4091 	if (tb[RTA_PREF]) {
4092 		pref = nla_get_u8(tb[RTA_PREF]);
4093 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4094 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4095 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4096 		cfg->fc_flags |= RTF_PREF(pref);
4097 	}
4098 
4099 	if (tb[RTA_ENCAP])
4100 		cfg->fc_encap = tb[RTA_ENCAP];
4101 
4102 	if (tb[RTA_ENCAP_TYPE]) {
4103 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4104 
4105 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4106 		if (err < 0)
4107 			goto errout;
4108 	}
4109 
4110 	if (tb[RTA_EXPIRES]) {
4111 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4112 
4113 		if (addrconf_finite_timeout(timeout)) {
4114 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4115 			cfg->fc_flags |= RTF_EXPIRES;
4116 		}
4117 	}
4118 
4119 	err = 0;
4120 errout:
4121 	return err;
4122 }
4123 
4124 struct rt6_nh {
4125 	struct rt6_info *rt6_info;
4126 	struct fib6_config r_cfg;
4127 	struct mx6_config mxc;
4128 	struct list_head next;
4129 };
4130 
4131 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4132 {
4133 	struct rt6_nh *nh;
4134 
4135 	list_for_each_entry(nh, rt6_nh_list, next) {
4136 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4137 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4138 		        nh->r_cfg.fc_ifindex);
4139 	}
4140 }
4141 
4142 static int ip6_route_info_append(struct list_head *rt6_nh_list,
4143 				 struct rt6_info *rt, struct fib6_config *r_cfg)
4144 {
4145 	struct rt6_nh *nh;
4146 	int err = -EEXIST;
4147 
4148 	list_for_each_entry(nh, rt6_nh_list, next) {
4149 		/* check if rt6_info already exists */
4150 		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4151 			return err;
4152 	}
4153 
4154 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4155 	if (!nh)
4156 		return -ENOMEM;
4157 	nh->rt6_info = rt;
4158 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
4159 	if (err) {
4160 		kfree(nh);
4161 		return err;
4162 	}
4163 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4164 	list_add_tail(&nh->next, rt6_nh_list);
4165 
4166 	return 0;
4167 }
4168 
4169 static void ip6_route_mpath_notify(struct rt6_info *rt,
4170 				   struct rt6_info *rt_last,
4171 				   struct nl_info *info,
4172 				   __u16 nlflags)
4173 {
4174 	/* if this is an APPEND route, then rt points to the first route
4175 	 * inserted and rt_last points to last route inserted. Userspace
4176 	 * wants a consistent dump of the route which starts at the first
4177 	 * nexthop. Since sibling routes are always added at the end of
4178 	 * the list, find the first sibling of the last route appended
4179 	 */
4180 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4181 		rt = list_first_entry(&rt_last->rt6i_siblings,
4182 				      struct rt6_info,
4183 				      rt6i_siblings);
4184 	}
4185 
4186 	if (rt)
4187 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4188 }
4189 
4190 static int ip6_route_multipath_add(struct fib6_config *cfg,
4191 				   struct netlink_ext_ack *extack)
4192 {
4193 	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4194 	struct nl_info *info = &cfg->fc_nlinfo;
4195 	struct fib6_config r_cfg;
4196 	struct rtnexthop *rtnh;
4197 	struct rt6_info *rt;
4198 	struct rt6_nh *err_nh;
4199 	struct rt6_nh *nh, *nh_safe;
4200 	__u16 nlflags;
4201 	int remaining;
4202 	int attrlen;
4203 	int err = 1;
4204 	int nhn = 0;
4205 	int replace = (cfg->fc_nlinfo.nlh &&
4206 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4207 	LIST_HEAD(rt6_nh_list);
4208 
4209 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4210 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4211 		nlflags |= NLM_F_APPEND;
4212 
4213 	remaining = cfg->fc_mp_len;
4214 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4215 
4216 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4217 	 * rt6_info structs per nexthop
4218 	 */
4219 	while (rtnh_ok(rtnh, remaining)) {
4220 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4221 		if (rtnh->rtnh_ifindex)
4222 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4223 
4224 		attrlen = rtnh_attrlen(rtnh);
4225 		if (attrlen > 0) {
4226 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4227 
4228 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4229 			if (nla) {
4230 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4231 				r_cfg.fc_flags |= RTF_GATEWAY;
4232 			}
4233 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4234 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4235 			if (nla)
4236 				r_cfg.fc_encap_type = nla_get_u16(nla);
4237 		}
4238 
4239 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4240 		rt = ip6_route_info_create(&r_cfg, extack);
4241 		if (IS_ERR(rt)) {
4242 			err = PTR_ERR(rt);
4243 			rt = NULL;
4244 			goto cleanup;
4245 		}
4246 
4247 		rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4248 
4249 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
4250 		if (err) {
4251 			dst_release_immediate(&rt->dst);
4252 			goto cleanup;
4253 		}
4254 
4255 		rtnh = rtnh_next(rtnh, &remaining);
4256 	}
4257 
4258 	/* for add and replace send one notification with all nexthops.
4259 	 * Skip the notification in fib6_add_rt2node and send one with
4260 	 * the full route when done
4261 	 */
4262 	info->skip_notify = 1;
4263 
4264 	err_nh = NULL;
4265 	list_for_each_entry(nh, &rt6_nh_list, next) {
4266 		rt_last = nh->rt6_info;
4267 		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
4268 		/* save reference to first route for notification */
4269 		if (!rt_notif && !err)
4270 			rt_notif = nh->rt6_info;
4271 
4272 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
4273 		nh->rt6_info = NULL;
4274 		if (err) {
4275 			if (replace && nhn)
4276 				ip6_print_replace_route_err(&rt6_nh_list);
4277 			err_nh = nh;
4278 			goto add_errout;
4279 		}
4280 
4281 		/* Because each route is added like a single route we remove
4282 		 * these flags after the first nexthop: if there is a collision,
4283 		 * we have already failed to add the first nexthop:
4284 		 * fib6_add_rt2node() has rejected it; when replacing, old
4285 		 * nexthops have been replaced by first new, the rest should
4286 		 * be added to it.
4287 		 */
4288 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4289 						     NLM_F_REPLACE);
4290 		nhn++;
4291 	}
4292 
4293 	/* success ... tell user about new route */
4294 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4295 	goto cleanup;
4296 
4297 add_errout:
4298 	/* send notification for routes that were added so that
4299 	 * the delete notifications sent by ip6_route_del are
4300 	 * coherent
4301 	 */
4302 	if (rt_notif)
4303 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4304 
4305 	/* Delete routes that were already added */
4306 	list_for_each_entry(nh, &rt6_nh_list, next) {
4307 		if (err_nh == nh)
4308 			break;
4309 		ip6_route_del(&nh->r_cfg, extack);
4310 	}
4311 
4312 cleanup:
4313 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4314 		if (nh->rt6_info)
4315 			dst_release_immediate(&nh->rt6_info->dst);
4316 		kfree(nh->mxc.mx);
4317 		list_del(&nh->next);
4318 		kfree(nh);
4319 	}
4320 
4321 	return err;
4322 }
4323 
4324 static int ip6_route_multipath_del(struct fib6_config *cfg,
4325 				   struct netlink_ext_ack *extack)
4326 {
4327 	struct fib6_config r_cfg;
4328 	struct rtnexthop *rtnh;
4329 	int remaining;
4330 	int attrlen;
4331 	int err = 1, last_err = 0;
4332 
4333 	remaining = cfg->fc_mp_len;
4334 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4335 
4336 	/* Parse a Multipath Entry */
4337 	while (rtnh_ok(rtnh, remaining)) {
4338 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4339 		if (rtnh->rtnh_ifindex)
4340 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4341 
4342 		attrlen = rtnh_attrlen(rtnh);
4343 		if (attrlen > 0) {
4344 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4345 
4346 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4347 			if (nla) {
4348 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4349 				r_cfg.fc_flags |= RTF_GATEWAY;
4350 			}
4351 		}
4352 		err = ip6_route_del(&r_cfg, extack);
4353 		if (err)
4354 			last_err = err;
4355 
4356 		rtnh = rtnh_next(rtnh, &remaining);
4357 	}
4358 
4359 	return last_err;
4360 }
4361 
4362 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4363 			      struct netlink_ext_ack *extack)
4364 {
4365 	struct fib6_config cfg;
4366 	int err;
4367 
4368 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4369 	if (err < 0)
4370 		return err;
4371 
4372 	if (cfg.fc_mp)
4373 		return ip6_route_multipath_del(&cfg, extack);
4374 	else {
4375 		cfg.fc_delete_all_nh = 1;
4376 		return ip6_route_del(&cfg, extack);
4377 	}
4378 }
4379 
4380 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4381 			      struct netlink_ext_ack *extack)
4382 {
4383 	struct fib6_config cfg;
4384 	int err;
4385 
4386 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4387 	if (err < 0)
4388 		return err;
4389 
4390 	if (cfg.fc_mp)
4391 		return ip6_route_multipath_add(&cfg, extack);
4392 	else
4393 		return ip6_route_add(&cfg, extack);
4394 }
4395 
4396 static size_t rt6_nlmsg_size(struct rt6_info *rt)
4397 {
4398 	int nexthop_len = 0;
4399 
4400 	if (rt->rt6i_nsiblings) {
4401 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4402 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4403 			    + nla_total_size(16) /* RTA_GATEWAY */
4404 			    + lwtunnel_get_encap_size(rt->dst.lwtstate);
4405 
4406 		nexthop_len *= rt->rt6i_nsiblings;
4407 	}
4408 
4409 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4410 	       + nla_total_size(16) /* RTA_SRC */
4411 	       + nla_total_size(16) /* RTA_DST */
4412 	       + nla_total_size(16) /* RTA_GATEWAY */
4413 	       + nla_total_size(16) /* RTA_PREFSRC */
4414 	       + nla_total_size(4) /* RTA_TABLE */
4415 	       + nla_total_size(4) /* RTA_IIF */
4416 	       + nla_total_size(4) /* RTA_OIF */
4417 	       + nla_total_size(4) /* RTA_PRIORITY */
4418 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4419 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4420 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4421 	       + nla_total_size(1) /* RTA_PREF */
4422 	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
4423 	       + nexthop_len;
4424 }
4425 
4426 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4427 			    unsigned int *flags, bool skip_oif)
4428 {
4429 	if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4430 		*flags |= RTNH_F_DEAD;
4431 
4432 	if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
4433 		*flags |= RTNH_F_LINKDOWN;
4434 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4435 			*flags |= RTNH_F_DEAD;
4436 	}
4437 
4438 	if (rt->rt6i_flags & RTF_GATEWAY) {
4439 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4440 			goto nla_put_failure;
4441 	}
4442 
4443 	*flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
4444 	if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
4445 		*flags |= RTNH_F_OFFLOAD;
4446 
4447 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4448 	if (!skip_oif && rt->dst.dev &&
4449 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4450 		goto nla_put_failure;
4451 
4452 	if (rt->dst.lwtstate &&
4453 	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4454 		goto nla_put_failure;
4455 
4456 	return 0;
4457 
4458 nla_put_failure:
4459 	return -EMSGSIZE;
4460 }
4461 
4462 /* add multipath next hop */
4463 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4464 {
4465 	struct rtnexthop *rtnh;
4466 	unsigned int flags = 0;
4467 
4468 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4469 	if (!rtnh)
4470 		goto nla_put_failure;
4471 
4472 	rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
4473 	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4474 
4475 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4476 		goto nla_put_failure;
4477 
4478 	rtnh->rtnh_flags = flags;
4479 
4480 	/* length of rtnetlink header + attributes */
4481 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4482 
4483 	return 0;
4484 
4485 nla_put_failure:
4486 	return -EMSGSIZE;
4487 }
4488 
4489 static int rt6_fill_node(struct net *net,
4490 			 struct sk_buff *skb, struct rt6_info *rt,
4491 			 struct in6_addr *dst, struct in6_addr *src,
4492 			 int iif, int type, u32 portid, u32 seq,
4493 			 unsigned int flags)
4494 {
4495 	u32 metrics[RTAX_MAX];
4496 	struct rtmsg *rtm;
4497 	struct nlmsghdr *nlh;
4498 	long expires;
4499 	u32 table;
4500 
4501 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4502 	if (!nlh)
4503 		return -EMSGSIZE;
4504 
4505 	rtm = nlmsg_data(nlh);
4506 	rtm->rtm_family = AF_INET6;
4507 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
4508 	rtm->rtm_src_len = rt->rt6i_src.plen;
4509 	rtm->rtm_tos = 0;
4510 	if (rt->rt6i_table)
4511 		table = rt->rt6i_table->tb6_id;
4512 	else
4513 		table = RT6_TABLE_UNSPEC;
4514 	rtm->rtm_table = table;
4515 	if (nla_put_u32(skb, RTA_TABLE, table))
4516 		goto nla_put_failure;
4517 	if (rt->rt6i_flags & RTF_REJECT) {
4518 		switch (rt->dst.error) {
4519 		case -EINVAL:
4520 			rtm->rtm_type = RTN_BLACKHOLE;
4521 			break;
4522 		case -EACCES:
4523 			rtm->rtm_type = RTN_PROHIBIT;
4524 			break;
4525 		case -EAGAIN:
4526 			rtm->rtm_type = RTN_THROW;
4527 			break;
4528 		default:
4529 			rtm->rtm_type = RTN_UNREACHABLE;
4530 			break;
4531 		}
4532 	}
4533 	else if (rt->rt6i_flags & RTF_LOCAL)
4534 		rtm->rtm_type = RTN_LOCAL;
4535 	else if (rt->rt6i_flags & RTF_ANYCAST)
4536 		rtm->rtm_type = RTN_ANYCAST;
4537 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
4538 		rtm->rtm_type = RTN_LOCAL;
4539 	else
4540 		rtm->rtm_type = RTN_UNICAST;
4541 	rtm->rtm_flags = 0;
4542 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4543 	rtm->rtm_protocol = rt->rt6i_protocol;
4544 
4545 	if (rt->rt6i_flags & RTF_CACHE)
4546 		rtm->rtm_flags |= RTM_F_CLONED;
4547 
4548 	if (dst) {
4549 		if (nla_put_in6_addr(skb, RTA_DST, dst))
4550 			goto nla_put_failure;
4551 		rtm->rtm_dst_len = 128;
4552 	} else if (rtm->rtm_dst_len)
4553 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
4554 			goto nla_put_failure;
4555 #ifdef CONFIG_IPV6_SUBTREES
4556 	if (src) {
4557 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4558 			goto nla_put_failure;
4559 		rtm->rtm_src_len = 128;
4560 	} else if (rtm->rtm_src_len &&
4561 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
4562 		goto nla_put_failure;
4563 #endif
4564 	if (iif) {
4565 #ifdef CONFIG_IPV6_MROUTE
4566 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4567 			int err = ip6mr_get_route(net, skb, rtm, portid);
4568 
4569 			if (err == 0)
4570 				return 0;
4571 			if (err < 0)
4572 				goto nla_put_failure;
4573 		} else
4574 #endif
4575 			if (nla_put_u32(skb, RTA_IIF, iif))
4576 				goto nla_put_failure;
4577 	} else if (dst) {
4578 		struct in6_addr saddr_buf;
4579 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
4580 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4581 			goto nla_put_failure;
4582 	}
4583 
4584 	if (rt->rt6i_prefsrc.plen) {
4585 		struct in6_addr saddr_buf;
4586 		saddr_buf = rt->rt6i_prefsrc.addr;
4587 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4588 			goto nla_put_failure;
4589 	}
4590 
4591 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4592 	if (rt->rt6i_pmtu)
4593 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4594 	if (rtnetlink_put_metrics(skb, metrics) < 0)
4595 		goto nla_put_failure;
4596 
4597 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4598 		goto nla_put_failure;
4599 
4600 	/* For multipath routes, walk the siblings list and add
4601 	 * each as a nexthop within RTA_MULTIPATH.
4602 	 */
4603 	if (rt->rt6i_nsiblings) {
4604 		struct rt6_info *sibling, *next_sibling;
4605 		struct nlattr *mp;
4606 
4607 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4608 		if (!mp)
4609 			goto nla_put_failure;
4610 
4611 		if (rt6_add_nexthop(skb, rt) < 0)
4612 			goto nla_put_failure;
4613 
4614 		list_for_each_entry_safe(sibling, next_sibling,
4615 					 &rt->rt6i_siblings, rt6i_siblings) {
4616 			if (rt6_add_nexthop(skb, sibling) < 0)
4617 				goto nla_put_failure;
4618 		}
4619 
4620 		nla_nest_end(skb, mp);
4621 	} else {
4622 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4623 			goto nla_put_failure;
4624 	}
4625 
4626 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
4627 
4628 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
4629 		goto nla_put_failure;
4630 
4631 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4632 		goto nla_put_failure;
4633 
4634 
4635 	nlmsg_end(skb, nlh);
4636 	return 0;
4637 
4638 nla_put_failure:
4639 	nlmsg_cancel(skb, nlh);
4640 	return -EMSGSIZE;
4641 }
4642 
4643 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
4644 {
4645 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4646 	struct net *net = arg->net;
4647 
4648 	if (rt == net->ipv6.ip6_null_entry)
4649 		return 0;
4650 
4651 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4652 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4653 
4654 		/* user wants prefix routes only */
4655 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4656 		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4657 			/* success since this is not a prefix route */
4658 			return 1;
4659 		}
4660 	}
4661 
4662 	return rt6_fill_node(net,
4663 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
4664 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
4665 		     NLM_F_MULTI);
4666 }
4667 
4668 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4669 			      struct netlink_ext_ack *extack)
4670 {
4671 	struct net *net = sock_net(in_skb->sk);
4672 	struct nlattr *tb[RTA_MAX+1];
4673 	int err, iif = 0, oif = 0;
4674 	struct dst_entry *dst;
4675 	struct rt6_info *rt;
4676 	struct sk_buff *skb;
4677 	struct rtmsg *rtm;
4678 	struct flowi6 fl6;
4679 	bool fibmatch;
4680 
4681 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4682 			  extack);
4683 	if (err < 0)
4684 		goto errout;
4685 
4686 	err = -EINVAL;
4687 	memset(&fl6, 0, sizeof(fl6));
4688 	rtm = nlmsg_data(nlh);
4689 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4690 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4691 
4692 	if (tb[RTA_SRC]) {
4693 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4694 			goto errout;
4695 
4696 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4697 	}
4698 
4699 	if (tb[RTA_DST]) {
4700 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4701 			goto errout;
4702 
4703 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4704 	}
4705 
4706 	if (tb[RTA_IIF])
4707 		iif = nla_get_u32(tb[RTA_IIF]);
4708 
4709 	if (tb[RTA_OIF])
4710 		oif = nla_get_u32(tb[RTA_OIF]);
4711 
4712 	if (tb[RTA_MARK])
4713 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4714 
4715 	if (tb[RTA_UID])
4716 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4717 					   nla_get_u32(tb[RTA_UID]));
4718 	else
4719 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4720 
4721 	if (iif) {
4722 		struct net_device *dev;
4723 		int flags = 0;
4724 
4725 		rcu_read_lock();
4726 
4727 		dev = dev_get_by_index_rcu(net, iif);
4728 		if (!dev) {
4729 			rcu_read_unlock();
4730 			err = -ENODEV;
4731 			goto errout;
4732 		}
4733 
4734 		fl6.flowi6_iif = iif;
4735 
4736 		if (!ipv6_addr_any(&fl6.saddr))
4737 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4738 
4739 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4740 
4741 		rcu_read_unlock();
4742 	} else {
4743 		fl6.flowi6_oif = oif;
4744 
4745 		dst = ip6_route_output(net, NULL, &fl6);
4746 	}
4747 
4748 
4749 	rt = container_of(dst, struct rt6_info, dst);
4750 	if (rt->dst.error) {
4751 		err = rt->dst.error;
4752 		ip6_rt_put(rt);
4753 		goto errout;
4754 	}
4755 
4756 	if (rt == net->ipv6.ip6_null_entry) {
4757 		err = rt->dst.error;
4758 		ip6_rt_put(rt);
4759 		goto errout;
4760 	}
4761 
4762 	if (fibmatch && rt->from) {
4763 		struct rt6_info *ort = rt->from;
4764 
4765 		dst_hold(&ort->dst);
4766 		ip6_rt_put(rt);
4767 		rt = ort;
4768 	}
4769 
4770 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4771 	if (!skb) {
4772 		ip6_rt_put(rt);
4773 		err = -ENOBUFS;
4774 		goto errout;
4775 	}
4776 
4777 	skb_dst_set(skb, &rt->dst);
4778 	if (fibmatch)
4779 		err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4780 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4781 				    nlh->nlmsg_seq, 0);
4782 	else
4783 		err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4784 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4785 				    nlh->nlmsg_seq, 0);
4786 	if (err < 0) {
4787 		kfree_skb(skb);
4788 		goto errout;
4789 	}
4790 
4791 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4792 errout:
4793 	return err;
4794 }
4795 
4796 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4797 		     unsigned int nlm_flags)
4798 {
4799 	struct sk_buff *skb;
4800 	struct net *net = info->nl_net;
4801 	u32 seq;
4802 	int err;
4803 
4804 	err = -ENOBUFS;
4805 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4806 
4807 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4808 	if (!skb)
4809 		goto errout;
4810 
4811 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
4812 				event, info->portid, seq, nlm_flags);
4813 	if (err < 0) {
4814 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4815 		WARN_ON(err == -EMSGSIZE);
4816 		kfree_skb(skb);
4817 		goto errout;
4818 	}
4819 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4820 		    info->nlh, gfp_any());
4821 	return;
4822 errout:
4823 	if (err < 0)
4824 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4825 }
4826 
4827 static int ip6_route_dev_notify(struct notifier_block *this,
4828 				unsigned long event, void *ptr)
4829 {
4830 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4831 	struct net *net = dev_net(dev);
4832 
4833 	if (!(dev->flags & IFF_LOOPBACK))
4834 		return NOTIFY_OK;
4835 
4836 	if (event == NETDEV_REGISTER) {
4837 		net->ipv6.ip6_null_entry->dst.dev = dev;
4838 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4839 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4840 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4841 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4842 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4843 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4844 #endif
4845 	 } else if (event == NETDEV_UNREGISTER &&
4846 		    dev->reg_state != NETREG_UNREGISTERED) {
4847 		/* NETDEV_UNREGISTER could be fired for multiple times by
4848 		 * netdev_wait_allrefs(). Make sure we only call this once.
4849 		 */
4850 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4851 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4852 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4853 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4854 #endif
4855 	}
4856 
4857 	return NOTIFY_OK;
4858 }
4859 
4860 /*
4861  *	/proc
4862  */
4863 
4864 #ifdef CONFIG_PROC_FS
4865 
4866 static const struct file_operations ipv6_route_proc_fops = {
4867 	.open		= ipv6_route_open,
4868 	.read		= seq_read,
4869 	.llseek		= seq_lseek,
4870 	.release	= seq_release_net,
4871 };
4872 
4873 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4874 {
4875 	struct net *net = (struct net *)seq->private;
4876 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4877 		   net->ipv6.rt6_stats->fib_nodes,
4878 		   net->ipv6.rt6_stats->fib_route_nodes,
4879 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4880 		   net->ipv6.rt6_stats->fib_rt_entries,
4881 		   net->ipv6.rt6_stats->fib_rt_cache,
4882 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4883 		   net->ipv6.rt6_stats->fib_discarded_routes);
4884 
4885 	return 0;
4886 }
4887 
4888 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4889 {
4890 	return single_open_net(inode, file, rt6_stats_seq_show);
4891 }
4892 
4893 static const struct file_operations rt6_stats_seq_fops = {
4894 	.open	 = rt6_stats_seq_open,
4895 	.read	 = seq_read,
4896 	.llseek	 = seq_lseek,
4897 	.release = single_release_net,
4898 };
4899 #endif	/* CONFIG_PROC_FS */
4900 
4901 #ifdef CONFIG_SYSCTL
4902 
4903 static
4904 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4905 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4906 {
4907 	struct net *net;
4908 	int delay;
4909 	if (!write)
4910 		return -EINVAL;
4911 
4912 	net = (struct net *)ctl->extra1;
4913 	delay = net->ipv6.sysctl.flush_delay;
4914 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4915 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4916 	return 0;
4917 }
4918 
4919 struct ctl_table ipv6_route_table_template[] = {
4920 	{
4921 		.procname	=	"flush",
4922 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4923 		.maxlen		=	sizeof(int),
4924 		.mode		=	0200,
4925 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4926 	},
4927 	{
4928 		.procname	=	"gc_thresh",
4929 		.data		=	&ip6_dst_ops_template.gc_thresh,
4930 		.maxlen		=	sizeof(int),
4931 		.mode		=	0644,
4932 		.proc_handler	=	proc_dointvec,
4933 	},
4934 	{
4935 		.procname	=	"max_size",
4936 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4937 		.maxlen		=	sizeof(int),
4938 		.mode		=	0644,
4939 		.proc_handler	=	proc_dointvec,
4940 	},
4941 	{
4942 		.procname	=	"gc_min_interval",
4943 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4944 		.maxlen		=	sizeof(int),
4945 		.mode		=	0644,
4946 		.proc_handler	=	proc_dointvec_jiffies,
4947 	},
4948 	{
4949 		.procname	=	"gc_timeout",
4950 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4951 		.maxlen		=	sizeof(int),
4952 		.mode		=	0644,
4953 		.proc_handler	=	proc_dointvec_jiffies,
4954 	},
4955 	{
4956 		.procname	=	"gc_interval",
4957 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4958 		.maxlen		=	sizeof(int),
4959 		.mode		=	0644,
4960 		.proc_handler	=	proc_dointvec_jiffies,
4961 	},
4962 	{
4963 		.procname	=	"gc_elasticity",
4964 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4965 		.maxlen		=	sizeof(int),
4966 		.mode		=	0644,
4967 		.proc_handler	=	proc_dointvec,
4968 	},
4969 	{
4970 		.procname	=	"mtu_expires",
4971 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4972 		.maxlen		=	sizeof(int),
4973 		.mode		=	0644,
4974 		.proc_handler	=	proc_dointvec_jiffies,
4975 	},
4976 	{
4977 		.procname	=	"min_adv_mss",
4978 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4979 		.maxlen		=	sizeof(int),
4980 		.mode		=	0644,
4981 		.proc_handler	=	proc_dointvec,
4982 	},
4983 	{
4984 		.procname	=	"gc_min_interval_ms",
4985 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4986 		.maxlen		=	sizeof(int),
4987 		.mode		=	0644,
4988 		.proc_handler	=	proc_dointvec_ms_jiffies,
4989 	},
4990 	{ }
4991 };
4992 
4993 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4994 {
4995 	struct ctl_table *table;
4996 
4997 	table = kmemdup(ipv6_route_table_template,
4998 			sizeof(ipv6_route_table_template),
4999 			GFP_KERNEL);
5000 
5001 	if (table) {
5002 		table[0].data = &net->ipv6.sysctl.flush_delay;
5003 		table[0].extra1 = net;
5004 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5005 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5006 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5007 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5008 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5009 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5010 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5011 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5012 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5013 
5014 		/* Don't export sysctls to unprivileged users */
5015 		if (net->user_ns != &init_user_ns)
5016 			table[0].procname = NULL;
5017 	}
5018 
5019 	return table;
5020 }
5021 #endif
5022 
5023 static int __net_init ip6_route_net_init(struct net *net)
5024 {
5025 	int ret = -ENOMEM;
5026 
5027 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5028 	       sizeof(net->ipv6.ip6_dst_ops));
5029 
5030 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5031 		goto out_ip6_dst_ops;
5032 
5033 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5034 					   sizeof(*net->ipv6.ip6_null_entry),
5035 					   GFP_KERNEL);
5036 	if (!net->ipv6.ip6_null_entry)
5037 		goto out_ip6_dst_entries;
5038 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5039 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5040 			 ip6_template_metrics, true);
5041 
5042 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5043 	net->ipv6.fib6_has_custom_rules = false;
5044 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5045 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5046 					       GFP_KERNEL);
5047 	if (!net->ipv6.ip6_prohibit_entry)
5048 		goto out_ip6_null_entry;
5049 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5050 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5051 			 ip6_template_metrics, true);
5052 
5053 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5054 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5055 					       GFP_KERNEL);
5056 	if (!net->ipv6.ip6_blk_hole_entry)
5057 		goto out_ip6_prohibit_entry;
5058 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5059 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5060 			 ip6_template_metrics, true);
5061 #endif
5062 
5063 	net->ipv6.sysctl.flush_delay = 0;
5064 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5065 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5066 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5067 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5068 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5069 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5070 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5071 
5072 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5073 
5074 	ret = 0;
5075 out:
5076 	return ret;
5077 
5078 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5079 out_ip6_prohibit_entry:
5080 	kfree(net->ipv6.ip6_prohibit_entry);
5081 out_ip6_null_entry:
5082 	kfree(net->ipv6.ip6_null_entry);
5083 #endif
5084 out_ip6_dst_entries:
5085 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5086 out_ip6_dst_ops:
5087 	goto out;
5088 }
5089 
5090 static void __net_exit ip6_route_net_exit(struct net *net)
5091 {
5092 	kfree(net->ipv6.ip6_null_entry);
5093 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5094 	kfree(net->ipv6.ip6_prohibit_entry);
5095 	kfree(net->ipv6.ip6_blk_hole_entry);
5096 #endif
5097 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5098 }
5099 
5100 static int __net_init ip6_route_net_init_late(struct net *net)
5101 {
5102 #ifdef CONFIG_PROC_FS
5103 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5104 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5105 #endif
5106 	return 0;
5107 }
5108 
5109 static void __net_exit ip6_route_net_exit_late(struct net *net)
5110 {
5111 #ifdef CONFIG_PROC_FS
5112 	remove_proc_entry("ipv6_route", net->proc_net);
5113 	remove_proc_entry("rt6_stats", net->proc_net);
5114 #endif
5115 }
5116 
5117 static struct pernet_operations ip6_route_net_ops = {
5118 	.init = ip6_route_net_init,
5119 	.exit = ip6_route_net_exit,
5120 };
5121 
5122 static int __net_init ipv6_inetpeer_init(struct net *net)
5123 {
5124 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5125 
5126 	if (!bp)
5127 		return -ENOMEM;
5128 	inet_peer_base_init(bp);
5129 	net->ipv6.peers = bp;
5130 	return 0;
5131 }
5132 
5133 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5134 {
5135 	struct inet_peer_base *bp = net->ipv6.peers;
5136 
5137 	net->ipv6.peers = NULL;
5138 	inetpeer_invalidate_tree(bp);
5139 	kfree(bp);
5140 }
5141 
5142 static struct pernet_operations ipv6_inetpeer_ops = {
5143 	.init	=	ipv6_inetpeer_init,
5144 	.exit	=	ipv6_inetpeer_exit,
5145 };
5146 
5147 static struct pernet_operations ip6_route_net_late_ops = {
5148 	.init = ip6_route_net_init_late,
5149 	.exit = ip6_route_net_exit_late,
5150 };
5151 
5152 static struct notifier_block ip6_route_dev_notifier = {
5153 	.notifier_call = ip6_route_dev_notify,
5154 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5155 };
5156 
5157 void __init ip6_route_init_special_entries(void)
5158 {
5159 	/* Registering of the loopback is done before this portion of code,
5160 	 * the loopback reference in rt6_info will not be taken, do it
5161 	 * manually for init_net */
5162 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5163 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5164   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5165 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5166 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5167 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5168 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5169   #endif
5170 }
5171 
5172 int __init ip6_route_init(void)
5173 {
5174 	int ret;
5175 	int cpu;
5176 
5177 	ret = -ENOMEM;
5178 	ip6_dst_ops_template.kmem_cachep =
5179 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5180 				  SLAB_HWCACHE_ALIGN, NULL);
5181 	if (!ip6_dst_ops_template.kmem_cachep)
5182 		goto out;
5183 
5184 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5185 	if (ret)
5186 		goto out_kmem_cache;
5187 
5188 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5189 	if (ret)
5190 		goto out_dst_entries;
5191 
5192 	ret = register_pernet_subsys(&ip6_route_net_ops);
5193 	if (ret)
5194 		goto out_register_inetpeer;
5195 
5196 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5197 
5198 	ret = fib6_init();
5199 	if (ret)
5200 		goto out_register_subsys;
5201 
5202 	ret = xfrm6_init();
5203 	if (ret)
5204 		goto out_fib6_init;
5205 
5206 	ret = fib6_rules_init();
5207 	if (ret)
5208 		goto xfrm6_init;
5209 
5210 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5211 	if (ret)
5212 		goto fib6_rules_init;
5213 
5214 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5215 				   inet6_rtm_newroute, NULL, 0);
5216 	if (ret < 0)
5217 		goto out_register_late_subsys;
5218 
5219 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5220 				   inet6_rtm_delroute, NULL, 0);
5221 	if (ret < 0)
5222 		goto out_register_late_subsys;
5223 
5224 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5225 				   inet6_rtm_getroute, NULL,
5226 				   RTNL_FLAG_DOIT_UNLOCKED);
5227 	if (ret < 0)
5228 		goto out_register_late_subsys;
5229 
5230 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5231 	if (ret)
5232 		goto out_register_late_subsys;
5233 
5234 	for_each_possible_cpu(cpu) {
5235 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5236 
5237 		INIT_LIST_HEAD(&ul->head);
5238 		spin_lock_init(&ul->lock);
5239 	}
5240 
5241 out:
5242 	return ret;
5243 
5244 out_register_late_subsys:
5245 	rtnl_unregister_all(PF_INET6);
5246 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5247 fib6_rules_init:
5248 	fib6_rules_cleanup();
5249 xfrm6_init:
5250 	xfrm6_fini();
5251 out_fib6_init:
5252 	fib6_gc_cleanup();
5253 out_register_subsys:
5254 	unregister_pernet_subsys(&ip6_route_net_ops);
5255 out_register_inetpeer:
5256 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5257 out_dst_entries:
5258 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5259 out_kmem_cache:
5260 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5261 	goto out;
5262 }
5263 
5264 void ip6_route_cleanup(void)
5265 {
5266 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5267 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5268 	fib6_rules_cleanup();
5269 	xfrm6_fini();
5270 	fib6_gc_cleanup();
5271 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5272 	unregister_pernet_subsys(&ip6_route_net_ops);
5273 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5274 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5275 }
5276