xref: /openbmc/linux/net/ipv6/route.c (revision 93c2fb253d177a0b8f4f93592441f88c9b7d6245)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <net/net_namespace.h>
49 #include <net/snmp.h>
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #include <net/ndisc.h>
54 #include <net/addrconf.h>
55 #include <net/tcp.h>
56 #include <linux/rtnetlink.h>
57 #include <net/dst.h>
58 #include <net/dst_metadata.h>
59 #include <net/xfrm.h>
60 #include <net/netevent.h>
61 #include <net/netlink.h>
62 #include <net/nexthop.h>
63 #include <net/lwtunnel.h>
64 #include <net/ip_tunnels.h>
65 #include <net/l3mdev.h>
66 #include <trace/events/fib6.h>
67 
68 #include <linux/uaccess.h>
69 
70 #ifdef CONFIG_SYSCTL
71 #include <linux/sysctl.h>
72 #endif
73 
74 enum rt6_nud_state {
75 	RT6_NUD_FAIL_HARD = -3,
76 	RT6_NUD_FAIL_PROBE = -2,
77 	RT6_NUD_FAIL_DO_RR = -1,
78 	RT6_NUD_SUCCEED = 1
79 };
80 
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
100 static size_t rt6_nlmsg_size(struct fib6_info *rt);
101 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102 			 struct fib6_info *rt, struct dst_entry *dst,
103 			 struct in6_addr *dest, struct in6_addr *src,
104 			 int iif, int type, u32 portid, u32 seq,
105 			 unsigned int flags);
106 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 					   struct in6_addr *daddr,
108 					   struct in6_addr *saddr);
109 
110 #ifdef CONFIG_IPV6_ROUTE_INFO
111 static struct fib6_info *rt6_add_route_info(struct net *net,
112 					   const struct in6_addr *prefix, int prefixlen,
113 					   const struct in6_addr *gwaddr,
114 					   struct net_device *dev,
115 					   unsigned int pref);
116 static struct fib6_info *rt6_get_route_info(struct net *net,
117 					   const struct in6_addr *prefix, int prefixlen,
118 					   const struct in6_addr *gwaddr,
119 					   struct net_device *dev);
120 #endif
121 
122 struct uncached_list {
123 	spinlock_t		lock;
124 	struct list_head	head;
125 };
126 
127 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 
129 void rt6_uncached_list_add(struct rt6_info *rt)
130 {
131 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 
133 	rt->rt6i_uncached_list = ul;
134 
135 	spin_lock_bh(&ul->lock);
136 	list_add_tail(&rt->rt6i_uncached, &ul->head);
137 	spin_unlock_bh(&ul->lock);
138 }
139 
140 void rt6_uncached_list_del(struct rt6_info *rt)
141 {
142 	if (!list_empty(&rt->rt6i_uncached)) {
143 		struct uncached_list *ul = rt->rt6i_uncached_list;
144 		struct net *net = dev_net(rt->dst.dev);
145 
146 		spin_lock_bh(&ul->lock);
147 		list_del(&rt->rt6i_uncached);
148 		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 		spin_unlock_bh(&ul->lock);
150 	}
151 }
152 
153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
154 {
155 	struct net_device *loopback_dev = net->loopback_dev;
156 	int cpu;
157 
158 	if (dev == loopback_dev)
159 		return;
160 
161 	for_each_possible_cpu(cpu) {
162 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
163 		struct rt6_info *rt;
164 
165 		spin_lock_bh(&ul->lock);
166 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
167 			struct inet6_dev *rt_idev = rt->rt6i_idev;
168 			struct net_device *rt_dev = rt->dst.dev;
169 
170 			if (rt_idev->dev == dev) {
171 				rt->rt6i_idev = in6_dev_get(loopback_dev);
172 				in6_dev_put(rt_idev);
173 			}
174 
175 			if (rt_dev == dev) {
176 				rt->dst.dev = loopback_dev;
177 				dev_hold(rt->dst.dev);
178 				dev_put(rt_dev);
179 			}
180 		}
181 		spin_unlock_bh(&ul->lock);
182 	}
183 }
184 
185 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	if (!ipv6_addr_any(p))
190 		return (const void *) p;
191 	else if (skb)
192 		return &ipv6_hdr(skb)->daddr;
193 	return daddr;
194 }
195 
196 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
197 				   struct net_device *dev,
198 				   struct sk_buff *skb,
199 				   const void *daddr)
200 {
201 	struct neighbour *n;
202 
203 	daddr = choose_neigh_daddr(gw, skb, daddr);
204 	n = __ipv6_neigh_lookup(dev, daddr);
205 	if (n)
206 		return n;
207 	return neigh_create(&nd_tbl, daddr, dev);
208 }
209 
210 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
211 					      struct sk_buff *skb,
212 					      const void *daddr)
213 {
214 	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
215 
216 	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 }
218 
219 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
220 {
221 	struct net_device *dev = dst->dev;
222 	struct rt6_info *rt = (struct rt6_info *)dst;
223 
224 	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 	if (!daddr)
226 		return;
227 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
228 		return;
229 	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
230 		return;
231 	__ipv6_confirm_neigh(dev, daddr);
232 }
233 
234 static struct dst_ops ip6_dst_ops_template = {
235 	.family			=	AF_INET6,
236 	.gc			=	ip6_dst_gc,
237 	.gc_thresh		=	1024,
238 	.check			=	ip6_dst_check,
239 	.default_advmss		=	ip6_default_advmss,
240 	.mtu			=	ip6_mtu,
241 	.cow_metrics		=	dst_cow_metrics_generic,
242 	.destroy		=	ip6_dst_destroy,
243 	.ifdown			=	ip6_dst_ifdown,
244 	.negative_advice	=	ip6_negative_advice,
245 	.link_failure		=	ip6_link_failure,
246 	.update_pmtu		=	ip6_rt_update_pmtu,
247 	.redirect		=	rt6_do_redirect,
248 	.local_out		=	__ip6_local_out,
249 	.neigh_lookup		=	ip6_dst_neigh_lookup,
250 	.confirm_neigh		=	ip6_confirm_neigh,
251 };
252 
253 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254 {
255 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
256 
257 	return mtu ? : dst->dev->mtu;
258 }
259 
260 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
261 					 struct sk_buff *skb, u32 mtu)
262 {
263 }
264 
265 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
266 				      struct sk_buff *skb)
267 {
268 }
269 
270 static struct dst_ops ip6_dst_blackhole_ops = {
271 	.family			=	AF_INET6,
272 	.destroy		=	ip6_dst_destroy,
273 	.check			=	ip6_dst_check,
274 	.mtu			=	ip6_blackhole_mtu,
275 	.default_advmss		=	ip6_default_advmss,
276 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277 	.redirect		=	ip6_rt_blackhole_redirect,
278 	.cow_metrics		=	dst_cow_metrics_generic,
279 	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 };
281 
282 static const u32 ip6_template_metrics[RTAX_MAX] = {
283 	[RTAX_HOPLIMIT - 1] = 0,
284 };
285 
286 static const struct fib6_info fib6_null_entry_template = {
287 	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
288 	.fib6_protocol  = RTPROT_KERNEL,
289 	.fib6_metric	= ~(u32)0,
290 	.fib6_ref	= ATOMIC_INIT(1),
291 	.fib6_type	= RTN_UNREACHABLE,
292 	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
293 };
294 
295 static const struct rt6_info ip6_null_entry_template = {
296 	.dst = {
297 		.__refcnt	= ATOMIC_INIT(1),
298 		.__use		= 1,
299 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 		.error		= -ENETUNREACH,
301 		.input		= ip6_pkt_discard,
302 		.output		= ip6_pkt_discard_out,
303 	},
304 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
305 };
306 
307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
308 
309 static const struct rt6_info ip6_prohibit_entry_template = {
310 	.dst = {
311 		.__refcnt	= ATOMIC_INIT(1),
312 		.__use		= 1,
313 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 		.error		= -EACCES,
315 		.input		= ip6_pkt_prohibit,
316 		.output		= ip6_pkt_prohibit_out,
317 	},
318 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319 };
320 
321 static const struct rt6_info ip6_blk_hole_entry_template = {
322 	.dst = {
323 		.__refcnt	= ATOMIC_INIT(1),
324 		.__use		= 1,
325 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 		.error		= -EINVAL,
327 		.input		= dst_discard,
328 		.output		= dst_discard_out,
329 	},
330 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
331 };
332 
333 #endif
334 
335 static void rt6_info_init(struct rt6_info *rt)
336 {
337 	struct dst_entry *dst = &rt->dst;
338 
339 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
340 	INIT_LIST_HEAD(&rt->rt6i_uncached);
341 }
342 
343 /* allocate dst with ip6_dst_ops */
344 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
348 					1, DST_OBSOLETE_FORCE_CHK, flags);
349 
350 	if (rt) {
351 		rt6_info_init(rt);
352 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
353 	}
354 
355 	return rt;
356 }
357 EXPORT_SYMBOL(ip6_dst_alloc);
358 
359 static void ip6_dst_destroy(struct dst_entry *dst)
360 {
361 	struct rt6_info *rt = (struct rt6_info *)dst;
362 	struct fib6_info *from = rt->from;
363 	struct inet6_dev *idev;
364 
365 	dst_destroy_metrics_generic(dst);
366 	rt6_uncached_list_del(rt);
367 
368 	idev = rt->rt6i_idev;
369 	if (idev) {
370 		rt->rt6i_idev = NULL;
371 		in6_dev_put(idev);
372 	}
373 
374 	rt->from = NULL;
375 	fib6_info_release(from);
376 }
377 
378 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
379 			   int how)
380 {
381 	struct rt6_info *rt = (struct rt6_info *)dst;
382 	struct inet6_dev *idev = rt->rt6i_idev;
383 	struct net_device *loopback_dev =
384 		dev_net(dev)->loopback_dev;
385 
386 	if (idev && idev->dev != loopback_dev) {
387 		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
388 		if (loopback_idev) {
389 			rt->rt6i_idev = loopback_idev;
390 			in6_dev_put(idev);
391 		}
392 	}
393 }
394 
395 static bool __rt6_check_expired(const struct rt6_info *rt)
396 {
397 	if (rt->rt6i_flags & RTF_EXPIRES)
398 		return time_after(jiffies, rt->dst.expires);
399 	else
400 		return false;
401 }
402 
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405 	if (rt->rt6i_flags & RTF_EXPIRES) {
406 		if (time_after(jiffies, rt->dst.expires))
407 			return true;
408 	} else if (rt->from) {
409 		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
410 			fib6_check_expired(rt->from);
411 	}
412 	return false;
413 }
414 
415 static struct fib6_info *rt6_multipath_select(const struct net *net,
416 					      struct fib6_info *match,
417 					     struct flowi6 *fl6, int oif,
418 					     const struct sk_buff *skb,
419 					     int strict)
420 {
421 	struct fib6_info *sibling, *next_sibling;
422 
423 	/* We might have already computed the hash for ICMPv6 errors. In such
424 	 * case it will always be non-zero. Otherwise now is the time to do it.
425 	 */
426 	if (!fl6->mp_hash)
427 		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
428 
429 	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
430 		return match;
431 
432 	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
433 				 fib6_siblings) {
434 		int nh_upper_bound;
435 
436 		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
437 		if (fl6->mp_hash > nh_upper_bound)
438 			continue;
439 		if (rt6_score_route(sibling, oif, strict) < 0)
440 			break;
441 		match = sibling;
442 		break;
443 	}
444 
445 	return match;
446 }
447 
448 /*
449  *	Route lookup. rcu_read_lock() should be held.
450  */
451 
452 static inline struct fib6_info *rt6_device_match(struct net *net,
453 						 struct fib6_info *rt,
454 						    const struct in6_addr *saddr,
455 						    int oif,
456 						    int flags)
457 {
458 	struct fib6_info *local = NULL;
459 	struct fib6_info *sprt;
460 
461 	if (!oif && ipv6_addr_any(saddr) &&
462 	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
463 		return rt;
464 
465 	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
466 		const struct net_device *dev = sprt->fib6_nh.nh_dev;
467 
468 		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
469 			continue;
470 
471 		if (oif) {
472 			if (dev->ifindex == oif)
473 				return sprt;
474 			if (dev->flags & IFF_LOOPBACK) {
475 				if (!sprt->fib6_idev ||
476 				    sprt->fib6_idev->dev->ifindex != oif) {
477 					if (flags & RT6_LOOKUP_F_IFACE)
478 						continue;
479 					if (local &&
480 					    local->fib6_idev->dev->ifindex == oif)
481 						continue;
482 				}
483 				local = sprt;
484 			}
485 		} else {
486 			if (ipv6_chk_addr(net, saddr, dev,
487 					  flags & RT6_LOOKUP_F_IFACE))
488 				return sprt;
489 		}
490 	}
491 
492 	if (oif) {
493 		if (local)
494 			return local;
495 
496 		if (flags & RT6_LOOKUP_F_IFACE)
497 			return net->ipv6.fib6_null_entry;
498 	}
499 
500 	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
501 }
502 
503 #ifdef CONFIG_IPV6_ROUTER_PREF
504 struct __rt6_probe_work {
505 	struct work_struct work;
506 	struct in6_addr target;
507 	struct net_device *dev;
508 };
509 
510 static void rt6_probe_deferred(struct work_struct *w)
511 {
512 	struct in6_addr mcaddr;
513 	struct __rt6_probe_work *work =
514 		container_of(w, struct __rt6_probe_work, work);
515 
516 	addrconf_addr_solict_mult(&work->target, &mcaddr);
517 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
518 	dev_put(work->dev);
519 	kfree(work);
520 }
521 
522 static void rt6_probe(struct fib6_info *rt)
523 {
524 	struct __rt6_probe_work *work;
525 	const struct in6_addr *nh_gw;
526 	struct neighbour *neigh;
527 	struct net_device *dev;
528 
529 	/*
530 	 * Okay, this does not seem to be appropriate
531 	 * for now, however, we need to check if it
532 	 * is really so; aka Router Reachability Probing.
533 	 *
534 	 * Router Reachability Probe MUST be rate-limited
535 	 * to no more than one per minute.
536 	 */
537 	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
538 		return;
539 
540 	nh_gw = &rt->fib6_nh.nh_gw;
541 	dev = rt->fib6_nh.nh_dev;
542 	rcu_read_lock_bh();
543 	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544 	if (neigh) {
545 		if (neigh->nud_state & NUD_VALID)
546 			goto out;
547 
548 		work = NULL;
549 		write_lock(&neigh->lock);
550 		if (!(neigh->nud_state & NUD_VALID) &&
551 		    time_after(jiffies,
552 			       neigh->updated +
553 			       rt->fib6_idev->cnf.rtr_probe_interval)) {
554 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
555 			if (work)
556 				__neigh_set_probe_once(neigh);
557 		}
558 		write_unlock(&neigh->lock);
559 	} else {
560 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
561 	}
562 
563 	if (work) {
564 		INIT_WORK(&work->work, rt6_probe_deferred);
565 		work->target = *nh_gw;
566 		dev_hold(dev);
567 		work->dev = dev;
568 		schedule_work(&work->work);
569 	}
570 
571 out:
572 	rcu_read_unlock_bh();
573 }
574 #else
575 static inline void rt6_probe(struct fib6_info *rt)
576 {
577 }
578 #endif
579 
580 /*
581  * Default Router Selection (RFC 2461 6.3.6)
582  */
583 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584 {
585 	const struct net_device *dev = rt->fib6_nh.nh_dev;
586 
587 	if (!oif || dev->ifindex == oif)
588 		return 2;
589 	if ((dev->flags & IFF_LOOPBACK) &&
590 	    rt->fib6_idev && rt->fib6_idev->dev->ifindex == oif)
591 		return 1;
592 	return 0;
593 }
594 
595 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
596 {
597 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
598 	struct neighbour *neigh;
599 
600 	if (rt->fib6_flags & RTF_NONEXTHOP ||
601 	    !(rt->fib6_flags & RTF_GATEWAY))
602 		return RT6_NUD_SUCCEED;
603 
604 	rcu_read_lock_bh();
605 	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
606 					  &rt->fib6_nh.nh_gw);
607 	if (neigh) {
608 		read_lock(&neigh->lock);
609 		if (neigh->nud_state & NUD_VALID)
610 			ret = RT6_NUD_SUCCEED;
611 #ifdef CONFIG_IPV6_ROUTER_PREF
612 		else if (!(neigh->nud_state & NUD_FAILED))
613 			ret = RT6_NUD_SUCCEED;
614 		else
615 			ret = RT6_NUD_FAIL_PROBE;
616 #endif
617 		read_unlock(&neigh->lock);
618 	} else {
619 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
620 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
621 	}
622 	rcu_read_unlock_bh();
623 
624 	return ret;
625 }
626 
627 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
628 {
629 	int m;
630 
631 	m = rt6_check_dev(rt, oif);
632 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
633 		return RT6_NUD_FAIL_HARD;
634 #ifdef CONFIG_IPV6_ROUTER_PREF
635 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
636 #endif
637 	if (strict & RT6_LOOKUP_F_REACHABLE) {
638 		int n = rt6_check_neigh(rt);
639 		if (n < 0)
640 			return n;
641 	}
642 	return m;
643 }
644 
645 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
646 				   int *mpri, struct fib6_info *match,
647 				   bool *do_rr)
648 {
649 	int m;
650 	bool match_do_rr = false;
651 	struct inet6_dev *idev = rt->fib6_idev;
652 
653 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
654 		goto out;
655 
656 	if (idev->cnf.ignore_routes_with_linkdown &&
657 	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
658 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
659 		goto out;
660 
661 	if (fib6_check_expired(rt))
662 		goto out;
663 
664 	m = rt6_score_route(rt, oif, strict);
665 	if (m == RT6_NUD_FAIL_DO_RR) {
666 		match_do_rr = true;
667 		m = 0; /* lowest valid score */
668 	} else if (m == RT6_NUD_FAIL_HARD) {
669 		goto out;
670 	}
671 
672 	if (strict & RT6_LOOKUP_F_REACHABLE)
673 		rt6_probe(rt);
674 
675 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
676 	if (m > *mpri) {
677 		*do_rr = match_do_rr;
678 		*mpri = m;
679 		match = rt;
680 	}
681 out:
682 	return match;
683 }
684 
685 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
686 				     struct fib6_info *leaf,
687 				     struct fib6_info *rr_head,
688 				     u32 metric, int oif, int strict,
689 				     bool *do_rr)
690 {
691 	struct fib6_info *rt, *match, *cont;
692 	int mpri = -1;
693 
694 	match = NULL;
695 	cont = NULL;
696 	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
697 		if (rt->fib6_metric != metric) {
698 			cont = rt;
699 			break;
700 		}
701 
702 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
703 	}
704 
705 	for (rt = leaf; rt && rt != rr_head;
706 	     rt = rcu_dereference(rt->rt6_next)) {
707 		if (rt->fib6_metric != metric) {
708 			cont = rt;
709 			break;
710 		}
711 
712 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
713 	}
714 
715 	if (match || !cont)
716 		return match;
717 
718 	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
719 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
720 
721 	return match;
722 }
723 
724 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
725 				   int oif, int strict)
726 {
727 	struct fib6_info *leaf = rcu_dereference(fn->leaf);
728 	struct fib6_info *match, *rt0;
729 	bool do_rr = false;
730 	int key_plen;
731 
732 	if (!leaf || leaf == net->ipv6.fib6_null_entry)
733 		return net->ipv6.fib6_null_entry;
734 
735 	rt0 = rcu_dereference(fn->rr_ptr);
736 	if (!rt0)
737 		rt0 = leaf;
738 
739 	/* Double check to make sure fn is not an intermediate node
740 	 * and fn->leaf does not points to its child's leaf
741 	 * (This might happen if all routes under fn are deleted from
742 	 * the tree and fib6_repair_tree() is called on the node.)
743 	 */
744 	key_plen = rt0->fib6_dst.plen;
745 #ifdef CONFIG_IPV6_SUBTREES
746 	if (rt0->fib6_src.plen)
747 		key_plen = rt0->fib6_src.plen;
748 #endif
749 	if (fn->fn_bit != key_plen)
750 		return net->ipv6.fib6_null_entry;
751 
752 	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
753 			     &do_rr);
754 
755 	if (do_rr) {
756 		struct fib6_info *next = rcu_dereference(rt0->rt6_next);
757 
758 		/* no entries matched; do round-robin */
759 		if (!next || next->fib6_metric != rt0->fib6_metric)
760 			next = leaf;
761 
762 		if (next != rt0) {
763 			spin_lock_bh(&leaf->fib6_table->tb6_lock);
764 			/* make sure next is not being deleted from the tree */
765 			if (next->fib6_node)
766 				rcu_assign_pointer(fn->rr_ptr, next);
767 			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
768 		}
769 	}
770 
771 	return match ? match : net->ipv6.fib6_null_entry;
772 }
773 
774 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
775 {
776 	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
777 }
778 
779 #ifdef CONFIG_IPV6_ROUTE_INFO
780 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
781 		  const struct in6_addr *gwaddr)
782 {
783 	struct net *net = dev_net(dev);
784 	struct route_info *rinfo = (struct route_info *) opt;
785 	struct in6_addr prefix_buf, *prefix;
786 	unsigned int pref;
787 	unsigned long lifetime;
788 	struct fib6_info *rt;
789 
790 	if (len < sizeof(struct route_info)) {
791 		return -EINVAL;
792 	}
793 
794 	/* Sanity check for prefix_len and length */
795 	if (rinfo->length > 3) {
796 		return -EINVAL;
797 	} else if (rinfo->prefix_len > 128) {
798 		return -EINVAL;
799 	} else if (rinfo->prefix_len > 64) {
800 		if (rinfo->length < 2) {
801 			return -EINVAL;
802 		}
803 	} else if (rinfo->prefix_len > 0) {
804 		if (rinfo->length < 1) {
805 			return -EINVAL;
806 		}
807 	}
808 
809 	pref = rinfo->route_pref;
810 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
811 		return -EINVAL;
812 
813 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
814 
815 	if (rinfo->length == 3)
816 		prefix = (struct in6_addr *)rinfo->prefix;
817 	else {
818 		/* this function is safe */
819 		ipv6_addr_prefix(&prefix_buf,
820 				 (struct in6_addr *)rinfo->prefix,
821 				 rinfo->prefix_len);
822 		prefix = &prefix_buf;
823 	}
824 
825 	if (rinfo->prefix_len == 0)
826 		rt = rt6_get_dflt_router(net, gwaddr, dev);
827 	else
828 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
829 					gwaddr, dev);
830 
831 	if (rt && !lifetime) {
832 		ip6_del_rt(net, rt);
833 		rt = NULL;
834 	}
835 
836 	if (!rt && lifetime)
837 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
838 					dev, pref);
839 	else if (rt)
840 		rt->fib6_flags = RTF_ROUTEINFO |
841 				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
842 
843 	if (rt) {
844 		if (!addrconf_finite_timeout(lifetime))
845 			fib6_clean_expires(rt);
846 		else
847 			fib6_set_expires(rt, jiffies + HZ * lifetime);
848 
849 		fib6_info_release(rt);
850 	}
851 	return 0;
852 }
853 #endif
854 
855 /*
856  *	Misc support functions
857  */
858 
859 /* called with rcu_lock held */
860 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
861 {
862 	struct net_device *dev = rt->fib6_nh.nh_dev;
863 
864 	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
865 		/* for copies of local routes, dst->dev needs to be the
866 		 * device if it is a master device, the master device if
867 		 * device is enslaved, and the loopback as the default
868 		 */
869 		if (netif_is_l3_slave(dev) &&
870 		    !rt6_need_strict(&rt->fib6_dst.addr))
871 			dev = l3mdev_master_dev_rcu(dev);
872 		else if (!netif_is_l3_master(dev))
873 			dev = dev_net(dev)->loopback_dev;
874 		/* last case is netif_is_l3_master(dev) is true in which
875 		 * case we want dev returned to be dev
876 		 */
877 	}
878 
879 	return dev;
880 }
881 
882 static const int fib6_prop[RTN_MAX + 1] = {
883 	[RTN_UNSPEC]	= 0,
884 	[RTN_UNICAST]	= 0,
885 	[RTN_LOCAL]	= 0,
886 	[RTN_BROADCAST]	= 0,
887 	[RTN_ANYCAST]	= 0,
888 	[RTN_MULTICAST]	= 0,
889 	[RTN_BLACKHOLE]	= -EINVAL,
890 	[RTN_UNREACHABLE] = -EHOSTUNREACH,
891 	[RTN_PROHIBIT]	= -EACCES,
892 	[RTN_THROW]	= -EAGAIN,
893 	[RTN_NAT]	= -EINVAL,
894 	[RTN_XRESOLVE]	= -EINVAL,
895 };
896 
897 static int ip6_rt_type_to_error(u8 fib6_type)
898 {
899 	return fib6_prop[fib6_type];
900 }
901 
902 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
903 {
904 	unsigned short flags = 0;
905 
906 	if (rt->dst_nocount)
907 		flags |= DST_NOCOUNT;
908 	if (rt->dst_nopolicy)
909 		flags |= DST_NOPOLICY;
910 	if (rt->dst_host)
911 		flags |= DST_HOST;
912 
913 	return flags;
914 }
915 
916 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
917 {
918 	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
919 
920 	switch (ort->fib6_type) {
921 	case RTN_BLACKHOLE:
922 		rt->dst.output = dst_discard_out;
923 		rt->dst.input = dst_discard;
924 		break;
925 	case RTN_PROHIBIT:
926 		rt->dst.output = ip6_pkt_prohibit_out;
927 		rt->dst.input = ip6_pkt_prohibit;
928 		break;
929 	case RTN_THROW:
930 	case RTN_UNREACHABLE:
931 	default:
932 		rt->dst.output = ip6_pkt_discard_out;
933 		rt->dst.input = ip6_pkt_discard;
934 		break;
935 	}
936 }
937 
938 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
939 {
940 	rt->dst.flags |= fib6_info_dst_flags(ort);
941 
942 	if (ort->fib6_flags & RTF_REJECT) {
943 		ip6_rt_init_dst_reject(rt, ort);
944 		return;
945 	}
946 
947 	rt->dst.error = 0;
948 	rt->dst.output = ip6_output;
949 
950 	if (ort->fib6_type == RTN_LOCAL) {
951 		rt->dst.input = ip6_input;
952 	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
953 		rt->dst.input = ip6_mc_input;
954 	} else {
955 		rt->dst.input = ip6_forward;
956 	}
957 
958 	if (ort->fib6_nh.nh_lwtstate) {
959 		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
960 		lwtunnel_set_redirect(&rt->dst);
961 	}
962 
963 	rt->dst.lastuse = jiffies;
964 }
965 
966 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
967 {
968 	rt->rt6i_flags &= ~RTF_EXPIRES;
969 	fib6_info_hold(from);
970 	rt->from = from;
971 	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
972 	if (from->fib6_metrics != &dst_default_metrics) {
973 		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
974 		refcount_inc(&from->fib6_metrics->refcnt);
975 	}
976 }
977 
978 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
979 {
980 	ip6_rt_init_dst(rt, ort);
981 
982 	rt->rt6i_dst = ort->fib6_dst;
983 	rt->rt6i_idev = ort->fib6_idev;
984 	if (rt->rt6i_idev)
985 		in6_dev_hold(rt->rt6i_idev);
986 	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
987 	rt->rt6i_flags = ort->fib6_flags;
988 	rt6_set_from(rt, ort);
989 #ifdef CONFIG_IPV6_SUBTREES
990 	rt->rt6i_src = ort->fib6_src;
991 #endif
992 	rt->rt6i_prefsrc = ort->fib6_prefsrc;
993 	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
994 }
995 
996 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
997 					struct in6_addr *saddr)
998 {
999 	struct fib6_node *pn, *sn;
1000 	while (1) {
1001 		if (fn->fn_flags & RTN_TL_ROOT)
1002 			return NULL;
1003 		pn = rcu_dereference(fn->parent);
1004 		sn = FIB6_SUBTREE(pn);
1005 		if (sn && sn != fn)
1006 			fn = fib6_lookup(sn, NULL, saddr);
1007 		else
1008 			fn = pn;
1009 		if (fn->fn_flags & RTN_RTINFO)
1010 			return fn;
1011 	}
1012 }
1013 
1014 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1015 			  bool null_fallback)
1016 {
1017 	struct rt6_info *rt = *prt;
1018 
1019 	if (dst_hold_safe(&rt->dst))
1020 		return true;
1021 	if (null_fallback) {
1022 		rt = net->ipv6.ip6_null_entry;
1023 		dst_hold(&rt->dst);
1024 	} else {
1025 		rt = NULL;
1026 	}
1027 	*prt = rt;
1028 	return false;
1029 }
1030 
1031 /* called with rcu_lock held */
1032 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1033 {
1034 	unsigned short flags = fib6_info_dst_flags(rt);
1035 	struct net_device *dev = rt->fib6_nh.nh_dev;
1036 	struct rt6_info *nrt;
1037 
1038 	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1039 	if (nrt)
1040 		ip6_rt_copy_init(nrt, rt);
1041 
1042 	return nrt;
1043 }
1044 
1045 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1046 					     struct fib6_table *table,
1047 					     struct flowi6 *fl6,
1048 					     const struct sk_buff *skb,
1049 					     int flags)
1050 {
1051 	struct fib6_info *f6i;
1052 	struct fib6_node *fn;
1053 	struct rt6_info *rt;
1054 
1055 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1056 		flags &= ~RT6_LOOKUP_F_IFACE;
1057 
1058 	rcu_read_lock();
1059 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1060 restart:
1061 	f6i = rcu_dereference(fn->leaf);
1062 	if (!f6i) {
1063 		f6i = net->ipv6.fib6_null_entry;
1064 	} else {
1065 		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1066 				      fl6->flowi6_oif, flags);
1067 		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1068 			f6i = rt6_multipath_select(net, f6i, fl6,
1069 						   fl6->flowi6_oif, skb, flags);
1070 	}
1071 	if (f6i == net->ipv6.fib6_null_entry) {
1072 		fn = fib6_backtrack(fn, &fl6->saddr);
1073 		if (fn)
1074 			goto restart;
1075 	}
1076 
1077 	/* Search through exception table */
1078 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1079 	if (rt) {
1080 		if (ip6_hold_safe(net, &rt, true))
1081 			dst_use_noref(&rt->dst, jiffies);
1082 	} else if (f6i == net->ipv6.fib6_null_entry) {
1083 		rt = net->ipv6.ip6_null_entry;
1084 		dst_hold(&rt->dst);
1085 	} else {
1086 		rt = ip6_create_rt_rcu(f6i);
1087 		if (!rt) {
1088 			rt = net->ipv6.ip6_null_entry;
1089 			dst_hold(&rt->dst);
1090 		}
1091 	}
1092 
1093 	rcu_read_unlock();
1094 
1095 	trace_fib6_table_lookup(net, rt, table, fl6);
1096 
1097 	return rt;
1098 }
1099 
1100 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1101 				   const struct sk_buff *skb, int flags)
1102 {
1103 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1104 }
1105 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1106 
1107 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1108 			    const struct in6_addr *saddr, int oif,
1109 			    const struct sk_buff *skb, int strict)
1110 {
1111 	struct flowi6 fl6 = {
1112 		.flowi6_oif = oif,
1113 		.daddr = *daddr,
1114 	};
1115 	struct dst_entry *dst;
1116 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1117 
1118 	if (saddr) {
1119 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1120 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1121 	}
1122 
1123 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1124 	if (dst->error == 0)
1125 		return (struct rt6_info *) dst;
1126 
1127 	dst_release(dst);
1128 
1129 	return NULL;
1130 }
1131 EXPORT_SYMBOL(rt6_lookup);
1132 
1133 /* ip6_ins_rt is called with FREE table->tb6_lock.
1134  * It takes new route entry, the addition fails by any reason the
1135  * route is released.
1136  * Caller must hold dst before calling it.
1137  */
1138 
1139 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1140 			struct netlink_ext_ack *extack)
1141 {
1142 	int err;
1143 	struct fib6_table *table;
1144 
1145 	table = rt->fib6_table;
1146 	spin_lock_bh(&table->tb6_lock);
1147 	err = fib6_add(&table->tb6_root, rt, info, extack);
1148 	spin_unlock_bh(&table->tb6_lock);
1149 
1150 	return err;
1151 }
1152 
1153 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1154 {
1155 	struct nl_info info = {	.nl_net = net, };
1156 
1157 	return __ip6_ins_rt(rt, &info, NULL);
1158 }
1159 
1160 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1161 					   const struct in6_addr *daddr,
1162 					   const struct in6_addr *saddr)
1163 {
1164 	struct net_device *dev;
1165 	struct rt6_info *rt;
1166 
1167 	/*
1168 	 *	Clone the route.
1169 	 */
1170 
1171 	rcu_read_lock();
1172 	dev = ip6_rt_get_dev_rcu(ort);
1173 	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1174 	rcu_read_unlock();
1175 	if (!rt)
1176 		return NULL;
1177 
1178 	ip6_rt_copy_init(rt, ort);
1179 	rt->rt6i_flags |= RTF_CACHE;
1180 	rt->dst.flags |= DST_HOST;
1181 	rt->rt6i_dst.addr = *daddr;
1182 	rt->rt6i_dst.plen = 128;
1183 
1184 	if (!rt6_is_gw_or_nonexthop(ort)) {
1185 		if (ort->fib6_dst.plen != 128 &&
1186 		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1187 			rt->rt6i_flags |= RTF_ANYCAST;
1188 #ifdef CONFIG_IPV6_SUBTREES
1189 		if (rt->rt6i_src.plen && saddr) {
1190 			rt->rt6i_src.addr = *saddr;
1191 			rt->rt6i_src.plen = 128;
1192 		}
1193 #endif
1194 	}
1195 
1196 	return rt;
1197 }
1198 
1199 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1200 {
1201 	unsigned short flags = fib6_info_dst_flags(rt);
1202 	struct net_device *dev;
1203 	struct rt6_info *pcpu_rt;
1204 
1205 	rcu_read_lock();
1206 	dev = ip6_rt_get_dev_rcu(rt);
1207 	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1208 	rcu_read_unlock();
1209 	if (!pcpu_rt)
1210 		return NULL;
1211 	ip6_rt_copy_init(pcpu_rt, rt);
1212 	pcpu_rt->rt6i_flags |= RTF_PCPU;
1213 	return pcpu_rt;
1214 }
1215 
1216 /* It should be called with rcu_read_lock() acquired */
1217 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1218 {
1219 	struct rt6_info *pcpu_rt, **p;
1220 
1221 	p = this_cpu_ptr(rt->rt6i_pcpu);
1222 	pcpu_rt = *p;
1223 
1224 	if (pcpu_rt)
1225 		ip6_hold_safe(NULL, &pcpu_rt, false);
1226 
1227 	return pcpu_rt;
1228 }
1229 
1230 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1231 					    struct fib6_info *rt)
1232 {
1233 	struct rt6_info *pcpu_rt, *prev, **p;
1234 
1235 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1236 	if (!pcpu_rt) {
1237 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1238 		return net->ipv6.ip6_null_entry;
1239 	}
1240 
1241 	dst_hold(&pcpu_rt->dst);
1242 	p = this_cpu_ptr(rt->rt6i_pcpu);
1243 	prev = cmpxchg(p, NULL, pcpu_rt);
1244 	BUG_ON(prev);
1245 
1246 	return pcpu_rt;
1247 }
1248 
1249 /* exception hash table implementation
1250  */
1251 static DEFINE_SPINLOCK(rt6_exception_lock);
1252 
1253 /* Remove rt6_ex from hash table and free the memory
1254  * Caller must hold rt6_exception_lock
1255  */
1256 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1257 				 struct rt6_exception *rt6_ex)
1258 {
1259 	struct net *net;
1260 
1261 	if (!bucket || !rt6_ex)
1262 		return;
1263 
1264 	net = dev_net(rt6_ex->rt6i->dst.dev);
1265 	hlist_del_rcu(&rt6_ex->hlist);
1266 	dst_release(&rt6_ex->rt6i->dst);
1267 	kfree_rcu(rt6_ex, rcu);
1268 	WARN_ON_ONCE(!bucket->depth);
1269 	bucket->depth--;
1270 	net->ipv6.rt6_stats->fib_rt_cache--;
1271 }
1272 
1273 /* Remove oldest rt6_ex in bucket and free the memory
1274  * Caller must hold rt6_exception_lock
1275  */
1276 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1277 {
1278 	struct rt6_exception *rt6_ex, *oldest = NULL;
1279 
1280 	if (!bucket)
1281 		return;
1282 
1283 	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1284 		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1285 			oldest = rt6_ex;
1286 	}
1287 	rt6_remove_exception(bucket, oldest);
1288 }
1289 
1290 static u32 rt6_exception_hash(const struct in6_addr *dst,
1291 			      const struct in6_addr *src)
1292 {
1293 	static u32 seed __read_mostly;
1294 	u32 val;
1295 
1296 	net_get_random_once(&seed, sizeof(seed));
1297 	val = jhash(dst, sizeof(*dst), seed);
1298 
1299 #ifdef CONFIG_IPV6_SUBTREES
1300 	if (src)
1301 		val = jhash(src, sizeof(*src), val);
1302 #endif
1303 	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1304 }
1305 
1306 /* Helper function to find the cached rt in the hash table
1307  * and update bucket pointer to point to the bucket for this
1308  * (daddr, saddr) pair
1309  * Caller must hold rt6_exception_lock
1310  */
1311 static struct rt6_exception *
1312 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1313 			      const struct in6_addr *daddr,
1314 			      const struct in6_addr *saddr)
1315 {
1316 	struct rt6_exception *rt6_ex;
1317 	u32 hval;
1318 
1319 	if (!(*bucket) || !daddr)
1320 		return NULL;
1321 
1322 	hval = rt6_exception_hash(daddr, saddr);
1323 	*bucket += hval;
1324 
1325 	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1326 		struct rt6_info *rt6 = rt6_ex->rt6i;
1327 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1328 
1329 #ifdef CONFIG_IPV6_SUBTREES
1330 		if (matched && saddr)
1331 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1332 #endif
1333 		if (matched)
1334 			return rt6_ex;
1335 	}
1336 	return NULL;
1337 }
1338 
1339 /* Helper function to find the cached rt in the hash table
1340  * and update bucket pointer to point to the bucket for this
1341  * (daddr, saddr) pair
1342  * Caller must hold rcu_read_lock()
1343  */
1344 static struct rt6_exception *
1345 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1346 			 const struct in6_addr *daddr,
1347 			 const struct in6_addr *saddr)
1348 {
1349 	struct rt6_exception *rt6_ex;
1350 	u32 hval;
1351 
1352 	WARN_ON_ONCE(!rcu_read_lock_held());
1353 
1354 	if (!(*bucket) || !daddr)
1355 		return NULL;
1356 
1357 	hval = rt6_exception_hash(daddr, saddr);
1358 	*bucket += hval;
1359 
1360 	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1361 		struct rt6_info *rt6 = rt6_ex->rt6i;
1362 		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1363 
1364 #ifdef CONFIG_IPV6_SUBTREES
1365 		if (matched && saddr)
1366 			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1367 #endif
1368 		if (matched)
1369 			return rt6_ex;
1370 	}
1371 	return NULL;
1372 }
1373 
1374 static unsigned int fib6_mtu(const struct fib6_info *rt)
1375 {
1376 	unsigned int mtu;
1377 
1378 	mtu = rt->fib6_pmtu ? : rt->fib6_idev->cnf.mtu6;
1379 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1380 
1381 	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1382 }
1383 
1384 static int rt6_insert_exception(struct rt6_info *nrt,
1385 				struct fib6_info *ort)
1386 {
1387 	struct net *net = dev_net(nrt->dst.dev);
1388 	struct rt6_exception_bucket *bucket;
1389 	struct in6_addr *src_key = NULL;
1390 	struct rt6_exception *rt6_ex;
1391 	int err = 0;
1392 
1393 	spin_lock_bh(&rt6_exception_lock);
1394 
1395 	if (ort->exception_bucket_flushed) {
1396 		err = -EINVAL;
1397 		goto out;
1398 	}
1399 
1400 	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1401 					lockdep_is_held(&rt6_exception_lock));
1402 	if (!bucket) {
1403 		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1404 				 GFP_ATOMIC);
1405 		if (!bucket) {
1406 			err = -ENOMEM;
1407 			goto out;
1408 		}
1409 		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1410 	}
1411 
1412 #ifdef CONFIG_IPV6_SUBTREES
1413 	/* rt6i_src.plen != 0 indicates ort is in subtree
1414 	 * and exception table is indexed by a hash of
1415 	 * both rt6i_dst and rt6i_src.
1416 	 * Otherwise, the exception table is indexed by
1417 	 * a hash of only rt6i_dst.
1418 	 */
1419 	if (ort->fib6_src.plen)
1420 		src_key = &nrt->rt6i_src.addr;
1421 #endif
1422 
1423 	/* Update rt6i_prefsrc as it could be changed
1424 	 * in rt6_remove_prefsrc()
1425 	 */
1426 	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1427 	/* rt6_mtu_change() might lower mtu on ort.
1428 	 * Only insert this exception route if its mtu
1429 	 * is less than ort's mtu value.
1430 	 */
1431 	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1432 		err = -EINVAL;
1433 		goto out;
1434 	}
1435 
1436 	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1437 					       src_key);
1438 	if (rt6_ex)
1439 		rt6_remove_exception(bucket, rt6_ex);
1440 
1441 	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1442 	if (!rt6_ex) {
1443 		err = -ENOMEM;
1444 		goto out;
1445 	}
1446 	rt6_ex->rt6i = nrt;
1447 	rt6_ex->stamp = jiffies;
1448 	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1449 	bucket->depth++;
1450 	net->ipv6.rt6_stats->fib_rt_cache++;
1451 
1452 	if (bucket->depth > FIB6_MAX_DEPTH)
1453 		rt6_exception_remove_oldest(bucket);
1454 
1455 out:
1456 	spin_unlock_bh(&rt6_exception_lock);
1457 
1458 	/* Update fn->fn_sernum to invalidate all cached dst */
1459 	if (!err) {
1460 		spin_lock_bh(&ort->fib6_table->tb6_lock);
1461 		fib6_update_sernum(net, ort);
1462 		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1463 		fib6_force_start_gc(net);
1464 	}
1465 
1466 	return err;
1467 }
1468 
1469 void rt6_flush_exceptions(struct fib6_info *rt)
1470 {
1471 	struct rt6_exception_bucket *bucket;
1472 	struct rt6_exception *rt6_ex;
1473 	struct hlist_node *tmp;
1474 	int i;
1475 
1476 	spin_lock_bh(&rt6_exception_lock);
1477 	/* Prevent rt6_insert_exception() to recreate the bucket list */
1478 	rt->exception_bucket_flushed = 1;
1479 
1480 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1481 				    lockdep_is_held(&rt6_exception_lock));
1482 	if (!bucket)
1483 		goto out;
1484 
1485 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1486 		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1487 			rt6_remove_exception(bucket, rt6_ex);
1488 		WARN_ON_ONCE(bucket->depth);
1489 		bucket++;
1490 	}
1491 
1492 out:
1493 	spin_unlock_bh(&rt6_exception_lock);
1494 }
1495 
1496 /* Find cached rt in the hash table inside passed in rt
1497  * Caller has to hold rcu_read_lock()
1498  */
1499 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1500 					   struct in6_addr *daddr,
1501 					   struct in6_addr *saddr)
1502 {
1503 	struct rt6_exception_bucket *bucket;
1504 	struct in6_addr *src_key = NULL;
1505 	struct rt6_exception *rt6_ex;
1506 	struct rt6_info *res = NULL;
1507 
1508 	bucket = rcu_dereference(rt->rt6i_exception_bucket);
1509 
1510 #ifdef CONFIG_IPV6_SUBTREES
1511 	/* rt6i_src.plen != 0 indicates rt is in subtree
1512 	 * and exception table is indexed by a hash of
1513 	 * both rt6i_dst and rt6i_src.
1514 	 * Otherwise, the exception table is indexed by
1515 	 * a hash of only rt6i_dst.
1516 	 */
1517 	if (rt->fib6_src.plen)
1518 		src_key = saddr;
1519 #endif
1520 	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1521 
1522 	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1523 		res = rt6_ex->rt6i;
1524 
1525 	return res;
1526 }
1527 
1528 /* Remove the passed in cached rt from the hash table that contains it */
1529 static int rt6_remove_exception_rt(struct rt6_info *rt)
1530 {
1531 	struct rt6_exception_bucket *bucket;
1532 	struct fib6_info *from = rt->from;
1533 	struct in6_addr *src_key = NULL;
1534 	struct rt6_exception *rt6_ex;
1535 	int err;
1536 
1537 	if (!from ||
1538 	    !(rt->rt6i_flags & RTF_CACHE))
1539 		return -EINVAL;
1540 
1541 	if (!rcu_access_pointer(from->rt6i_exception_bucket))
1542 		return -ENOENT;
1543 
1544 	spin_lock_bh(&rt6_exception_lock);
1545 	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1546 				    lockdep_is_held(&rt6_exception_lock));
1547 #ifdef CONFIG_IPV6_SUBTREES
1548 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1549 	 * and exception table is indexed by a hash of
1550 	 * both rt6i_dst and rt6i_src.
1551 	 * Otherwise, the exception table is indexed by
1552 	 * a hash of only rt6i_dst.
1553 	 */
1554 	if (from->fib6_src.plen)
1555 		src_key = &rt->rt6i_src.addr;
1556 #endif
1557 	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1558 					       &rt->rt6i_dst.addr,
1559 					       src_key);
1560 	if (rt6_ex) {
1561 		rt6_remove_exception(bucket, rt6_ex);
1562 		err = 0;
1563 	} else {
1564 		err = -ENOENT;
1565 	}
1566 
1567 	spin_unlock_bh(&rt6_exception_lock);
1568 	return err;
1569 }
1570 
1571 /* Find rt6_ex which contains the passed in rt cache and
1572  * refresh its stamp
1573  */
1574 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1575 {
1576 	struct rt6_exception_bucket *bucket;
1577 	struct fib6_info *from = rt->from;
1578 	struct in6_addr *src_key = NULL;
1579 	struct rt6_exception *rt6_ex;
1580 
1581 	if (!from ||
1582 	    !(rt->rt6i_flags & RTF_CACHE))
1583 		return;
1584 
1585 	rcu_read_lock();
1586 	bucket = rcu_dereference(from->rt6i_exception_bucket);
1587 
1588 #ifdef CONFIG_IPV6_SUBTREES
1589 	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1590 	 * and exception table is indexed by a hash of
1591 	 * both rt6i_dst and rt6i_src.
1592 	 * Otherwise, the exception table is indexed by
1593 	 * a hash of only rt6i_dst.
1594 	 */
1595 	if (from->fib6_src.plen)
1596 		src_key = &rt->rt6i_src.addr;
1597 #endif
1598 	rt6_ex = __rt6_find_exception_rcu(&bucket,
1599 					  &rt->rt6i_dst.addr,
1600 					  src_key);
1601 	if (rt6_ex)
1602 		rt6_ex->stamp = jiffies;
1603 
1604 	rcu_read_unlock();
1605 }
1606 
1607 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1608 {
1609 	struct rt6_exception_bucket *bucket;
1610 	struct rt6_exception *rt6_ex;
1611 	int i;
1612 
1613 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1614 					lockdep_is_held(&rt6_exception_lock));
1615 
1616 	if (bucket) {
1617 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1618 			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1619 				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1620 			}
1621 			bucket++;
1622 		}
1623 	}
1624 }
1625 
1626 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1627 					 struct rt6_info *rt, int mtu)
1628 {
1629 	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1630 	 * lowest MTU in the path: always allow updating the route PMTU to
1631 	 * reflect PMTU decreases.
1632 	 *
1633 	 * If the new MTU is higher, and the route PMTU is equal to the local
1634 	 * MTU, this means the old MTU is the lowest in the path, so allow
1635 	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1636 	 * handle this.
1637 	 */
1638 
1639 	if (dst_mtu(&rt->dst) >= mtu)
1640 		return true;
1641 
1642 	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1643 		return true;
1644 
1645 	return false;
1646 }
1647 
1648 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1649 				       struct fib6_info *rt, int mtu)
1650 {
1651 	struct rt6_exception_bucket *bucket;
1652 	struct rt6_exception *rt6_ex;
1653 	int i;
1654 
1655 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1656 					lockdep_is_held(&rt6_exception_lock));
1657 
1658 	if (!bucket)
1659 		return;
1660 
1661 	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1662 		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1663 			struct rt6_info *entry = rt6_ex->rt6i;
1664 
1665 			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1666 			 * route), the metrics of its rt->from have already
1667 			 * been updated.
1668 			 */
1669 			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1670 			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1671 				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1672 		}
1673 		bucket++;
1674 	}
1675 }
1676 
1677 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
1678 
1679 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1680 					struct in6_addr *gateway)
1681 {
1682 	struct rt6_exception_bucket *bucket;
1683 	struct rt6_exception *rt6_ex;
1684 	struct hlist_node *tmp;
1685 	int i;
1686 
1687 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1688 		return;
1689 
1690 	spin_lock_bh(&rt6_exception_lock);
1691 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1692 				     lockdep_is_held(&rt6_exception_lock));
1693 
1694 	if (bucket) {
1695 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1696 			hlist_for_each_entry_safe(rt6_ex, tmp,
1697 						  &bucket->chain, hlist) {
1698 				struct rt6_info *entry = rt6_ex->rt6i;
1699 
1700 				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1701 				    RTF_CACHE_GATEWAY &&
1702 				    ipv6_addr_equal(gateway,
1703 						    &entry->rt6i_gateway)) {
1704 					rt6_remove_exception(bucket, rt6_ex);
1705 				}
1706 			}
1707 			bucket++;
1708 		}
1709 	}
1710 
1711 	spin_unlock_bh(&rt6_exception_lock);
1712 }
1713 
1714 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1715 				      struct rt6_exception *rt6_ex,
1716 				      struct fib6_gc_args *gc_args,
1717 				      unsigned long now)
1718 {
1719 	struct rt6_info *rt = rt6_ex->rt6i;
1720 
1721 	/* we are pruning and obsoleting aged-out and non gateway exceptions
1722 	 * even if others have still references to them, so that on next
1723 	 * dst_check() such references can be dropped.
1724 	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1725 	 * expired, independently from their aging, as per RFC 8201 section 4
1726 	 */
1727 	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1728 		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1729 			RT6_TRACE("aging clone %p\n", rt);
1730 			rt6_remove_exception(bucket, rt6_ex);
1731 			return;
1732 		}
1733 	} else if (time_after(jiffies, rt->dst.expires)) {
1734 		RT6_TRACE("purging expired route %p\n", rt);
1735 		rt6_remove_exception(bucket, rt6_ex);
1736 		return;
1737 	}
1738 
1739 	if (rt->rt6i_flags & RTF_GATEWAY) {
1740 		struct neighbour *neigh;
1741 		__u8 neigh_flags = 0;
1742 
1743 		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1744 		if (neigh)
1745 			neigh_flags = neigh->flags;
1746 
1747 		if (!(neigh_flags & NTF_ROUTER)) {
1748 			RT6_TRACE("purging route %p via non-router but gateway\n",
1749 				  rt);
1750 			rt6_remove_exception(bucket, rt6_ex);
1751 			return;
1752 		}
1753 	}
1754 
1755 	gc_args->more++;
1756 }
1757 
1758 void rt6_age_exceptions(struct fib6_info *rt,
1759 			struct fib6_gc_args *gc_args,
1760 			unsigned long now)
1761 {
1762 	struct rt6_exception_bucket *bucket;
1763 	struct rt6_exception *rt6_ex;
1764 	struct hlist_node *tmp;
1765 	int i;
1766 
1767 	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1768 		return;
1769 
1770 	rcu_read_lock_bh();
1771 	spin_lock(&rt6_exception_lock);
1772 	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1773 				    lockdep_is_held(&rt6_exception_lock));
1774 
1775 	if (bucket) {
1776 		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1777 			hlist_for_each_entry_safe(rt6_ex, tmp,
1778 						  &bucket->chain, hlist) {
1779 				rt6_age_examine_exception(bucket, rt6_ex,
1780 							  gc_args, now);
1781 			}
1782 			bucket++;
1783 		}
1784 	}
1785 	spin_unlock(&rt6_exception_lock);
1786 	rcu_read_unlock_bh();
1787 }
1788 
1789 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1790 			       int oif, struct flowi6 *fl6,
1791 			       const struct sk_buff *skb, int flags)
1792 {
1793 	struct fib6_node *fn, *saved_fn;
1794 	struct fib6_info *f6i;
1795 	struct rt6_info *rt;
1796 	int strict = 0;
1797 
1798 	strict |= flags & RT6_LOOKUP_F_IFACE;
1799 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1800 	if (net->ipv6.devconf_all->forwarding == 0)
1801 		strict |= RT6_LOOKUP_F_REACHABLE;
1802 
1803 	rcu_read_lock();
1804 
1805 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1806 	saved_fn = fn;
1807 
1808 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1809 		oif = 0;
1810 
1811 redo_rt6_select:
1812 	f6i = rt6_select(net, fn, oif, strict);
1813 	if (f6i->fib6_nsiblings)
1814 		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
1815 	if (f6i == net->ipv6.fib6_null_entry) {
1816 		fn = fib6_backtrack(fn, &fl6->saddr);
1817 		if (fn)
1818 			goto redo_rt6_select;
1819 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1820 			/* also consider unreachable route */
1821 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1822 			fn = saved_fn;
1823 			goto redo_rt6_select;
1824 		}
1825 	}
1826 
1827 	if (f6i == net->ipv6.fib6_null_entry) {
1828 		rt = net->ipv6.ip6_null_entry;
1829 		rcu_read_unlock();
1830 		dst_hold(&rt->dst);
1831 		trace_fib6_table_lookup(net, rt, table, fl6);
1832 		return rt;
1833 	}
1834 
1835 	/*Search through exception table */
1836 	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1837 	if (rt) {
1838 		if (ip6_hold_safe(net, &rt, true))
1839 			dst_use_noref(&rt->dst, jiffies);
1840 
1841 		rcu_read_unlock();
1842 		trace_fib6_table_lookup(net, rt, table, fl6);
1843 		return rt;
1844 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1845 			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1846 		/* Create a RTF_CACHE clone which will not be
1847 		 * owned by the fib6 tree.  It is for the special case where
1848 		 * the daddr in the skb during the neighbor look-up is different
1849 		 * from the fl6->daddr used to look-up route here.
1850 		 */
1851 
1852 		struct rt6_info *uncached_rt;
1853 
1854 		fib6_info_hold(f6i);
1855 		rcu_read_unlock();
1856 
1857 		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1858 		fib6_info_release(f6i);
1859 
1860 		if (uncached_rt) {
1861 			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1862 			 * No need for another dst_hold()
1863 			 */
1864 			rt6_uncached_list_add(uncached_rt);
1865 			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1866 		} else {
1867 			uncached_rt = net->ipv6.ip6_null_entry;
1868 			dst_hold(&uncached_rt->dst);
1869 		}
1870 
1871 		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1872 		return uncached_rt;
1873 
1874 	} else {
1875 		/* Get a percpu copy */
1876 
1877 		struct rt6_info *pcpu_rt;
1878 
1879 		local_bh_disable();
1880 		pcpu_rt = rt6_get_pcpu_route(f6i);
1881 
1882 		if (!pcpu_rt)
1883 			pcpu_rt = rt6_make_pcpu_route(net, f6i);
1884 
1885 		local_bh_enable();
1886 		rcu_read_unlock();
1887 		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
1888 		return pcpu_rt;
1889 	}
1890 }
1891 EXPORT_SYMBOL_GPL(ip6_pol_route);
1892 
1893 static struct rt6_info *ip6_pol_route_input(struct net *net,
1894 					    struct fib6_table *table,
1895 					    struct flowi6 *fl6,
1896 					    const struct sk_buff *skb,
1897 					    int flags)
1898 {
1899 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1900 }
1901 
1902 struct dst_entry *ip6_route_input_lookup(struct net *net,
1903 					 struct net_device *dev,
1904 					 struct flowi6 *fl6,
1905 					 const struct sk_buff *skb,
1906 					 int flags)
1907 {
1908 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1909 		flags |= RT6_LOOKUP_F_IFACE;
1910 
1911 	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1912 }
1913 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1914 
1915 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1916 				  struct flow_keys *keys,
1917 				  struct flow_keys *flkeys)
1918 {
1919 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1920 	const struct ipv6hdr *key_iph = outer_iph;
1921 	struct flow_keys *_flkeys = flkeys;
1922 	const struct ipv6hdr *inner_iph;
1923 	const struct icmp6hdr *icmph;
1924 	struct ipv6hdr _inner_iph;
1925 
1926 	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1927 		goto out;
1928 
1929 	icmph = icmp6_hdr(skb);
1930 	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1931 	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1932 	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1933 	    icmph->icmp6_type != ICMPV6_PARAMPROB)
1934 		goto out;
1935 
1936 	inner_iph = skb_header_pointer(skb,
1937 				       skb_transport_offset(skb) + sizeof(*icmph),
1938 				       sizeof(_inner_iph), &_inner_iph);
1939 	if (!inner_iph)
1940 		goto out;
1941 
1942 	key_iph = inner_iph;
1943 	_flkeys = NULL;
1944 out:
1945 	if (_flkeys) {
1946 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1947 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1948 		keys->tags.flow_label = _flkeys->tags.flow_label;
1949 		keys->basic.ip_proto = _flkeys->basic.ip_proto;
1950 	} else {
1951 		keys->addrs.v6addrs.src = key_iph->saddr;
1952 		keys->addrs.v6addrs.dst = key_iph->daddr;
1953 		keys->tags.flow_label = ip6_flowinfo(key_iph);
1954 		keys->basic.ip_proto = key_iph->nexthdr;
1955 	}
1956 }
1957 
1958 /* if skb is set it will be used and fl6 can be NULL */
1959 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1960 		       const struct sk_buff *skb, struct flow_keys *flkeys)
1961 {
1962 	struct flow_keys hash_keys;
1963 	u32 mhash;
1964 
1965 	switch (ip6_multipath_hash_policy(net)) {
1966 	case 0:
1967 		memset(&hash_keys, 0, sizeof(hash_keys));
1968 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1969 		if (skb) {
1970 			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1971 		} else {
1972 			hash_keys.addrs.v6addrs.src = fl6->saddr;
1973 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1974 			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1975 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
1976 		}
1977 		break;
1978 	case 1:
1979 		if (skb) {
1980 			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1981 			struct flow_keys keys;
1982 
1983 			/* short-circuit if we already have L4 hash present */
1984 			if (skb->l4_hash)
1985 				return skb_get_hash_raw(skb) >> 1;
1986 
1987 			memset(&hash_keys, 0, sizeof(hash_keys));
1988 
1989                         if (!flkeys) {
1990 				skb_flow_dissect_flow_keys(skb, &keys, flag);
1991 				flkeys = &keys;
1992 			}
1993 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1994 			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1995 			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1996 			hash_keys.ports.src = flkeys->ports.src;
1997 			hash_keys.ports.dst = flkeys->ports.dst;
1998 			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1999 		} else {
2000 			memset(&hash_keys, 0, sizeof(hash_keys));
2001 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2002 			hash_keys.addrs.v6addrs.src = fl6->saddr;
2003 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2004 			hash_keys.ports.src = fl6->fl6_sport;
2005 			hash_keys.ports.dst = fl6->fl6_dport;
2006 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2007 		}
2008 		break;
2009 	}
2010 	mhash = flow_hash_from_keys(&hash_keys);
2011 
2012 	return mhash >> 1;
2013 }
2014 
2015 void ip6_route_input(struct sk_buff *skb)
2016 {
2017 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2018 	struct net *net = dev_net(skb->dev);
2019 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2020 	struct ip_tunnel_info *tun_info;
2021 	struct flowi6 fl6 = {
2022 		.flowi6_iif = skb->dev->ifindex,
2023 		.daddr = iph->daddr,
2024 		.saddr = iph->saddr,
2025 		.flowlabel = ip6_flowinfo(iph),
2026 		.flowi6_mark = skb->mark,
2027 		.flowi6_proto = iph->nexthdr,
2028 	};
2029 	struct flow_keys *flkeys = NULL, _flkeys;
2030 
2031 	tun_info = skb_tunnel_info(skb);
2032 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2033 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2034 
2035 	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2036 		flkeys = &_flkeys;
2037 
2038 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2039 		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2040 	skb_dst_drop(skb);
2041 	skb_dst_set(skb,
2042 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2043 }
2044 
2045 static struct rt6_info *ip6_pol_route_output(struct net *net,
2046 					     struct fib6_table *table,
2047 					     struct flowi6 *fl6,
2048 					     const struct sk_buff *skb,
2049 					     int flags)
2050 {
2051 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2052 }
2053 
2054 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2055 					 struct flowi6 *fl6, int flags)
2056 {
2057 	bool any_src;
2058 
2059 	if (rt6_need_strict(&fl6->daddr)) {
2060 		struct dst_entry *dst;
2061 
2062 		dst = l3mdev_link_scope_lookup(net, fl6);
2063 		if (dst)
2064 			return dst;
2065 	}
2066 
2067 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2068 
2069 	any_src = ipv6_addr_any(&fl6->saddr);
2070 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2071 	    (fl6->flowi6_oif && any_src))
2072 		flags |= RT6_LOOKUP_F_IFACE;
2073 
2074 	if (!any_src)
2075 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2076 	else if (sk)
2077 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2078 
2079 	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2080 }
2081 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2082 
2083 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2084 {
2085 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2086 	struct net_device *loopback_dev = net->loopback_dev;
2087 	struct dst_entry *new = NULL;
2088 
2089 	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2090 		       DST_OBSOLETE_DEAD, 0);
2091 	if (rt) {
2092 		rt6_info_init(rt);
2093 		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2094 
2095 		new = &rt->dst;
2096 		new->__use = 1;
2097 		new->input = dst_discard;
2098 		new->output = dst_discard_out;
2099 
2100 		dst_copy_metrics(new, &ort->dst);
2101 
2102 		rt->rt6i_idev = in6_dev_get(loopback_dev);
2103 		rt->rt6i_gateway = ort->rt6i_gateway;
2104 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2105 
2106 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2107 #ifdef CONFIG_IPV6_SUBTREES
2108 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2109 #endif
2110 	}
2111 
2112 	dst_release(dst_orig);
2113 	return new ? new : ERR_PTR(-ENOMEM);
2114 }
2115 
2116 /*
2117  *	Destination cache support functions
2118  */
2119 
2120 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2121 {
2122 	u32 rt_cookie = 0;
2123 
2124 	if ((f6i && !rt6_get_cookie_safe(f6i, &rt_cookie)) ||
2125 	     rt_cookie != cookie)
2126 		return false;
2127 
2128 	if (fib6_check_expired(f6i))
2129 		return false;
2130 
2131 	return true;
2132 }
2133 
2134 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2135 {
2136 	u32 rt_cookie = 0;
2137 
2138 	if ((rt->from && !rt6_get_cookie_safe(rt->from, &rt_cookie)) ||
2139 	    rt_cookie != cookie)
2140 		return NULL;
2141 
2142 	if (rt6_check_expired(rt))
2143 		return NULL;
2144 
2145 	return &rt->dst;
2146 }
2147 
2148 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2149 {
2150 	if (!__rt6_check_expired(rt) &&
2151 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2152 	    fib6_check(rt->from, cookie))
2153 		return &rt->dst;
2154 	else
2155 		return NULL;
2156 }
2157 
2158 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2159 {
2160 	struct rt6_info *rt;
2161 
2162 	rt = (struct rt6_info *) dst;
2163 
2164 	/* All IPV6 dsts are created with ->obsolete set to the value
2165 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2166 	 * into this function always.
2167 	 */
2168 
2169 	if (rt->rt6i_flags & RTF_PCPU ||
2170 	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2171 		return rt6_dst_from_check(rt, cookie);
2172 	else
2173 		return rt6_check(rt, cookie);
2174 }
2175 
2176 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2177 {
2178 	struct rt6_info *rt = (struct rt6_info *) dst;
2179 
2180 	if (rt) {
2181 		if (rt->rt6i_flags & RTF_CACHE) {
2182 			if (rt6_check_expired(rt)) {
2183 				rt6_remove_exception_rt(rt);
2184 				dst = NULL;
2185 			}
2186 		} else {
2187 			dst_release(dst);
2188 			dst = NULL;
2189 		}
2190 	}
2191 	return dst;
2192 }
2193 
2194 static void ip6_link_failure(struct sk_buff *skb)
2195 {
2196 	struct rt6_info *rt;
2197 
2198 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2199 
2200 	rt = (struct rt6_info *) skb_dst(skb);
2201 	if (rt) {
2202 		if (rt->rt6i_flags & RTF_CACHE) {
2203 			if (dst_hold_safe(&rt->dst))
2204 				rt6_remove_exception_rt(rt);
2205 		} else if (rt->from) {
2206 			struct fib6_node *fn;
2207 
2208 			rcu_read_lock();
2209 			fn = rcu_dereference(rt->from->fib6_node);
2210 			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2211 				fn->fn_sernum = -1;
2212 			rcu_read_unlock();
2213 		}
2214 	}
2215 }
2216 
2217 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2218 {
2219 	struct net *net = dev_net(rt->dst.dev);
2220 
2221 	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2222 	rt->rt6i_flags |= RTF_MODIFIED;
2223 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2224 }
2225 
2226 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2227 {
2228 	return !(rt->rt6i_flags & RTF_CACHE) &&
2229 		(rt->rt6i_flags & RTF_PCPU || rt->from);
2230 }
2231 
2232 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2233 				 const struct ipv6hdr *iph, u32 mtu)
2234 {
2235 	const struct in6_addr *daddr, *saddr;
2236 	struct rt6_info *rt6 = (struct rt6_info *)dst;
2237 
2238 	if (rt6->rt6i_flags & RTF_LOCAL)
2239 		return;
2240 
2241 	if (dst_metric_locked(dst, RTAX_MTU))
2242 		return;
2243 
2244 	if (iph) {
2245 		daddr = &iph->daddr;
2246 		saddr = &iph->saddr;
2247 	} else if (sk) {
2248 		daddr = &sk->sk_v6_daddr;
2249 		saddr = &inet6_sk(sk)->saddr;
2250 	} else {
2251 		daddr = NULL;
2252 		saddr = NULL;
2253 	}
2254 	dst_confirm_neigh(dst, daddr);
2255 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2256 	if (mtu >= dst_mtu(dst))
2257 		return;
2258 
2259 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2260 		rt6_do_update_pmtu(rt6, mtu);
2261 		/* update rt6_ex->stamp for cache */
2262 		if (rt6->rt6i_flags & RTF_CACHE)
2263 			rt6_update_exception_stamp_rt(rt6);
2264 	} else if (daddr) {
2265 		struct rt6_info *nrt6;
2266 
2267 		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2268 		if (nrt6) {
2269 			rt6_do_update_pmtu(nrt6, mtu);
2270 			if (rt6_insert_exception(nrt6, rt6->from))
2271 				dst_release_immediate(&nrt6->dst);
2272 		}
2273 	}
2274 }
2275 
2276 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2277 			       struct sk_buff *skb, u32 mtu)
2278 {
2279 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2280 }
2281 
2282 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2283 		     int oif, u32 mark, kuid_t uid)
2284 {
2285 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2286 	struct dst_entry *dst;
2287 	struct flowi6 fl6;
2288 
2289 	memset(&fl6, 0, sizeof(fl6));
2290 	fl6.flowi6_oif = oif;
2291 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2292 	fl6.daddr = iph->daddr;
2293 	fl6.saddr = iph->saddr;
2294 	fl6.flowlabel = ip6_flowinfo(iph);
2295 	fl6.flowi6_uid = uid;
2296 
2297 	dst = ip6_route_output(net, NULL, &fl6);
2298 	if (!dst->error)
2299 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2300 	dst_release(dst);
2301 }
2302 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2303 
2304 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2305 {
2306 	struct dst_entry *dst;
2307 
2308 	ip6_update_pmtu(skb, sock_net(sk), mtu,
2309 			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2310 
2311 	dst = __sk_dst_get(sk);
2312 	if (!dst || !dst->obsolete ||
2313 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2314 		return;
2315 
2316 	bh_lock_sock(sk);
2317 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2318 		ip6_datagram_dst_update(sk, false);
2319 	bh_unlock_sock(sk);
2320 }
2321 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2322 
2323 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2324 			   const struct flowi6 *fl6)
2325 {
2326 #ifdef CONFIG_IPV6_SUBTREES
2327 	struct ipv6_pinfo *np = inet6_sk(sk);
2328 #endif
2329 
2330 	ip6_dst_store(sk, dst,
2331 		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2332 		      &sk->sk_v6_daddr : NULL,
2333 #ifdef CONFIG_IPV6_SUBTREES
2334 		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2335 		      &np->saddr :
2336 #endif
2337 		      NULL);
2338 }
2339 
2340 /* Handle redirects */
2341 struct ip6rd_flowi {
2342 	struct flowi6 fl6;
2343 	struct in6_addr gateway;
2344 };
2345 
2346 static struct rt6_info *__ip6_route_redirect(struct net *net,
2347 					     struct fib6_table *table,
2348 					     struct flowi6 *fl6,
2349 					     const struct sk_buff *skb,
2350 					     int flags)
2351 {
2352 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2353 	struct rt6_info *ret = NULL, *rt_cache;
2354 	struct fib6_info *rt;
2355 	struct fib6_node *fn;
2356 
2357 	/* Get the "current" route for this destination and
2358 	 * check if the redirect has come from appropriate router.
2359 	 *
2360 	 * RFC 4861 specifies that redirects should only be
2361 	 * accepted if they come from the nexthop to the target.
2362 	 * Due to the way the routes are chosen, this notion
2363 	 * is a bit fuzzy and one might need to check all possible
2364 	 * routes.
2365 	 */
2366 
2367 	rcu_read_lock();
2368 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2369 restart:
2370 	for_each_fib6_node_rt_rcu(fn) {
2371 		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2372 			continue;
2373 		if (fib6_check_expired(rt))
2374 			continue;
2375 		if (rt->fib6_flags & RTF_REJECT)
2376 			break;
2377 		if (!(rt->fib6_flags & RTF_GATEWAY))
2378 			continue;
2379 		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2380 			continue;
2381 		/* rt_cache's gateway might be different from its 'parent'
2382 		 * in the case of an ip redirect.
2383 		 * So we keep searching in the exception table if the gateway
2384 		 * is different.
2385 		 */
2386 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2387 			rt_cache = rt6_find_cached_rt(rt,
2388 						      &fl6->daddr,
2389 						      &fl6->saddr);
2390 			if (rt_cache &&
2391 			    ipv6_addr_equal(&rdfl->gateway,
2392 					    &rt_cache->rt6i_gateway)) {
2393 				ret = rt_cache;
2394 				break;
2395 			}
2396 			continue;
2397 		}
2398 		break;
2399 	}
2400 
2401 	if (!rt)
2402 		rt = net->ipv6.fib6_null_entry;
2403 	else if (rt->fib6_flags & RTF_REJECT) {
2404 		ret = net->ipv6.ip6_null_entry;
2405 		goto out;
2406 	}
2407 
2408 	if (rt == net->ipv6.fib6_null_entry) {
2409 		fn = fib6_backtrack(fn, &fl6->saddr);
2410 		if (fn)
2411 			goto restart;
2412 	}
2413 
2414 out:
2415 	if (ret)
2416 		dst_hold(&ret->dst);
2417 	else
2418 		ret = ip6_create_rt_rcu(rt);
2419 
2420 	rcu_read_unlock();
2421 
2422 	trace_fib6_table_lookup(net, ret, table, fl6);
2423 	return ret;
2424 };
2425 
2426 static struct dst_entry *ip6_route_redirect(struct net *net,
2427 					    const struct flowi6 *fl6,
2428 					    const struct sk_buff *skb,
2429 					    const struct in6_addr *gateway)
2430 {
2431 	int flags = RT6_LOOKUP_F_HAS_SADDR;
2432 	struct ip6rd_flowi rdfl;
2433 
2434 	rdfl.fl6 = *fl6;
2435 	rdfl.gateway = *gateway;
2436 
2437 	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2438 				flags, __ip6_route_redirect);
2439 }
2440 
2441 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2442 		  kuid_t uid)
2443 {
2444 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2445 	struct dst_entry *dst;
2446 	struct flowi6 fl6;
2447 
2448 	memset(&fl6, 0, sizeof(fl6));
2449 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2450 	fl6.flowi6_oif = oif;
2451 	fl6.flowi6_mark = mark;
2452 	fl6.daddr = iph->daddr;
2453 	fl6.saddr = iph->saddr;
2454 	fl6.flowlabel = ip6_flowinfo(iph);
2455 	fl6.flowi6_uid = uid;
2456 
2457 	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2458 	rt6_do_redirect(dst, NULL, skb);
2459 	dst_release(dst);
2460 }
2461 EXPORT_SYMBOL_GPL(ip6_redirect);
2462 
2463 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2464 			    u32 mark)
2465 {
2466 	const struct ipv6hdr *iph = ipv6_hdr(skb);
2467 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2468 	struct dst_entry *dst;
2469 	struct flowi6 fl6;
2470 
2471 	memset(&fl6, 0, sizeof(fl6));
2472 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2473 	fl6.flowi6_oif = oif;
2474 	fl6.flowi6_mark = mark;
2475 	fl6.daddr = msg->dest;
2476 	fl6.saddr = iph->daddr;
2477 	fl6.flowi6_uid = sock_net_uid(net, NULL);
2478 
2479 	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2480 	rt6_do_redirect(dst, NULL, skb);
2481 	dst_release(dst);
2482 }
2483 
2484 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2485 {
2486 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2487 		     sk->sk_uid);
2488 }
2489 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2490 
2491 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2492 {
2493 	struct net_device *dev = dst->dev;
2494 	unsigned int mtu = dst_mtu(dst);
2495 	struct net *net = dev_net(dev);
2496 
2497 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2498 
2499 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2500 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2501 
2502 	/*
2503 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2504 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2505 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
2506 	 * rely only on pmtu discovery"
2507 	 */
2508 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2509 		mtu = IPV6_MAXPLEN;
2510 	return mtu;
2511 }
2512 
2513 static unsigned int ip6_mtu(const struct dst_entry *dst)
2514 {
2515 	struct inet6_dev *idev;
2516 	unsigned int mtu;
2517 
2518 	mtu = dst_metric_raw(dst, RTAX_MTU);
2519 	if (mtu)
2520 		goto out;
2521 
2522 	mtu = IPV6_MIN_MTU;
2523 
2524 	rcu_read_lock();
2525 	idev = __in6_dev_get(dst->dev);
2526 	if (idev)
2527 		mtu = idev->cnf.mtu6;
2528 	rcu_read_unlock();
2529 
2530 out:
2531 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2532 
2533 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2534 }
2535 
2536 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2537 				  struct flowi6 *fl6)
2538 {
2539 	struct dst_entry *dst;
2540 	struct rt6_info *rt;
2541 	struct inet6_dev *idev = in6_dev_get(dev);
2542 	struct net *net = dev_net(dev);
2543 
2544 	if (unlikely(!idev))
2545 		return ERR_PTR(-ENODEV);
2546 
2547 	rt = ip6_dst_alloc(net, dev, 0);
2548 	if (unlikely(!rt)) {
2549 		in6_dev_put(idev);
2550 		dst = ERR_PTR(-ENOMEM);
2551 		goto out;
2552 	}
2553 
2554 	rt->dst.flags |= DST_HOST;
2555 	rt->dst.input = ip6_input;
2556 	rt->dst.output  = ip6_output;
2557 	rt->rt6i_gateway  = fl6->daddr;
2558 	rt->rt6i_dst.addr = fl6->daddr;
2559 	rt->rt6i_dst.plen = 128;
2560 	rt->rt6i_idev     = idev;
2561 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2562 
2563 	/* Add this dst into uncached_list so that rt6_disable_ip() can
2564 	 * do proper release of the net_device
2565 	 */
2566 	rt6_uncached_list_add(rt);
2567 	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2568 
2569 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2570 
2571 out:
2572 	return dst;
2573 }
2574 
2575 static int ip6_dst_gc(struct dst_ops *ops)
2576 {
2577 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2578 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2579 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2580 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2581 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2582 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2583 	int entries;
2584 
2585 	entries = dst_entries_get_fast(ops);
2586 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2587 	    entries <= rt_max_size)
2588 		goto out;
2589 
2590 	net->ipv6.ip6_rt_gc_expire++;
2591 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2592 	entries = dst_entries_get_slow(ops);
2593 	if (entries < ops->gc_thresh)
2594 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2595 out:
2596 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2597 	return entries > rt_max_size;
2598 }
2599 
2600 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2601 			       struct fib6_config *cfg)
2602 {
2603 	int err = 0;
2604 
2605 	if (cfg->fc_mx) {
2606 		rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
2607 					   GFP_KERNEL);
2608 		if (unlikely(!rt->fib6_metrics))
2609 			return -ENOMEM;
2610 
2611 		refcount_set(&rt->fib6_metrics->refcnt, 1);
2612 
2613 		err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
2614 					 rt->fib6_metrics->metrics);
2615 	}
2616 
2617 	return err;
2618 }
2619 
2620 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2621 					    struct fib6_config *cfg,
2622 					    const struct in6_addr *gw_addr,
2623 					    u32 tbid, int flags)
2624 {
2625 	struct flowi6 fl6 = {
2626 		.flowi6_oif = cfg->fc_ifindex,
2627 		.daddr = *gw_addr,
2628 		.saddr = cfg->fc_prefsrc,
2629 	};
2630 	struct fib6_table *table;
2631 	struct rt6_info *rt;
2632 
2633 	table = fib6_get_table(net, tbid);
2634 	if (!table)
2635 		return NULL;
2636 
2637 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
2638 		flags |= RT6_LOOKUP_F_HAS_SADDR;
2639 
2640 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2641 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2642 
2643 	/* if table lookup failed, fall back to full lookup */
2644 	if (rt == net->ipv6.ip6_null_entry) {
2645 		ip6_rt_put(rt);
2646 		rt = NULL;
2647 	}
2648 
2649 	return rt;
2650 }
2651 
2652 static int ip6_route_check_nh_onlink(struct net *net,
2653 				     struct fib6_config *cfg,
2654 				     const struct net_device *dev,
2655 				     struct netlink_ext_ack *extack)
2656 {
2657 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2658 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2659 	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2660 	struct rt6_info *grt;
2661 	int err;
2662 
2663 	err = 0;
2664 	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2665 	if (grt) {
2666 		if (!grt->dst.error &&
2667 		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2668 			NL_SET_ERR_MSG(extack,
2669 				       "Nexthop has invalid gateway or device mismatch");
2670 			err = -EINVAL;
2671 		}
2672 
2673 		ip6_rt_put(grt);
2674 	}
2675 
2676 	return err;
2677 }
2678 
2679 static int ip6_route_check_nh(struct net *net,
2680 			      struct fib6_config *cfg,
2681 			      struct net_device **_dev,
2682 			      struct inet6_dev **idev)
2683 {
2684 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2685 	struct net_device *dev = _dev ? *_dev : NULL;
2686 	struct rt6_info *grt = NULL;
2687 	int err = -EHOSTUNREACH;
2688 
2689 	if (cfg->fc_table) {
2690 		int flags = RT6_LOOKUP_F_IFACE;
2691 
2692 		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2693 					  cfg->fc_table, flags);
2694 		if (grt) {
2695 			if (grt->rt6i_flags & RTF_GATEWAY ||
2696 			    (dev && dev != grt->dst.dev)) {
2697 				ip6_rt_put(grt);
2698 				grt = NULL;
2699 			}
2700 		}
2701 	}
2702 
2703 	if (!grt)
2704 		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2705 
2706 	if (!grt)
2707 		goto out;
2708 
2709 	if (dev) {
2710 		if (dev != grt->dst.dev) {
2711 			ip6_rt_put(grt);
2712 			goto out;
2713 		}
2714 	} else {
2715 		*_dev = dev = grt->dst.dev;
2716 		*idev = grt->rt6i_idev;
2717 		dev_hold(dev);
2718 		in6_dev_hold(grt->rt6i_idev);
2719 	}
2720 
2721 	if (!(grt->rt6i_flags & RTF_GATEWAY))
2722 		err = 0;
2723 
2724 	ip6_rt_put(grt);
2725 
2726 out:
2727 	return err;
2728 }
2729 
2730 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2731 			   struct net_device **_dev, struct inet6_dev **idev,
2732 			   struct netlink_ext_ack *extack)
2733 {
2734 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
2735 	int gwa_type = ipv6_addr_type(gw_addr);
2736 	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2737 	const struct net_device *dev = *_dev;
2738 	bool need_addr_check = !dev;
2739 	int err = -EINVAL;
2740 
2741 	/* if gw_addr is local we will fail to detect this in case
2742 	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2743 	 * will return already-added prefix route via interface that
2744 	 * prefix route was assigned to, which might be non-loopback.
2745 	 */
2746 	if (dev &&
2747 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2748 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2749 		goto out;
2750 	}
2751 
2752 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2753 		/* IPv6 strictly inhibits using not link-local
2754 		 * addresses as nexthop address.
2755 		 * Otherwise, router will not able to send redirects.
2756 		 * It is very good, but in some (rare!) circumstances
2757 		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2758 		 * some exceptions. --ANK
2759 		 * We allow IPv4-mapped nexthops to support RFC4798-type
2760 		 * addressing
2761 		 */
2762 		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2763 			NL_SET_ERR_MSG(extack, "Invalid gateway address");
2764 			goto out;
2765 		}
2766 
2767 		if (cfg->fc_flags & RTNH_F_ONLINK)
2768 			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2769 		else
2770 			err = ip6_route_check_nh(net, cfg, _dev, idev);
2771 
2772 		if (err)
2773 			goto out;
2774 	}
2775 
2776 	/* reload in case device was changed */
2777 	dev = *_dev;
2778 
2779 	err = -EINVAL;
2780 	if (!dev) {
2781 		NL_SET_ERR_MSG(extack, "Egress device not specified");
2782 		goto out;
2783 	} else if (dev->flags & IFF_LOOPBACK) {
2784 		NL_SET_ERR_MSG(extack,
2785 			       "Egress device can not be loopback device for this route");
2786 		goto out;
2787 	}
2788 
2789 	/* if we did not check gw_addr above, do so now that the
2790 	 * egress device has been resolved.
2791 	 */
2792 	if (need_addr_check &&
2793 	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2794 		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2795 		goto out;
2796 	}
2797 
2798 	err = 0;
2799 out:
2800 	return err;
2801 }
2802 
2803 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2804 					      gfp_t gfp_flags,
2805 					      struct netlink_ext_ack *extack)
2806 {
2807 	struct net *net = cfg->fc_nlinfo.nl_net;
2808 	struct fib6_info *rt = NULL;
2809 	struct net_device *dev = NULL;
2810 	struct inet6_dev *idev = NULL;
2811 	struct fib6_table *table;
2812 	int addr_type;
2813 	int err = -EINVAL;
2814 
2815 	/* RTF_PCPU is an internal flag; can not be set by userspace */
2816 	if (cfg->fc_flags & RTF_PCPU) {
2817 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2818 		goto out;
2819 	}
2820 
2821 	/* RTF_CACHE is an internal flag; can not be set by userspace */
2822 	if (cfg->fc_flags & RTF_CACHE) {
2823 		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2824 		goto out;
2825 	}
2826 
2827 	if (cfg->fc_type > RTN_MAX) {
2828 		NL_SET_ERR_MSG(extack, "Invalid route type");
2829 		goto out;
2830 	}
2831 
2832 	if (cfg->fc_dst_len > 128) {
2833 		NL_SET_ERR_MSG(extack, "Invalid prefix length");
2834 		goto out;
2835 	}
2836 	if (cfg->fc_src_len > 128) {
2837 		NL_SET_ERR_MSG(extack, "Invalid source address length");
2838 		goto out;
2839 	}
2840 #ifndef CONFIG_IPV6_SUBTREES
2841 	if (cfg->fc_src_len) {
2842 		NL_SET_ERR_MSG(extack,
2843 			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2844 		goto out;
2845 	}
2846 #endif
2847 	if (cfg->fc_ifindex) {
2848 		err = -ENODEV;
2849 		dev = dev_get_by_index(net, cfg->fc_ifindex);
2850 		if (!dev)
2851 			goto out;
2852 		idev = in6_dev_get(dev);
2853 		if (!idev)
2854 			goto out;
2855 	}
2856 
2857 	if (cfg->fc_metric == 0)
2858 		cfg->fc_metric = IP6_RT_PRIO_USER;
2859 
2860 	if (cfg->fc_flags & RTNH_F_ONLINK) {
2861 		if (!dev) {
2862 			NL_SET_ERR_MSG(extack,
2863 				       "Nexthop device required for onlink");
2864 			err = -ENODEV;
2865 			goto out;
2866 		}
2867 
2868 		if (!(dev->flags & IFF_UP)) {
2869 			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2870 			err = -ENETDOWN;
2871 			goto out;
2872 		}
2873 	}
2874 
2875 	err = -ENOBUFS;
2876 	if (cfg->fc_nlinfo.nlh &&
2877 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2878 		table = fib6_get_table(net, cfg->fc_table);
2879 		if (!table) {
2880 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2881 			table = fib6_new_table(net, cfg->fc_table);
2882 		}
2883 	} else {
2884 		table = fib6_new_table(net, cfg->fc_table);
2885 	}
2886 
2887 	if (!table)
2888 		goto out;
2889 
2890 	err = -ENOMEM;
2891 	rt = fib6_info_alloc(gfp_flags);
2892 	if (!rt)
2893 		goto out;
2894 
2895 	if (cfg->fc_flags & RTF_ADDRCONF)
2896 		rt->dst_nocount = true;
2897 
2898 	err = ip6_convert_metrics(net, rt, cfg);
2899 	if (err < 0)
2900 		goto out;
2901 
2902 	if (cfg->fc_flags & RTF_EXPIRES)
2903 		fib6_set_expires(rt, jiffies +
2904 				clock_t_to_jiffies(cfg->fc_expires));
2905 	else
2906 		fib6_clean_expires(rt);
2907 
2908 	if (cfg->fc_protocol == RTPROT_UNSPEC)
2909 		cfg->fc_protocol = RTPROT_BOOT;
2910 	rt->fib6_protocol = cfg->fc_protocol;
2911 
2912 	addr_type = ipv6_addr_type(&cfg->fc_dst);
2913 
2914 	if (cfg->fc_encap) {
2915 		struct lwtunnel_state *lwtstate;
2916 
2917 		err = lwtunnel_build_state(cfg->fc_encap_type,
2918 					   cfg->fc_encap, AF_INET6, cfg,
2919 					   &lwtstate, extack);
2920 		if (err)
2921 			goto out;
2922 		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2923 	}
2924 
2925 	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2926 	rt->fib6_dst.plen = cfg->fc_dst_len;
2927 	if (rt->fib6_dst.plen == 128)
2928 		rt->dst_host = true;
2929 
2930 #ifdef CONFIG_IPV6_SUBTREES
2931 	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
2932 	rt->fib6_src.plen = cfg->fc_src_len;
2933 #endif
2934 
2935 	rt->fib6_metric = cfg->fc_metric;
2936 	rt->fib6_nh.nh_weight = 1;
2937 
2938 	rt->fib6_type = cfg->fc_type;
2939 
2940 	/* We cannot add true routes via loopback here,
2941 	   they would result in kernel looping; promote them to reject routes
2942 	 */
2943 	if ((cfg->fc_flags & RTF_REJECT) ||
2944 	    (dev && (dev->flags & IFF_LOOPBACK) &&
2945 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
2946 	     !(cfg->fc_flags & RTF_LOCAL))) {
2947 		/* hold loopback dev/idev if we haven't done so. */
2948 		if (dev != net->loopback_dev) {
2949 			if (dev) {
2950 				dev_put(dev);
2951 				in6_dev_put(idev);
2952 			}
2953 			dev = net->loopback_dev;
2954 			dev_hold(dev);
2955 			idev = in6_dev_get(dev);
2956 			if (!idev) {
2957 				err = -ENODEV;
2958 				goto out;
2959 			}
2960 		}
2961 		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
2962 		goto install_route;
2963 	}
2964 
2965 	if (cfg->fc_flags & RTF_GATEWAY) {
2966 		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2967 		if (err)
2968 			goto out;
2969 
2970 		rt->fib6_nh.nh_gw = cfg->fc_gateway;
2971 	}
2972 
2973 	err = -ENODEV;
2974 	if (!dev)
2975 		goto out;
2976 
2977 	if (idev->cnf.disable_ipv6) {
2978 		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2979 		err = -EACCES;
2980 		goto out;
2981 	}
2982 
2983 	if (!(dev->flags & IFF_UP)) {
2984 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2985 		err = -ENETDOWN;
2986 		goto out;
2987 	}
2988 
2989 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2990 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2991 			NL_SET_ERR_MSG(extack, "Invalid source address");
2992 			err = -EINVAL;
2993 			goto out;
2994 		}
2995 		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
2996 		rt->fib6_prefsrc.plen = 128;
2997 	} else
2998 		rt->fib6_prefsrc.plen = 0;
2999 
3000 	rt->fib6_flags = cfg->fc_flags;
3001 
3002 install_route:
3003 	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3004 	    !netif_carrier_ok(dev))
3005 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3006 	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3007 	rt->fib6_nh.nh_dev = dev;
3008 	rt->fib6_idev = idev;
3009 	rt->fib6_table = table;
3010 
3011 	cfg->fc_nlinfo.nl_net = dev_net(dev);
3012 
3013 	return rt;
3014 out:
3015 	if (dev)
3016 		dev_put(dev);
3017 	if (idev)
3018 		in6_dev_put(idev);
3019 
3020 	fib6_info_release(rt);
3021 	return ERR_PTR(err);
3022 }
3023 
3024 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3025 		  struct netlink_ext_ack *extack)
3026 {
3027 	struct fib6_info *rt;
3028 	int err;
3029 
3030 	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3031 	if (IS_ERR(rt))
3032 		return PTR_ERR(rt);
3033 
3034 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3035 	fib6_info_release(rt);
3036 
3037 	return err;
3038 }
3039 
3040 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3041 {
3042 	struct net *net = info->nl_net;
3043 	struct fib6_table *table;
3044 	int err;
3045 
3046 	if (rt == net->ipv6.fib6_null_entry) {
3047 		err = -ENOENT;
3048 		goto out;
3049 	}
3050 
3051 	table = rt->fib6_table;
3052 	spin_lock_bh(&table->tb6_lock);
3053 	err = fib6_del(rt, info);
3054 	spin_unlock_bh(&table->tb6_lock);
3055 
3056 out:
3057 	fib6_info_release(rt);
3058 	return err;
3059 }
3060 
3061 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3062 {
3063 	struct nl_info info = { .nl_net = net };
3064 
3065 	return __ip6_del_rt(rt, &info);
3066 }
3067 
3068 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3069 {
3070 	struct nl_info *info = &cfg->fc_nlinfo;
3071 	struct net *net = info->nl_net;
3072 	struct sk_buff *skb = NULL;
3073 	struct fib6_table *table;
3074 	int err = -ENOENT;
3075 
3076 	if (rt == net->ipv6.fib6_null_entry)
3077 		goto out_put;
3078 	table = rt->fib6_table;
3079 	spin_lock_bh(&table->tb6_lock);
3080 
3081 	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3082 		struct fib6_info *sibling, *next_sibling;
3083 
3084 		/* prefer to send a single notification with all hops */
3085 		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3086 		if (skb) {
3087 			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3088 
3089 			if (rt6_fill_node(net, skb, rt, NULL,
3090 					  NULL, NULL, 0, RTM_DELROUTE,
3091 					  info->portid, seq, 0) < 0) {
3092 				kfree_skb(skb);
3093 				skb = NULL;
3094 			} else
3095 				info->skip_notify = 1;
3096 		}
3097 
3098 		list_for_each_entry_safe(sibling, next_sibling,
3099 					 &rt->fib6_siblings,
3100 					 fib6_siblings) {
3101 			err = fib6_del(sibling, info);
3102 			if (err)
3103 				goto out_unlock;
3104 		}
3105 	}
3106 
3107 	err = fib6_del(rt, info);
3108 out_unlock:
3109 	spin_unlock_bh(&table->tb6_lock);
3110 out_put:
3111 	fib6_info_release(rt);
3112 
3113 	if (skb) {
3114 		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3115 			    info->nlh, gfp_any());
3116 	}
3117 	return err;
3118 }
3119 
3120 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3121 {
3122 	int rc = -ESRCH;
3123 
3124 	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3125 		goto out;
3126 
3127 	if (cfg->fc_flags & RTF_GATEWAY &&
3128 	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3129 		goto out;
3130 	if (dst_hold_safe(&rt->dst))
3131 		rc = rt6_remove_exception_rt(rt);
3132 out:
3133 	return rc;
3134 }
3135 
3136 static int ip6_route_del(struct fib6_config *cfg,
3137 			 struct netlink_ext_ack *extack)
3138 {
3139 	struct rt6_info *rt_cache;
3140 	struct fib6_table *table;
3141 	struct fib6_info *rt;
3142 	struct fib6_node *fn;
3143 	int err = -ESRCH;
3144 
3145 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3146 	if (!table) {
3147 		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3148 		return err;
3149 	}
3150 
3151 	rcu_read_lock();
3152 
3153 	fn = fib6_locate(&table->tb6_root,
3154 			 &cfg->fc_dst, cfg->fc_dst_len,
3155 			 &cfg->fc_src, cfg->fc_src_len,
3156 			 !(cfg->fc_flags & RTF_CACHE));
3157 
3158 	if (fn) {
3159 		for_each_fib6_node_rt_rcu(fn) {
3160 			if (cfg->fc_flags & RTF_CACHE) {
3161 				int rc;
3162 
3163 				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3164 							      &cfg->fc_src);
3165 				if (rt_cache) {
3166 					rc = ip6_del_cached_rt(rt_cache, cfg);
3167 					if (rc != -ESRCH)
3168 						return rc;
3169 				}
3170 				continue;
3171 			}
3172 			if (cfg->fc_ifindex &&
3173 			    (!rt->fib6_nh.nh_dev ||
3174 			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3175 				continue;
3176 			if (cfg->fc_flags & RTF_GATEWAY &&
3177 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3178 				continue;
3179 			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3180 				continue;
3181 			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3182 				continue;
3183 			fib6_info_hold(rt);
3184 			rcu_read_unlock();
3185 
3186 			/* if gateway was specified only delete the one hop */
3187 			if (cfg->fc_flags & RTF_GATEWAY)
3188 				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3189 
3190 			return __ip6_del_rt_siblings(rt, cfg);
3191 		}
3192 	}
3193 	rcu_read_unlock();
3194 
3195 	return err;
3196 }
3197 
3198 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3199 {
3200 	struct netevent_redirect netevent;
3201 	struct rt6_info *rt, *nrt = NULL;
3202 	struct ndisc_options ndopts;
3203 	struct inet6_dev *in6_dev;
3204 	struct neighbour *neigh;
3205 	struct rd_msg *msg;
3206 	int optlen, on_link;
3207 	u8 *lladdr;
3208 
3209 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3210 	optlen -= sizeof(*msg);
3211 
3212 	if (optlen < 0) {
3213 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3214 		return;
3215 	}
3216 
3217 	msg = (struct rd_msg *)icmp6_hdr(skb);
3218 
3219 	if (ipv6_addr_is_multicast(&msg->dest)) {
3220 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3221 		return;
3222 	}
3223 
3224 	on_link = 0;
3225 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3226 		on_link = 1;
3227 	} else if (ipv6_addr_type(&msg->target) !=
3228 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3229 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3230 		return;
3231 	}
3232 
3233 	in6_dev = __in6_dev_get(skb->dev);
3234 	if (!in6_dev)
3235 		return;
3236 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3237 		return;
3238 
3239 	/* RFC2461 8.1:
3240 	 *	The IP source address of the Redirect MUST be the same as the current
3241 	 *	first-hop router for the specified ICMP Destination Address.
3242 	 */
3243 
3244 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3245 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3246 		return;
3247 	}
3248 
3249 	lladdr = NULL;
3250 	if (ndopts.nd_opts_tgt_lladdr) {
3251 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3252 					     skb->dev);
3253 		if (!lladdr) {
3254 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3255 			return;
3256 		}
3257 	}
3258 
3259 	rt = (struct rt6_info *) dst;
3260 	if (rt->rt6i_flags & RTF_REJECT) {
3261 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3262 		return;
3263 	}
3264 
3265 	/* Redirect received -> path was valid.
3266 	 * Look, redirects are sent only in response to data packets,
3267 	 * so that this nexthop apparently is reachable. --ANK
3268 	 */
3269 	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3270 
3271 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3272 	if (!neigh)
3273 		return;
3274 
3275 	/*
3276 	 *	We have finally decided to accept it.
3277 	 */
3278 
3279 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3280 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
3281 		     NEIGH_UPDATE_F_OVERRIDE|
3282 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3283 				     NEIGH_UPDATE_F_ISROUTER)),
3284 		     NDISC_REDIRECT, &ndopts);
3285 
3286 	nrt = ip6_rt_cache_alloc(rt->from, &msg->dest, NULL);
3287 	if (!nrt)
3288 		goto out;
3289 
3290 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3291 	if (on_link)
3292 		nrt->rt6i_flags &= ~RTF_GATEWAY;
3293 
3294 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3295 
3296 	/* No need to remove rt from the exception table if rt is
3297 	 * a cached route because rt6_insert_exception() will
3298 	 * takes care of it
3299 	 */
3300 	if (rt6_insert_exception(nrt, rt->from)) {
3301 		dst_release_immediate(&nrt->dst);
3302 		goto out;
3303 	}
3304 
3305 	netevent.old = &rt->dst;
3306 	netevent.new = &nrt->dst;
3307 	netevent.daddr = &msg->dest;
3308 	netevent.neigh = neigh;
3309 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3310 
3311 out:
3312 	neigh_release(neigh);
3313 }
3314 
3315 #ifdef CONFIG_IPV6_ROUTE_INFO
3316 static struct fib6_info *rt6_get_route_info(struct net *net,
3317 					   const struct in6_addr *prefix, int prefixlen,
3318 					   const struct in6_addr *gwaddr,
3319 					   struct net_device *dev)
3320 {
3321 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3322 	int ifindex = dev->ifindex;
3323 	struct fib6_node *fn;
3324 	struct fib6_info *rt = NULL;
3325 	struct fib6_table *table;
3326 
3327 	table = fib6_get_table(net, tb_id);
3328 	if (!table)
3329 		return NULL;
3330 
3331 	rcu_read_lock();
3332 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3333 	if (!fn)
3334 		goto out;
3335 
3336 	for_each_fib6_node_rt_rcu(fn) {
3337 		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3338 			continue;
3339 		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3340 			continue;
3341 		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3342 			continue;
3343 		fib6_info_hold(rt);
3344 		break;
3345 	}
3346 out:
3347 	rcu_read_unlock();
3348 	return rt;
3349 }
3350 
3351 static struct fib6_info *rt6_add_route_info(struct net *net,
3352 					   const struct in6_addr *prefix, int prefixlen,
3353 					   const struct in6_addr *gwaddr,
3354 					   struct net_device *dev,
3355 					   unsigned int pref)
3356 {
3357 	struct fib6_config cfg = {
3358 		.fc_metric	= IP6_RT_PRIO_USER,
3359 		.fc_ifindex	= dev->ifindex,
3360 		.fc_dst_len	= prefixlen,
3361 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3362 				  RTF_UP | RTF_PREF(pref),
3363 		.fc_protocol = RTPROT_RA,
3364 		.fc_type = RTN_UNICAST,
3365 		.fc_nlinfo.portid = 0,
3366 		.fc_nlinfo.nlh = NULL,
3367 		.fc_nlinfo.nl_net = net,
3368 	};
3369 
3370 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3371 	cfg.fc_dst = *prefix;
3372 	cfg.fc_gateway = *gwaddr;
3373 
3374 	/* We should treat it as a default route if prefix length is 0. */
3375 	if (!prefixlen)
3376 		cfg.fc_flags |= RTF_DEFAULT;
3377 
3378 	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3379 
3380 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3381 }
3382 #endif
3383 
3384 struct fib6_info *rt6_get_dflt_router(struct net *net,
3385 				     const struct in6_addr *addr,
3386 				     struct net_device *dev)
3387 {
3388 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3389 	struct fib6_info *rt;
3390 	struct fib6_table *table;
3391 
3392 	table = fib6_get_table(net, tb_id);
3393 	if (!table)
3394 		return NULL;
3395 
3396 	rcu_read_lock();
3397 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3398 		if (dev == rt->fib6_nh.nh_dev &&
3399 		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3400 		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3401 			break;
3402 	}
3403 	if (rt)
3404 		fib6_info_hold(rt);
3405 	rcu_read_unlock();
3406 	return rt;
3407 }
3408 
3409 struct fib6_info *rt6_add_dflt_router(struct net *net,
3410 				     const struct in6_addr *gwaddr,
3411 				     struct net_device *dev,
3412 				     unsigned int pref)
3413 {
3414 	struct fib6_config cfg = {
3415 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3416 		.fc_metric	= IP6_RT_PRIO_USER,
3417 		.fc_ifindex	= dev->ifindex,
3418 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3419 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3420 		.fc_protocol = RTPROT_RA,
3421 		.fc_type = RTN_UNICAST,
3422 		.fc_nlinfo.portid = 0,
3423 		.fc_nlinfo.nlh = NULL,
3424 		.fc_nlinfo.nl_net = net,
3425 	};
3426 
3427 	cfg.fc_gateway = *gwaddr;
3428 
3429 	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3430 		struct fib6_table *table;
3431 
3432 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
3433 		if (table)
3434 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3435 	}
3436 
3437 	return rt6_get_dflt_router(net, gwaddr, dev);
3438 }
3439 
3440 static void __rt6_purge_dflt_routers(struct net *net,
3441 				     struct fib6_table *table)
3442 {
3443 	struct fib6_info *rt;
3444 
3445 restart:
3446 	rcu_read_lock();
3447 	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3448 		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3449 		    (!rt->fib6_idev || rt->fib6_idev->cnf.accept_ra != 2)) {
3450 			fib6_info_hold(rt);
3451 			rcu_read_unlock();
3452 			ip6_del_rt(net, rt);
3453 			goto restart;
3454 		}
3455 	}
3456 	rcu_read_unlock();
3457 
3458 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3459 }
3460 
3461 void rt6_purge_dflt_routers(struct net *net)
3462 {
3463 	struct fib6_table *table;
3464 	struct hlist_head *head;
3465 	unsigned int h;
3466 
3467 	rcu_read_lock();
3468 
3469 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3470 		head = &net->ipv6.fib_table_hash[h];
3471 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3472 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3473 				__rt6_purge_dflt_routers(net, table);
3474 		}
3475 	}
3476 
3477 	rcu_read_unlock();
3478 }
3479 
3480 static void rtmsg_to_fib6_config(struct net *net,
3481 				 struct in6_rtmsg *rtmsg,
3482 				 struct fib6_config *cfg)
3483 {
3484 	memset(cfg, 0, sizeof(*cfg));
3485 
3486 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3487 			 : RT6_TABLE_MAIN;
3488 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3489 	cfg->fc_metric = rtmsg->rtmsg_metric;
3490 	cfg->fc_expires = rtmsg->rtmsg_info;
3491 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3492 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
3493 	cfg->fc_flags = rtmsg->rtmsg_flags;
3494 	cfg->fc_type = rtmsg->rtmsg_type;
3495 
3496 	cfg->fc_nlinfo.nl_net = net;
3497 
3498 	cfg->fc_dst = rtmsg->rtmsg_dst;
3499 	cfg->fc_src = rtmsg->rtmsg_src;
3500 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3501 }
3502 
3503 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3504 {
3505 	struct fib6_config cfg;
3506 	struct in6_rtmsg rtmsg;
3507 	int err;
3508 
3509 	switch (cmd) {
3510 	case SIOCADDRT:		/* Add a route */
3511 	case SIOCDELRT:		/* Delete a route */
3512 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3513 			return -EPERM;
3514 		err = copy_from_user(&rtmsg, arg,
3515 				     sizeof(struct in6_rtmsg));
3516 		if (err)
3517 			return -EFAULT;
3518 
3519 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3520 
3521 		rtnl_lock();
3522 		switch (cmd) {
3523 		case SIOCADDRT:
3524 			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3525 			break;
3526 		case SIOCDELRT:
3527 			err = ip6_route_del(&cfg, NULL);
3528 			break;
3529 		default:
3530 			err = -EINVAL;
3531 		}
3532 		rtnl_unlock();
3533 
3534 		return err;
3535 	}
3536 
3537 	return -EINVAL;
3538 }
3539 
3540 /*
3541  *	Drop the packet on the floor
3542  */
3543 
3544 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3545 {
3546 	int type;
3547 	struct dst_entry *dst = skb_dst(skb);
3548 	switch (ipstats_mib_noroutes) {
3549 	case IPSTATS_MIB_INNOROUTES:
3550 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3551 		if (type == IPV6_ADDR_ANY) {
3552 			IP6_INC_STATS(dev_net(dst->dev),
3553 				      __in6_dev_get_safely(skb->dev),
3554 				      IPSTATS_MIB_INADDRERRORS);
3555 			break;
3556 		}
3557 		/* FALLTHROUGH */
3558 	case IPSTATS_MIB_OUTNOROUTES:
3559 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3560 			      ipstats_mib_noroutes);
3561 		break;
3562 	}
3563 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3564 	kfree_skb(skb);
3565 	return 0;
3566 }
3567 
3568 static int ip6_pkt_discard(struct sk_buff *skb)
3569 {
3570 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3571 }
3572 
3573 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3574 {
3575 	skb->dev = skb_dst(skb)->dev;
3576 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3577 }
3578 
3579 static int ip6_pkt_prohibit(struct sk_buff *skb)
3580 {
3581 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3582 }
3583 
3584 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3585 {
3586 	skb->dev = skb_dst(skb)->dev;
3587 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3588 }
3589 
3590 /*
3591  *	Allocate a dst for local (unicast / anycast) address.
3592  */
3593 
3594 struct fib6_info *addrconf_dst_alloc(struct net *net,
3595 				    struct inet6_dev *idev,
3596 				    const struct in6_addr *addr,
3597 				    bool anycast, gfp_t gfp_flags)
3598 {
3599 	u32 tb_id;
3600 	struct net_device *dev = idev->dev;
3601 	struct fib6_info *rt;
3602 
3603 	rt = fib6_info_alloc(gfp_flags);
3604 	if (!rt)
3605 		return ERR_PTR(-ENOMEM);
3606 
3607 	rt->dst_nocount = true;
3608 
3609 	in6_dev_hold(idev);
3610 	rt->fib6_idev = idev;
3611 
3612 	rt->dst_host = true;
3613 	rt->fib6_protocol = RTPROT_KERNEL;
3614 	rt->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3615 	if (anycast) {
3616 		rt->fib6_type = RTN_ANYCAST;
3617 		rt->fib6_flags |= RTF_ANYCAST;
3618 	} else {
3619 		rt->fib6_type = RTN_LOCAL;
3620 		rt->fib6_flags |= RTF_LOCAL;
3621 	}
3622 
3623 	rt->fib6_nh.nh_gw = *addr;
3624 	dev_hold(dev);
3625 	rt->fib6_nh.nh_dev = dev;
3626 	rt->fib6_dst.addr = *addr;
3627 	rt->fib6_dst.plen = 128;
3628 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3629 	rt->fib6_table = fib6_get_table(net, tb_id);
3630 
3631 	return rt;
3632 }
3633 
3634 /* remove deleted ip from prefsrc entries */
3635 struct arg_dev_net_ip {
3636 	struct net_device *dev;
3637 	struct net *net;
3638 	struct in6_addr *addr;
3639 };
3640 
3641 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3642 {
3643 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3644 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3645 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3646 
3647 	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3648 	    rt != net->ipv6.fib6_null_entry &&
3649 	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3650 		spin_lock_bh(&rt6_exception_lock);
3651 		/* remove prefsrc entry */
3652 		rt->fib6_prefsrc.plen = 0;
3653 		/* need to update cache as well */
3654 		rt6_exceptions_remove_prefsrc(rt);
3655 		spin_unlock_bh(&rt6_exception_lock);
3656 	}
3657 	return 0;
3658 }
3659 
3660 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3661 {
3662 	struct net *net = dev_net(ifp->idev->dev);
3663 	struct arg_dev_net_ip adni = {
3664 		.dev = ifp->idev->dev,
3665 		.net = net,
3666 		.addr = &ifp->addr,
3667 	};
3668 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3669 }
3670 
3671 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3672 
3673 /* Remove routers and update dst entries when gateway turn into host. */
3674 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3675 {
3676 	struct in6_addr *gateway = (struct in6_addr *)arg;
3677 
3678 	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3679 	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3680 		return -1;
3681 	}
3682 
3683 	/* Further clean up cached routes in exception table.
3684 	 * This is needed because cached route may have a different
3685 	 * gateway than its 'parent' in the case of an ip redirect.
3686 	 */
3687 	rt6_exceptions_clean_tohost(rt, gateway);
3688 
3689 	return 0;
3690 }
3691 
3692 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3693 {
3694 	fib6_clean_all(net, fib6_clean_tohost, gateway);
3695 }
3696 
3697 struct arg_netdev_event {
3698 	const struct net_device *dev;
3699 	union {
3700 		unsigned int nh_flags;
3701 		unsigned long event;
3702 	};
3703 };
3704 
3705 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3706 {
3707 	struct fib6_info *iter;
3708 	struct fib6_node *fn;
3709 
3710 	fn = rcu_dereference_protected(rt->fib6_node,
3711 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3712 	iter = rcu_dereference_protected(fn->leaf,
3713 			lockdep_is_held(&rt->fib6_table->tb6_lock));
3714 	while (iter) {
3715 		if (iter->fib6_metric == rt->fib6_metric &&
3716 		    rt6_qualify_for_ecmp(iter))
3717 			return iter;
3718 		iter = rcu_dereference_protected(iter->rt6_next,
3719 				lockdep_is_held(&rt->fib6_table->tb6_lock));
3720 	}
3721 
3722 	return NULL;
3723 }
3724 
3725 static bool rt6_is_dead(const struct fib6_info *rt)
3726 {
3727 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3728 	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3729 	     rt->fib6_idev->cnf.ignore_routes_with_linkdown))
3730 		return true;
3731 
3732 	return false;
3733 }
3734 
3735 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3736 {
3737 	struct fib6_info *iter;
3738 	int total = 0;
3739 
3740 	if (!rt6_is_dead(rt))
3741 		total += rt->fib6_nh.nh_weight;
3742 
3743 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3744 		if (!rt6_is_dead(iter))
3745 			total += iter->fib6_nh.nh_weight;
3746 	}
3747 
3748 	return total;
3749 }
3750 
3751 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3752 {
3753 	int upper_bound = -1;
3754 
3755 	if (!rt6_is_dead(rt)) {
3756 		*weight += rt->fib6_nh.nh_weight;
3757 		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3758 						    total) - 1;
3759 	}
3760 	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3761 }
3762 
3763 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3764 {
3765 	struct fib6_info *iter;
3766 	int weight = 0;
3767 
3768 	rt6_upper_bound_set(rt, &weight, total);
3769 
3770 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3771 		rt6_upper_bound_set(iter, &weight, total);
3772 }
3773 
3774 void rt6_multipath_rebalance(struct fib6_info *rt)
3775 {
3776 	struct fib6_info *first;
3777 	int total;
3778 
3779 	/* In case the entire multipath route was marked for flushing,
3780 	 * then there is no need to rebalance upon the removal of every
3781 	 * sibling route.
3782 	 */
3783 	if (!rt->fib6_nsiblings || rt->should_flush)
3784 		return;
3785 
3786 	/* During lookup routes are evaluated in order, so we need to
3787 	 * make sure upper bounds are assigned from the first sibling
3788 	 * onwards.
3789 	 */
3790 	first = rt6_multipath_first_sibling(rt);
3791 	if (WARN_ON_ONCE(!first))
3792 		return;
3793 
3794 	total = rt6_multipath_total_weight(first);
3795 	rt6_multipath_upper_bound_set(first, total);
3796 }
3797 
3798 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3799 {
3800 	const struct arg_netdev_event *arg = p_arg;
3801 	struct net *net = dev_net(arg->dev);
3802 
3803 	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3804 		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3805 		fib6_update_sernum_upto_root(net, rt);
3806 		rt6_multipath_rebalance(rt);
3807 	}
3808 
3809 	return 0;
3810 }
3811 
3812 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3813 {
3814 	struct arg_netdev_event arg = {
3815 		.dev = dev,
3816 		{
3817 			.nh_flags = nh_flags,
3818 		},
3819 	};
3820 
3821 	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3822 		arg.nh_flags |= RTNH_F_LINKDOWN;
3823 
3824 	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3825 }
3826 
3827 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3828 				   const struct net_device *dev)
3829 {
3830 	struct fib6_info *iter;
3831 
3832 	if (rt->fib6_nh.nh_dev == dev)
3833 		return true;
3834 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3835 		if (iter->fib6_nh.nh_dev == dev)
3836 			return true;
3837 
3838 	return false;
3839 }
3840 
3841 static void rt6_multipath_flush(struct fib6_info *rt)
3842 {
3843 	struct fib6_info *iter;
3844 
3845 	rt->should_flush = 1;
3846 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3847 		iter->should_flush = 1;
3848 }
3849 
3850 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3851 					     const struct net_device *down_dev)
3852 {
3853 	struct fib6_info *iter;
3854 	unsigned int dead = 0;
3855 
3856 	if (rt->fib6_nh.nh_dev == down_dev ||
3857 	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3858 		dead++;
3859 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3860 		if (iter->fib6_nh.nh_dev == down_dev ||
3861 		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3862 			dead++;
3863 
3864 	return dead;
3865 }
3866 
3867 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3868 				       const struct net_device *dev,
3869 				       unsigned int nh_flags)
3870 {
3871 	struct fib6_info *iter;
3872 
3873 	if (rt->fib6_nh.nh_dev == dev)
3874 		rt->fib6_nh.nh_flags |= nh_flags;
3875 	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3876 		if (iter->fib6_nh.nh_dev == dev)
3877 			iter->fib6_nh.nh_flags |= nh_flags;
3878 }
3879 
3880 /* called with write lock held for table with rt */
3881 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
3882 {
3883 	const struct arg_netdev_event *arg = p_arg;
3884 	const struct net_device *dev = arg->dev;
3885 	struct net *net = dev_net(dev);
3886 
3887 	if (rt == net->ipv6.fib6_null_entry)
3888 		return 0;
3889 
3890 	switch (arg->event) {
3891 	case NETDEV_UNREGISTER:
3892 		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3893 	case NETDEV_DOWN:
3894 		if (rt->should_flush)
3895 			return -1;
3896 		if (!rt->fib6_nsiblings)
3897 			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3898 		if (rt6_multipath_uses_dev(rt, dev)) {
3899 			unsigned int count;
3900 
3901 			count = rt6_multipath_dead_count(rt, dev);
3902 			if (rt->fib6_nsiblings + 1 == count) {
3903 				rt6_multipath_flush(rt);
3904 				return -1;
3905 			}
3906 			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3907 						   RTNH_F_LINKDOWN);
3908 			fib6_update_sernum(net, rt);
3909 			rt6_multipath_rebalance(rt);
3910 		}
3911 		return -2;
3912 	case NETDEV_CHANGE:
3913 		if (rt->fib6_nh.nh_dev != dev ||
3914 		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3915 			break;
3916 		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3917 		rt6_multipath_rebalance(rt);
3918 		break;
3919 	}
3920 
3921 	return 0;
3922 }
3923 
3924 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
3925 {
3926 	struct arg_netdev_event arg = {
3927 		.dev = dev,
3928 		{
3929 			.event = event,
3930 		},
3931 	};
3932 
3933 	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3934 }
3935 
3936 void rt6_disable_ip(struct net_device *dev, unsigned long event)
3937 {
3938 	rt6_sync_down_dev(dev, event);
3939 	rt6_uncached_list_flush_dev(dev_net(dev), dev);
3940 	neigh_ifdown(&nd_tbl, dev);
3941 }
3942 
3943 struct rt6_mtu_change_arg {
3944 	struct net_device *dev;
3945 	unsigned int mtu;
3946 };
3947 
3948 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
3949 {
3950 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3951 	struct inet6_dev *idev;
3952 
3953 	/* In IPv6 pmtu discovery is not optional,
3954 	   so that RTAX_MTU lock cannot disable it.
3955 	   We still use this lock to block changes
3956 	   caused by addrconf/ndisc.
3957 	*/
3958 
3959 	idev = __in6_dev_get(arg->dev);
3960 	if (!idev)
3961 		return 0;
3962 
3963 	/* For administrative MTU increase, there is no way to discover
3964 	   IPv6 PMTU increase, so PMTU increase should be updated here.
3965 	   Since RFC 1981 doesn't include administrative MTU increase
3966 	   update PMTU increase is a MUST. (i.e. jumbo frame)
3967 	 */
3968 	if (rt->fib6_nh.nh_dev == arg->dev &&
3969 	    !fib6_metric_locked(rt, RTAX_MTU)) {
3970 		u32 mtu = rt->fib6_pmtu;
3971 
3972 		if (mtu >= arg->mtu ||
3973 		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
3974 			fib6_metric_set(rt, RTAX_MTU, arg->mtu);
3975 
3976 		spin_lock_bh(&rt6_exception_lock);
3977 		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3978 		spin_unlock_bh(&rt6_exception_lock);
3979 	}
3980 	return 0;
3981 }
3982 
3983 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
3984 {
3985 	struct rt6_mtu_change_arg arg = {
3986 		.dev = dev,
3987 		.mtu = mtu,
3988 	};
3989 
3990 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
3991 }
3992 
3993 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3994 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3995 	[RTA_OIF]               = { .type = NLA_U32 },
3996 	[RTA_IIF]		= { .type = NLA_U32 },
3997 	[RTA_PRIORITY]          = { .type = NLA_U32 },
3998 	[RTA_METRICS]           = { .type = NLA_NESTED },
3999 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4000 	[RTA_PREF]              = { .type = NLA_U8 },
4001 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4002 	[RTA_ENCAP]		= { .type = NLA_NESTED },
4003 	[RTA_EXPIRES]		= { .type = NLA_U32 },
4004 	[RTA_UID]		= { .type = NLA_U32 },
4005 	[RTA_MARK]		= { .type = NLA_U32 },
4006 };
4007 
4008 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4009 			      struct fib6_config *cfg,
4010 			      struct netlink_ext_ack *extack)
4011 {
4012 	struct rtmsg *rtm;
4013 	struct nlattr *tb[RTA_MAX+1];
4014 	unsigned int pref;
4015 	int err;
4016 
4017 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4018 			  NULL);
4019 	if (err < 0)
4020 		goto errout;
4021 
4022 	err = -EINVAL;
4023 	rtm = nlmsg_data(nlh);
4024 	memset(cfg, 0, sizeof(*cfg));
4025 
4026 	cfg->fc_table = rtm->rtm_table;
4027 	cfg->fc_dst_len = rtm->rtm_dst_len;
4028 	cfg->fc_src_len = rtm->rtm_src_len;
4029 	cfg->fc_flags = RTF_UP;
4030 	cfg->fc_protocol = rtm->rtm_protocol;
4031 	cfg->fc_type = rtm->rtm_type;
4032 
4033 	if (rtm->rtm_type == RTN_UNREACHABLE ||
4034 	    rtm->rtm_type == RTN_BLACKHOLE ||
4035 	    rtm->rtm_type == RTN_PROHIBIT ||
4036 	    rtm->rtm_type == RTN_THROW)
4037 		cfg->fc_flags |= RTF_REJECT;
4038 
4039 	if (rtm->rtm_type == RTN_LOCAL)
4040 		cfg->fc_flags |= RTF_LOCAL;
4041 
4042 	if (rtm->rtm_flags & RTM_F_CLONED)
4043 		cfg->fc_flags |= RTF_CACHE;
4044 
4045 	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4046 
4047 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4048 	cfg->fc_nlinfo.nlh = nlh;
4049 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4050 
4051 	if (tb[RTA_GATEWAY]) {
4052 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4053 		cfg->fc_flags |= RTF_GATEWAY;
4054 	}
4055 
4056 	if (tb[RTA_DST]) {
4057 		int plen = (rtm->rtm_dst_len + 7) >> 3;
4058 
4059 		if (nla_len(tb[RTA_DST]) < plen)
4060 			goto errout;
4061 
4062 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4063 	}
4064 
4065 	if (tb[RTA_SRC]) {
4066 		int plen = (rtm->rtm_src_len + 7) >> 3;
4067 
4068 		if (nla_len(tb[RTA_SRC]) < plen)
4069 			goto errout;
4070 
4071 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4072 	}
4073 
4074 	if (tb[RTA_PREFSRC])
4075 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4076 
4077 	if (tb[RTA_OIF])
4078 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4079 
4080 	if (tb[RTA_PRIORITY])
4081 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4082 
4083 	if (tb[RTA_METRICS]) {
4084 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4085 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4086 	}
4087 
4088 	if (tb[RTA_TABLE])
4089 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4090 
4091 	if (tb[RTA_MULTIPATH]) {
4092 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4093 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4094 
4095 		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4096 						     cfg->fc_mp_len, extack);
4097 		if (err < 0)
4098 			goto errout;
4099 	}
4100 
4101 	if (tb[RTA_PREF]) {
4102 		pref = nla_get_u8(tb[RTA_PREF]);
4103 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4104 		    pref != ICMPV6_ROUTER_PREF_HIGH)
4105 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4106 		cfg->fc_flags |= RTF_PREF(pref);
4107 	}
4108 
4109 	if (tb[RTA_ENCAP])
4110 		cfg->fc_encap = tb[RTA_ENCAP];
4111 
4112 	if (tb[RTA_ENCAP_TYPE]) {
4113 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4114 
4115 		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4116 		if (err < 0)
4117 			goto errout;
4118 	}
4119 
4120 	if (tb[RTA_EXPIRES]) {
4121 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4122 
4123 		if (addrconf_finite_timeout(timeout)) {
4124 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4125 			cfg->fc_flags |= RTF_EXPIRES;
4126 		}
4127 	}
4128 
4129 	err = 0;
4130 errout:
4131 	return err;
4132 }
4133 
4134 struct rt6_nh {
4135 	struct fib6_info *fib6_info;
4136 	struct fib6_config r_cfg;
4137 	struct list_head next;
4138 };
4139 
4140 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4141 {
4142 	struct rt6_nh *nh;
4143 
4144 	list_for_each_entry(nh, rt6_nh_list, next) {
4145 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4146 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4147 		        nh->r_cfg.fc_ifindex);
4148 	}
4149 }
4150 
4151 static int ip6_route_info_append(struct net *net,
4152 				 struct list_head *rt6_nh_list,
4153 				 struct fib6_info *rt,
4154 				 struct fib6_config *r_cfg)
4155 {
4156 	struct rt6_nh *nh;
4157 	int err = -EEXIST;
4158 
4159 	list_for_each_entry(nh, rt6_nh_list, next) {
4160 		/* check if fib6_info already exists */
4161 		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4162 			return err;
4163 	}
4164 
4165 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4166 	if (!nh)
4167 		return -ENOMEM;
4168 	nh->fib6_info = rt;
4169 	err = ip6_convert_metrics(net, rt, r_cfg);
4170 	if (err) {
4171 		kfree(nh);
4172 		return err;
4173 	}
4174 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4175 	list_add_tail(&nh->next, rt6_nh_list);
4176 
4177 	return 0;
4178 }
4179 
4180 static void ip6_route_mpath_notify(struct fib6_info *rt,
4181 				   struct fib6_info *rt_last,
4182 				   struct nl_info *info,
4183 				   __u16 nlflags)
4184 {
4185 	/* if this is an APPEND route, then rt points to the first route
4186 	 * inserted and rt_last points to last route inserted. Userspace
4187 	 * wants a consistent dump of the route which starts at the first
4188 	 * nexthop. Since sibling routes are always added at the end of
4189 	 * the list, find the first sibling of the last route appended
4190 	 */
4191 	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4192 		rt = list_first_entry(&rt_last->fib6_siblings,
4193 				      struct fib6_info,
4194 				      fib6_siblings);
4195 	}
4196 
4197 	if (rt)
4198 		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4199 }
4200 
4201 static int ip6_route_multipath_add(struct fib6_config *cfg,
4202 				   struct netlink_ext_ack *extack)
4203 {
4204 	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4205 	struct nl_info *info = &cfg->fc_nlinfo;
4206 	struct fib6_config r_cfg;
4207 	struct rtnexthop *rtnh;
4208 	struct fib6_info *rt;
4209 	struct rt6_nh *err_nh;
4210 	struct rt6_nh *nh, *nh_safe;
4211 	__u16 nlflags;
4212 	int remaining;
4213 	int attrlen;
4214 	int err = 1;
4215 	int nhn = 0;
4216 	int replace = (cfg->fc_nlinfo.nlh &&
4217 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4218 	LIST_HEAD(rt6_nh_list);
4219 
4220 	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4221 	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4222 		nlflags |= NLM_F_APPEND;
4223 
4224 	remaining = cfg->fc_mp_len;
4225 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4226 
4227 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4228 	 * fib6_info structs per nexthop
4229 	 */
4230 	while (rtnh_ok(rtnh, remaining)) {
4231 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4232 		if (rtnh->rtnh_ifindex)
4233 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4234 
4235 		attrlen = rtnh_attrlen(rtnh);
4236 		if (attrlen > 0) {
4237 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4238 
4239 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4240 			if (nla) {
4241 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4242 				r_cfg.fc_flags |= RTF_GATEWAY;
4243 			}
4244 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4245 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4246 			if (nla)
4247 				r_cfg.fc_encap_type = nla_get_u16(nla);
4248 		}
4249 
4250 		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4251 		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4252 		if (IS_ERR(rt)) {
4253 			err = PTR_ERR(rt);
4254 			rt = NULL;
4255 			goto cleanup;
4256 		}
4257 
4258 		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4259 
4260 		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4261 					    rt, &r_cfg);
4262 		if (err) {
4263 			fib6_info_release(rt);
4264 			goto cleanup;
4265 		}
4266 
4267 		rtnh = rtnh_next(rtnh, &remaining);
4268 	}
4269 
4270 	/* for add and replace send one notification with all nexthops.
4271 	 * Skip the notification in fib6_add_rt2node and send one with
4272 	 * the full route when done
4273 	 */
4274 	info->skip_notify = 1;
4275 
4276 	err_nh = NULL;
4277 	list_for_each_entry(nh, &rt6_nh_list, next) {
4278 		rt_last = nh->fib6_info;
4279 		err = __ip6_ins_rt(nh->fib6_info, info, extack);
4280 		fib6_info_release(nh->fib6_info);
4281 
4282 		/* save reference to first route for notification */
4283 		if (!rt_notif && !err)
4284 			rt_notif = nh->fib6_info;
4285 
4286 		/* nh->fib6_info is used or freed at this point, reset to NULL*/
4287 		nh->fib6_info = NULL;
4288 		if (err) {
4289 			if (replace && nhn)
4290 				ip6_print_replace_route_err(&rt6_nh_list);
4291 			err_nh = nh;
4292 			goto add_errout;
4293 		}
4294 
4295 		/* Because each route is added like a single route we remove
4296 		 * these flags after the first nexthop: if there is a collision,
4297 		 * we have already failed to add the first nexthop:
4298 		 * fib6_add_rt2node() has rejected it; when replacing, old
4299 		 * nexthops have been replaced by first new, the rest should
4300 		 * be added to it.
4301 		 */
4302 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4303 						     NLM_F_REPLACE);
4304 		nhn++;
4305 	}
4306 
4307 	/* success ... tell user about new route */
4308 	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4309 	goto cleanup;
4310 
4311 add_errout:
4312 	/* send notification for routes that were added so that
4313 	 * the delete notifications sent by ip6_route_del are
4314 	 * coherent
4315 	 */
4316 	if (rt_notif)
4317 		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4318 
4319 	/* Delete routes that were already added */
4320 	list_for_each_entry(nh, &rt6_nh_list, next) {
4321 		if (err_nh == nh)
4322 			break;
4323 		ip6_route_del(&nh->r_cfg, extack);
4324 	}
4325 
4326 cleanup:
4327 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4328 		if (nh->fib6_info)
4329 			fib6_info_release(nh->fib6_info);
4330 		list_del(&nh->next);
4331 		kfree(nh);
4332 	}
4333 
4334 	return err;
4335 }
4336 
4337 static int ip6_route_multipath_del(struct fib6_config *cfg,
4338 				   struct netlink_ext_ack *extack)
4339 {
4340 	struct fib6_config r_cfg;
4341 	struct rtnexthop *rtnh;
4342 	int remaining;
4343 	int attrlen;
4344 	int err = 1, last_err = 0;
4345 
4346 	remaining = cfg->fc_mp_len;
4347 	rtnh = (struct rtnexthop *)cfg->fc_mp;
4348 
4349 	/* Parse a Multipath Entry */
4350 	while (rtnh_ok(rtnh, remaining)) {
4351 		memcpy(&r_cfg, cfg, sizeof(*cfg));
4352 		if (rtnh->rtnh_ifindex)
4353 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4354 
4355 		attrlen = rtnh_attrlen(rtnh);
4356 		if (attrlen > 0) {
4357 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4358 
4359 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4360 			if (nla) {
4361 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4362 				r_cfg.fc_flags |= RTF_GATEWAY;
4363 			}
4364 		}
4365 		err = ip6_route_del(&r_cfg, extack);
4366 		if (err)
4367 			last_err = err;
4368 
4369 		rtnh = rtnh_next(rtnh, &remaining);
4370 	}
4371 
4372 	return last_err;
4373 }
4374 
4375 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4376 			      struct netlink_ext_ack *extack)
4377 {
4378 	struct fib6_config cfg;
4379 	int err;
4380 
4381 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4382 	if (err < 0)
4383 		return err;
4384 
4385 	if (cfg.fc_mp)
4386 		return ip6_route_multipath_del(&cfg, extack);
4387 	else {
4388 		cfg.fc_delete_all_nh = 1;
4389 		return ip6_route_del(&cfg, extack);
4390 	}
4391 }
4392 
4393 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4394 			      struct netlink_ext_ack *extack)
4395 {
4396 	struct fib6_config cfg;
4397 	int err;
4398 
4399 	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4400 	if (err < 0)
4401 		return err;
4402 
4403 	if (cfg.fc_mp)
4404 		return ip6_route_multipath_add(&cfg, extack);
4405 	else
4406 		return ip6_route_add(&cfg, GFP_KERNEL, extack);
4407 }
4408 
4409 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4410 {
4411 	int nexthop_len = 0;
4412 
4413 	if (rt->fib6_nsiblings) {
4414 		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
4415 			    + NLA_ALIGN(sizeof(struct rtnexthop))
4416 			    + nla_total_size(16) /* RTA_GATEWAY */
4417 			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4418 
4419 		nexthop_len *= rt->fib6_nsiblings;
4420 	}
4421 
4422 	return NLMSG_ALIGN(sizeof(struct rtmsg))
4423 	       + nla_total_size(16) /* RTA_SRC */
4424 	       + nla_total_size(16) /* RTA_DST */
4425 	       + nla_total_size(16) /* RTA_GATEWAY */
4426 	       + nla_total_size(16) /* RTA_PREFSRC */
4427 	       + nla_total_size(4) /* RTA_TABLE */
4428 	       + nla_total_size(4) /* RTA_IIF */
4429 	       + nla_total_size(4) /* RTA_OIF */
4430 	       + nla_total_size(4) /* RTA_PRIORITY */
4431 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4432 	       + nla_total_size(sizeof(struct rta_cacheinfo))
4433 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4434 	       + nla_total_size(1) /* RTA_PREF */
4435 	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4436 	       + nexthop_len;
4437 }
4438 
4439 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4440 			    unsigned int *flags, bool skip_oif)
4441 {
4442 	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4443 		*flags |= RTNH_F_DEAD;
4444 
4445 	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4446 		*flags |= RTNH_F_LINKDOWN;
4447 		if (rt->fib6_idev->cnf.ignore_routes_with_linkdown)
4448 			*flags |= RTNH_F_DEAD;
4449 	}
4450 
4451 	if (rt->fib6_flags & RTF_GATEWAY) {
4452 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4453 			goto nla_put_failure;
4454 	}
4455 
4456 	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4457 	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4458 		*flags |= RTNH_F_OFFLOAD;
4459 
4460 	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4461 	if (!skip_oif && rt->fib6_nh.nh_dev &&
4462 	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4463 		goto nla_put_failure;
4464 
4465 	if (rt->fib6_nh.nh_lwtstate &&
4466 	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4467 		goto nla_put_failure;
4468 
4469 	return 0;
4470 
4471 nla_put_failure:
4472 	return -EMSGSIZE;
4473 }
4474 
4475 /* add multipath next hop */
4476 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4477 {
4478 	const struct net_device *dev = rt->fib6_nh.nh_dev;
4479 	struct rtnexthop *rtnh;
4480 	unsigned int flags = 0;
4481 
4482 	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4483 	if (!rtnh)
4484 		goto nla_put_failure;
4485 
4486 	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4487 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4488 
4489 	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4490 		goto nla_put_failure;
4491 
4492 	rtnh->rtnh_flags = flags;
4493 
4494 	/* length of rtnetlink header + attributes */
4495 	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4496 
4497 	return 0;
4498 
4499 nla_put_failure:
4500 	return -EMSGSIZE;
4501 }
4502 
4503 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4504 			 struct fib6_info *rt, struct dst_entry *dst,
4505 			 struct in6_addr *dest, struct in6_addr *src,
4506 			 int iif, int type, u32 portid, u32 seq,
4507 			 unsigned int flags)
4508 {
4509 	struct rtmsg *rtm;
4510 	struct nlmsghdr *nlh;
4511 	long expires = 0;
4512 	u32 *pmetrics;
4513 	u32 table;
4514 
4515 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4516 	if (!nlh)
4517 		return -EMSGSIZE;
4518 
4519 	rtm = nlmsg_data(nlh);
4520 	rtm->rtm_family = AF_INET6;
4521 	rtm->rtm_dst_len = rt->fib6_dst.plen;
4522 	rtm->rtm_src_len = rt->fib6_src.plen;
4523 	rtm->rtm_tos = 0;
4524 	if (rt->fib6_table)
4525 		table = rt->fib6_table->tb6_id;
4526 	else
4527 		table = RT6_TABLE_UNSPEC;
4528 	rtm->rtm_table = table;
4529 	if (nla_put_u32(skb, RTA_TABLE, table))
4530 		goto nla_put_failure;
4531 
4532 	rtm->rtm_type = rt->fib6_type;
4533 	rtm->rtm_flags = 0;
4534 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4535 	rtm->rtm_protocol = rt->fib6_protocol;
4536 
4537 	if (rt->fib6_flags & RTF_CACHE)
4538 		rtm->rtm_flags |= RTM_F_CLONED;
4539 
4540 	if (dest) {
4541 		if (nla_put_in6_addr(skb, RTA_DST, dest))
4542 			goto nla_put_failure;
4543 		rtm->rtm_dst_len = 128;
4544 	} else if (rtm->rtm_dst_len)
4545 		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
4546 			goto nla_put_failure;
4547 #ifdef CONFIG_IPV6_SUBTREES
4548 	if (src) {
4549 		if (nla_put_in6_addr(skb, RTA_SRC, src))
4550 			goto nla_put_failure;
4551 		rtm->rtm_src_len = 128;
4552 	} else if (rtm->rtm_src_len &&
4553 		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
4554 		goto nla_put_failure;
4555 #endif
4556 	if (iif) {
4557 #ifdef CONFIG_IPV6_MROUTE
4558 		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4559 			int err = ip6mr_get_route(net, skb, rtm, portid);
4560 
4561 			if (err == 0)
4562 				return 0;
4563 			if (err < 0)
4564 				goto nla_put_failure;
4565 		} else
4566 #endif
4567 			if (nla_put_u32(skb, RTA_IIF, iif))
4568 				goto nla_put_failure;
4569 	} else if (dest) {
4570 		struct in6_addr saddr_buf;
4571 		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4572 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4573 			goto nla_put_failure;
4574 	}
4575 
4576 	if (rt->fib6_prefsrc.plen) {
4577 		struct in6_addr saddr_buf;
4578 		saddr_buf = rt->fib6_prefsrc.addr;
4579 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4580 			goto nla_put_failure;
4581 	}
4582 
4583 	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4584 	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4585 		goto nla_put_failure;
4586 
4587 	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4588 		goto nla_put_failure;
4589 
4590 	/* For multipath routes, walk the siblings list and add
4591 	 * each as a nexthop within RTA_MULTIPATH.
4592 	 */
4593 	if (rt->fib6_nsiblings) {
4594 		struct fib6_info *sibling, *next_sibling;
4595 		struct nlattr *mp;
4596 
4597 		mp = nla_nest_start(skb, RTA_MULTIPATH);
4598 		if (!mp)
4599 			goto nla_put_failure;
4600 
4601 		if (rt6_add_nexthop(skb, rt) < 0)
4602 			goto nla_put_failure;
4603 
4604 		list_for_each_entry_safe(sibling, next_sibling,
4605 					 &rt->fib6_siblings, fib6_siblings) {
4606 			if (rt6_add_nexthop(skb, sibling) < 0)
4607 				goto nla_put_failure;
4608 		}
4609 
4610 		nla_nest_end(skb, mp);
4611 	} else {
4612 		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4613 			goto nla_put_failure;
4614 	}
4615 
4616 	if (rt->fib6_flags & RTF_EXPIRES) {
4617 		expires = dst ? dst->expires : rt->expires;
4618 		expires -= jiffies;
4619 	}
4620 
4621 	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4622 		goto nla_put_failure;
4623 
4624 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4625 		goto nla_put_failure;
4626 
4627 
4628 	nlmsg_end(skb, nlh);
4629 	return 0;
4630 
4631 nla_put_failure:
4632 	nlmsg_cancel(skb, nlh);
4633 	return -EMSGSIZE;
4634 }
4635 
4636 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4637 {
4638 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4639 	struct net *net = arg->net;
4640 
4641 	if (rt == net->ipv6.fib6_null_entry)
4642 		return 0;
4643 
4644 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4645 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4646 
4647 		/* user wants prefix routes only */
4648 		if (rtm->rtm_flags & RTM_F_PREFIX &&
4649 		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4650 			/* success since this is not a prefix route */
4651 			return 1;
4652 		}
4653 	}
4654 
4655 	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4656 			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4657 			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4658 }
4659 
4660 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4661 			      struct netlink_ext_ack *extack)
4662 {
4663 	struct net *net = sock_net(in_skb->sk);
4664 	struct nlattr *tb[RTA_MAX+1];
4665 	int err, iif = 0, oif = 0;
4666 	struct dst_entry *dst;
4667 	struct rt6_info *rt;
4668 	struct sk_buff *skb;
4669 	struct rtmsg *rtm;
4670 	struct flowi6 fl6;
4671 	bool fibmatch;
4672 
4673 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4674 			  extack);
4675 	if (err < 0)
4676 		goto errout;
4677 
4678 	err = -EINVAL;
4679 	memset(&fl6, 0, sizeof(fl6));
4680 	rtm = nlmsg_data(nlh);
4681 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4682 	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4683 
4684 	if (tb[RTA_SRC]) {
4685 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4686 			goto errout;
4687 
4688 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4689 	}
4690 
4691 	if (tb[RTA_DST]) {
4692 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4693 			goto errout;
4694 
4695 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4696 	}
4697 
4698 	if (tb[RTA_IIF])
4699 		iif = nla_get_u32(tb[RTA_IIF]);
4700 
4701 	if (tb[RTA_OIF])
4702 		oif = nla_get_u32(tb[RTA_OIF]);
4703 
4704 	if (tb[RTA_MARK])
4705 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4706 
4707 	if (tb[RTA_UID])
4708 		fl6.flowi6_uid = make_kuid(current_user_ns(),
4709 					   nla_get_u32(tb[RTA_UID]));
4710 	else
4711 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4712 
4713 	if (iif) {
4714 		struct net_device *dev;
4715 		int flags = 0;
4716 
4717 		rcu_read_lock();
4718 
4719 		dev = dev_get_by_index_rcu(net, iif);
4720 		if (!dev) {
4721 			rcu_read_unlock();
4722 			err = -ENODEV;
4723 			goto errout;
4724 		}
4725 
4726 		fl6.flowi6_iif = iif;
4727 
4728 		if (!ipv6_addr_any(&fl6.saddr))
4729 			flags |= RT6_LOOKUP_F_HAS_SADDR;
4730 
4731 		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4732 
4733 		rcu_read_unlock();
4734 	} else {
4735 		fl6.flowi6_oif = oif;
4736 
4737 		dst = ip6_route_output(net, NULL, &fl6);
4738 	}
4739 
4740 
4741 	rt = container_of(dst, struct rt6_info, dst);
4742 	if (rt->dst.error) {
4743 		err = rt->dst.error;
4744 		ip6_rt_put(rt);
4745 		goto errout;
4746 	}
4747 
4748 	if (rt == net->ipv6.ip6_null_entry) {
4749 		err = rt->dst.error;
4750 		ip6_rt_put(rt);
4751 		goto errout;
4752 	}
4753 
4754 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4755 	if (!skb) {
4756 		ip6_rt_put(rt);
4757 		err = -ENOBUFS;
4758 		goto errout;
4759 	}
4760 
4761 	skb_dst_set(skb, &rt->dst);
4762 	if (fibmatch)
4763 		err = rt6_fill_node(net, skb, rt->from, NULL, NULL, NULL, iif,
4764 				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4765 				    nlh->nlmsg_seq, 0);
4766 	else
4767 		err = rt6_fill_node(net, skb, rt->from, dst,
4768 				    &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE,
4769 				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
4770 				    0);
4771 	if (err < 0) {
4772 		kfree_skb(skb);
4773 		goto errout;
4774 	}
4775 
4776 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4777 errout:
4778 	return err;
4779 }
4780 
4781 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4782 		     unsigned int nlm_flags)
4783 {
4784 	struct sk_buff *skb;
4785 	struct net *net = info->nl_net;
4786 	u32 seq;
4787 	int err;
4788 
4789 	err = -ENOBUFS;
4790 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4791 
4792 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4793 	if (!skb)
4794 		goto errout;
4795 
4796 	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
4797 			    event, info->portid, seq, nlm_flags);
4798 	if (err < 0) {
4799 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4800 		WARN_ON(err == -EMSGSIZE);
4801 		kfree_skb(skb);
4802 		goto errout;
4803 	}
4804 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4805 		    info->nlh, gfp_any());
4806 	return;
4807 errout:
4808 	if (err < 0)
4809 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
4810 }
4811 
4812 static int ip6_route_dev_notify(struct notifier_block *this,
4813 				unsigned long event, void *ptr)
4814 {
4815 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4816 	struct net *net = dev_net(dev);
4817 
4818 	if (!(dev->flags & IFF_LOOPBACK))
4819 		return NOTIFY_OK;
4820 
4821 	if (event == NETDEV_REGISTER) {
4822 		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4823 		net->ipv6.fib6_null_entry->fib6_idev = in6_dev_get(dev);
4824 		net->ipv6.ip6_null_entry->dst.dev = dev;
4825 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4826 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4827 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4828 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4829 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4830 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4831 #endif
4832 	 } else if (event == NETDEV_UNREGISTER &&
4833 		    dev->reg_state != NETREG_UNREGISTERED) {
4834 		/* NETDEV_UNREGISTER could be fired for multiple times by
4835 		 * netdev_wait_allrefs(). Make sure we only call this once.
4836 		 */
4837 		in6_dev_put_clear(&net->ipv6.fib6_null_entry->fib6_idev);
4838 		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4839 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4840 		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4841 		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4842 #endif
4843 	}
4844 
4845 	return NOTIFY_OK;
4846 }
4847 
4848 /*
4849  *	/proc
4850  */
4851 
4852 #ifdef CONFIG_PROC_FS
4853 
4854 static const struct file_operations ipv6_route_proc_fops = {
4855 	.open		= ipv6_route_open,
4856 	.read		= seq_read,
4857 	.llseek		= seq_lseek,
4858 	.release	= seq_release_net,
4859 };
4860 
4861 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4862 {
4863 	struct net *net = (struct net *)seq->private;
4864 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4865 		   net->ipv6.rt6_stats->fib_nodes,
4866 		   net->ipv6.rt6_stats->fib_route_nodes,
4867 		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4868 		   net->ipv6.rt6_stats->fib_rt_entries,
4869 		   net->ipv6.rt6_stats->fib_rt_cache,
4870 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4871 		   net->ipv6.rt6_stats->fib_discarded_routes);
4872 
4873 	return 0;
4874 }
4875 
4876 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4877 {
4878 	return single_open_net(inode, file, rt6_stats_seq_show);
4879 }
4880 
4881 static const struct file_operations rt6_stats_seq_fops = {
4882 	.open	 = rt6_stats_seq_open,
4883 	.read	 = seq_read,
4884 	.llseek	 = seq_lseek,
4885 	.release = single_release_net,
4886 };
4887 #endif	/* CONFIG_PROC_FS */
4888 
4889 #ifdef CONFIG_SYSCTL
4890 
4891 static
4892 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
4893 			      void __user *buffer, size_t *lenp, loff_t *ppos)
4894 {
4895 	struct net *net;
4896 	int delay;
4897 	if (!write)
4898 		return -EINVAL;
4899 
4900 	net = (struct net *)ctl->extra1;
4901 	delay = net->ipv6.sysctl.flush_delay;
4902 	proc_dointvec(ctl, write, buffer, lenp, ppos);
4903 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4904 	return 0;
4905 }
4906 
4907 struct ctl_table ipv6_route_table_template[] = {
4908 	{
4909 		.procname	=	"flush",
4910 		.data		=	&init_net.ipv6.sysctl.flush_delay,
4911 		.maxlen		=	sizeof(int),
4912 		.mode		=	0200,
4913 		.proc_handler	=	ipv6_sysctl_rtcache_flush
4914 	},
4915 	{
4916 		.procname	=	"gc_thresh",
4917 		.data		=	&ip6_dst_ops_template.gc_thresh,
4918 		.maxlen		=	sizeof(int),
4919 		.mode		=	0644,
4920 		.proc_handler	=	proc_dointvec,
4921 	},
4922 	{
4923 		.procname	=	"max_size",
4924 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
4925 		.maxlen		=	sizeof(int),
4926 		.mode		=	0644,
4927 		.proc_handler	=	proc_dointvec,
4928 	},
4929 	{
4930 		.procname	=	"gc_min_interval",
4931 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4932 		.maxlen		=	sizeof(int),
4933 		.mode		=	0644,
4934 		.proc_handler	=	proc_dointvec_jiffies,
4935 	},
4936 	{
4937 		.procname	=	"gc_timeout",
4938 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
4939 		.maxlen		=	sizeof(int),
4940 		.mode		=	0644,
4941 		.proc_handler	=	proc_dointvec_jiffies,
4942 	},
4943 	{
4944 		.procname	=	"gc_interval",
4945 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
4946 		.maxlen		=	sizeof(int),
4947 		.mode		=	0644,
4948 		.proc_handler	=	proc_dointvec_jiffies,
4949 	},
4950 	{
4951 		.procname	=	"gc_elasticity",
4952 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
4953 		.maxlen		=	sizeof(int),
4954 		.mode		=	0644,
4955 		.proc_handler	=	proc_dointvec,
4956 	},
4957 	{
4958 		.procname	=	"mtu_expires",
4959 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
4960 		.maxlen		=	sizeof(int),
4961 		.mode		=	0644,
4962 		.proc_handler	=	proc_dointvec_jiffies,
4963 	},
4964 	{
4965 		.procname	=	"min_adv_mss",
4966 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
4967 		.maxlen		=	sizeof(int),
4968 		.mode		=	0644,
4969 		.proc_handler	=	proc_dointvec,
4970 	},
4971 	{
4972 		.procname	=	"gc_min_interval_ms",
4973 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4974 		.maxlen		=	sizeof(int),
4975 		.mode		=	0644,
4976 		.proc_handler	=	proc_dointvec_ms_jiffies,
4977 	},
4978 	{ }
4979 };
4980 
4981 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4982 {
4983 	struct ctl_table *table;
4984 
4985 	table = kmemdup(ipv6_route_table_template,
4986 			sizeof(ipv6_route_table_template),
4987 			GFP_KERNEL);
4988 
4989 	if (table) {
4990 		table[0].data = &net->ipv6.sysctl.flush_delay;
4991 		table[0].extra1 = net;
4992 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4993 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4994 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4995 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4996 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4997 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4998 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4999 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5000 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5001 
5002 		/* Don't export sysctls to unprivileged users */
5003 		if (net->user_ns != &init_user_ns)
5004 			table[0].procname = NULL;
5005 	}
5006 
5007 	return table;
5008 }
5009 #endif
5010 
5011 static int __net_init ip6_route_net_init(struct net *net)
5012 {
5013 	int ret = -ENOMEM;
5014 
5015 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5016 	       sizeof(net->ipv6.ip6_dst_ops));
5017 
5018 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5019 		goto out_ip6_dst_ops;
5020 
5021 	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5022 					    sizeof(*net->ipv6.fib6_null_entry),
5023 					    GFP_KERNEL);
5024 	if (!net->ipv6.fib6_null_entry)
5025 		goto out_ip6_dst_entries;
5026 
5027 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5028 					   sizeof(*net->ipv6.ip6_null_entry),
5029 					   GFP_KERNEL);
5030 	if (!net->ipv6.ip6_null_entry)
5031 		goto out_fib6_null_entry;
5032 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5033 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5034 			 ip6_template_metrics, true);
5035 
5036 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5037 	net->ipv6.fib6_has_custom_rules = false;
5038 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5039 					       sizeof(*net->ipv6.ip6_prohibit_entry),
5040 					       GFP_KERNEL);
5041 	if (!net->ipv6.ip6_prohibit_entry)
5042 		goto out_ip6_null_entry;
5043 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5044 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5045 			 ip6_template_metrics, true);
5046 
5047 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5048 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
5049 					       GFP_KERNEL);
5050 	if (!net->ipv6.ip6_blk_hole_entry)
5051 		goto out_ip6_prohibit_entry;
5052 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5053 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5054 			 ip6_template_metrics, true);
5055 #endif
5056 
5057 	net->ipv6.sysctl.flush_delay = 0;
5058 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
5059 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5060 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5061 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5062 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5063 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5064 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5065 
5066 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
5067 
5068 	ret = 0;
5069 out:
5070 	return ret;
5071 
5072 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5073 out_ip6_prohibit_entry:
5074 	kfree(net->ipv6.ip6_prohibit_entry);
5075 out_ip6_null_entry:
5076 	kfree(net->ipv6.ip6_null_entry);
5077 #endif
5078 out_fib6_null_entry:
5079 	kfree(net->ipv6.fib6_null_entry);
5080 out_ip6_dst_entries:
5081 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5082 out_ip6_dst_ops:
5083 	goto out;
5084 }
5085 
5086 static void __net_exit ip6_route_net_exit(struct net *net)
5087 {
5088 	kfree(net->ipv6.fib6_null_entry);
5089 	kfree(net->ipv6.ip6_null_entry);
5090 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5091 	kfree(net->ipv6.ip6_prohibit_entry);
5092 	kfree(net->ipv6.ip6_blk_hole_entry);
5093 #endif
5094 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5095 }
5096 
5097 static int __net_init ip6_route_net_init_late(struct net *net)
5098 {
5099 #ifdef CONFIG_PROC_FS
5100 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5101 	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5102 #endif
5103 	return 0;
5104 }
5105 
5106 static void __net_exit ip6_route_net_exit_late(struct net *net)
5107 {
5108 #ifdef CONFIG_PROC_FS
5109 	remove_proc_entry("ipv6_route", net->proc_net);
5110 	remove_proc_entry("rt6_stats", net->proc_net);
5111 #endif
5112 }
5113 
5114 static struct pernet_operations ip6_route_net_ops = {
5115 	.init = ip6_route_net_init,
5116 	.exit = ip6_route_net_exit,
5117 };
5118 
5119 static int __net_init ipv6_inetpeer_init(struct net *net)
5120 {
5121 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5122 
5123 	if (!bp)
5124 		return -ENOMEM;
5125 	inet_peer_base_init(bp);
5126 	net->ipv6.peers = bp;
5127 	return 0;
5128 }
5129 
5130 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5131 {
5132 	struct inet_peer_base *bp = net->ipv6.peers;
5133 
5134 	net->ipv6.peers = NULL;
5135 	inetpeer_invalidate_tree(bp);
5136 	kfree(bp);
5137 }
5138 
5139 static struct pernet_operations ipv6_inetpeer_ops = {
5140 	.init	=	ipv6_inetpeer_init,
5141 	.exit	=	ipv6_inetpeer_exit,
5142 };
5143 
5144 static struct pernet_operations ip6_route_net_late_ops = {
5145 	.init = ip6_route_net_init_late,
5146 	.exit = ip6_route_net_exit_late,
5147 };
5148 
5149 static struct notifier_block ip6_route_dev_notifier = {
5150 	.notifier_call = ip6_route_dev_notify,
5151 	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5152 };
5153 
5154 void __init ip6_route_init_special_entries(void)
5155 {
5156 	/* Registering of the loopback is done before this portion of code,
5157 	 * the loopback reference in rt6_info will not be taken, do it
5158 	 * manually for init_net */
5159 	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5160 	init_net.ipv6.fib6_null_entry->fib6_idev = in6_dev_get(init_net.loopback_dev);
5161 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5162 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5163   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5164 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5165 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5166 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5167 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5168   #endif
5169 }
5170 
5171 int __init ip6_route_init(void)
5172 {
5173 	int ret;
5174 	int cpu;
5175 
5176 	ret = -ENOMEM;
5177 	ip6_dst_ops_template.kmem_cachep =
5178 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5179 				  SLAB_HWCACHE_ALIGN, NULL);
5180 	if (!ip6_dst_ops_template.kmem_cachep)
5181 		goto out;
5182 
5183 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5184 	if (ret)
5185 		goto out_kmem_cache;
5186 
5187 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5188 	if (ret)
5189 		goto out_dst_entries;
5190 
5191 	ret = register_pernet_subsys(&ip6_route_net_ops);
5192 	if (ret)
5193 		goto out_register_inetpeer;
5194 
5195 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5196 
5197 	ret = fib6_init();
5198 	if (ret)
5199 		goto out_register_subsys;
5200 
5201 	ret = xfrm6_init();
5202 	if (ret)
5203 		goto out_fib6_init;
5204 
5205 	ret = fib6_rules_init();
5206 	if (ret)
5207 		goto xfrm6_init;
5208 
5209 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
5210 	if (ret)
5211 		goto fib6_rules_init;
5212 
5213 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5214 				   inet6_rtm_newroute, NULL, 0);
5215 	if (ret < 0)
5216 		goto out_register_late_subsys;
5217 
5218 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5219 				   inet6_rtm_delroute, NULL, 0);
5220 	if (ret < 0)
5221 		goto out_register_late_subsys;
5222 
5223 	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5224 				   inet6_rtm_getroute, NULL,
5225 				   RTNL_FLAG_DOIT_UNLOCKED);
5226 	if (ret < 0)
5227 		goto out_register_late_subsys;
5228 
5229 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5230 	if (ret)
5231 		goto out_register_late_subsys;
5232 
5233 	for_each_possible_cpu(cpu) {
5234 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5235 
5236 		INIT_LIST_HEAD(&ul->head);
5237 		spin_lock_init(&ul->lock);
5238 	}
5239 
5240 out:
5241 	return ret;
5242 
5243 out_register_late_subsys:
5244 	rtnl_unregister_all(PF_INET6);
5245 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5246 fib6_rules_init:
5247 	fib6_rules_cleanup();
5248 xfrm6_init:
5249 	xfrm6_fini();
5250 out_fib6_init:
5251 	fib6_gc_cleanup();
5252 out_register_subsys:
5253 	unregister_pernet_subsys(&ip6_route_net_ops);
5254 out_register_inetpeer:
5255 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5256 out_dst_entries:
5257 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5258 out_kmem_cache:
5259 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5260 	goto out;
5261 }
5262 
5263 void ip6_route_cleanup(void)
5264 {
5265 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5266 	unregister_pernet_subsys(&ip6_route_net_late_ops);
5267 	fib6_rules_cleanup();
5268 	xfrm6_fini();
5269 	fib6_gc_cleanup();
5270 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5271 	unregister_pernet_subsys(&ip6_route_net_ops);
5272 	dst_entries_destroy(&ip6_dst_blackhole_ops);
5273 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5274 }
5275