xref: /openbmc/linux/net/ipv6/route.c (revision a8da474e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 
66 #include <asm/uaccess.h>
67 
68 #ifdef CONFIG_SYSCTL
69 #include <linux/sysctl.h>
70 #endif
71 
72 enum rt6_nud_state {
73 	RT6_NUD_FAIL_HARD = -3,
74 	RT6_NUD_FAIL_PROBE = -2,
75 	RT6_NUD_FAIL_DO_RR = -1,
76 	RT6_NUD_SUCCEED = 1
77 };
78 
79 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
80 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
81 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
82 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
83 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
84 static void		ip6_dst_destroy(struct dst_entry *);
85 static void		ip6_dst_ifdown(struct dst_entry *,
86 				       struct net_device *dev, int how);
87 static int		 ip6_dst_gc(struct dst_ops *ops);
88 
89 static int		ip6_pkt_discard(struct sk_buff *skb);
90 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
91 static int		ip6_pkt_prohibit(struct sk_buff *skb);
92 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
93 static void		ip6_link_failure(struct sk_buff *skb);
94 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
95 					   struct sk_buff *skb, u32 mtu);
96 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
97 					struct sk_buff *skb);
98 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
99 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct net *net,
103 					   const struct in6_addr *prefix, int prefixlen,
104 					   const struct in6_addr *gwaddr, int ifindex,
105 					   unsigned int pref);
106 static struct rt6_info *rt6_get_route_info(struct net *net,
107 					   const struct in6_addr *prefix, int prefixlen,
108 					   const struct in6_addr *gwaddr, int ifindex);
109 #endif
110 
111 struct uncached_list {
112 	spinlock_t		lock;
113 	struct list_head	head;
114 };
115 
116 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
117 
118 static void rt6_uncached_list_add(struct rt6_info *rt)
119 {
120 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
121 
122 	rt->dst.flags |= DST_NOCACHE;
123 	rt->rt6i_uncached_list = ul;
124 
125 	spin_lock_bh(&ul->lock);
126 	list_add_tail(&rt->rt6i_uncached, &ul->head);
127 	spin_unlock_bh(&ul->lock);
128 }
129 
130 static void rt6_uncached_list_del(struct rt6_info *rt)
131 {
132 	if (!list_empty(&rt->rt6i_uncached)) {
133 		struct uncached_list *ul = rt->rt6i_uncached_list;
134 
135 		spin_lock_bh(&ul->lock);
136 		list_del(&rt->rt6i_uncached);
137 		spin_unlock_bh(&ul->lock);
138 	}
139 }
140 
141 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
142 {
143 	struct net_device *loopback_dev = net->loopback_dev;
144 	int cpu;
145 
146 	if (dev == loopback_dev)
147 		return;
148 
149 	for_each_possible_cpu(cpu) {
150 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
151 		struct rt6_info *rt;
152 
153 		spin_lock_bh(&ul->lock);
154 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
155 			struct inet6_dev *rt_idev = rt->rt6i_idev;
156 			struct net_device *rt_dev = rt->dst.dev;
157 
158 			if (rt_idev->dev == dev) {
159 				rt->rt6i_idev = in6_dev_get(loopback_dev);
160 				in6_dev_put(rt_idev);
161 			}
162 
163 			if (rt_dev == dev) {
164 				rt->dst.dev = loopback_dev;
165 				dev_hold(rt->dst.dev);
166 				dev_put(rt_dev);
167 			}
168 		}
169 		spin_unlock_bh(&ul->lock);
170 	}
171 }
172 
173 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
174 {
175 	return dst_metrics_write_ptr(rt->dst.from);
176 }
177 
178 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
179 {
180 	struct rt6_info *rt = (struct rt6_info *)dst;
181 
182 	if (rt->rt6i_flags & RTF_PCPU)
183 		return rt6_pcpu_cow_metrics(rt);
184 	else if (rt->rt6i_flags & RTF_CACHE)
185 		return NULL;
186 	else
187 		return dst_cow_metrics_generic(dst, old);
188 }
189 
190 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
191 					     struct sk_buff *skb,
192 					     const void *daddr)
193 {
194 	struct in6_addr *p = &rt->rt6i_gateway;
195 
196 	if (!ipv6_addr_any(p))
197 		return (const void *) p;
198 	else if (skb)
199 		return &ipv6_hdr(skb)->daddr;
200 	return daddr;
201 }
202 
203 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
204 					  struct sk_buff *skb,
205 					  const void *daddr)
206 {
207 	struct rt6_info *rt = (struct rt6_info *) dst;
208 	struct neighbour *n;
209 
210 	daddr = choose_neigh_daddr(rt, skb, daddr);
211 	n = __ipv6_neigh_lookup(dst->dev, daddr);
212 	if (n)
213 		return n;
214 	return neigh_create(&nd_tbl, daddr, dst->dev);
215 }
216 
217 static struct dst_ops ip6_dst_ops_template = {
218 	.family			=	AF_INET6,
219 	.gc			=	ip6_dst_gc,
220 	.gc_thresh		=	1024,
221 	.check			=	ip6_dst_check,
222 	.default_advmss		=	ip6_default_advmss,
223 	.mtu			=	ip6_mtu,
224 	.cow_metrics		=	ipv6_cow_metrics,
225 	.destroy		=	ip6_dst_destroy,
226 	.ifdown			=	ip6_dst_ifdown,
227 	.negative_advice	=	ip6_negative_advice,
228 	.link_failure		=	ip6_link_failure,
229 	.update_pmtu		=	ip6_rt_update_pmtu,
230 	.redirect		=	rt6_do_redirect,
231 	.local_out		=	__ip6_local_out,
232 	.neigh_lookup		=	ip6_neigh_lookup,
233 };
234 
235 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
236 {
237 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
238 
239 	return mtu ? : dst->dev->mtu;
240 }
241 
242 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
243 					 struct sk_buff *skb, u32 mtu)
244 {
245 }
246 
247 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
248 				      struct sk_buff *skb)
249 {
250 }
251 
252 static struct dst_ops ip6_dst_blackhole_ops = {
253 	.family			=	AF_INET6,
254 	.destroy		=	ip6_dst_destroy,
255 	.check			=	ip6_dst_check,
256 	.mtu			=	ip6_blackhole_mtu,
257 	.default_advmss		=	ip6_default_advmss,
258 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
259 	.redirect		=	ip6_rt_blackhole_redirect,
260 	.cow_metrics		=	dst_cow_metrics_generic,
261 	.neigh_lookup		=	ip6_neigh_lookup,
262 };
263 
264 static const u32 ip6_template_metrics[RTAX_MAX] = {
265 	[RTAX_HOPLIMIT - 1] = 0,
266 };
267 
268 static const struct rt6_info ip6_null_entry_template = {
269 	.dst = {
270 		.__refcnt	= ATOMIC_INIT(1),
271 		.__use		= 1,
272 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
273 		.error		= -ENETUNREACH,
274 		.input		= ip6_pkt_discard,
275 		.output		= ip6_pkt_discard_out,
276 	},
277 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
278 	.rt6i_protocol  = RTPROT_KERNEL,
279 	.rt6i_metric	= ~(u32) 0,
280 	.rt6i_ref	= ATOMIC_INIT(1),
281 };
282 
283 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
284 
285 static const struct rt6_info ip6_prohibit_entry_template = {
286 	.dst = {
287 		.__refcnt	= ATOMIC_INIT(1),
288 		.__use		= 1,
289 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
290 		.error		= -EACCES,
291 		.input		= ip6_pkt_prohibit,
292 		.output		= ip6_pkt_prohibit_out,
293 	},
294 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
295 	.rt6i_protocol  = RTPROT_KERNEL,
296 	.rt6i_metric	= ~(u32) 0,
297 	.rt6i_ref	= ATOMIC_INIT(1),
298 };
299 
300 static const struct rt6_info ip6_blk_hole_entry_template = {
301 	.dst = {
302 		.__refcnt	= ATOMIC_INIT(1),
303 		.__use		= 1,
304 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
305 		.error		= -EINVAL,
306 		.input		= dst_discard,
307 		.output		= dst_discard_out,
308 	},
309 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
310 	.rt6i_protocol  = RTPROT_KERNEL,
311 	.rt6i_metric	= ~(u32) 0,
312 	.rt6i_ref	= ATOMIC_INIT(1),
313 };
314 
315 #endif
316 
317 static void rt6_info_init(struct rt6_info *rt)
318 {
319 	struct dst_entry *dst = &rt->dst;
320 
321 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
322 	INIT_LIST_HEAD(&rt->rt6i_siblings);
323 	INIT_LIST_HEAD(&rt->rt6i_uncached);
324 }
325 
326 /* allocate dst with ip6_dst_ops */
327 static struct rt6_info *__ip6_dst_alloc(struct net *net,
328 					struct net_device *dev,
329 					int flags)
330 {
331 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
332 					0, DST_OBSOLETE_FORCE_CHK, flags);
333 
334 	if (rt)
335 		rt6_info_init(rt);
336 
337 	return rt;
338 }
339 
340 static struct rt6_info *ip6_dst_alloc(struct net *net,
341 				      struct net_device *dev,
342 				      int flags)
343 {
344 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
345 
346 	if (rt) {
347 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
348 		if (rt->rt6i_pcpu) {
349 			int cpu;
350 
351 			for_each_possible_cpu(cpu) {
352 				struct rt6_info **p;
353 
354 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
355 				/* no one shares rt */
356 				*p =  NULL;
357 			}
358 		} else {
359 			dst_destroy((struct dst_entry *)rt);
360 			return NULL;
361 		}
362 	}
363 
364 	return rt;
365 }
366 
367 static void ip6_dst_destroy(struct dst_entry *dst)
368 {
369 	struct rt6_info *rt = (struct rt6_info *)dst;
370 	struct dst_entry *from = dst->from;
371 	struct inet6_dev *idev;
372 
373 	dst_destroy_metrics_generic(dst);
374 	free_percpu(rt->rt6i_pcpu);
375 	rt6_uncached_list_del(rt);
376 
377 	idev = rt->rt6i_idev;
378 	if (idev) {
379 		rt->rt6i_idev = NULL;
380 		in6_dev_put(idev);
381 	}
382 
383 	dst->from = NULL;
384 	dst_release(from);
385 }
386 
387 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
388 			   int how)
389 {
390 	struct rt6_info *rt = (struct rt6_info *)dst;
391 	struct inet6_dev *idev = rt->rt6i_idev;
392 	struct net_device *loopback_dev =
393 		dev_net(dev)->loopback_dev;
394 
395 	if (dev != loopback_dev) {
396 		if (idev && idev->dev == dev) {
397 			struct inet6_dev *loopback_idev =
398 				in6_dev_get(loopback_dev);
399 			if (loopback_idev) {
400 				rt->rt6i_idev = loopback_idev;
401 				in6_dev_put(idev);
402 			}
403 		}
404 	}
405 }
406 
407 static bool __rt6_check_expired(const struct rt6_info *rt)
408 {
409 	if (rt->rt6i_flags & RTF_EXPIRES)
410 		return time_after(jiffies, rt->dst.expires);
411 	else
412 		return false;
413 }
414 
415 static bool rt6_check_expired(const struct rt6_info *rt)
416 {
417 	if (rt->rt6i_flags & RTF_EXPIRES) {
418 		if (time_after(jiffies, rt->dst.expires))
419 			return true;
420 	} else if (rt->dst.from) {
421 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
422 	}
423 	return false;
424 }
425 
426 /* Multipath route selection:
427  *   Hash based function using packet header and flowlabel.
428  * Adapted from fib_info_hashfn()
429  */
430 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
431 			       const struct flowi6 *fl6)
432 {
433 	return get_hash_from_flowi6(fl6) % candidate_count;
434 }
435 
436 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
437 					     struct flowi6 *fl6, int oif,
438 					     int strict)
439 {
440 	struct rt6_info *sibling, *next_sibling;
441 	int route_choosen;
442 
443 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
444 	/* Don't change the route, if route_choosen == 0
445 	 * (siblings does not include ourself)
446 	 */
447 	if (route_choosen)
448 		list_for_each_entry_safe(sibling, next_sibling,
449 				&match->rt6i_siblings, rt6i_siblings) {
450 			route_choosen--;
451 			if (route_choosen == 0) {
452 				if (rt6_score_route(sibling, oif, strict) < 0)
453 					break;
454 				match = sibling;
455 				break;
456 			}
457 		}
458 	return match;
459 }
460 
461 /*
462  *	Route lookup. Any table->tb6_lock is implied.
463  */
464 
465 static inline struct rt6_info *rt6_device_match(struct net *net,
466 						    struct rt6_info *rt,
467 						    const struct in6_addr *saddr,
468 						    int oif,
469 						    int flags)
470 {
471 	struct rt6_info *local = NULL;
472 	struct rt6_info *sprt;
473 
474 	if (!oif && ipv6_addr_any(saddr))
475 		goto out;
476 
477 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
478 		struct net_device *dev = sprt->dst.dev;
479 
480 		if (oif) {
481 			if (dev->ifindex == oif)
482 				return sprt;
483 			if (dev->flags & IFF_LOOPBACK) {
484 				if (!sprt->rt6i_idev ||
485 				    sprt->rt6i_idev->dev->ifindex != oif) {
486 					if (flags & RT6_LOOKUP_F_IFACE)
487 						continue;
488 					if (local &&
489 					    local->rt6i_idev->dev->ifindex == oif)
490 						continue;
491 				}
492 				local = sprt;
493 			}
494 		} else {
495 			if (ipv6_chk_addr(net, saddr, dev,
496 					  flags & RT6_LOOKUP_F_IFACE))
497 				return sprt;
498 		}
499 	}
500 
501 	if (oif) {
502 		if (local)
503 			return local;
504 
505 		if (flags & RT6_LOOKUP_F_IFACE)
506 			return net->ipv6.ip6_null_entry;
507 	}
508 out:
509 	return rt;
510 }
511 
512 #ifdef CONFIG_IPV6_ROUTER_PREF
513 struct __rt6_probe_work {
514 	struct work_struct work;
515 	struct in6_addr target;
516 	struct net_device *dev;
517 };
518 
519 static void rt6_probe_deferred(struct work_struct *w)
520 {
521 	struct in6_addr mcaddr;
522 	struct __rt6_probe_work *work =
523 		container_of(w, struct __rt6_probe_work, work);
524 
525 	addrconf_addr_solict_mult(&work->target, &mcaddr);
526 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL);
527 	dev_put(work->dev);
528 	kfree(work);
529 }
530 
531 static void rt6_probe(struct rt6_info *rt)
532 {
533 	struct __rt6_probe_work *work;
534 	struct neighbour *neigh;
535 	/*
536 	 * Okay, this does not seem to be appropriate
537 	 * for now, however, we need to check if it
538 	 * is really so; aka Router Reachability Probing.
539 	 *
540 	 * Router Reachability Probe MUST be rate-limited
541 	 * to no more than one per minute.
542 	 */
543 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
544 		return;
545 	rcu_read_lock_bh();
546 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
547 	if (neigh) {
548 		if (neigh->nud_state & NUD_VALID)
549 			goto out;
550 
551 		work = NULL;
552 		write_lock(&neigh->lock);
553 		if (!(neigh->nud_state & NUD_VALID) &&
554 		    time_after(jiffies,
555 			       neigh->updated +
556 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
557 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
558 			if (work)
559 				__neigh_set_probe_once(neigh);
560 		}
561 		write_unlock(&neigh->lock);
562 	} else {
563 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
564 	}
565 
566 	if (work) {
567 		INIT_WORK(&work->work, rt6_probe_deferred);
568 		work->target = rt->rt6i_gateway;
569 		dev_hold(rt->dst.dev);
570 		work->dev = rt->dst.dev;
571 		schedule_work(&work->work);
572 	}
573 
574 out:
575 	rcu_read_unlock_bh();
576 }
577 #else
578 static inline void rt6_probe(struct rt6_info *rt)
579 {
580 }
581 #endif
582 
583 /*
584  * Default Router Selection (RFC 2461 6.3.6)
585  */
586 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
587 {
588 	struct net_device *dev = rt->dst.dev;
589 	if (!oif || dev->ifindex == oif)
590 		return 2;
591 	if ((dev->flags & IFF_LOOPBACK) &&
592 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
593 		return 1;
594 	return 0;
595 }
596 
597 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
598 {
599 	struct neighbour *neigh;
600 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
601 
602 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
603 	    !(rt->rt6i_flags & RTF_GATEWAY))
604 		return RT6_NUD_SUCCEED;
605 
606 	rcu_read_lock_bh();
607 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
608 	if (neigh) {
609 		read_lock(&neigh->lock);
610 		if (neigh->nud_state & NUD_VALID)
611 			ret = RT6_NUD_SUCCEED;
612 #ifdef CONFIG_IPV6_ROUTER_PREF
613 		else if (!(neigh->nud_state & NUD_FAILED))
614 			ret = RT6_NUD_SUCCEED;
615 		else
616 			ret = RT6_NUD_FAIL_PROBE;
617 #endif
618 		read_unlock(&neigh->lock);
619 	} else {
620 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
621 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
622 	}
623 	rcu_read_unlock_bh();
624 
625 	return ret;
626 }
627 
628 static int rt6_score_route(struct rt6_info *rt, int oif,
629 			   int strict)
630 {
631 	int m;
632 
633 	m = rt6_check_dev(rt, oif);
634 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
635 		return RT6_NUD_FAIL_HARD;
636 #ifdef CONFIG_IPV6_ROUTER_PREF
637 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
638 #endif
639 	if (strict & RT6_LOOKUP_F_REACHABLE) {
640 		int n = rt6_check_neigh(rt);
641 		if (n < 0)
642 			return n;
643 	}
644 	return m;
645 }
646 
647 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
648 				   int *mpri, struct rt6_info *match,
649 				   bool *do_rr)
650 {
651 	int m;
652 	bool match_do_rr = false;
653 	struct inet6_dev *idev = rt->rt6i_idev;
654 	struct net_device *dev = rt->dst.dev;
655 
656 	if (dev && !netif_carrier_ok(dev) &&
657 	    idev->cnf.ignore_routes_with_linkdown)
658 		goto out;
659 
660 	if (rt6_check_expired(rt))
661 		goto out;
662 
663 	m = rt6_score_route(rt, oif, strict);
664 	if (m == RT6_NUD_FAIL_DO_RR) {
665 		match_do_rr = true;
666 		m = 0; /* lowest valid score */
667 	} else if (m == RT6_NUD_FAIL_HARD) {
668 		goto out;
669 	}
670 
671 	if (strict & RT6_LOOKUP_F_REACHABLE)
672 		rt6_probe(rt);
673 
674 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
675 	if (m > *mpri) {
676 		*do_rr = match_do_rr;
677 		*mpri = m;
678 		match = rt;
679 	}
680 out:
681 	return match;
682 }
683 
684 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
685 				     struct rt6_info *rr_head,
686 				     u32 metric, int oif, int strict,
687 				     bool *do_rr)
688 {
689 	struct rt6_info *rt, *match, *cont;
690 	int mpri = -1;
691 
692 	match = NULL;
693 	cont = NULL;
694 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
695 		if (rt->rt6i_metric != metric) {
696 			cont = rt;
697 			break;
698 		}
699 
700 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
701 	}
702 
703 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
704 		if (rt->rt6i_metric != metric) {
705 			cont = rt;
706 			break;
707 		}
708 
709 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
710 	}
711 
712 	if (match || !cont)
713 		return match;
714 
715 	for (rt = cont; rt; rt = rt->dst.rt6_next)
716 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
717 
718 	return match;
719 }
720 
721 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
722 {
723 	struct rt6_info *match, *rt0;
724 	struct net *net;
725 	bool do_rr = false;
726 
727 	rt0 = fn->rr_ptr;
728 	if (!rt0)
729 		fn->rr_ptr = rt0 = fn->leaf;
730 
731 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
732 			     &do_rr);
733 
734 	if (do_rr) {
735 		struct rt6_info *next = rt0->dst.rt6_next;
736 
737 		/* no entries matched; do round-robin */
738 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
739 			next = fn->leaf;
740 
741 		if (next != rt0)
742 			fn->rr_ptr = next;
743 	}
744 
745 	net = dev_net(rt0->dst.dev);
746 	return match ? match : net->ipv6.ip6_null_entry;
747 }
748 
749 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
750 {
751 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
752 }
753 
754 #ifdef CONFIG_IPV6_ROUTE_INFO
755 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
756 		  const struct in6_addr *gwaddr)
757 {
758 	struct net *net = dev_net(dev);
759 	struct route_info *rinfo = (struct route_info *) opt;
760 	struct in6_addr prefix_buf, *prefix;
761 	unsigned int pref;
762 	unsigned long lifetime;
763 	struct rt6_info *rt;
764 
765 	if (len < sizeof(struct route_info)) {
766 		return -EINVAL;
767 	}
768 
769 	/* Sanity check for prefix_len and length */
770 	if (rinfo->length > 3) {
771 		return -EINVAL;
772 	} else if (rinfo->prefix_len > 128) {
773 		return -EINVAL;
774 	} else if (rinfo->prefix_len > 64) {
775 		if (rinfo->length < 2) {
776 			return -EINVAL;
777 		}
778 	} else if (rinfo->prefix_len > 0) {
779 		if (rinfo->length < 1) {
780 			return -EINVAL;
781 		}
782 	}
783 
784 	pref = rinfo->route_pref;
785 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
786 		return -EINVAL;
787 
788 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
789 
790 	if (rinfo->length == 3)
791 		prefix = (struct in6_addr *)rinfo->prefix;
792 	else {
793 		/* this function is safe */
794 		ipv6_addr_prefix(&prefix_buf,
795 				 (struct in6_addr *)rinfo->prefix,
796 				 rinfo->prefix_len);
797 		prefix = &prefix_buf;
798 	}
799 
800 	if (rinfo->prefix_len == 0)
801 		rt = rt6_get_dflt_router(gwaddr, dev);
802 	else
803 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
804 					gwaddr, dev->ifindex);
805 
806 	if (rt && !lifetime) {
807 		ip6_del_rt(rt);
808 		rt = NULL;
809 	}
810 
811 	if (!rt && lifetime)
812 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
813 					pref);
814 	else if (rt)
815 		rt->rt6i_flags = RTF_ROUTEINFO |
816 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
817 
818 	if (rt) {
819 		if (!addrconf_finite_timeout(lifetime))
820 			rt6_clean_expires(rt);
821 		else
822 			rt6_set_expires(rt, jiffies + HZ * lifetime);
823 
824 		ip6_rt_put(rt);
825 	}
826 	return 0;
827 }
828 #endif
829 
830 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
831 					struct in6_addr *saddr)
832 {
833 	struct fib6_node *pn;
834 	while (1) {
835 		if (fn->fn_flags & RTN_TL_ROOT)
836 			return NULL;
837 		pn = fn->parent;
838 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
839 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
840 		else
841 			fn = pn;
842 		if (fn->fn_flags & RTN_RTINFO)
843 			return fn;
844 	}
845 }
846 
847 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
848 					     struct fib6_table *table,
849 					     struct flowi6 *fl6, int flags)
850 {
851 	struct fib6_node *fn;
852 	struct rt6_info *rt;
853 
854 	read_lock_bh(&table->tb6_lock);
855 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
856 restart:
857 	rt = fn->leaf;
858 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
859 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
860 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
861 	if (rt == net->ipv6.ip6_null_entry) {
862 		fn = fib6_backtrack(fn, &fl6->saddr);
863 		if (fn)
864 			goto restart;
865 	}
866 	dst_use(&rt->dst, jiffies);
867 	read_unlock_bh(&table->tb6_lock);
868 	return rt;
869 
870 }
871 
872 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
873 				    int flags)
874 {
875 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
876 }
877 EXPORT_SYMBOL_GPL(ip6_route_lookup);
878 
879 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
880 			    const struct in6_addr *saddr, int oif, int strict)
881 {
882 	struct flowi6 fl6 = {
883 		.flowi6_oif = oif,
884 		.daddr = *daddr,
885 	};
886 	struct dst_entry *dst;
887 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
888 
889 	if (saddr) {
890 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
891 		flags |= RT6_LOOKUP_F_HAS_SADDR;
892 	}
893 
894 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
895 	if (dst->error == 0)
896 		return (struct rt6_info *) dst;
897 
898 	dst_release(dst);
899 
900 	return NULL;
901 }
902 EXPORT_SYMBOL(rt6_lookup);
903 
904 /* ip6_ins_rt is called with FREE table->tb6_lock.
905    It takes new route entry, the addition fails by any reason the
906    route is freed. In any case, if caller does not hold it, it may
907    be destroyed.
908  */
909 
910 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
911 			struct mx6_config *mxc)
912 {
913 	int err;
914 	struct fib6_table *table;
915 
916 	table = rt->rt6i_table;
917 	write_lock_bh(&table->tb6_lock);
918 	err = fib6_add(&table->tb6_root, rt, info, mxc);
919 	write_unlock_bh(&table->tb6_lock);
920 
921 	return err;
922 }
923 
924 int ip6_ins_rt(struct rt6_info *rt)
925 {
926 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
927 	struct mx6_config mxc = { .mx = NULL, };
928 
929 	return __ip6_ins_rt(rt, &info, &mxc);
930 }
931 
932 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
933 					   const struct in6_addr *daddr,
934 					   const struct in6_addr *saddr)
935 {
936 	struct rt6_info *rt;
937 
938 	/*
939 	 *	Clone the route.
940 	 */
941 
942 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
943 		ort = (struct rt6_info *)ort->dst.from;
944 
945 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
946 
947 	if (!rt)
948 		return NULL;
949 
950 	ip6_rt_copy_init(rt, ort);
951 	rt->rt6i_flags |= RTF_CACHE;
952 	rt->rt6i_metric = 0;
953 	rt->dst.flags |= DST_HOST;
954 	rt->rt6i_dst.addr = *daddr;
955 	rt->rt6i_dst.plen = 128;
956 
957 	if (!rt6_is_gw_or_nonexthop(ort)) {
958 		if (ort->rt6i_dst.plen != 128 &&
959 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
960 			rt->rt6i_flags |= RTF_ANYCAST;
961 #ifdef CONFIG_IPV6_SUBTREES
962 		if (rt->rt6i_src.plen && saddr) {
963 			rt->rt6i_src.addr = *saddr;
964 			rt->rt6i_src.plen = 128;
965 		}
966 #endif
967 	}
968 
969 	return rt;
970 }
971 
972 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
973 {
974 	struct rt6_info *pcpu_rt;
975 
976 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
977 				  rt->dst.dev, rt->dst.flags);
978 
979 	if (!pcpu_rt)
980 		return NULL;
981 	ip6_rt_copy_init(pcpu_rt, rt);
982 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
983 	pcpu_rt->rt6i_flags |= RTF_PCPU;
984 	return pcpu_rt;
985 }
986 
987 /* It should be called with read_lock_bh(&tb6_lock) acquired */
988 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
989 {
990 	struct rt6_info *pcpu_rt, **p;
991 
992 	p = this_cpu_ptr(rt->rt6i_pcpu);
993 	pcpu_rt = *p;
994 
995 	if (pcpu_rt) {
996 		dst_hold(&pcpu_rt->dst);
997 		rt6_dst_from_metrics_check(pcpu_rt);
998 	}
999 	return pcpu_rt;
1000 }
1001 
1002 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1003 {
1004 	struct fib6_table *table = rt->rt6i_table;
1005 	struct rt6_info *pcpu_rt, *prev, **p;
1006 
1007 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1008 	if (!pcpu_rt) {
1009 		struct net *net = dev_net(rt->dst.dev);
1010 
1011 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1012 		return net->ipv6.ip6_null_entry;
1013 	}
1014 
1015 	read_lock_bh(&table->tb6_lock);
1016 	if (rt->rt6i_pcpu) {
1017 		p = this_cpu_ptr(rt->rt6i_pcpu);
1018 		prev = cmpxchg(p, NULL, pcpu_rt);
1019 		if (prev) {
1020 			/* If someone did it before us, return prev instead */
1021 			dst_destroy(&pcpu_rt->dst);
1022 			pcpu_rt = prev;
1023 		}
1024 	} else {
1025 		/* rt has been removed from the fib6 tree
1026 		 * before we have a chance to acquire the read_lock.
1027 		 * In this case, don't brother to create a pcpu rt
1028 		 * since rt is going away anyway.  The next
1029 		 * dst_check() will trigger a re-lookup.
1030 		 */
1031 		dst_destroy(&pcpu_rt->dst);
1032 		pcpu_rt = rt;
1033 	}
1034 	dst_hold(&pcpu_rt->dst);
1035 	rt6_dst_from_metrics_check(pcpu_rt);
1036 	read_unlock_bh(&table->tb6_lock);
1037 	return pcpu_rt;
1038 }
1039 
1040 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1041 				      struct flowi6 *fl6, int flags)
1042 {
1043 	struct fib6_node *fn, *saved_fn;
1044 	struct rt6_info *rt;
1045 	int strict = 0;
1046 
1047 	strict |= flags & RT6_LOOKUP_F_IFACE;
1048 	if (net->ipv6.devconf_all->forwarding == 0)
1049 		strict |= RT6_LOOKUP_F_REACHABLE;
1050 
1051 	read_lock_bh(&table->tb6_lock);
1052 
1053 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1054 	saved_fn = fn;
1055 
1056 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1057 		oif = 0;
1058 
1059 redo_rt6_select:
1060 	rt = rt6_select(fn, oif, strict);
1061 	if (rt->rt6i_nsiblings)
1062 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1063 	if (rt == net->ipv6.ip6_null_entry) {
1064 		fn = fib6_backtrack(fn, &fl6->saddr);
1065 		if (fn)
1066 			goto redo_rt6_select;
1067 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1068 			/* also consider unreachable route */
1069 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1070 			fn = saved_fn;
1071 			goto redo_rt6_select;
1072 		}
1073 	}
1074 
1075 
1076 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1077 		dst_use(&rt->dst, jiffies);
1078 		read_unlock_bh(&table->tb6_lock);
1079 
1080 		rt6_dst_from_metrics_check(rt);
1081 		return rt;
1082 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1083 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1084 		/* Create a RTF_CACHE clone which will not be
1085 		 * owned by the fib6 tree.  It is for the special case where
1086 		 * the daddr in the skb during the neighbor look-up is different
1087 		 * from the fl6->daddr used to look-up route here.
1088 		 */
1089 
1090 		struct rt6_info *uncached_rt;
1091 
1092 		dst_use(&rt->dst, jiffies);
1093 		read_unlock_bh(&table->tb6_lock);
1094 
1095 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1096 		dst_release(&rt->dst);
1097 
1098 		if (uncached_rt)
1099 			rt6_uncached_list_add(uncached_rt);
1100 		else
1101 			uncached_rt = net->ipv6.ip6_null_entry;
1102 
1103 		dst_hold(&uncached_rt->dst);
1104 		return uncached_rt;
1105 
1106 	} else {
1107 		/* Get a percpu copy */
1108 
1109 		struct rt6_info *pcpu_rt;
1110 
1111 		rt->dst.lastuse = jiffies;
1112 		rt->dst.__use++;
1113 		pcpu_rt = rt6_get_pcpu_route(rt);
1114 
1115 		if (pcpu_rt) {
1116 			read_unlock_bh(&table->tb6_lock);
1117 		} else {
1118 			/* We have to do the read_unlock first
1119 			 * because rt6_make_pcpu_route() may trigger
1120 			 * ip6_dst_gc() which will take the write_lock.
1121 			 */
1122 			dst_hold(&rt->dst);
1123 			read_unlock_bh(&table->tb6_lock);
1124 			pcpu_rt = rt6_make_pcpu_route(rt);
1125 			dst_release(&rt->dst);
1126 		}
1127 
1128 		return pcpu_rt;
1129 
1130 	}
1131 }
1132 
1133 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1134 					    struct flowi6 *fl6, int flags)
1135 {
1136 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1137 }
1138 
1139 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1140 						struct net_device *dev,
1141 						struct flowi6 *fl6, int flags)
1142 {
1143 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1144 		flags |= RT6_LOOKUP_F_IFACE;
1145 
1146 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1147 }
1148 
1149 void ip6_route_input(struct sk_buff *skb)
1150 {
1151 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1152 	struct net *net = dev_net(skb->dev);
1153 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1154 	struct ip_tunnel_info *tun_info;
1155 	struct flowi6 fl6 = {
1156 		.flowi6_iif = l3mdev_fib_oif(skb->dev),
1157 		.daddr = iph->daddr,
1158 		.saddr = iph->saddr,
1159 		.flowlabel = ip6_flowinfo(iph),
1160 		.flowi6_mark = skb->mark,
1161 		.flowi6_proto = iph->nexthdr,
1162 	};
1163 
1164 	tun_info = skb_tunnel_info(skb);
1165 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1166 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1167 	skb_dst_drop(skb);
1168 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1169 }
1170 
1171 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1172 					     struct flowi6 *fl6, int flags)
1173 {
1174 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1175 }
1176 
1177 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1178 				    struct flowi6 *fl6)
1179 {
1180 	struct dst_entry *dst;
1181 	int flags = 0;
1182 	bool any_src;
1183 
1184 	dst = l3mdev_rt6_dst_by_oif(net, fl6);
1185 	if (dst)
1186 		return dst;
1187 
1188 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1189 
1190 	any_src = ipv6_addr_any(&fl6->saddr);
1191 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1192 	    (fl6->flowi6_oif && any_src))
1193 		flags |= RT6_LOOKUP_F_IFACE;
1194 
1195 	if (!any_src)
1196 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1197 	else if (sk)
1198 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1199 
1200 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1201 }
1202 EXPORT_SYMBOL(ip6_route_output);
1203 
1204 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1205 {
1206 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1207 	struct dst_entry *new = NULL;
1208 
1209 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1210 	if (rt) {
1211 		rt6_info_init(rt);
1212 
1213 		new = &rt->dst;
1214 		new->__use = 1;
1215 		new->input = dst_discard;
1216 		new->output = dst_discard_out;
1217 
1218 		dst_copy_metrics(new, &ort->dst);
1219 		rt->rt6i_idev = ort->rt6i_idev;
1220 		if (rt->rt6i_idev)
1221 			in6_dev_hold(rt->rt6i_idev);
1222 
1223 		rt->rt6i_gateway = ort->rt6i_gateway;
1224 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1225 		rt->rt6i_metric = 0;
1226 
1227 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1228 #ifdef CONFIG_IPV6_SUBTREES
1229 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1230 #endif
1231 
1232 		dst_free(new);
1233 	}
1234 
1235 	dst_release(dst_orig);
1236 	return new ? new : ERR_PTR(-ENOMEM);
1237 }
1238 
1239 /*
1240  *	Destination cache support functions
1241  */
1242 
1243 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1244 {
1245 	if (rt->dst.from &&
1246 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1247 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1248 }
1249 
1250 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1251 {
1252 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1253 		return NULL;
1254 
1255 	if (rt6_check_expired(rt))
1256 		return NULL;
1257 
1258 	return &rt->dst;
1259 }
1260 
1261 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1262 {
1263 	if (!__rt6_check_expired(rt) &&
1264 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1266 		return &rt->dst;
1267 	else
1268 		return NULL;
1269 }
1270 
1271 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1272 {
1273 	struct rt6_info *rt;
1274 
1275 	rt = (struct rt6_info *) dst;
1276 
1277 	/* All IPV6 dsts are created with ->obsolete set to the value
1278 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1279 	 * into this function always.
1280 	 */
1281 
1282 	rt6_dst_from_metrics_check(rt);
1283 
1284 	if (rt->rt6i_flags & RTF_PCPU ||
1285 	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1286 		return rt6_dst_from_check(rt, cookie);
1287 	else
1288 		return rt6_check(rt, cookie);
1289 }
1290 
1291 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1292 {
1293 	struct rt6_info *rt = (struct rt6_info *) dst;
1294 
1295 	if (rt) {
1296 		if (rt->rt6i_flags & RTF_CACHE) {
1297 			if (rt6_check_expired(rt)) {
1298 				ip6_del_rt(rt);
1299 				dst = NULL;
1300 			}
1301 		} else {
1302 			dst_release(dst);
1303 			dst = NULL;
1304 		}
1305 	}
1306 	return dst;
1307 }
1308 
1309 static void ip6_link_failure(struct sk_buff *skb)
1310 {
1311 	struct rt6_info *rt;
1312 
1313 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1314 
1315 	rt = (struct rt6_info *) skb_dst(skb);
1316 	if (rt) {
1317 		if (rt->rt6i_flags & RTF_CACHE) {
1318 			dst_hold(&rt->dst);
1319 			ip6_del_rt(rt);
1320 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1321 			rt->rt6i_node->fn_sernum = -1;
1322 		}
1323 	}
1324 }
1325 
1326 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1327 {
1328 	struct net *net = dev_net(rt->dst.dev);
1329 
1330 	rt->rt6i_flags |= RTF_MODIFIED;
1331 	rt->rt6i_pmtu = mtu;
1332 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1333 }
1334 
1335 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1336 {
1337 	return !(rt->rt6i_flags & RTF_CACHE) &&
1338 		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1339 }
1340 
1341 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1342 				 const struct ipv6hdr *iph, u32 mtu)
1343 {
1344 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1345 
1346 	if (rt6->rt6i_flags & RTF_LOCAL)
1347 		return;
1348 
1349 	dst_confirm(dst);
1350 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1351 	if (mtu >= dst_mtu(dst))
1352 		return;
1353 
1354 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1355 		rt6_do_update_pmtu(rt6, mtu);
1356 	} else {
1357 		const struct in6_addr *daddr, *saddr;
1358 		struct rt6_info *nrt6;
1359 
1360 		if (iph) {
1361 			daddr = &iph->daddr;
1362 			saddr = &iph->saddr;
1363 		} else if (sk) {
1364 			daddr = &sk->sk_v6_daddr;
1365 			saddr = &inet6_sk(sk)->saddr;
1366 		} else {
1367 			return;
1368 		}
1369 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1370 		if (nrt6) {
1371 			rt6_do_update_pmtu(nrt6, mtu);
1372 
1373 			/* ip6_ins_rt(nrt6) will bump the
1374 			 * rt6->rt6i_node->fn_sernum
1375 			 * which will fail the next rt6_check() and
1376 			 * invalidate the sk->sk_dst_cache.
1377 			 */
1378 			ip6_ins_rt(nrt6);
1379 		}
1380 	}
1381 }
1382 
1383 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1384 			       struct sk_buff *skb, u32 mtu)
1385 {
1386 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1387 }
1388 
1389 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1390 		     int oif, u32 mark)
1391 {
1392 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1393 	struct dst_entry *dst;
1394 	struct flowi6 fl6;
1395 
1396 	memset(&fl6, 0, sizeof(fl6));
1397 	fl6.flowi6_oif = oif;
1398 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1399 	fl6.daddr = iph->daddr;
1400 	fl6.saddr = iph->saddr;
1401 	fl6.flowlabel = ip6_flowinfo(iph);
1402 
1403 	dst = ip6_route_output(net, NULL, &fl6);
1404 	if (!dst->error)
1405 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1406 	dst_release(dst);
1407 }
1408 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1409 
1410 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1411 {
1412 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1413 			sk->sk_bound_dev_if, sk->sk_mark);
1414 }
1415 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1416 
1417 /* Handle redirects */
1418 struct ip6rd_flowi {
1419 	struct flowi6 fl6;
1420 	struct in6_addr gateway;
1421 };
1422 
1423 static struct rt6_info *__ip6_route_redirect(struct net *net,
1424 					     struct fib6_table *table,
1425 					     struct flowi6 *fl6,
1426 					     int flags)
1427 {
1428 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1429 	struct rt6_info *rt;
1430 	struct fib6_node *fn;
1431 
1432 	/* Get the "current" route for this destination and
1433 	 * check if the redirect has come from approriate router.
1434 	 *
1435 	 * RFC 4861 specifies that redirects should only be
1436 	 * accepted if they come from the nexthop to the target.
1437 	 * Due to the way the routes are chosen, this notion
1438 	 * is a bit fuzzy and one might need to check all possible
1439 	 * routes.
1440 	 */
1441 
1442 	read_lock_bh(&table->tb6_lock);
1443 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1444 restart:
1445 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1446 		if (rt6_check_expired(rt))
1447 			continue;
1448 		if (rt->dst.error)
1449 			break;
1450 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1451 			continue;
1452 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1453 			continue;
1454 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1455 			continue;
1456 		break;
1457 	}
1458 
1459 	if (!rt)
1460 		rt = net->ipv6.ip6_null_entry;
1461 	else if (rt->dst.error) {
1462 		rt = net->ipv6.ip6_null_entry;
1463 		goto out;
1464 	}
1465 
1466 	if (rt == net->ipv6.ip6_null_entry) {
1467 		fn = fib6_backtrack(fn, &fl6->saddr);
1468 		if (fn)
1469 			goto restart;
1470 	}
1471 
1472 out:
1473 	dst_hold(&rt->dst);
1474 
1475 	read_unlock_bh(&table->tb6_lock);
1476 
1477 	return rt;
1478 };
1479 
1480 static struct dst_entry *ip6_route_redirect(struct net *net,
1481 					const struct flowi6 *fl6,
1482 					const struct in6_addr *gateway)
1483 {
1484 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1485 	struct ip6rd_flowi rdfl;
1486 
1487 	rdfl.fl6 = *fl6;
1488 	rdfl.gateway = *gateway;
1489 
1490 	return fib6_rule_lookup(net, &rdfl.fl6,
1491 				flags, __ip6_route_redirect);
1492 }
1493 
1494 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1495 {
1496 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1497 	struct dst_entry *dst;
1498 	struct flowi6 fl6;
1499 
1500 	memset(&fl6, 0, sizeof(fl6));
1501 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1502 	fl6.flowi6_oif = oif;
1503 	fl6.flowi6_mark = mark;
1504 	fl6.daddr = iph->daddr;
1505 	fl6.saddr = iph->saddr;
1506 	fl6.flowlabel = ip6_flowinfo(iph);
1507 
1508 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1509 	rt6_do_redirect(dst, NULL, skb);
1510 	dst_release(dst);
1511 }
1512 EXPORT_SYMBOL_GPL(ip6_redirect);
1513 
1514 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1515 			    u32 mark)
1516 {
1517 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1518 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1519 	struct dst_entry *dst;
1520 	struct flowi6 fl6;
1521 
1522 	memset(&fl6, 0, sizeof(fl6));
1523 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1524 	fl6.flowi6_oif = oif;
1525 	fl6.flowi6_mark = mark;
1526 	fl6.daddr = msg->dest;
1527 	fl6.saddr = iph->daddr;
1528 
1529 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1530 	rt6_do_redirect(dst, NULL, skb);
1531 	dst_release(dst);
1532 }
1533 
1534 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1535 {
1536 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1537 }
1538 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1539 
1540 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1541 {
1542 	struct net_device *dev = dst->dev;
1543 	unsigned int mtu = dst_mtu(dst);
1544 	struct net *net = dev_net(dev);
1545 
1546 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1547 
1548 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1549 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1550 
1551 	/*
1552 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1553 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1554 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1555 	 * rely only on pmtu discovery"
1556 	 */
1557 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1558 		mtu = IPV6_MAXPLEN;
1559 	return mtu;
1560 }
1561 
1562 static unsigned int ip6_mtu(const struct dst_entry *dst)
1563 {
1564 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1565 	unsigned int mtu = rt->rt6i_pmtu;
1566 	struct inet6_dev *idev;
1567 
1568 	if (mtu)
1569 		goto out;
1570 
1571 	mtu = dst_metric_raw(dst, RTAX_MTU);
1572 	if (mtu)
1573 		goto out;
1574 
1575 	mtu = IPV6_MIN_MTU;
1576 
1577 	rcu_read_lock();
1578 	idev = __in6_dev_get(dst->dev);
1579 	if (idev)
1580 		mtu = idev->cnf.mtu6;
1581 	rcu_read_unlock();
1582 
1583 out:
1584 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1585 }
1586 
1587 static struct dst_entry *icmp6_dst_gc_list;
1588 static DEFINE_SPINLOCK(icmp6_dst_lock);
1589 
1590 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1591 				  struct flowi6 *fl6)
1592 {
1593 	struct dst_entry *dst;
1594 	struct rt6_info *rt;
1595 	struct inet6_dev *idev = in6_dev_get(dev);
1596 	struct net *net = dev_net(dev);
1597 
1598 	if (unlikely(!idev))
1599 		return ERR_PTR(-ENODEV);
1600 
1601 	rt = ip6_dst_alloc(net, dev, 0);
1602 	if (unlikely(!rt)) {
1603 		in6_dev_put(idev);
1604 		dst = ERR_PTR(-ENOMEM);
1605 		goto out;
1606 	}
1607 
1608 	rt->dst.flags |= DST_HOST;
1609 	rt->dst.output  = ip6_output;
1610 	atomic_set(&rt->dst.__refcnt, 1);
1611 	rt->rt6i_gateway  = fl6->daddr;
1612 	rt->rt6i_dst.addr = fl6->daddr;
1613 	rt->rt6i_dst.plen = 128;
1614 	rt->rt6i_idev     = idev;
1615 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1616 
1617 	spin_lock_bh(&icmp6_dst_lock);
1618 	rt->dst.next = icmp6_dst_gc_list;
1619 	icmp6_dst_gc_list = &rt->dst;
1620 	spin_unlock_bh(&icmp6_dst_lock);
1621 
1622 	fib6_force_start_gc(net);
1623 
1624 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1625 
1626 out:
1627 	return dst;
1628 }
1629 
1630 int icmp6_dst_gc(void)
1631 {
1632 	struct dst_entry *dst, **pprev;
1633 	int more = 0;
1634 
1635 	spin_lock_bh(&icmp6_dst_lock);
1636 	pprev = &icmp6_dst_gc_list;
1637 
1638 	while ((dst = *pprev) != NULL) {
1639 		if (!atomic_read(&dst->__refcnt)) {
1640 			*pprev = dst->next;
1641 			dst_free(dst);
1642 		} else {
1643 			pprev = &dst->next;
1644 			++more;
1645 		}
1646 	}
1647 
1648 	spin_unlock_bh(&icmp6_dst_lock);
1649 
1650 	return more;
1651 }
1652 
1653 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1654 			    void *arg)
1655 {
1656 	struct dst_entry *dst, **pprev;
1657 
1658 	spin_lock_bh(&icmp6_dst_lock);
1659 	pprev = &icmp6_dst_gc_list;
1660 	while ((dst = *pprev) != NULL) {
1661 		struct rt6_info *rt = (struct rt6_info *) dst;
1662 		if (func(rt, arg)) {
1663 			*pprev = dst->next;
1664 			dst_free(dst);
1665 		} else {
1666 			pprev = &dst->next;
1667 		}
1668 	}
1669 	spin_unlock_bh(&icmp6_dst_lock);
1670 }
1671 
1672 static int ip6_dst_gc(struct dst_ops *ops)
1673 {
1674 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1675 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1676 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1677 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1678 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1679 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1680 	int entries;
1681 
1682 	entries = dst_entries_get_fast(ops);
1683 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1684 	    entries <= rt_max_size)
1685 		goto out;
1686 
1687 	net->ipv6.ip6_rt_gc_expire++;
1688 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1689 	entries = dst_entries_get_slow(ops);
1690 	if (entries < ops->gc_thresh)
1691 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1692 out:
1693 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1694 	return entries > rt_max_size;
1695 }
1696 
1697 static int ip6_convert_metrics(struct mx6_config *mxc,
1698 			       const struct fib6_config *cfg)
1699 {
1700 	bool ecn_ca = false;
1701 	struct nlattr *nla;
1702 	int remaining;
1703 	u32 *mp;
1704 
1705 	if (!cfg->fc_mx)
1706 		return 0;
1707 
1708 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1709 	if (unlikely(!mp))
1710 		return -ENOMEM;
1711 
1712 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1713 		int type = nla_type(nla);
1714 		u32 val;
1715 
1716 		if (!type)
1717 			continue;
1718 		if (unlikely(type > RTAX_MAX))
1719 			goto err;
1720 
1721 		if (type == RTAX_CC_ALGO) {
1722 			char tmp[TCP_CA_NAME_MAX];
1723 
1724 			nla_strlcpy(tmp, nla, sizeof(tmp));
1725 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1726 			if (val == TCP_CA_UNSPEC)
1727 				goto err;
1728 		} else {
1729 			val = nla_get_u32(nla);
1730 		}
1731 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1732 			goto err;
1733 
1734 		mp[type - 1] = val;
1735 		__set_bit(type - 1, mxc->mx_valid);
1736 	}
1737 
1738 	if (ecn_ca) {
1739 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1740 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1741 	}
1742 
1743 	mxc->mx = mp;
1744 	return 0;
1745  err:
1746 	kfree(mp);
1747 	return -EINVAL;
1748 }
1749 
1750 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1751 {
1752 	struct net *net = cfg->fc_nlinfo.nl_net;
1753 	struct rt6_info *rt = NULL;
1754 	struct net_device *dev = NULL;
1755 	struct inet6_dev *idev = NULL;
1756 	struct fib6_table *table;
1757 	int addr_type;
1758 	int err = -EINVAL;
1759 
1760 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1761 		goto out;
1762 #ifndef CONFIG_IPV6_SUBTREES
1763 	if (cfg->fc_src_len)
1764 		goto out;
1765 #endif
1766 	if (cfg->fc_ifindex) {
1767 		err = -ENODEV;
1768 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1769 		if (!dev)
1770 			goto out;
1771 		idev = in6_dev_get(dev);
1772 		if (!idev)
1773 			goto out;
1774 	}
1775 
1776 	if (cfg->fc_metric == 0)
1777 		cfg->fc_metric = IP6_RT_PRIO_USER;
1778 
1779 	err = -ENOBUFS;
1780 	if (cfg->fc_nlinfo.nlh &&
1781 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1782 		table = fib6_get_table(net, cfg->fc_table);
1783 		if (!table) {
1784 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1785 			table = fib6_new_table(net, cfg->fc_table);
1786 		}
1787 	} else {
1788 		table = fib6_new_table(net, cfg->fc_table);
1789 	}
1790 
1791 	if (!table)
1792 		goto out;
1793 
1794 	rt = ip6_dst_alloc(net, NULL,
1795 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1796 
1797 	if (!rt) {
1798 		err = -ENOMEM;
1799 		goto out;
1800 	}
1801 
1802 	if (cfg->fc_flags & RTF_EXPIRES)
1803 		rt6_set_expires(rt, jiffies +
1804 				clock_t_to_jiffies(cfg->fc_expires));
1805 	else
1806 		rt6_clean_expires(rt);
1807 
1808 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1809 		cfg->fc_protocol = RTPROT_BOOT;
1810 	rt->rt6i_protocol = cfg->fc_protocol;
1811 
1812 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1813 
1814 	if (addr_type & IPV6_ADDR_MULTICAST)
1815 		rt->dst.input = ip6_mc_input;
1816 	else if (cfg->fc_flags & RTF_LOCAL)
1817 		rt->dst.input = ip6_input;
1818 	else
1819 		rt->dst.input = ip6_forward;
1820 
1821 	rt->dst.output = ip6_output;
1822 
1823 	if (cfg->fc_encap) {
1824 		struct lwtunnel_state *lwtstate;
1825 
1826 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1827 					   cfg->fc_encap, AF_INET6, cfg,
1828 					   &lwtstate);
1829 		if (err)
1830 			goto out;
1831 		rt->dst.lwtstate = lwtstate_get(lwtstate);
1832 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1833 			rt->dst.lwtstate->orig_output = rt->dst.output;
1834 			rt->dst.output = lwtunnel_output;
1835 		}
1836 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1837 			rt->dst.lwtstate->orig_input = rt->dst.input;
1838 			rt->dst.input = lwtunnel_input;
1839 		}
1840 	}
1841 
1842 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1843 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1844 	if (rt->rt6i_dst.plen == 128)
1845 		rt->dst.flags |= DST_HOST;
1846 
1847 #ifdef CONFIG_IPV6_SUBTREES
1848 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1849 	rt->rt6i_src.plen = cfg->fc_src_len;
1850 #endif
1851 
1852 	rt->rt6i_metric = cfg->fc_metric;
1853 
1854 	/* We cannot add true routes via loopback here,
1855 	   they would result in kernel looping; promote them to reject routes
1856 	 */
1857 	if ((cfg->fc_flags & RTF_REJECT) ||
1858 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1859 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1860 	     !(cfg->fc_flags & RTF_LOCAL))) {
1861 		/* hold loopback dev/idev if we haven't done so. */
1862 		if (dev != net->loopback_dev) {
1863 			if (dev) {
1864 				dev_put(dev);
1865 				in6_dev_put(idev);
1866 			}
1867 			dev = net->loopback_dev;
1868 			dev_hold(dev);
1869 			idev = in6_dev_get(dev);
1870 			if (!idev) {
1871 				err = -ENODEV;
1872 				goto out;
1873 			}
1874 		}
1875 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1876 		switch (cfg->fc_type) {
1877 		case RTN_BLACKHOLE:
1878 			rt->dst.error = -EINVAL;
1879 			rt->dst.output = dst_discard_out;
1880 			rt->dst.input = dst_discard;
1881 			break;
1882 		case RTN_PROHIBIT:
1883 			rt->dst.error = -EACCES;
1884 			rt->dst.output = ip6_pkt_prohibit_out;
1885 			rt->dst.input = ip6_pkt_prohibit;
1886 			break;
1887 		case RTN_THROW:
1888 		case RTN_UNREACHABLE:
1889 		default:
1890 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1891 					: (cfg->fc_type == RTN_UNREACHABLE)
1892 					? -EHOSTUNREACH : -ENETUNREACH;
1893 			rt->dst.output = ip6_pkt_discard_out;
1894 			rt->dst.input = ip6_pkt_discard;
1895 			break;
1896 		}
1897 		goto install_route;
1898 	}
1899 
1900 	if (cfg->fc_flags & RTF_GATEWAY) {
1901 		const struct in6_addr *gw_addr;
1902 		int gwa_type;
1903 
1904 		gw_addr = &cfg->fc_gateway;
1905 		gwa_type = ipv6_addr_type(gw_addr);
1906 
1907 		/* if gw_addr is local we will fail to detect this in case
1908 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1909 		 * will return already-added prefix route via interface that
1910 		 * prefix route was assigned to, which might be non-loopback.
1911 		 */
1912 		err = -EINVAL;
1913 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1914 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1915 					    dev : NULL, 0, 0))
1916 			goto out;
1917 
1918 		rt->rt6i_gateway = *gw_addr;
1919 
1920 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1921 			struct rt6_info *grt;
1922 
1923 			/* IPv6 strictly inhibits using not link-local
1924 			   addresses as nexthop address.
1925 			   Otherwise, router will not able to send redirects.
1926 			   It is very good, but in some (rare!) circumstances
1927 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1928 			   some exceptions. --ANK
1929 			 */
1930 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1931 				goto out;
1932 
1933 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1934 
1935 			err = -EHOSTUNREACH;
1936 			if (!grt)
1937 				goto out;
1938 			if (dev) {
1939 				if (dev != grt->dst.dev) {
1940 					ip6_rt_put(grt);
1941 					goto out;
1942 				}
1943 			} else {
1944 				dev = grt->dst.dev;
1945 				idev = grt->rt6i_idev;
1946 				dev_hold(dev);
1947 				in6_dev_hold(grt->rt6i_idev);
1948 			}
1949 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1950 				err = 0;
1951 			ip6_rt_put(grt);
1952 
1953 			if (err)
1954 				goto out;
1955 		}
1956 		err = -EINVAL;
1957 		if (!dev || (dev->flags & IFF_LOOPBACK))
1958 			goto out;
1959 	}
1960 
1961 	err = -ENODEV;
1962 	if (!dev)
1963 		goto out;
1964 
1965 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1966 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1967 			err = -EINVAL;
1968 			goto out;
1969 		}
1970 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1971 		rt->rt6i_prefsrc.plen = 128;
1972 	} else
1973 		rt->rt6i_prefsrc.plen = 0;
1974 
1975 	rt->rt6i_flags = cfg->fc_flags;
1976 
1977 install_route:
1978 	rt->dst.dev = dev;
1979 	rt->rt6i_idev = idev;
1980 	rt->rt6i_table = table;
1981 
1982 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1983 
1984 	return rt;
1985 out:
1986 	if (dev)
1987 		dev_put(dev);
1988 	if (idev)
1989 		in6_dev_put(idev);
1990 	if (rt)
1991 		dst_free(&rt->dst);
1992 
1993 	return ERR_PTR(err);
1994 }
1995 
1996 int ip6_route_add(struct fib6_config *cfg)
1997 {
1998 	struct mx6_config mxc = { .mx = NULL, };
1999 	struct rt6_info *rt;
2000 	int err;
2001 
2002 	rt = ip6_route_info_create(cfg);
2003 	if (IS_ERR(rt)) {
2004 		err = PTR_ERR(rt);
2005 		rt = NULL;
2006 		goto out;
2007 	}
2008 
2009 	err = ip6_convert_metrics(&mxc, cfg);
2010 	if (err)
2011 		goto out;
2012 
2013 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2014 
2015 	kfree(mxc.mx);
2016 
2017 	return err;
2018 out:
2019 	if (rt)
2020 		dst_free(&rt->dst);
2021 
2022 	return err;
2023 }
2024 
2025 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2026 {
2027 	int err;
2028 	struct fib6_table *table;
2029 	struct net *net = dev_net(rt->dst.dev);
2030 
2031 	if (rt == net->ipv6.ip6_null_entry ||
2032 	    rt->dst.flags & DST_NOCACHE) {
2033 		err = -ENOENT;
2034 		goto out;
2035 	}
2036 
2037 	table = rt->rt6i_table;
2038 	write_lock_bh(&table->tb6_lock);
2039 	err = fib6_del(rt, info);
2040 	write_unlock_bh(&table->tb6_lock);
2041 
2042 out:
2043 	ip6_rt_put(rt);
2044 	return err;
2045 }
2046 
2047 int ip6_del_rt(struct rt6_info *rt)
2048 {
2049 	struct nl_info info = {
2050 		.nl_net = dev_net(rt->dst.dev),
2051 	};
2052 	return __ip6_del_rt(rt, &info);
2053 }
2054 
2055 static int ip6_route_del(struct fib6_config *cfg)
2056 {
2057 	struct fib6_table *table;
2058 	struct fib6_node *fn;
2059 	struct rt6_info *rt;
2060 	int err = -ESRCH;
2061 
2062 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2063 	if (!table)
2064 		return err;
2065 
2066 	read_lock_bh(&table->tb6_lock);
2067 
2068 	fn = fib6_locate(&table->tb6_root,
2069 			 &cfg->fc_dst, cfg->fc_dst_len,
2070 			 &cfg->fc_src, cfg->fc_src_len);
2071 
2072 	if (fn) {
2073 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2074 			if ((rt->rt6i_flags & RTF_CACHE) &&
2075 			    !(cfg->fc_flags & RTF_CACHE))
2076 				continue;
2077 			if (cfg->fc_ifindex &&
2078 			    (!rt->dst.dev ||
2079 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2080 				continue;
2081 			if (cfg->fc_flags & RTF_GATEWAY &&
2082 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2083 				continue;
2084 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2085 				continue;
2086 			dst_hold(&rt->dst);
2087 			read_unlock_bh(&table->tb6_lock);
2088 
2089 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2090 		}
2091 	}
2092 	read_unlock_bh(&table->tb6_lock);
2093 
2094 	return err;
2095 }
2096 
2097 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2098 {
2099 	struct netevent_redirect netevent;
2100 	struct rt6_info *rt, *nrt = NULL;
2101 	struct ndisc_options ndopts;
2102 	struct inet6_dev *in6_dev;
2103 	struct neighbour *neigh;
2104 	struct rd_msg *msg;
2105 	int optlen, on_link;
2106 	u8 *lladdr;
2107 
2108 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2109 	optlen -= sizeof(*msg);
2110 
2111 	if (optlen < 0) {
2112 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2113 		return;
2114 	}
2115 
2116 	msg = (struct rd_msg *)icmp6_hdr(skb);
2117 
2118 	if (ipv6_addr_is_multicast(&msg->dest)) {
2119 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2120 		return;
2121 	}
2122 
2123 	on_link = 0;
2124 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2125 		on_link = 1;
2126 	} else if (ipv6_addr_type(&msg->target) !=
2127 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2128 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2129 		return;
2130 	}
2131 
2132 	in6_dev = __in6_dev_get(skb->dev);
2133 	if (!in6_dev)
2134 		return;
2135 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2136 		return;
2137 
2138 	/* RFC2461 8.1:
2139 	 *	The IP source address of the Redirect MUST be the same as the current
2140 	 *	first-hop router for the specified ICMP Destination Address.
2141 	 */
2142 
2143 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2144 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2145 		return;
2146 	}
2147 
2148 	lladdr = NULL;
2149 	if (ndopts.nd_opts_tgt_lladdr) {
2150 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2151 					     skb->dev);
2152 		if (!lladdr) {
2153 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2154 			return;
2155 		}
2156 	}
2157 
2158 	rt = (struct rt6_info *) dst;
2159 	if (rt->rt6i_flags & RTF_REJECT) {
2160 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2161 		return;
2162 	}
2163 
2164 	/* Redirect received -> path was valid.
2165 	 * Look, redirects are sent only in response to data packets,
2166 	 * so that this nexthop apparently is reachable. --ANK
2167 	 */
2168 	dst_confirm(&rt->dst);
2169 
2170 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2171 	if (!neigh)
2172 		return;
2173 
2174 	/*
2175 	 *	We have finally decided to accept it.
2176 	 */
2177 
2178 	neigh_update(neigh, lladdr, NUD_STALE,
2179 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2180 		     NEIGH_UPDATE_F_OVERRIDE|
2181 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2182 				     NEIGH_UPDATE_F_ISROUTER))
2183 		     );
2184 
2185 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2186 	if (!nrt)
2187 		goto out;
2188 
2189 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2190 	if (on_link)
2191 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2192 
2193 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2194 
2195 	if (ip6_ins_rt(nrt))
2196 		goto out;
2197 
2198 	netevent.old = &rt->dst;
2199 	netevent.new = &nrt->dst;
2200 	netevent.daddr = &msg->dest;
2201 	netevent.neigh = neigh;
2202 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2203 
2204 	if (rt->rt6i_flags & RTF_CACHE) {
2205 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2206 		ip6_del_rt(rt);
2207 	}
2208 
2209 out:
2210 	neigh_release(neigh);
2211 }
2212 
2213 /*
2214  *	Misc support functions
2215  */
2216 
2217 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2218 {
2219 	BUG_ON(from->dst.from);
2220 
2221 	rt->rt6i_flags &= ~RTF_EXPIRES;
2222 	dst_hold(&from->dst);
2223 	rt->dst.from = &from->dst;
2224 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2225 }
2226 
2227 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2228 {
2229 	rt->dst.input = ort->dst.input;
2230 	rt->dst.output = ort->dst.output;
2231 	rt->rt6i_dst = ort->rt6i_dst;
2232 	rt->dst.error = ort->dst.error;
2233 	rt->rt6i_idev = ort->rt6i_idev;
2234 	if (rt->rt6i_idev)
2235 		in6_dev_hold(rt->rt6i_idev);
2236 	rt->dst.lastuse = jiffies;
2237 	rt->rt6i_gateway = ort->rt6i_gateway;
2238 	rt->rt6i_flags = ort->rt6i_flags;
2239 	rt6_set_from(rt, ort);
2240 	rt->rt6i_metric = ort->rt6i_metric;
2241 #ifdef CONFIG_IPV6_SUBTREES
2242 	rt->rt6i_src = ort->rt6i_src;
2243 #endif
2244 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2245 	rt->rt6i_table = ort->rt6i_table;
2246 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2247 }
2248 
2249 #ifdef CONFIG_IPV6_ROUTE_INFO
2250 static struct rt6_info *rt6_get_route_info(struct net *net,
2251 					   const struct in6_addr *prefix, int prefixlen,
2252 					   const struct in6_addr *gwaddr, int ifindex)
2253 {
2254 	struct fib6_node *fn;
2255 	struct rt6_info *rt = NULL;
2256 	struct fib6_table *table;
2257 
2258 	table = fib6_get_table(net, RT6_TABLE_INFO);
2259 	if (!table)
2260 		return NULL;
2261 
2262 	read_lock_bh(&table->tb6_lock);
2263 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2264 	if (!fn)
2265 		goto out;
2266 
2267 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2268 		if (rt->dst.dev->ifindex != ifindex)
2269 			continue;
2270 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2271 			continue;
2272 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2273 			continue;
2274 		dst_hold(&rt->dst);
2275 		break;
2276 	}
2277 out:
2278 	read_unlock_bh(&table->tb6_lock);
2279 	return rt;
2280 }
2281 
2282 static struct rt6_info *rt6_add_route_info(struct net *net,
2283 					   const struct in6_addr *prefix, int prefixlen,
2284 					   const struct in6_addr *gwaddr, int ifindex,
2285 					   unsigned int pref)
2286 {
2287 	struct fib6_config cfg = {
2288 		.fc_metric	= IP6_RT_PRIO_USER,
2289 		.fc_ifindex	= ifindex,
2290 		.fc_dst_len	= prefixlen,
2291 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2292 				  RTF_UP | RTF_PREF(pref),
2293 		.fc_nlinfo.portid = 0,
2294 		.fc_nlinfo.nlh = NULL,
2295 		.fc_nlinfo.nl_net = net,
2296 	};
2297 
2298 	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2299 	cfg.fc_dst = *prefix;
2300 	cfg.fc_gateway = *gwaddr;
2301 
2302 	/* We should treat it as a default route if prefix length is 0. */
2303 	if (!prefixlen)
2304 		cfg.fc_flags |= RTF_DEFAULT;
2305 
2306 	ip6_route_add(&cfg);
2307 
2308 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2309 }
2310 #endif
2311 
2312 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2313 {
2314 	struct rt6_info *rt;
2315 	struct fib6_table *table;
2316 
2317 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2318 	if (!table)
2319 		return NULL;
2320 
2321 	read_lock_bh(&table->tb6_lock);
2322 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2323 		if (dev == rt->dst.dev &&
2324 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2325 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2326 			break;
2327 	}
2328 	if (rt)
2329 		dst_hold(&rt->dst);
2330 	read_unlock_bh(&table->tb6_lock);
2331 	return rt;
2332 }
2333 
2334 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2335 				     struct net_device *dev,
2336 				     unsigned int pref)
2337 {
2338 	struct fib6_config cfg = {
2339 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2340 		.fc_metric	= IP6_RT_PRIO_USER,
2341 		.fc_ifindex	= dev->ifindex,
2342 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2343 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2344 		.fc_nlinfo.portid = 0,
2345 		.fc_nlinfo.nlh = NULL,
2346 		.fc_nlinfo.nl_net = dev_net(dev),
2347 	};
2348 
2349 	cfg.fc_gateway = *gwaddr;
2350 
2351 	ip6_route_add(&cfg);
2352 
2353 	return rt6_get_dflt_router(gwaddr, dev);
2354 }
2355 
2356 void rt6_purge_dflt_routers(struct net *net)
2357 {
2358 	struct rt6_info *rt;
2359 	struct fib6_table *table;
2360 
2361 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2362 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2363 	if (!table)
2364 		return;
2365 
2366 restart:
2367 	read_lock_bh(&table->tb6_lock);
2368 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2369 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2370 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2371 			dst_hold(&rt->dst);
2372 			read_unlock_bh(&table->tb6_lock);
2373 			ip6_del_rt(rt);
2374 			goto restart;
2375 		}
2376 	}
2377 	read_unlock_bh(&table->tb6_lock);
2378 }
2379 
2380 static void rtmsg_to_fib6_config(struct net *net,
2381 				 struct in6_rtmsg *rtmsg,
2382 				 struct fib6_config *cfg)
2383 {
2384 	memset(cfg, 0, sizeof(*cfg));
2385 
2386 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2387 			 : RT6_TABLE_MAIN;
2388 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2389 	cfg->fc_metric = rtmsg->rtmsg_metric;
2390 	cfg->fc_expires = rtmsg->rtmsg_info;
2391 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2392 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2393 	cfg->fc_flags = rtmsg->rtmsg_flags;
2394 
2395 	cfg->fc_nlinfo.nl_net = net;
2396 
2397 	cfg->fc_dst = rtmsg->rtmsg_dst;
2398 	cfg->fc_src = rtmsg->rtmsg_src;
2399 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2400 }
2401 
2402 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2403 {
2404 	struct fib6_config cfg;
2405 	struct in6_rtmsg rtmsg;
2406 	int err;
2407 
2408 	switch (cmd) {
2409 	case SIOCADDRT:		/* Add a route */
2410 	case SIOCDELRT:		/* Delete a route */
2411 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2412 			return -EPERM;
2413 		err = copy_from_user(&rtmsg, arg,
2414 				     sizeof(struct in6_rtmsg));
2415 		if (err)
2416 			return -EFAULT;
2417 
2418 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2419 
2420 		rtnl_lock();
2421 		switch (cmd) {
2422 		case SIOCADDRT:
2423 			err = ip6_route_add(&cfg);
2424 			break;
2425 		case SIOCDELRT:
2426 			err = ip6_route_del(&cfg);
2427 			break;
2428 		default:
2429 			err = -EINVAL;
2430 		}
2431 		rtnl_unlock();
2432 
2433 		return err;
2434 	}
2435 
2436 	return -EINVAL;
2437 }
2438 
2439 /*
2440  *	Drop the packet on the floor
2441  */
2442 
2443 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2444 {
2445 	int type;
2446 	struct dst_entry *dst = skb_dst(skb);
2447 	switch (ipstats_mib_noroutes) {
2448 	case IPSTATS_MIB_INNOROUTES:
2449 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2450 		if (type == IPV6_ADDR_ANY) {
2451 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2452 				      IPSTATS_MIB_INADDRERRORS);
2453 			break;
2454 		}
2455 		/* FALLTHROUGH */
2456 	case IPSTATS_MIB_OUTNOROUTES:
2457 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2458 			      ipstats_mib_noroutes);
2459 		break;
2460 	}
2461 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2462 	kfree_skb(skb);
2463 	return 0;
2464 }
2465 
2466 static int ip6_pkt_discard(struct sk_buff *skb)
2467 {
2468 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2469 }
2470 
2471 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2472 {
2473 	skb->dev = skb_dst(skb)->dev;
2474 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2475 }
2476 
2477 static int ip6_pkt_prohibit(struct sk_buff *skb)
2478 {
2479 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2480 }
2481 
2482 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2483 {
2484 	skb->dev = skb_dst(skb)->dev;
2485 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2486 }
2487 
2488 /*
2489  *	Allocate a dst for local (unicast / anycast) address.
2490  */
2491 
2492 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2493 				    const struct in6_addr *addr,
2494 				    bool anycast)
2495 {
2496 	u32 tb_id;
2497 	struct net *net = dev_net(idev->dev);
2498 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2499 					    DST_NOCOUNT);
2500 	if (!rt)
2501 		return ERR_PTR(-ENOMEM);
2502 
2503 	in6_dev_hold(idev);
2504 
2505 	rt->dst.flags |= DST_HOST;
2506 	rt->dst.input = ip6_input;
2507 	rt->dst.output = ip6_output;
2508 	rt->rt6i_idev = idev;
2509 
2510 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2511 	if (anycast)
2512 		rt->rt6i_flags |= RTF_ANYCAST;
2513 	else
2514 		rt->rt6i_flags |= RTF_LOCAL;
2515 
2516 	rt->rt6i_gateway  = *addr;
2517 	rt->rt6i_dst.addr = *addr;
2518 	rt->rt6i_dst.plen = 128;
2519 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2520 	rt->rt6i_table = fib6_get_table(net, tb_id);
2521 	rt->dst.flags |= DST_NOCACHE;
2522 
2523 	atomic_set(&rt->dst.__refcnt, 1);
2524 
2525 	return rt;
2526 }
2527 
2528 int ip6_route_get_saddr(struct net *net,
2529 			struct rt6_info *rt,
2530 			const struct in6_addr *daddr,
2531 			unsigned int prefs,
2532 			struct in6_addr *saddr)
2533 {
2534 	struct inet6_dev *idev =
2535 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2536 	int err = 0;
2537 	if (rt && rt->rt6i_prefsrc.plen)
2538 		*saddr = rt->rt6i_prefsrc.addr;
2539 	else
2540 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2541 					 daddr, prefs, saddr);
2542 	return err;
2543 }
2544 
2545 /* remove deleted ip from prefsrc entries */
2546 struct arg_dev_net_ip {
2547 	struct net_device *dev;
2548 	struct net *net;
2549 	struct in6_addr *addr;
2550 };
2551 
2552 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2553 {
2554 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2555 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2556 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2557 
2558 	if (((void *)rt->dst.dev == dev || !dev) &&
2559 	    rt != net->ipv6.ip6_null_entry &&
2560 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2561 		/* remove prefsrc entry */
2562 		rt->rt6i_prefsrc.plen = 0;
2563 	}
2564 	return 0;
2565 }
2566 
2567 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2568 {
2569 	struct net *net = dev_net(ifp->idev->dev);
2570 	struct arg_dev_net_ip adni = {
2571 		.dev = ifp->idev->dev,
2572 		.net = net,
2573 		.addr = &ifp->addr,
2574 	};
2575 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2576 }
2577 
2578 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2579 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2580 
2581 /* Remove routers and update dst entries when gateway turn into host. */
2582 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2583 {
2584 	struct in6_addr *gateway = (struct in6_addr *)arg;
2585 
2586 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2587 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2588 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2589 		return -1;
2590 	}
2591 	return 0;
2592 }
2593 
2594 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2595 {
2596 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2597 }
2598 
2599 struct arg_dev_net {
2600 	struct net_device *dev;
2601 	struct net *net;
2602 };
2603 
2604 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2605 {
2606 	const struct arg_dev_net *adn = arg;
2607 	const struct net_device *dev = adn->dev;
2608 
2609 	if ((rt->dst.dev == dev || !dev) &&
2610 	    rt != adn->net->ipv6.ip6_null_entry)
2611 		return -1;
2612 
2613 	return 0;
2614 }
2615 
2616 void rt6_ifdown(struct net *net, struct net_device *dev)
2617 {
2618 	struct arg_dev_net adn = {
2619 		.dev = dev,
2620 		.net = net,
2621 	};
2622 
2623 	fib6_clean_all(net, fib6_ifdown, &adn);
2624 	icmp6_clean_all(fib6_ifdown, &adn);
2625 	if (dev)
2626 		rt6_uncached_list_flush_dev(net, dev);
2627 }
2628 
2629 struct rt6_mtu_change_arg {
2630 	struct net_device *dev;
2631 	unsigned int mtu;
2632 };
2633 
2634 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2635 {
2636 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2637 	struct inet6_dev *idev;
2638 
2639 	/* In IPv6 pmtu discovery is not optional,
2640 	   so that RTAX_MTU lock cannot disable it.
2641 	   We still use this lock to block changes
2642 	   caused by addrconf/ndisc.
2643 	*/
2644 
2645 	idev = __in6_dev_get(arg->dev);
2646 	if (!idev)
2647 		return 0;
2648 
2649 	/* For administrative MTU increase, there is no way to discover
2650 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2651 	   Since RFC 1981 doesn't include administrative MTU increase
2652 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2653 	 */
2654 	/*
2655 	   If new MTU is less than route PMTU, this new MTU will be the
2656 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2657 	   decreases; if new MTU is greater than route PMTU, and the
2658 	   old MTU is the lowest MTU in the path, update the route PMTU
2659 	   to reflect the increase. In this case if the other nodes' MTU
2660 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2661 	   PMTU discouvery.
2662 	 */
2663 	if (rt->dst.dev == arg->dev &&
2664 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2665 		if (rt->rt6i_flags & RTF_CACHE) {
2666 			/* For RTF_CACHE with rt6i_pmtu == 0
2667 			 * (i.e. a redirected route),
2668 			 * the metrics of its rt->dst.from has already
2669 			 * been updated.
2670 			 */
2671 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2672 				rt->rt6i_pmtu = arg->mtu;
2673 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2674 			   (dst_mtu(&rt->dst) < arg->mtu &&
2675 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2676 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2677 		}
2678 	}
2679 	return 0;
2680 }
2681 
2682 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2683 {
2684 	struct rt6_mtu_change_arg arg = {
2685 		.dev = dev,
2686 		.mtu = mtu,
2687 	};
2688 
2689 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2690 }
2691 
2692 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2693 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2694 	[RTA_OIF]               = { .type = NLA_U32 },
2695 	[RTA_IIF]		= { .type = NLA_U32 },
2696 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2697 	[RTA_METRICS]           = { .type = NLA_NESTED },
2698 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2699 	[RTA_PREF]              = { .type = NLA_U8 },
2700 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2701 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2702 };
2703 
2704 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2705 			      struct fib6_config *cfg)
2706 {
2707 	struct rtmsg *rtm;
2708 	struct nlattr *tb[RTA_MAX+1];
2709 	unsigned int pref;
2710 	int err;
2711 
2712 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2713 	if (err < 0)
2714 		goto errout;
2715 
2716 	err = -EINVAL;
2717 	rtm = nlmsg_data(nlh);
2718 	memset(cfg, 0, sizeof(*cfg));
2719 
2720 	cfg->fc_table = rtm->rtm_table;
2721 	cfg->fc_dst_len = rtm->rtm_dst_len;
2722 	cfg->fc_src_len = rtm->rtm_src_len;
2723 	cfg->fc_flags = RTF_UP;
2724 	cfg->fc_protocol = rtm->rtm_protocol;
2725 	cfg->fc_type = rtm->rtm_type;
2726 
2727 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2728 	    rtm->rtm_type == RTN_BLACKHOLE ||
2729 	    rtm->rtm_type == RTN_PROHIBIT ||
2730 	    rtm->rtm_type == RTN_THROW)
2731 		cfg->fc_flags |= RTF_REJECT;
2732 
2733 	if (rtm->rtm_type == RTN_LOCAL)
2734 		cfg->fc_flags |= RTF_LOCAL;
2735 
2736 	if (rtm->rtm_flags & RTM_F_CLONED)
2737 		cfg->fc_flags |= RTF_CACHE;
2738 
2739 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2740 	cfg->fc_nlinfo.nlh = nlh;
2741 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2742 
2743 	if (tb[RTA_GATEWAY]) {
2744 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2745 		cfg->fc_flags |= RTF_GATEWAY;
2746 	}
2747 
2748 	if (tb[RTA_DST]) {
2749 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2750 
2751 		if (nla_len(tb[RTA_DST]) < plen)
2752 			goto errout;
2753 
2754 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2755 	}
2756 
2757 	if (tb[RTA_SRC]) {
2758 		int plen = (rtm->rtm_src_len + 7) >> 3;
2759 
2760 		if (nla_len(tb[RTA_SRC]) < plen)
2761 			goto errout;
2762 
2763 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2764 	}
2765 
2766 	if (tb[RTA_PREFSRC])
2767 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2768 
2769 	if (tb[RTA_OIF])
2770 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2771 
2772 	if (tb[RTA_PRIORITY])
2773 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2774 
2775 	if (tb[RTA_METRICS]) {
2776 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2777 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2778 	}
2779 
2780 	if (tb[RTA_TABLE])
2781 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2782 
2783 	if (tb[RTA_MULTIPATH]) {
2784 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2785 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2786 	}
2787 
2788 	if (tb[RTA_PREF]) {
2789 		pref = nla_get_u8(tb[RTA_PREF]);
2790 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2791 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2792 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2793 		cfg->fc_flags |= RTF_PREF(pref);
2794 	}
2795 
2796 	if (tb[RTA_ENCAP])
2797 		cfg->fc_encap = tb[RTA_ENCAP];
2798 
2799 	if (tb[RTA_ENCAP_TYPE])
2800 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2801 
2802 	err = 0;
2803 errout:
2804 	return err;
2805 }
2806 
2807 struct rt6_nh {
2808 	struct rt6_info *rt6_info;
2809 	struct fib6_config r_cfg;
2810 	struct mx6_config mxc;
2811 	struct list_head next;
2812 };
2813 
2814 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2815 {
2816 	struct rt6_nh *nh;
2817 
2818 	list_for_each_entry(nh, rt6_nh_list, next) {
2819 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2820 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2821 		        nh->r_cfg.fc_ifindex);
2822 	}
2823 }
2824 
2825 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2826 				 struct rt6_info *rt, struct fib6_config *r_cfg)
2827 {
2828 	struct rt6_nh *nh;
2829 	struct rt6_info *rtnh;
2830 	int err = -EEXIST;
2831 
2832 	list_for_each_entry(nh, rt6_nh_list, next) {
2833 		/* check if rt6_info already exists */
2834 		rtnh = nh->rt6_info;
2835 
2836 		if (rtnh->dst.dev == rt->dst.dev &&
2837 		    rtnh->rt6i_idev == rt->rt6i_idev &&
2838 		    ipv6_addr_equal(&rtnh->rt6i_gateway,
2839 				    &rt->rt6i_gateway))
2840 			return err;
2841 	}
2842 
2843 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2844 	if (!nh)
2845 		return -ENOMEM;
2846 	nh->rt6_info = rt;
2847 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
2848 	if (err) {
2849 		kfree(nh);
2850 		return err;
2851 	}
2852 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2853 	list_add_tail(&nh->next, rt6_nh_list);
2854 
2855 	return 0;
2856 }
2857 
2858 static int ip6_route_multipath_add(struct fib6_config *cfg)
2859 {
2860 	struct fib6_config r_cfg;
2861 	struct rtnexthop *rtnh;
2862 	struct rt6_info *rt;
2863 	struct rt6_nh *err_nh;
2864 	struct rt6_nh *nh, *nh_safe;
2865 	int remaining;
2866 	int attrlen;
2867 	int err = 1;
2868 	int nhn = 0;
2869 	int replace = (cfg->fc_nlinfo.nlh &&
2870 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2871 	LIST_HEAD(rt6_nh_list);
2872 
2873 	remaining = cfg->fc_mp_len;
2874 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2875 
2876 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
2877 	 * rt6_info structs per nexthop
2878 	 */
2879 	while (rtnh_ok(rtnh, remaining)) {
2880 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2881 		if (rtnh->rtnh_ifindex)
2882 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2883 
2884 		attrlen = rtnh_attrlen(rtnh);
2885 		if (attrlen > 0) {
2886 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2887 
2888 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2889 			if (nla) {
2890 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2891 				r_cfg.fc_flags |= RTF_GATEWAY;
2892 			}
2893 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2894 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2895 			if (nla)
2896 				r_cfg.fc_encap_type = nla_get_u16(nla);
2897 		}
2898 
2899 		rt = ip6_route_info_create(&r_cfg);
2900 		if (IS_ERR(rt)) {
2901 			err = PTR_ERR(rt);
2902 			rt = NULL;
2903 			goto cleanup;
2904 		}
2905 
2906 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2907 		if (err) {
2908 			dst_free(&rt->dst);
2909 			goto cleanup;
2910 		}
2911 
2912 		rtnh = rtnh_next(rtnh, &remaining);
2913 	}
2914 
2915 	err_nh = NULL;
2916 	list_for_each_entry(nh, &rt6_nh_list, next) {
2917 		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2918 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
2919 		nh->rt6_info = NULL;
2920 		if (err) {
2921 			if (replace && nhn)
2922 				ip6_print_replace_route_err(&rt6_nh_list);
2923 			err_nh = nh;
2924 			goto add_errout;
2925 		}
2926 
2927 		/* Because each route is added like a single route we remove
2928 		 * these flags after the first nexthop: if there is a collision,
2929 		 * we have already failed to add the first nexthop:
2930 		 * fib6_add_rt2node() has rejected it; when replacing, old
2931 		 * nexthops have been replaced by first new, the rest should
2932 		 * be added to it.
2933 		 */
2934 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2935 						     NLM_F_REPLACE);
2936 		nhn++;
2937 	}
2938 
2939 	goto cleanup;
2940 
2941 add_errout:
2942 	/* Delete routes that were already added */
2943 	list_for_each_entry(nh, &rt6_nh_list, next) {
2944 		if (err_nh == nh)
2945 			break;
2946 		ip6_route_del(&nh->r_cfg);
2947 	}
2948 
2949 cleanup:
2950 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
2951 		if (nh->rt6_info)
2952 			dst_free(&nh->rt6_info->dst);
2953 		kfree(nh->mxc.mx);
2954 		list_del(&nh->next);
2955 		kfree(nh);
2956 	}
2957 
2958 	return err;
2959 }
2960 
2961 static int ip6_route_multipath_del(struct fib6_config *cfg)
2962 {
2963 	struct fib6_config r_cfg;
2964 	struct rtnexthop *rtnh;
2965 	int remaining;
2966 	int attrlen;
2967 	int err = 1, last_err = 0;
2968 
2969 	remaining = cfg->fc_mp_len;
2970 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2971 
2972 	/* Parse a Multipath Entry */
2973 	while (rtnh_ok(rtnh, remaining)) {
2974 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2975 		if (rtnh->rtnh_ifindex)
2976 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2977 
2978 		attrlen = rtnh_attrlen(rtnh);
2979 		if (attrlen > 0) {
2980 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2981 
2982 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2983 			if (nla) {
2984 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2985 				r_cfg.fc_flags |= RTF_GATEWAY;
2986 			}
2987 		}
2988 		err = ip6_route_del(&r_cfg);
2989 		if (err)
2990 			last_err = err;
2991 
2992 		rtnh = rtnh_next(rtnh, &remaining);
2993 	}
2994 
2995 	return last_err;
2996 }
2997 
2998 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2999 {
3000 	struct fib6_config cfg;
3001 	int err;
3002 
3003 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3004 	if (err < 0)
3005 		return err;
3006 
3007 	if (cfg.fc_mp)
3008 		return ip6_route_multipath_del(&cfg);
3009 	else
3010 		return ip6_route_del(&cfg);
3011 }
3012 
3013 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3014 {
3015 	struct fib6_config cfg;
3016 	int err;
3017 
3018 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3019 	if (err < 0)
3020 		return err;
3021 
3022 	if (cfg.fc_mp)
3023 		return ip6_route_multipath_add(&cfg);
3024 	else
3025 		return ip6_route_add(&cfg);
3026 }
3027 
3028 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3029 {
3030 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3031 	       + nla_total_size(16) /* RTA_SRC */
3032 	       + nla_total_size(16) /* RTA_DST */
3033 	       + nla_total_size(16) /* RTA_GATEWAY */
3034 	       + nla_total_size(16) /* RTA_PREFSRC */
3035 	       + nla_total_size(4) /* RTA_TABLE */
3036 	       + nla_total_size(4) /* RTA_IIF */
3037 	       + nla_total_size(4) /* RTA_OIF */
3038 	       + nla_total_size(4) /* RTA_PRIORITY */
3039 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3040 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3041 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3042 	       + nla_total_size(1) /* RTA_PREF */
3043 	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
3044 }
3045 
3046 static int rt6_fill_node(struct net *net,
3047 			 struct sk_buff *skb, struct rt6_info *rt,
3048 			 struct in6_addr *dst, struct in6_addr *src,
3049 			 int iif, int type, u32 portid, u32 seq,
3050 			 int prefix, int nowait, unsigned int flags)
3051 {
3052 	u32 metrics[RTAX_MAX];
3053 	struct rtmsg *rtm;
3054 	struct nlmsghdr *nlh;
3055 	long expires;
3056 	u32 table;
3057 
3058 	if (prefix) {	/* user wants prefix routes only */
3059 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3060 			/* success since this is not a prefix route */
3061 			return 1;
3062 		}
3063 	}
3064 
3065 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3066 	if (!nlh)
3067 		return -EMSGSIZE;
3068 
3069 	rtm = nlmsg_data(nlh);
3070 	rtm->rtm_family = AF_INET6;
3071 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3072 	rtm->rtm_src_len = rt->rt6i_src.plen;
3073 	rtm->rtm_tos = 0;
3074 	if (rt->rt6i_table)
3075 		table = rt->rt6i_table->tb6_id;
3076 	else
3077 		table = RT6_TABLE_UNSPEC;
3078 	rtm->rtm_table = table;
3079 	if (nla_put_u32(skb, RTA_TABLE, table))
3080 		goto nla_put_failure;
3081 	if (rt->rt6i_flags & RTF_REJECT) {
3082 		switch (rt->dst.error) {
3083 		case -EINVAL:
3084 			rtm->rtm_type = RTN_BLACKHOLE;
3085 			break;
3086 		case -EACCES:
3087 			rtm->rtm_type = RTN_PROHIBIT;
3088 			break;
3089 		case -EAGAIN:
3090 			rtm->rtm_type = RTN_THROW;
3091 			break;
3092 		default:
3093 			rtm->rtm_type = RTN_UNREACHABLE;
3094 			break;
3095 		}
3096 	}
3097 	else if (rt->rt6i_flags & RTF_LOCAL)
3098 		rtm->rtm_type = RTN_LOCAL;
3099 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3100 		rtm->rtm_type = RTN_LOCAL;
3101 	else
3102 		rtm->rtm_type = RTN_UNICAST;
3103 	rtm->rtm_flags = 0;
3104 	if (!netif_carrier_ok(rt->dst.dev)) {
3105 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
3106 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3107 			rtm->rtm_flags |= RTNH_F_DEAD;
3108 	}
3109 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3110 	rtm->rtm_protocol = rt->rt6i_protocol;
3111 	if (rt->rt6i_flags & RTF_DYNAMIC)
3112 		rtm->rtm_protocol = RTPROT_REDIRECT;
3113 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
3114 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3115 			rtm->rtm_protocol = RTPROT_RA;
3116 		else
3117 			rtm->rtm_protocol = RTPROT_KERNEL;
3118 	}
3119 
3120 	if (rt->rt6i_flags & RTF_CACHE)
3121 		rtm->rtm_flags |= RTM_F_CLONED;
3122 
3123 	if (dst) {
3124 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3125 			goto nla_put_failure;
3126 		rtm->rtm_dst_len = 128;
3127 	} else if (rtm->rtm_dst_len)
3128 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3129 			goto nla_put_failure;
3130 #ifdef CONFIG_IPV6_SUBTREES
3131 	if (src) {
3132 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3133 			goto nla_put_failure;
3134 		rtm->rtm_src_len = 128;
3135 	} else if (rtm->rtm_src_len &&
3136 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3137 		goto nla_put_failure;
3138 #endif
3139 	if (iif) {
3140 #ifdef CONFIG_IPV6_MROUTE
3141 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3142 			int err = ip6mr_get_route(net, skb, rtm, nowait);
3143 			if (err <= 0) {
3144 				if (!nowait) {
3145 					if (err == 0)
3146 						return 0;
3147 					goto nla_put_failure;
3148 				} else {
3149 					if (err == -EMSGSIZE)
3150 						goto nla_put_failure;
3151 				}
3152 			}
3153 		} else
3154 #endif
3155 			if (nla_put_u32(skb, RTA_IIF, iif))
3156 				goto nla_put_failure;
3157 	} else if (dst) {
3158 		struct in6_addr saddr_buf;
3159 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3160 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3161 			goto nla_put_failure;
3162 	}
3163 
3164 	if (rt->rt6i_prefsrc.plen) {
3165 		struct in6_addr saddr_buf;
3166 		saddr_buf = rt->rt6i_prefsrc.addr;
3167 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3168 			goto nla_put_failure;
3169 	}
3170 
3171 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3172 	if (rt->rt6i_pmtu)
3173 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3174 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3175 		goto nla_put_failure;
3176 
3177 	if (rt->rt6i_flags & RTF_GATEWAY) {
3178 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3179 			goto nla_put_failure;
3180 	}
3181 
3182 	if (rt->dst.dev &&
3183 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3184 		goto nla_put_failure;
3185 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3186 		goto nla_put_failure;
3187 
3188 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3189 
3190 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3191 		goto nla_put_failure;
3192 
3193 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3194 		goto nla_put_failure;
3195 
3196 	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3197 
3198 	nlmsg_end(skb, nlh);
3199 	return 0;
3200 
3201 nla_put_failure:
3202 	nlmsg_cancel(skb, nlh);
3203 	return -EMSGSIZE;
3204 }
3205 
3206 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3207 {
3208 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3209 	int prefix;
3210 
3211 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3212 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3213 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3214 	} else
3215 		prefix = 0;
3216 
3217 	return rt6_fill_node(arg->net,
3218 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3219 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3220 		     prefix, 0, NLM_F_MULTI);
3221 }
3222 
3223 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3224 {
3225 	struct net *net = sock_net(in_skb->sk);
3226 	struct nlattr *tb[RTA_MAX+1];
3227 	struct rt6_info *rt;
3228 	struct sk_buff *skb;
3229 	struct rtmsg *rtm;
3230 	struct flowi6 fl6;
3231 	int err, iif = 0, oif = 0;
3232 
3233 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3234 	if (err < 0)
3235 		goto errout;
3236 
3237 	err = -EINVAL;
3238 	memset(&fl6, 0, sizeof(fl6));
3239 
3240 	if (tb[RTA_SRC]) {
3241 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3242 			goto errout;
3243 
3244 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3245 	}
3246 
3247 	if (tb[RTA_DST]) {
3248 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3249 			goto errout;
3250 
3251 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3252 	}
3253 
3254 	if (tb[RTA_IIF])
3255 		iif = nla_get_u32(tb[RTA_IIF]);
3256 
3257 	if (tb[RTA_OIF])
3258 		oif = nla_get_u32(tb[RTA_OIF]);
3259 
3260 	if (tb[RTA_MARK])
3261 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3262 
3263 	if (iif) {
3264 		struct net_device *dev;
3265 		int flags = 0;
3266 
3267 		dev = __dev_get_by_index(net, iif);
3268 		if (!dev) {
3269 			err = -ENODEV;
3270 			goto errout;
3271 		}
3272 
3273 		fl6.flowi6_iif = iif;
3274 
3275 		if (!ipv6_addr_any(&fl6.saddr))
3276 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3277 
3278 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3279 							       flags);
3280 	} else {
3281 		fl6.flowi6_oif = oif;
3282 
3283 		if (netif_index_is_l3_master(net, oif)) {
3284 			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3285 					   FLOWI_FLAG_SKIP_NH_OIF;
3286 		}
3287 
3288 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3289 	}
3290 
3291 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3292 	if (!skb) {
3293 		ip6_rt_put(rt);
3294 		err = -ENOBUFS;
3295 		goto errout;
3296 	}
3297 
3298 	/* Reserve room for dummy headers, this skb can pass
3299 	   through good chunk of routing engine.
3300 	 */
3301 	skb_reset_mac_header(skb);
3302 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3303 
3304 	skb_dst_set(skb, &rt->dst);
3305 
3306 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3307 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3308 			    nlh->nlmsg_seq, 0, 0, 0);
3309 	if (err < 0) {
3310 		kfree_skb(skb);
3311 		goto errout;
3312 	}
3313 
3314 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3315 errout:
3316 	return err;
3317 }
3318 
3319 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3320 		     unsigned int nlm_flags)
3321 {
3322 	struct sk_buff *skb;
3323 	struct net *net = info->nl_net;
3324 	u32 seq;
3325 	int err;
3326 
3327 	err = -ENOBUFS;
3328 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3329 
3330 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3331 	if (!skb)
3332 		goto errout;
3333 
3334 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3335 				event, info->portid, seq, 0, 0, nlm_flags);
3336 	if (err < 0) {
3337 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3338 		WARN_ON(err == -EMSGSIZE);
3339 		kfree_skb(skb);
3340 		goto errout;
3341 	}
3342 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3343 		    info->nlh, gfp_any());
3344 	return;
3345 errout:
3346 	if (err < 0)
3347 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3348 }
3349 
3350 static int ip6_route_dev_notify(struct notifier_block *this,
3351 				unsigned long event, void *ptr)
3352 {
3353 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3354 	struct net *net = dev_net(dev);
3355 
3356 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3357 		net->ipv6.ip6_null_entry->dst.dev = dev;
3358 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3359 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3360 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3361 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3362 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3363 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3364 #endif
3365 	}
3366 
3367 	return NOTIFY_OK;
3368 }
3369 
3370 /*
3371  *	/proc
3372  */
3373 
3374 #ifdef CONFIG_PROC_FS
3375 
3376 static const struct file_operations ipv6_route_proc_fops = {
3377 	.owner		= THIS_MODULE,
3378 	.open		= ipv6_route_open,
3379 	.read		= seq_read,
3380 	.llseek		= seq_lseek,
3381 	.release	= seq_release_net,
3382 };
3383 
3384 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3385 {
3386 	struct net *net = (struct net *)seq->private;
3387 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3388 		   net->ipv6.rt6_stats->fib_nodes,
3389 		   net->ipv6.rt6_stats->fib_route_nodes,
3390 		   net->ipv6.rt6_stats->fib_rt_alloc,
3391 		   net->ipv6.rt6_stats->fib_rt_entries,
3392 		   net->ipv6.rt6_stats->fib_rt_cache,
3393 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3394 		   net->ipv6.rt6_stats->fib_discarded_routes);
3395 
3396 	return 0;
3397 }
3398 
3399 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3400 {
3401 	return single_open_net(inode, file, rt6_stats_seq_show);
3402 }
3403 
3404 static const struct file_operations rt6_stats_seq_fops = {
3405 	.owner	 = THIS_MODULE,
3406 	.open	 = rt6_stats_seq_open,
3407 	.read	 = seq_read,
3408 	.llseek	 = seq_lseek,
3409 	.release = single_release_net,
3410 };
3411 #endif	/* CONFIG_PROC_FS */
3412 
3413 #ifdef CONFIG_SYSCTL
3414 
3415 static
3416 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3417 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3418 {
3419 	struct net *net;
3420 	int delay;
3421 	if (!write)
3422 		return -EINVAL;
3423 
3424 	net = (struct net *)ctl->extra1;
3425 	delay = net->ipv6.sysctl.flush_delay;
3426 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3427 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3428 	return 0;
3429 }
3430 
3431 struct ctl_table ipv6_route_table_template[] = {
3432 	{
3433 		.procname	=	"flush",
3434 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3435 		.maxlen		=	sizeof(int),
3436 		.mode		=	0200,
3437 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3438 	},
3439 	{
3440 		.procname	=	"gc_thresh",
3441 		.data		=	&ip6_dst_ops_template.gc_thresh,
3442 		.maxlen		=	sizeof(int),
3443 		.mode		=	0644,
3444 		.proc_handler	=	proc_dointvec,
3445 	},
3446 	{
3447 		.procname	=	"max_size",
3448 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3449 		.maxlen		=	sizeof(int),
3450 		.mode		=	0644,
3451 		.proc_handler	=	proc_dointvec,
3452 	},
3453 	{
3454 		.procname	=	"gc_min_interval",
3455 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3456 		.maxlen		=	sizeof(int),
3457 		.mode		=	0644,
3458 		.proc_handler	=	proc_dointvec_jiffies,
3459 	},
3460 	{
3461 		.procname	=	"gc_timeout",
3462 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3463 		.maxlen		=	sizeof(int),
3464 		.mode		=	0644,
3465 		.proc_handler	=	proc_dointvec_jiffies,
3466 	},
3467 	{
3468 		.procname	=	"gc_interval",
3469 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3470 		.maxlen		=	sizeof(int),
3471 		.mode		=	0644,
3472 		.proc_handler	=	proc_dointvec_jiffies,
3473 	},
3474 	{
3475 		.procname	=	"gc_elasticity",
3476 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3477 		.maxlen		=	sizeof(int),
3478 		.mode		=	0644,
3479 		.proc_handler	=	proc_dointvec,
3480 	},
3481 	{
3482 		.procname	=	"mtu_expires",
3483 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3484 		.maxlen		=	sizeof(int),
3485 		.mode		=	0644,
3486 		.proc_handler	=	proc_dointvec_jiffies,
3487 	},
3488 	{
3489 		.procname	=	"min_adv_mss",
3490 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3491 		.maxlen		=	sizeof(int),
3492 		.mode		=	0644,
3493 		.proc_handler	=	proc_dointvec,
3494 	},
3495 	{
3496 		.procname	=	"gc_min_interval_ms",
3497 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3498 		.maxlen		=	sizeof(int),
3499 		.mode		=	0644,
3500 		.proc_handler	=	proc_dointvec_ms_jiffies,
3501 	},
3502 	{ }
3503 };
3504 
3505 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3506 {
3507 	struct ctl_table *table;
3508 
3509 	table = kmemdup(ipv6_route_table_template,
3510 			sizeof(ipv6_route_table_template),
3511 			GFP_KERNEL);
3512 
3513 	if (table) {
3514 		table[0].data = &net->ipv6.sysctl.flush_delay;
3515 		table[0].extra1 = net;
3516 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3517 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3518 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3519 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3520 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3521 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3522 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3523 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3524 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3525 
3526 		/* Don't export sysctls to unprivileged users */
3527 		if (net->user_ns != &init_user_ns)
3528 			table[0].procname = NULL;
3529 	}
3530 
3531 	return table;
3532 }
3533 #endif
3534 
3535 static int __net_init ip6_route_net_init(struct net *net)
3536 {
3537 	int ret = -ENOMEM;
3538 
3539 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3540 	       sizeof(net->ipv6.ip6_dst_ops));
3541 
3542 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3543 		goto out_ip6_dst_ops;
3544 
3545 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3546 					   sizeof(*net->ipv6.ip6_null_entry),
3547 					   GFP_KERNEL);
3548 	if (!net->ipv6.ip6_null_entry)
3549 		goto out_ip6_dst_entries;
3550 	net->ipv6.ip6_null_entry->dst.path =
3551 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3552 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3553 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3554 			 ip6_template_metrics, true);
3555 
3556 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3557 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3558 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3559 					       GFP_KERNEL);
3560 	if (!net->ipv6.ip6_prohibit_entry)
3561 		goto out_ip6_null_entry;
3562 	net->ipv6.ip6_prohibit_entry->dst.path =
3563 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3564 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3565 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3566 			 ip6_template_metrics, true);
3567 
3568 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3569 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3570 					       GFP_KERNEL);
3571 	if (!net->ipv6.ip6_blk_hole_entry)
3572 		goto out_ip6_prohibit_entry;
3573 	net->ipv6.ip6_blk_hole_entry->dst.path =
3574 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3575 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3576 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3577 			 ip6_template_metrics, true);
3578 #endif
3579 
3580 	net->ipv6.sysctl.flush_delay = 0;
3581 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3582 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3583 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3584 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3585 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3586 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3587 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3588 
3589 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3590 
3591 	ret = 0;
3592 out:
3593 	return ret;
3594 
3595 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3596 out_ip6_prohibit_entry:
3597 	kfree(net->ipv6.ip6_prohibit_entry);
3598 out_ip6_null_entry:
3599 	kfree(net->ipv6.ip6_null_entry);
3600 #endif
3601 out_ip6_dst_entries:
3602 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3603 out_ip6_dst_ops:
3604 	goto out;
3605 }
3606 
3607 static void __net_exit ip6_route_net_exit(struct net *net)
3608 {
3609 	kfree(net->ipv6.ip6_null_entry);
3610 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3611 	kfree(net->ipv6.ip6_prohibit_entry);
3612 	kfree(net->ipv6.ip6_blk_hole_entry);
3613 #endif
3614 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3615 }
3616 
3617 static int __net_init ip6_route_net_init_late(struct net *net)
3618 {
3619 #ifdef CONFIG_PROC_FS
3620 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3621 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3622 #endif
3623 	return 0;
3624 }
3625 
3626 static void __net_exit ip6_route_net_exit_late(struct net *net)
3627 {
3628 #ifdef CONFIG_PROC_FS
3629 	remove_proc_entry("ipv6_route", net->proc_net);
3630 	remove_proc_entry("rt6_stats", net->proc_net);
3631 #endif
3632 }
3633 
3634 static struct pernet_operations ip6_route_net_ops = {
3635 	.init = ip6_route_net_init,
3636 	.exit = ip6_route_net_exit,
3637 };
3638 
3639 static int __net_init ipv6_inetpeer_init(struct net *net)
3640 {
3641 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3642 
3643 	if (!bp)
3644 		return -ENOMEM;
3645 	inet_peer_base_init(bp);
3646 	net->ipv6.peers = bp;
3647 	return 0;
3648 }
3649 
3650 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3651 {
3652 	struct inet_peer_base *bp = net->ipv6.peers;
3653 
3654 	net->ipv6.peers = NULL;
3655 	inetpeer_invalidate_tree(bp);
3656 	kfree(bp);
3657 }
3658 
3659 static struct pernet_operations ipv6_inetpeer_ops = {
3660 	.init	=	ipv6_inetpeer_init,
3661 	.exit	=	ipv6_inetpeer_exit,
3662 };
3663 
3664 static struct pernet_operations ip6_route_net_late_ops = {
3665 	.init = ip6_route_net_init_late,
3666 	.exit = ip6_route_net_exit_late,
3667 };
3668 
3669 static struct notifier_block ip6_route_dev_notifier = {
3670 	.notifier_call = ip6_route_dev_notify,
3671 	.priority = 0,
3672 };
3673 
3674 int __init ip6_route_init(void)
3675 {
3676 	int ret;
3677 	int cpu;
3678 
3679 	ret = -ENOMEM;
3680 	ip6_dst_ops_template.kmem_cachep =
3681 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3682 				  SLAB_HWCACHE_ALIGN, NULL);
3683 	if (!ip6_dst_ops_template.kmem_cachep)
3684 		goto out;
3685 
3686 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3687 	if (ret)
3688 		goto out_kmem_cache;
3689 
3690 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3691 	if (ret)
3692 		goto out_dst_entries;
3693 
3694 	ret = register_pernet_subsys(&ip6_route_net_ops);
3695 	if (ret)
3696 		goto out_register_inetpeer;
3697 
3698 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3699 
3700 	/* Registering of the loopback is done before this portion of code,
3701 	 * the loopback reference in rt6_info will not be taken, do it
3702 	 * manually for init_net */
3703 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3704 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3705   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3706 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3707 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3708 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3709 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3710   #endif
3711 	ret = fib6_init();
3712 	if (ret)
3713 		goto out_register_subsys;
3714 
3715 	ret = xfrm6_init();
3716 	if (ret)
3717 		goto out_fib6_init;
3718 
3719 	ret = fib6_rules_init();
3720 	if (ret)
3721 		goto xfrm6_init;
3722 
3723 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3724 	if (ret)
3725 		goto fib6_rules_init;
3726 
3727 	ret = -ENOBUFS;
3728 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3729 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3730 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3731 		goto out_register_late_subsys;
3732 
3733 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3734 	if (ret)
3735 		goto out_register_late_subsys;
3736 
3737 	for_each_possible_cpu(cpu) {
3738 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3739 
3740 		INIT_LIST_HEAD(&ul->head);
3741 		spin_lock_init(&ul->lock);
3742 	}
3743 
3744 out:
3745 	return ret;
3746 
3747 out_register_late_subsys:
3748 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3749 fib6_rules_init:
3750 	fib6_rules_cleanup();
3751 xfrm6_init:
3752 	xfrm6_fini();
3753 out_fib6_init:
3754 	fib6_gc_cleanup();
3755 out_register_subsys:
3756 	unregister_pernet_subsys(&ip6_route_net_ops);
3757 out_register_inetpeer:
3758 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3759 out_dst_entries:
3760 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3761 out_kmem_cache:
3762 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3763 	goto out;
3764 }
3765 
3766 void ip6_route_cleanup(void)
3767 {
3768 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3769 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3770 	fib6_rules_cleanup();
3771 	xfrm6_fini();
3772 	fib6_gc_cleanup();
3773 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3774 	unregister_pernet_subsys(&ip6_route_net_ops);
3775 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3776 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3777 }
3778