xref: /openbmc/linux/net/ipv6/route.c (revision 029f7f3b8701cc7aca8bdb31f0c7edd6a479e357)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66 
67 #include <asm/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 enum rt6_nud_state {
74 	RT6_NUD_FAIL_HARD = -3,
75 	RT6_NUD_FAIL_PROBE = -2,
76 	RT6_NUD_FAIL_DO_RR = -1,
77 	RT6_NUD_SUCCEED = 1
78 };
79 
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104 					   const struct in6_addr *prefix, int prefixlen,
105 					   const struct in6_addr *gwaddr, int ifindex,
106 					   unsigned int pref);
107 static struct rt6_info *rt6_get_route_info(struct net *net,
108 					   const struct in6_addr *prefix, int prefixlen,
109 					   const struct in6_addr *gwaddr, int ifindex);
110 #endif
111 
112 struct uncached_list {
113 	spinlock_t		lock;
114 	struct list_head	head;
115 };
116 
117 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
118 
119 static void rt6_uncached_list_add(struct rt6_info *rt)
120 {
121 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
122 
123 	rt->dst.flags |= DST_NOCACHE;
124 	rt->rt6i_uncached_list = ul;
125 
126 	spin_lock_bh(&ul->lock);
127 	list_add_tail(&rt->rt6i_uncached, &ul->head);
128 	spin_unlock_bh(&ul->lock);
129 }
130 
131 static void rt6_uncached_list_del(struct rt6_info *rt)
132 {
133 	if (!list_empty(&rt->rt6i_uncached)) {
134 		struct uncached_list *ul = rt->rt6i_uncached_list;
135 
136 		spin_lock_bh(&ul->lock);
137 		list_del(&rt->rt6i_uncached);
138 		spin_unlock_bh(&ul->lock);
139 	}
140 }
141 
142 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
143 {
144 	struct net_device *loopback_dev = net->loopback_dev;
145 	int cpu;
146 
147 	if (dev == loopback_dev)
148 		return;
149 
150 	for_each_possible_cpu(cpu) {
151 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
152 		struct rt6_info *rt;
153 
154 		spin_lock_bh(&ul->lock);
155 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
156 			struct inet6_dev *rt_idev = rt->rt6i_idev;
157 			struct net_device *rt_dev = rt->dst.dev;
158 
159 			if (rt_idev->dev == dev) {
160 				rt->rt6i_idev = in6_dev_get(loopback_dev);
161 				in6_dev_put(rt_idev);
162 			}
163 
164 			if (rt_dev == dev) {
165 				rt->dst.dev = loopback_dev;
166 				dev_hold(rt->dst.dev);
167 				dev_put(rt_dev);
168 			}
169 		}
170 		spin_unlock_bh(&ul->lock);
171 	}
172 }
173 
174 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
175 {
176 	return dst_metrics_write_ptr(rt->dst.from);
177 }
178 
179 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
180 {
181 	struct rt6_info *rt = (struct rt6_info *)dst;
182 
183 	if (rt->rt6i_flags & RTF_PCPU)
184 		return rt6_pcpu_cow_metrics(rt);
185 	else if (rt->rt6i_flags & RTF_CACHE)
186 		return NULL;
187 	else
188 		return dst_cow_metrics_generic(dst, old);
189 }
190 
191 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
192 					     struct sk_buff *skb,
193 					     const void *daddr)
194 {
195 	struct in6_addr *p = &rt->rt6i_gateway;
196 
197 	if (!ipv6_addr_any(p))
198 		return (const void *) p;
199 	else if (skb)
200 		return &ipv6_hdr(skb)->daddr;
201 	return daddr;
202 }
203 
204 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
205 					  struct sk_buff *skb,
206 					  const void *daddr)
207 {
208 	struct rt6_info *rt = (struct rt6_info *) dst;
209 	struct neighbour *n;
210 
211 	daddr = choose_neigh_daddr(rt, skb, daddr);
212 	n = __ipv6_neigh_lookup(dst->dev, daddr);
213 	if (n)
214 		return n;
215 	return neigh_create(&nd_tbl, daddr, dst->dev);
216 }
217 
218 static struct dst_ops ip6_dst_ops_template = {
219 	.family			=	AF_INET6,
220 	.gc			=	ip6_dst_gc,
221 	.gc_thresh		=	1024,
222 	.check			=	ip6_dst_check,
223 	.default_advmss		=	ip6_default_advmss,
224 	.mtu			=	ip6_mtu,
225 	.cow_metrics		=	ipv6_cow_metrics,
226 	.destroy		=	ip6_dst_destroy,
227 	.ifdown			=	ip6_dst_ifdown,
228 	.negative_advice	=	ip6_negative_advice,
229 	.link_failure		=	ip6_link_failure,
230 	.update_pmtu		=	ip6_rt_update_pmtu,
231 	.redirect		=	rt6_do_redirect,
232 	.local_out		=	__ip6_local_out,
233 	.neigh_lookup		=	ip6_neigh_lookup,
234 };
235 
236 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
237 {
238 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
239 
240 	return mtu ? : dst->dev->mtu;
241 }
242 
243 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
244 					 struct sk_buff *skb, u32 mtu)
245 {
246 }
247 
248 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
249 				      struct sk_buff *skb)
250 {
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	dst_cow_metrics_generic,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_out,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 static void rt6_info_init(struct rt6_info *rt)
319 {
320 	struct dst_entry *dst = &rt->dst;
321 
322 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
323 	INIT_LIST_HEAD(&rt->rt6i_siblings);
324 	INIT_LIST_HEAD(&rt->rt6i_uncached);
325 }
326 
327 /* allocate dst with ip6_dst_ops */
328 static struct rt6_info *__ip6_dst_alloc(struct net *net,
329 					struct net_device *dev,
330 					int flags)
331 {
332 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
333 					0, DST_OBSOLETE_FORCE_CHK, flags);
334 
335 	if (rt)
336 		rt6_info_init(rt);
337 
338 	return rt;
339 }
340 
341 static struct rt6_info *ip6_dst_alloc(struct net *net,
342 				      struct net_device *dev,
343 				      int flags)
344 {
345 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
346 
347 	if (rt) {
348 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
349 		if (rt->rt6i_pcpu) {
350 			int cpu;
351 
352 			for_each_possible_cpu(cpu) {
353 				struct rt6_info **p;
354 
355 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
356 				/* no one shares rt */
357 				*p =  NULL;
358 			}
359 		} else {
360 			dst_destroy((struct dst_entry *)rt);
361 			return NULL;
362 		}
363 	}
364 
365 	return rt;
366 }
367 
368 static void ip6_dst_destroy(struct dst_entry *dst)
369 {
370 	struct rt6_info *rt = (struct rt6_info *)dst;
371 	struct dst_entry *from = dst->from;
372 	struct inet6_dev *idev;
373 
374 	dst_destroy_metrics_generic(dst);
375 	free_percpu(rt->rt6i_pcpu);
376 	rt6_uncached_list_del(rt);
377 
378 	idev = rt->rt6i_idev;
379 	if (idev) {
380 		rt->rt6i_idev = NULL;
381 		in6_dev_put(idev);
382 	}
383 
384 	dst->from = NULL;
385 	dst_release(from);
386 }
387 
388 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 			   int how)
390 {
391 	struct rt6_info *rt = (struct rt6_info *)dst;
392 	struct inet6_dev *idev = rt->rt6i_idev;
393 	struct net_device *loopback_dev =
394 		dev_net(dev)->loopback_dev;
395 
396 	if (dev != loopback_dev) {
397 		if (idev && idev->dev == dev) {
398 			struct inet6_dev *loopback_idev =
399 				in6_dev_get(loopback_dev);
400 			if (loopback_idev) {
401 				rt->rt6i_idev = loopback_idev;
402 				in6_dev_put(idev);
403 			}
404 		}
405 	}
406 }
407 
408 static bool __rt6_check_expired(const struct rt6_info *rt)
409 {
410 	if (rt->rt6i_flags & RTF_EXPIRES)
411 		return time_after(jiffies, rt->dst.expires);
412 	else
413 		return false;
414 }
415 
416 static bool rt6_check_expired(const struct rt6_info *rt)
417 {
418 	if (rt->rt6i_flags & RTF_EXPIRES) {
419 		if (time_after(jiffies, rt->dst.expires))
420 			return true;
421 	} else if (rt->dst.from) {
422 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
423 	}
424 	return false;
425 }
426 
427 /* Multipath route selection:
428  *   Hash based function using packet header and flowlabel.
429  * Adapted from fib_info_hashfn()
430  */
431 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
432 			       const struct flowi6 *fl6)
433 {
434 	return get_hash_from_flowi6(fl6) % candidate_count;
435 }
436 
437 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
438 					     struct flowi6 *fl6, int oif,
439 					     int strict)
440 {
441 	struct rt6_info *sibling, *next_sibling;
442 	int route_choosen;
443 
444 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
445 	/* Don't change the route, if route_choosen == 0
446 	 * (siblings does not include ourself)
447 	 */
448 	if (route_choosen)
449 		list_for_each_entry_safe(sibling, next_sibling,
450 				&match->rt6i_siblings, rt6i_siblings) {
451 			route_choosen--;
452 			if (route_choosen == 0) {
453 				if (rt6_score_route(sibling, oif, strict) < 0)
454 					break;
455 				match = sibling;
456 				break;
457 			}
458 		}
459 	return match;
460 }
461 
462 /*
463  *	Route lookup. Any table->tb6_lock is implied.
464  */
465 
466 static inline struct rt6_info *rt6_device_match(struct net *net,
467 						    struct rt6_info *rt,
468 						    const struct in6_addr *saddr,
469 						    int oif,
470 						    int flags)
471 {
472 	struct rt6_info *local = NULL;
473 	struct rt6_info *sprt;
474 
475 	if (!oif && ipv6_addr_any(saddr))
476 		goto out;
477 
478 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
479 		struct net_device *dev = sprt->dst.dev;
480 
481 		if (oif) {
482 			if (dev->ifindex == oif)
483 				return sprt;
484 			if (dev->flags & IFF_LOOPBACK) {
485 				if (!sprt->rt6i_idev ||
486 				    sprt->rt6i_idev->dev->ifindex != oif) {
487 					if (flags & RT6_LOOKUP_F_IFACE)
488 						continue;
489 					if (local &&
490 					    local->rt6i_idev->dev->ifindex == oif)
491 						continue;
492 				}
493 				local = sprt;
494 			}
495 		} else {
496 			if (ipv6_chk_addr(net, saddr, dev,
497 					  flags & RT6_LOOKUP_F_IFACE))
498 				return sprt;
499 		}
500 	}
501 
502 	if (oif) {
503 		if (local)
504 			return local;
505 
506 		if (flags & RT6_LOOKUP_F_IFACE)
507 			return net->ipv6.ip6_null_entry;
508 	}
509 out:
510 	return rt;
511 }
512 
513 #ifdef CONFIG_IPV6_ROUTER_PREF
514 struct __rt6_probe_work {
515 	struct work_struct work;
516 	struct in6_addr target;
517 	struct net_device *dev;
518 };
519 
520 static void rt6_probe_deferred(struct work_struct *w)
521 {
522 	struct in6_addr mcaddr;
523 	struct __rt6_probe_work *work =
524 		container_of(w, struct __rt6_probe_work, work);
525 
526 	addrconf_addr_solict_mult(&work->target, &mcaddr);
527 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL);
528 	dev_put(work->dev);
529 	kfree(work);
530 }
531 
532 static void rt6_probe(struct rt6_info *rt)
533 {
534 	struct __rt6_probe_work *work;
535 	struct neighbour *neigh;
536 	/*
537 	 * Okay, this does not seem to be appropriate
538 	 * for now, however, we need to check if it
539 	 * is really so; aka Router Reachability Probing.
540 	 *
541 	 * Router Reachability Probe MUST be rate-limited
542 	 * to no more than one per minute.
543 	 */
544 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
545 		return;
546 	rcu_read_lock_bh();
547 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
548 	if (neigh) {
549 		if (neigh->nud_state & NUD_VALID)
550 			goto out;
551 
552 		work = NULL;
553 		write_lock(&neigh->lock);
554 		if (!(neigh->nud_state & NUD_VALID) &&
555 		    time_after(jiffies,
556 			       neigh->updated +
557 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
558 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
559 			if (work)
560 				__neigh_set_probe_once(neigh);
561 		}
562 		write_unlock(&neigh->lock);
563 	} else {
564 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
565 	}
566 
567 	if (work) {
568 		INIT_WORK(&work->work, rt6_probe_deferred);
569 		work->target = rt->rt6i_gateway;
570 		dev_hold(rt->dst.dev);
571 		work->dev = rt->dst.dev;
572 		schedule_work(&work->work);
573 	}
574 
575 out:
576 	rcu_read_unlock_bh();
577 }
578 #else
579 static inline void rt6_probe(struct rt6_info *rt)
580 {
581 }
582 #endif
583 
584 /*
585  * Default Router Selection (RFC 2461 6.3.6)
586  */
587 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
588 {
589 	struct net_device *dev = rt->dst.dev;
590 	if (!oif || dev->ifindex == oif)
591 		return 2;
592 	if ((dev->flags & IFF_LOOPBACK) &&
593 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
594 		return 1;
595 	return 0;
596 }
597 
598 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
599 {
600 	struct neighbour *neigh;
601 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
602 
603 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
604 	    !(rt->rt6i_flags & RTF_GATEWAY))
605 		return RT6_NUD_SUCCEED;
606 
607 	rcu_read_lock_bh();
608 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
609 	if (neigh) {
610 		read_lock(&neigh->lock);
611 		if (neigh->nud_state & NUD_VALID)
612 			ret = RT6_NUD_SUCCEED;
613 #ifdef CONFIG_IPV6_ROUTER_PREF
614 		else if (!(neigh->nud_state & NUD_FAILED))
615 			ret = RT6_NUD_SUCCEED;
616 		else
617 			ret = RT6_NUD_FAIL_PROBE;
618 #endif
619 		read_unlock(&neigh->lock);
620 	} else {
621 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
622 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
623 	}
624 	rcu_read_unlock_bh();
625 
626 	return ret;
627 }
628 
629 static int rt6_score_route(struct rt6_info *rt, int oif,
630 			   int strict)
631 {
632 	int m;
633 
634 	m = rt6_check_dev(rt, oif);
635 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
636 		return RT6_NUD_FAIL_HARD;
637 #ifdef CONFIG_IPV6_ROUTER_PREF
638 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
639 #endif
640 	if (strict & RT6_LOOKUP_F_REACHABLE) {
641 		int n = rt6_check_neigh(rt);
642 		if (n < 0)
643 			return n;
644 	}
645 	return m;
646 }
647 
648 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
649 				   int *mpri, struct rt6_info *match,
650 				   bool *do_rr)
651 {
652 	int m;
653 	bool match_do_rr = false;
654 	struct inet6_dev *idev = rt->rt6i_idev;
655 	struct net_device *dev = rt->dst.dev;
656 
657 	if (dev && !netif_carrier_ok(dev) &&
658 	    idev->cnf.ignore_routes_with_linkdown)
659 		goto out;
660 
661 	if (rt6_check_expired(rt))
662 		goto out;
663 
664 	m = rt6_score_route(rt, oif, strict);
665 	if (m == RT6_NUD_FAIL_DO_RR) {
666 		match_do_rr = true;
667 		m = 0; /* lowest valid score */
668 	} else if (m == RT6_NUD_FAIL_HARD) {
669 		goto out;
670 	}
671 
672 	if (strict & RT6_LOOKUP_F_REACHABLE)
673 		rt6_probe(rt);
674 
675 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
676 	if (m > *mpri) {
677 		*do_rr = match_do_rr;
678 		*mpri = m;
679 		match = rt;
680 	}
681 out:
682 	return match;
683 }
684 
685 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
686 				     struct rt6_info *rr_head,
687 				     u32 metric, int oif, int strict,
688 				     bool *do_rr)
689 {
690 	struct rt6_info *rt, *match, *cont;
691 	int mpri = -1;
692 
693 	match = NULL;
694 	cont = NULL;
695 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
696 		if (rt->rt6i_metric != metric) {
697 			cont = rt;
698 			break;
699 		}
700 
701 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
702 	}
703 
704 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
705 		if (rt->rt6i_metric != metric) {
706 			cont = rt;
707 			break;
708 		}
709 
710 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
711 	}
712 
713 	if (match || !cont)
714 		return match;
715 
716 	for (rt = cont; rt; rt = rt->dst.rt6_next)
717 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 
719 	return match;
720 }
721 
722 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
723 {
724 	struct rt6_info *match, *rt0;
725 	struct net *net;
726 	bool do_rr = false;
727 
728 	rt0 = fn->rr_ptr;
729 	if (!rt0)
730 		fn->rr_ptr = rt0 = fn->leaf;
731 
732 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
733 			     &do_rr);
734 
735 	if (do_rr) {
736 		struct rt6_info *next = rt0->dst.rt6_next;
737 
738 		/* no entries matched; do round-robin */
739 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
740 			next = fn->leaf;
741 
742 		if (next != rt0)
743 			fn->rr_ptr = next;
744 	}
745 
746 	net = dev_net(rt0->dst.dev);
747 	return match ? match : net->ipv6.ip6_null_entry;
748 }
749 
750 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
751 {
752 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
753 }
754 
755 #ifdef CONFIG_IPV6_ROUTE_INFO
756 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
757 		  const struct in6_addr *gwaddr)
758 {
759 	struct net *net = dev_net(dev);
760 	struct route_info *rinfo = (struct route_info *) opt;
761 	struct in6_addr prefix_buf, *prefix;
762 	unsigned int pref;
763 	unsigned long lifetime;
764 	struct rt6_info *rt;
765 
766 	if (len < sizeof(struct route_info)) {
767 		return -EINVAL;
768 	}
769 
770 	/* Sanity check for prefix_len and length */
771 	if (rinfo->length > 3) {
772 		return -EINVAL;
773 	} else if (rinfo->prefix_len > 128) {
774 		return -EINVAL;
775 	} else if (rinfo->prefix_len > 64) {
776 		if (rinfo->length < 2) {
777 			return -EINVAL;
778 		}
779 	} else if (rinfo->prefix_len > 0) {
780 		if (rinfo->length < 1) {
781 			return -EINVAL;
782 		}
783 	}
784 
785 	pref = rinfo->route_pref;
786 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
787 		return -EINVAL;
788 
789 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
790 
791 	if (rinfo->length == 3)
792 		prefix = (struct in6_addr *)rinfo->prefix;
793 	else {
794 		/* this function is safe */
795 		ipv6_addr_prefix(&prefix_buf,
796 				 (struct in6_addr *)rinfo->prefix,
797 				 rinfo->prefix_len);
798 		prefix = &prefix_buf;
799 	}
800 
801 	if (rinfo->prefix_len == 0)
802 		rt = rt6_get_dflt_router(gwaddr, dev);
803 	else
804 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
805 					gwaddr, dev->ifindex);
806 
807 	if (rt && !lifetime) {
808 		ip6_del_rt(rt);
809 		rt = NULL;
810 	}
811 
812 	if (!rt && lifetime)
813 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
814 					pref);
815 	else if (rt)
816 		rt->rt6i_flags = RTF_ROUTEINFO |
817 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
818 
819 	if (rt) {
820 		if (!addrconf_finite_timeout(lifetime))
821 			rt6_clean_expires(rt);
822 		else
823 			rt6_set_expires(rt, jiffies + HZ * lifetime);
824 
825 		ip6_rt_put(rt);
826 	}
827 	return 0;
828 }
829 #endif
830 
831 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
832 					struct in6_addr *saddr)
833 {
834 	struct fib6_node *pn;
835 	while (1) {
836 		if (fn->fn_flags & RTN_TL_ROOT)
837 			return NULL;
838 		pn = fn->parent;
839 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
840 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
841 		else
842 			fn = pn;
843 		if (fn->fn_flags & RTN_RTINFO)
844 			return fn;
845 	}
846 }
847 
848 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
849 					     struct fib6_table *table,
850 					     struct flowi6 *fl6, int flags)
851 {
852 	struct fib6_node *fn;
853 	struct rt6_info *rt;
854 
855 	read_lock_bh(&table->tb6_lock);
856 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
857 restart:
858 	rt = fn->leaf;
859 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
860 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
861 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
862 	if (rt == net->ipv6.ip6_null_entry) {
863 		fn = fib6_backtrack(fn, &fl6->saddr);
864 		if (fn)
865 			goto restart;
866 	}
867 	dst_use(&rt->dst, jiffies);
868 	read_unlock_bh(&table->tb6_lock);
869 
870 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
871 
872 	return rt;
873 
874 }
875 
876 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
877 				    int flags)
878 {
879 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
880 }
881 EXPORT_SYMBOL_GPL(ip6_route_lookup);
882 
883 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
884 			    const struct in6_addr *saddr, int oif, int strict)
885 {
886 	struct flowi6 fl6 = {
887 		.flowi6_oif = oif,
888 		.daddr = *daddr,
889 	};
890 	struct dst_entry *dst;
891 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
892 
893 	if (saddr) {
894 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
895 		flags |= RT6_LOOKUP_F_HAS_SADDR;
896 	}
897 
898 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
899 	if (dst->error == 0)
900 		return (struct rt6_info *) dst;
901 
902 	dst_release(dst);
903 
904 	return NULL;
905 }
906 EXPORT_SYMBOL(rt6_lookup);
907 
908 /* ip6_ins_rt is called with FREE table->tb6_lock.
909    It takes new route entry, the addition fails by any reason the
910    route is freed. In any case, if caller does not hold it, it may
911    be destroyed.
912  */
913 
914 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
915 			struct mx6_config *mxc)
916 {
917 	int err;
918 	struct fib6_table *table;
919 
920 	table = rt->rt6i_table;
921 	write_lock_bh(&table->tb6_lock);
922 	err = fib6_add(&table->tb6_root, rt, info, mxc);
923 	write_unlock_bh(&table->tb6_lock);
924 
925 	return err;
926 }
927 
928 int ip6_ins_rt(struct rt6_info *rt)
929 {
930 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
931 	struct mx6_config mxc = { .mx = NULL, };
932 
933 	return __ip6_ins_rt(rt, &info, &mxc);
934 }
935 
936 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
937 					   const struct in6_addr *daddr,
938 					   const struct in6_addr *saddr)
939 {
940 	struct rt6_info *rt;
941 
942 	/*
943 	 *	Clone the route.
944 	 */
945 
946 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
947 		ort = (struct rt6_info *)ort->dst.from;
948 
949 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
950 
951 	if (!rt)
952 		return NULL;
953 
954 	ip6_rt_copy_init(rt, ort);
955 	rt->rt6i_flags |= RTF_CACHE;
956 	rt->rt6i_metric = 0;
957 	rt->dst.flags |= DST_HOST;
958 	rt->rt6i_dst.addr = *daddr;
959 	rt->rt6i_dst.plen = 128;
960 
961 	if (!rt6_is_gw_or_nonexthop(ort)) {
962 		if (ort->rt6i_dst.plen != 128 &&
963 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
964 			rt->rt6i_flags |= RTF_ANYCAST;
965 #ifdef CONFIG_IPV6_SUBTREES
966 		if (rt->rt6i_src.plen && saddr) {
967 			rt->rt6i_src.addr = *saddr;
968 			rt->rt6i_src.plen = 128;
969 		}
970 #endif
971 	}
972 
973 	return rt;
974 }
975 
976 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
977 {
978 	struct rt6_info *pcpu_rt;
979 
980 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
981 				  rt->dst.dev, rt->dst.flags);
982 
983 	if (!pcpu_rt)
984 		return NULL;
985 	ip6_rt_copy_init(pcpu_rt, rt);
986 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
987 	pcpu_rt->rt6i_flags |= RTF_PCPU;
988 	return pcpu_rt;
989 }
990 
991 /* It should be called with read_lock_bh(&tb6_lock) acquired */
992 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
993 {
994 	struct rt6_info *pcpu_rt, **p;
995 
996 	p = this_cpu_ptr(rt->rt6i_pcpu);
997 	pcpu_rt = *p;
998 
999 	if (pcpu_rt) {
1000 		dst_hold(&pcpu_rt->dst);
1001 		rt6_dst_from_metrics_check(pcpu_rt);
1002 	}
1003 	return pcpu_rt;
1004 }
1005 
1006 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1007 {
1008 	struct fib6_table *table = rt->rt6i_table;
1009 	struct rt6_info *pcpu_rt, *prev, **p;
1010 
1011 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1012 	if (!pcpu_rt) {
1013 		struct net *net = dev_net(rt->dst.dev);
1014 
1015 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1016 		return net->ipv6.ip6_null_entry;
1017 	}
1018 
1019 	read_lock_bh(&table->tb6_lock);
1020 	if (rt->rt6i_pcpu) {
1021 		p = this_cpu_ptr(rt->rt6i_pcpu);
1022 		prev = cmpxchg(p, NULL, pcpu_rt);
1023 		if (prev) {
1024 			/* If someone did it before us, return prev instead */
1025 			dst_destroy(&pcpu_rt->dst);
1026 			pcpu_rt = prev;
1027 		}
1028 	} else {
1029 		/* rt has been removed from the fib6 tree
1030 		 * before we have a chance to acquire the read_lock.
1031 		 * In this case, don't brother to create a pcpu rt
1032 		 * since rt is going away anyway.  The next
1033 		 * dst_check() will trigger a re-lookup.
1034 		 */
1035 		dst_destroy(&pcpu_rt->dst);
1036 		pcpu_rt = rt;
1037 	}
1038 	dst_hold(&pcpu_rt->dst);
1039 	rt6_dst_from_metrics_check(pcpu_rt);
1040 	read_unlock_bh(&table->tb6_lock);
1041 	return pcpu_rt;
1042 }
1043 
1044 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1045 				      struct flowi6 *fl6, int flags)
1046 {
1047 	struct fib6_node *fn, *saved_fn;
1048 	struct rt6_info *rt;
1049 	int strict = 0;
1050 
1051 	strict |= flags & RT6_LOOKUP_F_IFACE;
1052 	if (net->ipv6.devconf_all->forwarding == 0)
1053 		strict |= RT6_LOOKUP_F_REACHABLE;
1054 
1055 	read_lock_bh(&table->tb6_lock);
1056 
1057 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1058 	saved_fn = fn;
1059 
1060 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1061 		oif = 0;
1062 
1063 redo_rt6_select:
1064 	rt = rt6_select(fn, oif, strict);
1065 	if (rt->rt6i_nsiblings)
1066 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1067 	if (rt == net->ipv6.ip6_null_entry) {
1068 		fn = fib6_backtrack(fn, &fl6->saddr);
1069 		if (fn)
1070 			goto redo_rt6_select;
1071 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1072 			/* also consider unreachable route */
1073 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1074 			fn = saved_fn;
1075 			goto redo_rt6_select;
1076 		}
1077 	}
1078 
1079 
1080 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1081 		dst_use(&rt->dst, jiffies);
1082 		read_unlock_bh(&table->tb6_lock);
1083 
1084 		rt6_dst_from_metrics_check(rt);
1085 
1086 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1087 		return rt;
1088 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1089 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1090 		/* Create a RTF_CACHE clone which will not be
1091 		 * owned by the fib6 tree.  It is for the special case where
1092 		 * the daddr in the skb during the neighbor look-up is different
1093 		 * from the fl6->daddr used to look-up route here.
1094 		 */
1095 
1096 		struct rt6_info *uncached_rt;
1097 
1098 		dst_use(&rt->dst, jiffies);
1099 		read_unlock_bh(&table->tb6_lock);
1100 
1101 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1102 		dst_release(&rt->dst);
1103 
1104 		if (uncached_rt)
1105 			rt6_uncached_list_add(uncached_rt);
1106 		else
1107 			uncached_rt = net->ipv6.ip6_null_entry;
1108 
1109 		dst_hold(&uncached_rt->dst);
1110 
1111 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1112 		return uncached_rt;
1113 
1114 	} else {
1115 		/* Get a percpu copy */
1116 
1117 		struct rt6_info *pcpu_rt;
1118 
1119 		rt->dst.lastuse = jiffies;
1120 		rt->dst.__use++;
1121 		pcpu_rt = rt6_get_pcpu_route(rt);
1122 
1123 		if (pcpu_rt) {
1124 			read_unlock_bh(&table->tb6_lock);
1125 		} else {
1126 			/* We have to do the read_unlock first
1127 			 * because rt6_make_pcpu_route() may trigger
1128 			 * ip6_dst_gc() which will take the write_lock.
1129 			 */
1130 			dst_hold(&rt->dst);
1131 			read_unlock_bh(&table->tb6_lock);
1132 			pcpu_rt = rt6_make_pcpu_route(rt);
1133 			dst_release(&rt->dst);
1134 		}
1135 
1136 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1137 		return pcpu_rt;
1138 
1139 	}
1140 }
1141 
1142 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1143 					    struct flowi6 *fl6, int flags)
1144 {
1145 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1146 }
1147 
1148 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1149 						struct net_device *dev,
1150 						struct flowi6 *fl6, int flags)
1151 {
1152 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1153 		flags |= RT6_LOOKUP_F_IFACE;
1154 
1155 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1156 }
1157 
1158 void ip6_route_input(struct sk_buff *skb)
1159 {
1160 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1161 	struct net *net = dev_net(skb->dev);
1162 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1163 	struct ip_tunnel_info *tun_info;
1164 	struct flowi6 fl6 = {
1165 		.flowi6_iif = l3mdev_fib_oif(skb->dev),
1166 		.daddr = iph->daddr,
1167 		.saddr = iph->saddr,
1168 		.flowlabel = ip6_flowinfo(iph),
1169 		.flowi6_mark = skb->mark,
1170 		.flowi6_proto = iph->nexthdr,
1171 	};
1172 
1173 	tun_info = skb_tunnel_info(skb);
1174 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1175 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1176 	skb_dst_drop(skb);
1177 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1178 }
1179 
1180 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1181 					     struct flowi6 *fl6, int flags)
1182 {
1183 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1184 }
1185 
1186 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1187 				    struct flowi6 *fl6)
1188 {
1189 	struct dst_entry *dst;
1190 	int flags = 0;
1191 	bool any_src;
1192 
1193 	dst = l3mdev_rt6_dst_by_oif(net, fl6);
1194 	if (dst)
1195 		return dst;
1196 
1197 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1198 
1199 	any_src = ipv6_addr_any(&fl6->saddr);
1200 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1201 	    (fl6->flowi6_oif && any_src))
1202 		flags |= RT6_LOOKUP_F_IFACE;
1203 
1204 	if (!any_src)
1205 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1206 	else if (sk)
1207 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1208 
1209 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1210 }
1211 EXPORT_SYMBOL(ip6_route_output);
1212 
1213 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1214 {
1215 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1216 	struct dst_entry *new = NULL;
1217 
1218 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1219 	if (rt) {
1220 		rt6_info_init(rt);
1221 
1222 		new = &rt->dst;
1223 		new->__use = 1;
1224 		new->input = dst_discard;
1225 		new->output = dst_discard_out;
1226 
1227 		dst_copy_metrics(new, &ort->dst);
1228 		rt->rt6i_idev = ort->rt6i_idev;
1229 		if (rt->rt6i_idev)
1230 			in6_dev_hold(rt->rt6i_idev);
1231 
1232 		rt->rt6i_gateway = ort->rt6i_gateway;
1233 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1234 		rt->rt6i_metric = 0;
1235 
1236 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1237 #ifdef CONFIG_IPV6_SUBTREES
1238 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1239 #endif
1240 
1241 		dst_free(new);
1242 	}
1243 
1244 	dst_release(dst_orig);
1245 	return new ? new : ERR_PTR(-ENOMEM);
1246 }
1247 
1248 /*
1249  *	Destination cache support functions
1250  */
1251 
1252 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1253 {
1254 	if (rt->dst.from &&
1255 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1256 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1257 }
1258 
1259 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1260 {
1261 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1262 		return NULL;
1263 
1264 	if (rt6_check_expired(rt))
1265 		return NULL;
1266 
1267 	return &rt->dst;
1268 }
1269 
1270 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1271 {
1272 	if (!__rt6_check_expired(rt) &&
1273 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1274 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1275 		return &rt->dst;
1276 	else
1277 		return NULL;
1278 }
1279 
1280 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1281 {
1282 	struct rt6_info *rt;
1283 
1284 	rt = (struct rt6_info *) dst;
1285 
1286 	/* All IPV6 dsts are created with ->obsolete set to the value
1287 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1288 	 * into this function always.
1289 	 */
1290 
1291 	rt6_dst_from_metrics_check(rt);
1292 
1293 	if (rt->rt6i_flags & RTF_PCPU ||
1294 	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1295 		return rt6_dst_from_check(rt, cookie);
1296 	else
1297 		return rt6_check(rt, cookie);
1298 }
1299 
1300 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1301 {
1302 	struct rt6_info *rt = (struct rt6_info *) dst;
1303 
1304 	if (rt) {
1305 		if (rt->rt6i_flags & RTF_CACHE) {
1306 			if (rt6_check_expired(rt)) {
1307 				ip6_del_rt(rt);
1308 				dst = NULL;
1309 			}
1310 		} else {
1311 			dst_release(dst);
1312 			dst = NULL;
1313 		}
1314 	}
1315 	return dst;
1316 }
1317 
1318 static void ip6_link_failure(struct sk_buff *skb)
1319 {
1320 	struct rt6_info *rt;
1321 
1322 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1323 
1324 	rt = (struct rt6_info *) skb_dst(skb);
1325 	if (rt) {
1326 		if (rt->rt6i_flags & RTF_CACHE) {
1327 			dst_hold(&rt->dst);
1328 			ip6_del_rt(rt);
1329 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1330 			rt->rt6i_node->fn_sernum = -1;
1331 		}
1332 	}
1333 }
1334 
1335 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1336 {
1337 	struct net *net = dev_net(rt->dst.dev);
1338 
1339 	rt->rt6i_flags |= RTF_MODIFIED;
1340 	rt->rt6i_pmtu = mtu;
1341 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1342 }
1343 
1344 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1345 {
1346 	return !(rt->rt6i_flags & RTF_CACHE) &&
1347 		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1348 }
1349 
1350 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1351 				 const struct ipv6hdr *iph, u32 mtu)
1352 {
1353 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1354 
1355 	if (rt6->rt6i_flags & RTF_LOCAL)
1356 		return;
1357 
1358 	dst_confirm(dst);
1359 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1360 	if (mtu >= dst_mtu(dst))
1361 		return;
1362 
1363 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1364 		rt6_do_update_pmtu(rt6, mtu);
1365 	} else {
1366 		const struct in6_addr *daddr, *saddr;
1367 		struct rt6_info *nrt6;
1368 
1369 		if (iph) {
1370 			daddr = &iph->daddr;
1371 			saddr = &iph->saddr;
1372 		} else if (sk) {
1373 			daddr = &sk->sk_v6_daddr;
1374 			saddr = &inet6_sk(sk)->saddr;
1375 		} else {
1376 			return;
1377 		}
1378 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1379 		if (nrt6) {
1380 			rt6_do_update_pmtu(nrt6, mtu);
1381 
1382 			/* ip6_ins_rt(nrt6) will bump the
1383 			 * rt6->rt6i_node->fn_sernum
1384 			 * which will fail the next rt6_check() and
1385 			 * invalidate the sk->sk_dst_cache.
1386 			 */
1387 			ip6_ins_rt(nrt6);
1388 		}
1389 	}
1390 }
1391 
1392 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1393 			       struct sk_buff *skb, u32 mtu)
1394 {
1395 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1396 }
1397 
1398 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1399 		     int oif, u32 mark)
1400 {
1401 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1402 	struct dst_entry *dst;
1403 	struct flowi6 fl6;
1404 
1405 	memset(&fl6, 0, sizeof(fl6));
1406 	fl6.flowi6_oif = oif;
1407 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1408 	fl6.daddr = iph->daddr;
1409 	fl6.saddr = iph->saddr;
1410 	fl6.flowlabel = ip6_flowinfo(iph);
1411 
1412 	dst = ip6_route_output(net, NULL, &fl6);
1413 	if (!dst->error)
1414 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1415 	dst_release(dst);
1416 }
1417 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1418 
1419 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1420 {
1421 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1422 			sk->sk_bound_dev_if, sk->sk_mark);
1423 }
1424 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1425 
1426 /* Handle redirects */
1427 struct ip6rd_flowi {
1428 	struct flowi6 fl6;
1429 	struct in6_addr gateway;
1430 };
1431 
1432 static struct rt6_info *__ip6_route_redirect(struct net *net,
1433 					     struct fib6_table *table,
1434 					     struct flowi6 *fl6,
1435 					     int flags)
1436 {
1437 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1438 	struct rt6_info *rt;
1439 	struct fib6_node *fn;
1440 
1441 	/* Get the "current" route for this destination and
1442 	 * check if the redirect has come from approriate router.
1443 	 *
1444 	 * RFC 4861 specifies that redirects should only be
1445 	 * accepted if they come from the nexthop to the target.
1446 	 * Due to the way the routes are chosen, this notion
1447 	 * is a bit fuzzy and one might need to check all possible
1448 	 * routes.
1449 	 */
1450 
1451 	read_lock_bh(&table->tb6_lock);
1452 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1453 restart:
1454 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1455 		if (rt6_check_expired(rt))
1456 			continue;
1457 		if (rt->dst.error)
1458 			break;
1459 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1460 			continue;
1461 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1462 			continue;
1463 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1464 			continue;
1465 		break;
1466 	}
1467 
1468 	if (!rt)
1469 		rt = net->ipv6.ip6_null_entry;
1470 	else if (rt->dst.error) {
1471 		rt = net->ipv6.ip6_null_entry;
1472 		goto out;
1473 	}
1474 
1475 	if (rt == net->ipv6.ip6_null_entry) {
1476 		fn = fib6_backtrack(fn, &fl6->saddr);
1477 		if (fn)
1478 			goto restart;
1479 	}
1480 
1481 out:
1482 	dst_hold(&rt->dst);
1483 
1484 	read_unlock_bh(&table->tb6_lock);
1485 
1486 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1487 	return rt;
1488 };
1489 
1490 static struct dst_entry *ip6_route_redirect(struct net *net,
1491 					const struct flowi6 *fl6,
1492 					const struct in6_addr *gateway)
1493 {
1494 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1495 	struct ip6rd_flowi rdfl;
1496 
1497 	rdfl.fl6 = *fl6;
1498 	rdfl.gateway = *gateway;
1499 
1500 	return fib6_rule_lookup(net, &rdfl.fl6,
1501 				flags, __ip6_route_redirect);
1502 }
1503 
1504 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1505 {
1506 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1507 	struct dst_entry *dst;
1508 	struct flowi6 fl6;
1509 
1510 	memset(&fl6, 0, sizeof(fl6));
1511 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1512 	fl6.flowi6_oif = oif;
1513 	fl6.flowi6_mark = mark;
1514 	fl6.daddr = iph->daddr;
1515 	fl6.saddr = iph->saddr;
1516 	fl6.flowlabel = ip6_flowinfo(iph);
1517 
1518 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1519 	rt6_do_redirect(dst, NULL, skb);
1520 	dst_release(dst);
1521 }
1522 EXPORT_SYMBOL_GPL(ip6_redirect);
1523 
1524 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1525 			    u32 mark)
1526 {
1527 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1528 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1529 	struct dst_entry *dst;
1530 	struct flowi6 fl6;
1531 
1532 	memset(&fl6, 0, sizeof(fl6));
1533 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1534 	fl6.flowi6_oif = oif;
1535 	fl6.flowi6_mark = mark;
1536 	fl6.daddr = msg->dest;
1537 	fl6.saddr = iph->daddr;
1538 
1539 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1540 	rt6_do_redirect(dst, NULL, skb);
1541 	dst_release(dst);
1542 }
1543 
1544 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1545 {
1546 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1547 }
1548 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1549 
1550 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1551 {
1552 	struct net_device *dev = dst->dev;
1553 	unsigned int mtu = dst_mtu(dst);
1554 	struct net *net = dev_net(dev);
1555 
1556 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1557 
1558 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1559 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1560 
1561 	/*
1562 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1563 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1564 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1565 	 * rely only on pmtu discovery"
1566 	 */
1567 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1568 		mtu = IPV6_MAXPLEN;
1569 	return mtu;
1570 }
1571 
1572 static unsigned int ip6_mtu(const struct dst_entry *dst)
1573 {
1574 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1575 	unsigned int mtu = rt->rt6i_pmtu;
1576 	struct inet6_dev *idev;
1577 
1578 	if (mtu)
1579 		goto out;
1580 
1581 	mtu = dst_metric_raw(dst, RTAX_MTU);
1582 	if (mtu)
1583 		goto out;
1584 
1585 	mtu = IPV6_MIN_MTU;
1586 
1587 	rcu_read_lock();
1588 	idev = __in6_dev_get(dst->dev);
1589 	if (idev)
1590 		mtu = idev->cnf.mtu6;
1591 	rcu_read_unlock();
1592 
1593 out:
1594 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1595 }
1596 
1597 static struct dst_entry *icmp6_dst_gc_list;
1598 static DEFINE_SPINLOCK(icmp6_dst_lock);
1599 
1600 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1601 				  struct flowi6 *fl6)
1602 {
1603 	struct dst_entry *dst;
1604 	struct rt6_info *rt;
1605 	struct inet6_dev *idev = in6_dev_get(dev);
1606 	struct net *net = dev_net(dev);
1607 
1608 	if (unlikely(!idev))
1609 		return ERR_PTR(-ENODEV);
1610 
1611 	rt = ip6_dst_alloc(net, dev, 0);
1612 	if (unlikely(!rt)) {
1613 		in6_dev_put(idev);
1614 		dst = ERR_PTR(-ENOMEM);
1615 		goto out;
1616 	}
1617 
1618 	rt->dst.flags |= DST_HOST;
1619 	rt->dst.output  = ip6_output;
1620 	atomic_set(&rt->dst.__refcnt, 1);
1621 	rt->rt6i_gateway  = fl6->daddr;
1622 	rt->rt6i_dst.addr = fl6->daddr;
1623 	rt->rt6i_dst.plen = 128;
1624 	rt->rt6i_idev     = idev;
1625 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1626 
1627 	spin_lock_bh(&icmp6_dst_lock);
1628 	rt->dst.next = icmp6_dst_gc_list;
1629 	icmp6_dst_gc_list = &rt->dst;
1630 	spin_unlock_bh(&icmp6_dst_lock);
1631 
1632 	fib6_force_start_gc(net);
1633 
1634 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1635 
1636 out:
1637 	return dst;
1638 }
1639 
1640 int icmp6_dst_gc(void)
1641 {
1642 	struct dst_entry *dst, **pprev;
1643 	int more = 0;
1644 
1645 	spin_lock_bh(&icmp6_dst_lock);
1646 	pprev = &icmp6_dst_gc_list;
1647 
1648 	while ((dst = *pprev) != NULL) {
1649 		if (!atomic_read(&dst->__refcnt)) {
1650 			*pprev = dst->next;
1651 			dst_free(dst);
1652 		} else {
1653 			pprev = &dst->next;
1654 			++more;
1655 		}
1656 	}
1657 
1658 	spin_unlock_bh(&icmp6_dst_lock);
1659 
1660 	return more;
1661 }
1662 
1663 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1664 			    void *arg)
1665 {
1666 	struct dst_entry *dst, **pprev;
1667 
1668 	spin_lock_bh(&icmp6_dst_lock);
1669 	pprev = &icmp6_dst_gc_list;
1670 	while ((dst = *pprev) != NULL) {
1671 		struct rt6_info *rt = (struct rt6_info *) dst;
1672 		if (func(rt, arg)) {
1673 			*pprev = dst->next;
1674 			dst_free(dst);
1675 		} else {
1676 			pprev = &dst->next;
1677 		}
1678 	}
1679 	spin_unlock_bh(&icmp6_dst_lock);
1680 }
1681 
1682 static int ip6_dst_gc(struct dst_ops *ops)
1683 {
1684 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1685 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1686 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1687 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1688 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1689 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1690 	int entries;
1691 
1692 	entries = dst_entries_get_fast(ops);
1693 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1694 	    entries <= rt_max_size)
1695 		goto out;
1696 
1697 	net->ipv6.ip6_rt_gc_expire++;
1698 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1699 	entries = dst_entries_get_slow(ops);
1700 	if (entries < ops->gc_thresh)
1701 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1702 out:
1703 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1704 	return entries > rt_max_size;
1705 }
1706 
1707 static int ip6_convert_metrics(struct mx6_config *mxc,
1708 			       const struct fib6_config *cfg)
1709 {
1710 	bool ecn_ca = false;
1711 	struct nlattr *nla;
1712 	int remaining;
1713 	u32 *mp;
1714 
1715 	if (!cfg->fc_mx)
1716 		return 0;
1717 
1718 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1719 	if (unlikely(!mp))
1720 		return -ENOMEM;
1721 
1722 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1723 		int type = nla_type(nla);
1724 		u32 val;
1725 
1726 		if (!type)
1727 			continue;
1728 		if (unlikely(type > RTAX_MAX))
1729 			goto err;
1730 
1731 		if (type == RTAX_CC_ALGO) {
1732 			char tmp[TCP_CA_NAME_MAX];
1733 
1734 			nla_strlcpy(tmp, nla, sizeof(tmp));
1735 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1736 			if (val == TCP_CA_UNSPEC)
1737 				goto err;
1738 		} else {
1739 			val = nla_get_u32(nla);
1740 		}
1741 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1742 			goto err;
1743 
1744 		mp[type - 1] = val;
1745 		__set_bit(type - 1, mxc->mx_valid);
1746 	}
1747 
1748 	if (ecn_ca) {
1749 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1750 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1751 	}
1752 
1753 	mxc->mx = mp;
1754 	return 0;
1755  err:
1756 	kfree(mp);
1757 	return -EINVAL;
1758 }
1759 
1760 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1761 {
1762 	struct net *net = cfg->fc_nlinfo.nl_net;
1763 	struct rt6_info *rt = NULL;
1764 	struct net_device *dev = NULL;
1765 	struct inet6_dev *idev = NULL;
1766 	struct fib6_table *table;
1767 	int addr_type;
1768 	int err = -EINVAL;
1769 
1770 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1771 		goto out;
1772 #ifndef CONFIG_IPV6_SUBTREES
1773 	if (cfg->fc_src_len)
1774 		goto out;
1775 #endif
1776 	if (cfg->fc_ifindex) {
1777 		err = -ENODEV;
1778 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1779 		if (!dev)
1780 			goto out;
1781 		idev = in6_dev_get(dev);
1782 		if (!idev)
1783 			goto out;
1784 	}
1785 
1786 	if (cfg->fc_metric == 0)
1787 		cfg->fc_metric = IP6_RT_PRIO_USER;
1788 
1789 	err = -ENOBUFS;
1790 	if (cfg->fc_nlinfo.nlh &&
1791 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1792 		table = fib6_get_table(net, cfg->fc_table);
1793 		if (!table) {
1794 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1795 			table = fib6_new_table(net, cfg->fc_table);
1796 		}
1797 	} else {
1798 		table = fib6_new_table(net, cfg->fc_table);
1799 	}
1800 
1801 	if (!table)
1802 		goto out;
1803 
1804 	rt = ip6_dst_alloc(net, NULL,
1805 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1806 
1807 	if (!rt) {
1808 		err = -ENOMEM;
1809 		goto out;
1810 	}
1811 
1812 	if (cfg->fc_flags & RTF_EXPIRES)
1813 		rt6_set_expires(rt, jiffies +
1814 				clock_t_to_jiffies(cfg->fc_expires));
1815 	else
1816 		rt6_clean_expires(rt);
1817 
1818 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1819 		cfg->fc_protocol = RTPROT_BOOT;
1820 	rt->rt6i_protocol = cfg->fc_protocol;
1821 
1822 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1823 
1824 	if (addr_type & IPV6_ADDR_MULTICAST)
1825 		rt->dst.input = ip6_mc_input;
1826 	else if (cfg->fc_flags & RTF_LOCAL)
1827 		rt->dst.input = ip6_input;
1828 	else
1829 		rt->dst.input = ip6_forward;
1830 
1831 	rt->dst.output = ip6_output;
1832 
1833 	if (cfg->fc_encap) {
1834 		struct lwtunnel_state *lwtstate;
1835 
1836 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1837 					   cfg->fc_encap, AF_INET6, cfg,
1838 					   &lwtstate);
1839 		if (err)
1840 			goto out;
1841 		rt->dst.lwtstate = lwtstate_get(lwtstate);
1842 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1843 			rt->dst.lwtstate->orig_output = rt->dst.output;
1844 			rt->dst.output = lwtunnel_output;
1845 		}
1846 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1847 			rt->dst.lwtstate->orig_input = rt->dst.input;
1848 			rt->dst.input = lwtunnel_input;
1849 		}
1850 	}
1851 
1852 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1853 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1854 	if (rt->rt6i_dst.plen == 128)
1855 		rt->dst.flags |= DST_HOST;
1856 
1857 #ifdef CONFIG_IPV6_SUBTREES
1858 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1859 	rt->rt6i_src.plen = cfg->fc_src_len;
1860 #endif
1861 
1862 	rt->rt6i_metric = cfg->fc_metric;
1863 
1864 	/* We cannot add true routes via loopback here,
1865 	   they would result in kernel looping; promote them to reject routes
1866 	 */
1867 	if ((cfg->fc_flags & RTF_REJECT) ||
1868 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1869 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1870 	     !(cfg->fc_flags & RTF_LOCAL))) {
1871 		/* hold loopback dev/idev if we haven't done so. */
1872 		if (dev != net->loopback_dev) {
1873 			if (dev) {
1874 				dev_put(dev);
1875 				in6_dev_put(idev);
1876 			}
1877 			dev = net->loopback_dev;
1878 			dev_hold(dev);
1879 			idev = in6_dev_get(dev);
1880 			if (!idev) {
1881 				err = -ENODEV;
1882 				goto out;
1883 			}
1884 		}
1885 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1886 		switch (cfg->fc_type) {
1887 		case RTN_BLACKHOLE:
1888 			rt->dst.error = -EINVAL;
1889 			rt->dst.output = dst_discard_out;
1890 			rt->dst.input = dst_discard;
1891 			break;
1892 		case RTN_PROHIBIT:
1893 			rt->dst.error = -EACCES;
1894 			rt->dst.output = ip6_pkt_prohibit_out;
1895 			rt->dst.input = ip6_pkt_prohibit;
1896 			break;
1897 		case RTN_THROW:
1898 		case RTN_UNREACHABLE:
1899 		default:
1900 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1901 					: (cfg->fc_type == RTN_UNREACHABLE)
1902 					? -EHOSTUNREACH : -ENETUNREACH;
1903 			rt->dst.output = ip6_pkt_discard_out;
1904 			rt->dst.input = ip6_pkt_discard;
1905 			break;
1906 		}
1907 		goto install_route;
1908 	}
1909 
1910 	if (cfg->fc_flags & RTF_GATEWAY) {
1911 		const struct in6_addr *gw_addr;
1912 		int gwa_type;
1913 
1914 		gw_addr = &cfg->fc_gateway;
1915 		gwa_type = ipv6_addr_type(gw_addr);
1916 
1917 		/* if gw_addr is local we will fail to detect this in case
1918 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1919 		 * will return already-added prefix route via interface that
1920 		 * prefix route was assigned to, which might be non-loopback.
1921 		 */
1922 		err = -EINVAL;
1923 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1924 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1925 					    dev : NULL, 0, 0))
1926 			goto out;
1927 
1928 		rt->rt6i_gateway = *gw_addr;
1929 
1930 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1931 			struct rt6_info *grt;
1932 
1933 			/* IPv6 strictly inhibits using not link-local
1934 			   addresses as nexthop address.
1935 			   Otherwise, router will not able to send redirects.
1936 			   It is very good, but in some (rare!) circumstances
1937 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1938 			   some exceptions. --ANK
1939 			 */
1940 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1941 				goto out;
1942 
1943 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1944 
1945 			err = -EHOSTUNREACH;
1946 			if (!grt)
1947 				goto out;
1948 			if (dev) {
1949 				if (dev != grt->dst.dev) {
1950 					ip6_rt_put(grt);
1951 					goto out;
1952 				}
1953 			} else {
1954 				dev = grt->dst.dev;
1955 				idev = grt->rt6i_idev;
1956 				dev_hold(dev);
1957 				in6_dev_hold(grt->rt6i_idev);
1958 			}
1959 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1960 				err = 0;
1961 			ip6_rt_put(grt);
1962 
1963 			if (err)
1964 				goto out;
1965 		}
1966 		err = -EINVAL;
1967 		if (!dev || (dev->flags & IFF_LOOPBACK))
1968 			goto out;
1969 	}
1970 
1971 	err = -ENODEV;
1972 	if (!dev)
1973 		goto out;
1974 
1975 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1976 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1977 			err = -EINVAL;
1978 			goto out;
1979 		}
1980 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1981 		rt->rt6i_prefsrc.plen = 128;
1982 	} else
1983 		rt->rt6i_prefsrc.plen = 0;
1984 
1985 	rt->rt6i_flags = cfg->fc_flags;
1986 
1987 install_route:
1988 	rt->dst.dev = dev;
1989 	rt->rt6i_idev = idev;
1990 	rt->rt6i_table = table;
1991 
1992 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1993 
1994 	return rt;
1995 out:
1996 	if (dev)
1997 		dev_put(dev);
1998 	if (idev)
1999 		in6_dev_put(idev);
2000 	if (rt)
2001 		dst_free(&rt->dst);
2002 
2003 	return ERR_PTR(err);
2004 }
2005 
2006 int ip6_route_add(struct fib6_config *cfg)
2007 {
2008 	struct mx6_config mxc = { .mx = NULL, };
2009 	struct rt6_info *rt;
2010 	int err;
2011 
2012 	rt = ip6_route_info_create(cfg);
2013 	if (IS_ERR(rt)) {
2014 		err = PTR_ERR(rt);
2015 		rt = NULL;
2016 		goto out;
2017 	}
2018 
2019 	err = ip6_convert_metrics(&mxc, cfg);
2020 	if (err)
2021 		goto out;
2022 
2023 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2024 
2025 	kfree(mxc.mx);
2026 
2027 	return err;
2028 out:
2029 	if (rt)
2030 		dst_free(&rt->dst);
2031 
2032 	return err;
2033 }
2034 
2035 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2036 {
2037 	int err;
2038 	struct fib6_table *table;
2039 	struct net *net = dev_net(rt->dst.dev);
2040 
2041 	if (rt == net->ipv6.ip6_null_entry ||
2042 	    rt->dst.flags & DST_NOCACHE) {
2043 		err = -ENOENT;
2044 		goto out;
2045 	}
2046 
2047 	table = rt->rt6i_table;
2048 	write_lock_bh(&table->tb6_lock);
2049 	err = fib6_del(rt, info);
2050 	write_unlock_bh(&table->tb6_lock);
2051 
2052 out:
2053 	ip6_rt_put(rt);
2054 	return err;
2055 }
2056 
2057 int ip6_del_rt(struct rt6_info *rt)
2058 {
2059 	struct nl_info info = {
2060 		.nl_net = dev_net(rt->dst.dev),
2061 	};
2062 	return __ip6_del_rt(rt, &info);
2063 }
2064 
2065 static int ip6_route_del(struct fib6_config *cfg)
2066 {
2067 	struct fib6_table *table;
2068 	struct fib6_node *fn;
2069 	struct rt6_info *rt;
2070 	int err = -ESRCH;
2071 
2072 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2073 	if (!table)
2074 		return err;
2075 
2076 	read_lock_bh(&table->tb6_lock);
2077 
2078 	fn = fib6_locate(&table->tb6_root,
2079 			 &cfg->fc_dst, cfg->fc_dst_len,
2080 			 &cfg->fc_src, cfg->fc_src_len);
2081 
2082 	if (fn) {
2083 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2084 			if ((rt->rt6i_flags & RTF_CACHE) &&
2085 			    !(cfg->fc_flags & RTF_CACHE))
2086 				continue;
2087 			if (cfg->fc_ifindex &&
2088 			    (!rt->dst.dev ||
2089 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2090 				continue;
2091 			if (cfg->fc_flags & RTF_GATEWAY &&
2092 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2093 				continue;
2094 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2095 				continue;
2096 			dst_hold(&rt->dst);
2097 			read_unlock_bh(&table->tb6_lock);
2098 
2099 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2100 		}
2101 	}
2102 	read_unlock_bh(&table->tb6_lock);
2103 
2104 	return err;
2105 }
2106 
2107 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2108 {
2109 	struct netevent_redirect netevent;
2110 	struct rt6_info *rt, *nrt = NULL;
2111 	struct ndisc_options ndopts;
2112 	struct inet6_dev *in6_dev;
2113 	struct neighbour *neigh;
2114 	struct rd_msg *msg;
2115 	int optlen, on_link;
2116 	u8 *lladdr;
2117 
2118 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2119 	optlen -= sizeof(*msg);
2120 
2121 	if (optlen < 0) {
2122 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2123 		return;
2124 	}
2125 
2126 	msg = (struct rd_msg *)icmp6_hdr(skb);
2127 
2128 	if (ipv6_addr_is_multicast(&msg->dest)) {
2129 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2130 		return;
2131 	}
2132 
2133 	on_link = 0;
2134 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2135 		on_link = 1;
2136 	} else if (ipv6_addr_type(&msg->target) !=
2137 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2138 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2139 		return;
2140 	}
2141 
2142 	in6_dev = __in6_dev_get(skb->dev);
2143 	if (!in6_dev)
2144 		return;
2145 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2146 		return;
2147 
2148 	/* RFC2461 8.1:
2149 	 *	The IP source address of the Redirect MUST be the same as the current
2150 	 *	first-hop router for the specified ICMP Destination Address.
2151 	 */
2152 
2153 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2154 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2155 		return;
2156 	}
2157 
2158 	lladdr = NULL;
2159 	if (ndopts.nd_opts_tgt_lladdr) {
2160 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2161 					     skb->dev);
2162 		if (!lladdr) {
2163 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2164 			return;
2165 		}
2166 	}
2167 
2168 	rt = (struct rt6_info *) dst;
2169 	if (rt->rt6i_flags & RTF_REJECT) {
2170 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2171 		return;
2172 	}
2173 
2174 	/* Redirect received -> path was valid.
2175 	 * Look, redirects are sent only in response to data packets,
2176 	 * so that this nexthop apparently is reachable. --ANK
2177 	 */
2178 	dst_confirm(&rt->dst);
2179 
2180 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2181 	if (!neigh)
2182 		return;
2183 
2184 	/*
2185 	 *	We have finally decided to accept it.
2186 	 */
2187 
2188 	neigh_update(neigh, lladdr, NUD_STALE,
2189 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2190 		     NEIGH_UPDATE_F_OVERRIDE|
2191 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2192 				     NEIGH_UPDATE_F_ISROUTER))
2193 		     );
2194 
2195 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2196 	if (!nrt)
2197 		goto out;
2198 
2199 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2200 	if (on_link)
2201 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2202 
2203 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2204 
2205 	if (ip6_ins_rt(nrt))
2206 		goto out;
2207 
2208 	netevent.old = &rt->dst;
2209 	netevent.new = &nrt->dst;
2210 	netevent.daddr = &msg->dest;
2211 	netevent.neigh = neigh;
2212 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2213 
2214 	if (rt->rt6i_flags & RTF_CACHE) {
2215 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2216 		ip6_del_rt(rt);
2217 	}
2218 
2219 out:
2220 	neigh_release(neigh);
2221 }
2222 
2223 /*
2224  *	Misc support functions
2225  */
2226 
2227 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2228 {
2229 	BUG_ON(from->dst.from);
2230 
2231 	rt->rt6i_flags &= ~RTF_EXPIRES;
2232 	dst_hold(&from->dst);
2233 	rt->dst.from = &from->dst;
2234 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2235 }
2236 
2237 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2238 {
2239 	rt->dst.input = ort->dst.input;
2240 	rt->dst.output = ort->dst.output;
2241 	rt->rt6i_dst = ort->rt6i_dst;
2242 	rt->dst.error = ort->dst.error;
2243 	rt->rt6i_idev = ort->rt6i_idev;
2244 	if (rt->rt6i_idev)
2245 		in6_dev_hold(rt->rt6i_idev);
2246 	rt->dst.lastuse = jiffies;
2247 	rt->rt6i_gateway = ort->rt6i_gateway;
2248 	rt->rt6i_flags = ort->rt6i_flags;
2249 	rt6_set_from(rt, ort);
2250 	rt->rt6i_metric = ort->rt6i_metric;
2251 #ifdef CONFIG_IPV6_SUBTREES
2252 	rt->rt6i_src = ort->rt6i_src;
2253 #endif
2254 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2255 	rt->rt6i_table = ort->rt6i_table;
2256 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2257 }
2258 
2259 #ifdef CONFIG_IPV6_ROUTE_INFO
2260 static struct rt6_info *rt6_get_route_info(struct net *net,
2261 					   const struct in6_addr *prefix, int prefixlen,
2262 					   const struct in6_addr *gwaddr, int ifindex)
2263 {
2264 	struct fib6_node *fn;
2265 	struct rt6_info *rt = NULL;
2266 	struct fib6_table *table;
2267 
2268 	table = fib6_get_table(net, RT6_TABLE_INFO);
2269 	if (!table)
2270 		return NULL;
2271 
2272 	read_lock_bh(&table->tb6_lock);
2273 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2274 	if (!fn)
2275 		goto out;
2276 
2277 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2278 		if (rt->dst.dev->ifindex != ifindex)
2279 			continue;
2280 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2281 			continue;
2282 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2283 			continue;
2284 		dst_hold(&rt->dst);
2285 		break;
2286 	}
2287 out:
2288 	read_unlock_bh(&table->tb6_lock);
2289 	return rt;
2290 }
2291 
2292 static struct rt6_info *rt6_add_route_info(struct net *net,
2293 					   const struct in6_addr *prefix, int prefixlen,
2294 					   const struct in6_addr *gwaddr, int ifindex,
2295 					   unsigned int pref)
2296 {
2297 	struct fib6_config cfg = {
2298 		.fc_metric	= IP6_RT_PRIO_USER,
2299 		.fc_ifindex	= ifindex,
2300 		.fc_dst_len	= prefixlen,
2301 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2302 				  RTF_UP | RTF_PREF(pref),
2303 		.fc_nlinfo.portid = 0,
2304 		.fc_nlinfo.nlh = NULL,
2305 		.fc_nlinfo.nl_net = net,
2306 	};
2307 
2308 	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
2309 	cfg.fc_dst = *prefix;
2310 	cfg.fc_gateway = *gwaddr;
2311 
2312 	/* We should treat it as a default route if prefix length is 0. */
2313 	if (!prefixlen)
2314 		cfg.fc_flags |= RTF_DEFAULT;
2315 
2316 	ip6_route_add(&cfg);
2317 
2318 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2319 }
2320 #endif
2321 
2322 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2323 {
2324 	struct rt6_info *rt;
2325 	struct fib6_table *table;
2326 
2327 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2328 	if (!table)
2329 		return NULL;
2330 
2331 	read_lock_bh(&table->tb6_lock);
2332 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2333 		if (dev == rt->dst.dev &&
2334 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2335 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2336 			break;
2337 	}
2338 	if (rt)
2339 		dst_hold(&rt->dst);
2340 	read_unlock_bh(&table->tb6_lock);
2341 	return rt;
2342 }
2343 
2344 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2345 				     struct net_device *dev,
2346 				     unsigned int pref)
2347 {
2348 	struct fib6_config cfg = {
2349 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2350 		.fc_metric	= IP6_RT_PRIO_USER,
2351 		.fc_ifindex	= dev->ifindex,
2352 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2353 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2354 		.fc_nlinfo.portid = 0,
2355 		.fc_nlinfo.nlh = NULL,
2356 		.fc_nlinfo.nl_net = dev_net(dev),
2357 	};
2358 
2359 	cfg.fc_gateway = *gwaddr;
2360 
2361 	ip6_route_add(&cfg);
2362 
2363 	return rt6_get_dflt_router(gwaddr, dev);
2364 }
2365 
2366 void rt6_purge_dflt_routers(struct net *net)
2367 {
2368 	struct rt6_info *rt;
2369 	struct fib6_table *table;
2370 
2371 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2372 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2373 	if (!table)
2374 		return;
2375 
2376 restart:
2377 	read_lock_bh(&table->tb6_lock);
2378 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2379 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2380 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2381 			dst_hold(&rt->dst);
2382 			read_unlock_bh(&table->tb6_lock);
2383 			ip6_del_rt(rt);
2384 			goto restart;
2385 		}
2386 	}
2387 	read_unlock_bh(&table->tb6_lock);
2388 }
2389 
2390 static void rtmsg_to_fib6_config(struct net *net,
2391 				 struct in6_rtmsg *rtmsg,
2392 				 struct fib6_config *cfg)
2393 {
2394 	memset(cfg, 0, sizeof(*cfg));
2395 
2396 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2397 			 : RT6_TABLE_MAIN;
2398 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2399 	cfg->fc_metric = rtmsg->rtmsg_metric;
2400 	cfg->fc_expires = rtmsg->rtmsg_info;
2401 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2402 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2403 	cfg->fc_flags = rtmsg->rtmsg_flags;
2404 
2405 	cfg->fc_nlinfo.nl_net = net;
2406 
2407 	cfg->fc_dst = rtmsg->rtmsg_dst;
2408 	cfg->fc_src = rtmsg->rtmsg_src;
2409 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2410 }
2411 
2412 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2413 {
2414 	struct fib6_config cfg;
2415 	struct in6_rtmsg rtmsg;
2416 	int err;
2417 
2418 	switch (cmd) {
2419 	case SIOCADDRT:		/* Add a route */
2420 	case SIOCDELRT:		/* Delete a route */
2421 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2422 			return -EPERM;
2423 		err = copy_from_user(&rtmsg, arg,
2424 				     sizeof(struct in6_rtmsg));
2425 		if (err)
2426 			return -EFAULT;
2427 
2428 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2429 
2430 		rtnl_lock();
2431 		switch (cmd) {
2432 		case SIOCADDRT:
2433 			err = ip6_route_add(&cfg);
2434 			break;
2435 		case SIOCDELRT:
2436 			err = ip6_route_del(&cfg);
2437 			break;
2438 		default:
2439 			err = -EINVAL;
2440 		}
2441 		rtnl_unlock();
2442 
2443 		return err;
2444 	}
2445 
2446 	return -EINVAL;
2447 }
2448 
2449 /*
2450  *	Drop the packet on the floor
2451  */
2452 
2453 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2454 {
2455 	int type;
2456 	struct dst_entry *dst = skb_dst(skb);
2457 	switch (ipstats_mib_noroutes) {
2458 	case IPSTATS_MIB_INNOROUTES:
2459 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2460 		if (type == IPV6_ADDR_ANY) {
2461 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2462 				      IPSTATS_MIB_INADDRERRORS);
2463 			break;
2464 		}
2465 		/* FALLTHROUGH */
2466 	case IPSTATS_MIB_OUTNOROUTES:
2467 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2468 			      ipstats_mib_noroutes);
2469 		break;
2470 	}
2471 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2472 	kfree_skb(skb);
2473 	return 0;
2474 }
2475 
2476 static int ip6_pkt_discard(struct sk_buff *skb)
2477 {
2478 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2479 }
2480 
2481 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2482 {
2483 	skb->dev = skb_dst(skb)->dev;
2484 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2485 }
2486 
2487 static int ip6_pkt_prohibit(struct sk_buff *skb)
2488 {
2489 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2490 }
2491 
2492 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2493 {
2494 	skb->dev = skb_dst(skb)->dev;
2495 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2496 }
2497 
2498 /*
2499  *	Allocate a dst for local (unicast / anycast) address.
2500  */
2501 
2502 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2503 				    const struct in6_addr *addr,
2504 				    bool anycast)
2505 {
2506 	u32 tb_id;
2507 	struct net *net = dev_net(idev->dev);
2508 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2509 					    DST_NOCOUNT);
2510 	if (!rt)
2511 		return ERR_PTR(-ENOMEM);
2512 
2513 	in6_dev_hold(idev);
2514 
2515 	rt->dst.flags |= DST_HOST;
2516 	rt->dst.input = ip6_input;
2517 	rt->dst.output = ip6_output;
2518 	rt->rt6i_idev = idev;
2519 
2520 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2521 	if (anycast)
2522 		rt->rt6i_flags |= RTF_ANYCAST;
2523 	else
2524 		rt->rt6i_flags |= RTF_LOCAL;
2525 
2526 	rt->rt6i_gateway  = *addr;
2527 	rt->rt6i_dst.addr = *addr;
2528 	rt->rt6i_dst.plen = 128;
2529 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2530 	rt->rt6i_table = fib6_get_table(net, tb_id);
2531 	rt->dst.flags |= DST_NOCACHE;
2532 
2533 	atomic_set(&rt->dst.__refcnt, 1);
2534 
2535 	return rt;
2536 }
2537 
2538 int ip6_route_get_saddr(struct net *net,
2539 			struct rt6_info *rt,
2540 			const struct in6_addr *daddr,
2541 			unsigned int prefs,
2542 			struct in6_addr *saddr)
2543 {
2544 	struct inet6_dev *idev =
2545 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2546 	int err = 0;
2547 	if (rt && rt->rt6i_prefsrc.plen)
2548 		*saddr = rt->rt6i_prefsrc.addr;
2549 	else
2550 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2551 					 daddr, prefs, saddr);
2552 	return err;
2553 }
2554 
2555 /* remove deleted ip from prefsrc entries */
2556 struct arg_dev_net_ip {
2557 	struct net_device *dev;
2558 	struct net *net;
2559 	struct in6_addr *addr;
2560 };
2561 
2562 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2563 {
2564 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2565 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2566 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2567 
2568 	if (((void *)rt->dst.dev == dev || !dev) &&
2569 	    rt != net->ipv6.ip6_null_entry &&
2570 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2571 		/* remove prefsrc entry */
2572 		rt->rt6i_prefsrc.plen = 0;
2573 	}
2574 	return 0;
2575 }
2576 
2577 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2578 {
2579 	struct net *net = dev_net(ifp->idev->dev);
2580 	struct arg_dev_net_ip adni = {
2581 		.dev = ifp->idev->dev,
2582 		.net = net,
2583 		.addr = &ifp->addr,
2584 	};
2585 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2586 }
2587 
2588 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2589 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2590 
2591 /* Remove routers and update dst entries when gateway turn into host. */
2592 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2593 {
2594 	struct in6_addr *gateway = (struct in6_addr *)arg;
2595 
2596 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2597 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2598 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2599 		return -1;
2600 	}
2601 	return 0;
2602 }
2603 
2604 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2605 {
2606 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2607 }
2608 
2609 struct arg_dev_net {
2610 	struct net_device *dev;
2611 	struct net *net;
2612 };
2613 
2614 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2615 {
2616 	const struct arg_dev_net *adn = arg;
2617 	const struct net_device *dev = adn->dev;
2618 
2619 	if ((rt->dst.dev == dev || !dev) &&
2620 	    rt != adn->net->ipv6.ip6_null_entry)
2621 		return -1;
2622 
2623 	return 0;
2624 }
2625 
2626 void rt6_ifdown(struct net *net, struct net_device *dev)
2627 {
2628 	struct arg_dev_net adn = {
2629 		.dev = dev,
2630 		.net = net,
2631 	};
2632 
2633 	fib6_clean_all(net, fib6_ifdown, &adn);
2634 	icmp6_clean_all(fib6_ifdown, &adn);
2635 	if (dev)
2636 		rt6_uncached_list_flush_dev(net, dev);
2637 }
2638 
2639 struct rt6_mtu_change_arg {
2640 	struct net_device *dev;
2641 	unsigned int mtu;
2642 };
2643 
2644 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2645 {
2646 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2647 	struct inet6_dev *idev;
2648 
2649 	/* In IPv6 pmtu discovery is not optional,
2650 	   so that RTAX_MTU lock cannot disable it.
2651 	   We still use this lock to block changes
2652 	   caused by addrconf/ndisc.
2653 	*/
2654 
2655 	idev = __in6_dev_get(arg->dev);
2656 	if (!idev)
2657 		return 0;
2658 
2659 	/* For administrative MTU increase, there is no way to discover
2660 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2661 	   Since RFC 1981 doesn't include administrative MTU increase
2662 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2663 	 */
2664 	/*
2665 	   If new MTU is less than route PMTU, this new MTU will be the
2666 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2667 	   decreases; if new MTU is greater than route PMTU, and the
2668 	   old MTU is the lowest MTU in the path, update the route PMTU
2669 	   to reflect the increase. In this case if the other nodes' MTU
2670 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2671 	   PMTU discouvery.
2672 	 */
2673 	if (rt->dst.dev == arg->dev &&
2674 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2675 		if (rt->rt6i_flags & RTF_CACHE) {
2676 			/* For RTF_CACHE with rt6i_pmtu == 0
2677 			 * (i.e. a redirected route),
2678 			 * the metrics of its rt->dst.from has already
2679 			 * been updated.
2680 			 */
2681 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2682 				rt->rt6i_pmtu = arg->mtu;
2683 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2684 			   (dst_mtu(&rt->dst) < arg->mtu &&
2685 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2686 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2687 		}
2688 	}
2689 	return 0;
2690 }
2691 
2692 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2693 {
2694 	struct rt6_mtu_change_arg arg = {
2695 		.dev = dev,
2696 		.mtu = mtu,
2697 	};
2698 
2699 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2700 }
2701 
2702 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2703 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2704 	[RTA_OIF]               = { .type = NLA_U32 },
2705 	[RTA_IIF]		= { .type = NLA_U32 },
2706 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2707 	[RTA_METRICS]           = { .type = NLA_NESTED },
2708 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2709 	[RTA_PREF]              = { .type = NLA_U8 },
2710 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2711 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2712 };
2713 
2714 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2715 			      struct fib6_config *cfg)
2716 {
2717 	struct rtmsg *rtm;
2718 	struct nlattr *tb[RTA_MAX+1];
2719 	unsigned int pref;
2720 	int err;
2721 
2722 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2723 	if (err < 0)
2724 		goto errout;
2725 
2726 	err = -EINVAL;
2727 	rtm = nlmsg_data(nlh);
2728 	memset(cfg, 0, sizeof(*cfg));
2729 
2730 	cfg->fc_table = rtm->rtm_table;
2731 	cfg->fc_dst_len = rtm->rtm_dst_len;
2732 	cfg->fc_src_len = rtm->rtm_src_len;
2733 	cfg->fc_flags = RTF_UP;
2734 	cfg->fc_protocol = rtm->rtm_protocol;
2735 	cfg->fc_type = rtm->rtm_type;
2736 
2737 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2738 	    rtm->rtm_type == RTN_BLACKHOLE ||
2739 	    rtm->rtm_type == RTN_PROHIBIT ||
2740 	    rtm->rtm_type == RTN_THROW)
2741 		cfg->fc_flags |= RTF_REJECT;
2742 
2743 	if (rtm->rtm_type == RTN_LOCAL)
2744 		cfg->fc_flags |= RTF_LOCAL;
2745 
2746 	if (rtm->rtm_flags & RTM_F_CLONED)
2747 		cfg->fc_flags |= RTF_CACHE;
2748 
2749 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2750 	cfg->fc_nlinfo.nlh = nlh;
2751 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2752 
2753 	if (tb[RTA_GATEWAY]) {
2754 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2755 		cfg->fc_flags |= RTF_GATEWAY;
2756 	}
2757 
2758 	if (tb[RTA_DST]) {
2759 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2760 
2761 		if (nla_len(tb[RTA_DST]) < plen)
2762 			goto errout;
2763 
2764 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2765 	}
2766 
2767 	if (tb[RTA_SRC]) {
2768 		int plen = (rtm->rtm_src_len + 7) >> 3;
2769 
2770 		if (nla_len(tb[RTA_SRC]) < plen)
2771 			goto errout;
2772 
2773 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2774 	}
2775 
2776 	if (tb[RTA_PREFSRC])
2777 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2778 
2779 	if (tb[RTA_OIF])
2780 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2781 
2782 	if (tb[RTA_PRIORITY])
2783 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2784 
2785 	if (tb[RTA_METRICS]) {
2786 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2787 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2788 	}
2789 
2790 	if (tb[RTA_TABLE])
2791 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2792 
2793 	if (tb[RTA_MULTIPATH]) {
2794 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2795 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2796 	}
2797 
2798 	if (tb[RTA_PREF]) {
2799 		pref = nla_get_u8(tb[RTA_PREF]);
2800 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2801 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2802 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2803 		cfg->fc_flags |= RTF_PREF(pref);
2804 	}
2805 
2806 	if (tb[RTA_ENCAP])
2807 		cfg->fc_encap = tb[RTA_ENCAP];
2808 
2809 	if (tb[RTA_ENCAP_TYPE])
2810 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2811 
2812 	err = 0;
2813 errout:
2814 	return err;
2815 }
2816 
2817 struct rt6_nh {
2818 	struct rt6_info *rt6_info;
2819 	struct fib6_config r_cfg;
2820 	struct mx6_config mxc;
2821 	struct list_head next;
2822 };
2823 
2824 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2825 {
2826 	struct rt6_nh *nh;
2827 
2828 	list_for_each_entry(nh, rt6_nh_list, next) {
2829 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2830 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2831 		        nh->r_cfg.fc_ifindex);
2832 	}
2833 }
2834 
2835 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2836 				 struct rt6_info *rt, struct fib6_config *r_cfg)
2837 {
2838 	struct rt6_nh *nh;
2839 	struct rt6_info *rtnh;
2840 	int err = -EEXIST;
2841 
2842 	list_for_each_entry(nh, rt6_nh_list, next) {
2843 		/* check if rt6_info already exists */
2844 		rtnh = nh->rt6_info;
2845 
2846 		if (rtnh->dst.dev == rt->dst.dev &&
2847 		    rtnh->rt6i_idev == rt->rt6i_idev &&
2848 		    ipv6_addr_equal(&rtnh->rt6i_gateway,
2849 				    &rt->rt6i_gateway))
2850 			return err;
2851 	}
2852 
2853 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2854 	if (!nh)
2855 		return -ENOMEM;
2856 	nh->rt6_info = rt;
2857 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
2858 	if (err) {
2859 		kfree(nh);
2860 		return err;
2861 	}
2862 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2863 	list_add_tail(&nh->next, rt6_nh_list);
2864 
2865 	return 0;
2866 }
2867 
2868 static int ip6_route_multipath_add(struct fib6_config *cfg)
2869 {
2870 	struct fib6_config r_cfg;
2871 	struct rtnexthop *rtnh;
2872 	struct rt6_info *rt;
2873 	struct rt6_nh *err_nh;
2874 	struct rt6_nh *nh, *nh_safe;
2875 	int remaining;
2876 	int attrlen;
2877 	int err = 1;
2878 	int nhn = 0;
2879 	int replace = (cfg->fc_nlinfo.nlh &&
2880 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2881 	LIST_HEAD(rt6_nh_list);
2882 
2883 	remaining = cfg->fc_mp_len;
2884 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2885 
2886 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
2887 	 * rt6_info structs per nexthop
2888 	 */
2889 	while (rtnh_ok(rtnh, remaining)) {
2890 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2891 		if (rtnh->rtnh_ifindex)
2892 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2893 
2894 		attrlen = rtnh_attrlen(rtnh);
2895 		if (attrlen > 0) {
2896 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2897 
2898 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2899 			if (nla) {
2900 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2901 				r_cfg.fc_flags |= RTF_GATEWAY;
2902 			}
2903 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2904 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2905 			if (nla)
2906 				r_cfg.fc_encap_type = nla_get_u16(nla);
2907 		}
2908 
2909 		rt = ip6_route_info_create(&r_cfg);
2910 		if (IS_ERR(rt)) {
2911 			err = PTR_ERR(rt);
2912 			rt = NULL;
2913 			goto cleanup;
2914 		}
2915 
2916 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
2917 		if (err) {
2918 			dst_free(&rt->dst);
2919 			goto cleanup;
2920 		}
2921 
2922 		rtnh = rtnh_next(rtnh, &remaining);
2923 	}
2924 
2925 	err_nh = NULL;
2926 	list_for_each_entry(nh, &rt6_nh_list, next) {
2927 		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
2928 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
2929 		nh->rt6_info = NULL;
2930 		if (err) {
2931 			if (replace && nhn)
2932 				ip6_print_replace_route_err(&rt6_nh_list);
2933 			err_nh = nh;
2934 			goto add_errout;
2935 		}
2936 
2937 		/* Because each route is added like a single route we remove
2938 		 * these flags after the first nexthop: if there is a collision,
2939 		 * we have already failed to add the first nexthop:
2940 		 * fib6_add_rt2node() has rejected it; when replacing, old
2941 		 * nexthops have been replaced by first new, the rest should
2942 		 * be added to it.
2943 		 */
2944 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2945 						     NLM_F_REPLACE);
2946 		nhn++;
2947 	}
2948 
2949 	goto cleanup;
2950 
2951 add_errout:
2952 	/* Delete routes that were already added */
2953 	list_for_each_entry(nh, &rt6_nh_list, next) {
2954 		if (err_nh == nh)
2955 			break;
2956 		ip6_route_del(&nh->r_cfg);
2957 	}
2958 
2959 cleanup:
2960 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
2961 		if (nh->rt6_info)
2962 			dst_free(&nh->rt6_info->dst);
2963 		kfree(nh->mxc.mx);
2964 		list_del(&nh->next);
2965 		kfree(nh);
2966 	}
2967 
2968 	return err;
2969 }
2970 
2971 static int ip6_route_multipath_del(struct fib6_config *cfg)
2972 {
2973 	struct fib6_config r_cfg;
2974 	struct rtnexthop *rtnh;
2975 	int remaining;
2976 	int attrlen;
2977 	int err = 1, last_err = 0;
2978 
2979 	remaining = cfg->fc_mp_len;
2980 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2981 
2982 	/* Parse a Multipath Entry */
2983 	while (rtnh_ok(rtnh, remaining)) {
2984 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2985 		if (rtnh->rtnh_ifindex)
2986 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2987 
2988 		attrlen = rtnh_attrlen(rtnh);
2989 		if (attrlen > 0) {
2990 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2991 
2992 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2993 			if (nla) {
2994 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
2995 				r_cfg.fc_flags |= RTF_GATEWAY;
2996 			}
2997 		}
2998 		err = ip6_route_del(&r_cfg);
2999 		if (err)
3000 			last_err = err;
3001 
3002 		rtnh = rtnh_next(rtnh, &remaining);
3003 	}
3004 
3005 	return last_err;
3006 }
3007 
3008 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3009 {
3010 	struct fib6_config cfg;
3011 	int err;
3012 
3013 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3014 	if (err < 0)
3015 		return err;
3016 
3017 	if (cfg.fc_mp)
3018 		return ip6_route_multipath_del(&cfg);
3019 	else
3020 		return ip6_route_del(&cfg);
3021 }
3022 
3023 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3024 {
3025 	struct fib6_config cfg;
3026 	int err;
3027 
3028 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3029 	if (err < 0)
3030 		return err;
3031 
3032 	if (cfg.fc_mp)
3033 		return ip6_route_multipath_add(&cfg);
3034 	else
3035 		return ip6_route_add(&cfg);
3036 }
3037 
3038 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3039 {
3040 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3041 	       + nla_total_size(16) /* RTA_SRC */
3042 	       + nla_total_size(16) /* RTA_DST */
3043 	       + nla_total_size(16) /* RTA_GATEWAY */
3044 	       + nla_total_size(16) /* RTA_PREFSRC */
3045 	       + nla_total_size(4) /* RTA_TABLE */
3046 	       + nla_total_size(4) /* RTA_IIF */
3047 	       + nla_total_size(4) /* RTA_OIF */
3048 	       + nla_total_size(4) /* RTA_PRIORITY */
3049 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3050 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3051 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3052 	       + nla_total_size(1) /* RTA_PREF */
3053 	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
3054 }
3055 
3056 static int rt6_fill_node(struct net *net,
3057 			 struct sk_buff *skb, struct rt6_info *rt,
3058 			 struct in6_addr *dst, struct in6_addr *src,
3059 			 int iif, int type, u32 portid, u32 seq,
3060 			 int prefix, int nowait, unsigned int flags)
3061 {
3062 	u32 metrics[RTAX_MAX];
3063 	struct rtmsg *rtm;
3064 	struct nlmsghdr *nlh;
3065 	long expires;
3066 	u32 table;
3067 
3068 	if (prefix) {	/* user wants prefix routes only */
3069 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3070 			/* success since this is not a prefix route */
3071 			return 1;
3072 		}
3073 	}
3074 
3075 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3076 	if (!nlh)
3077 		return -EMSGSIZE;
3078 
3079 	rtm = nlmsg_data(nlh);
3080 	rtm->rtm_family = AF_INET6;
3081 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3082 	rtm->rtm_src_len = rt->rt6i_src.plen;
3083 	rtm->rtm_tos = 0;
3084 	if (rt->rt6i_table)
3085 		table = rt->rt6i_table->tb6_id;
3086 	else
3087 		table = RT6_TABLE_UNSPEC;
3088 	rtm->rtm_table = table;
3089 	if (nla_put_u32(skb, RTA_TABLE, table))
3090 		goto nla_put_failure;
3091 	if (rt->rt6i_flags & RTF_REJECT) {
3092 		switch (rt->dst.error) {
3093 		case -EINVAL:
3094 			rtm->rtm_type = RTN_BLACKHOLE;
3095 			break;
3096 		case -EACCES:
3097 			rtm->rtm_type = RTN_PROHIBIT;
3098 			break;
3099 		case -EAGAIN:
3100 			rtm->rtm_type = RTN_THROW;
3101 			break;
3102 		default:
3103 			rtm->rtm_type = RTN_UNREACHABLE;
3104 			break;
3105 		}
3106 	}
3107 	else if (rt->rt6i_flags & RTF_LOCAL)
3108 		rtm->rtm_type = RTN_LOCAL;
3109 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3110 		rtm->rtm_type = RTN_LOCAL;
3111 	else
3112 		rtm->rtm_type = RTN_UNICAST;
3113 	rtm->rtm_flags = 0;
3114 	if (!netif_carrier_ok(rt->dst.dev)) {
3115 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
3116 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3117 			rtm->rtm_flags |= RTNH_F_DEAD;
3118 	}
3119 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3120 	rtm->rtm_protocol = rt->rt6i_protocol;
3121 	if (rt->rt6i_flags & RTF_DYNAMIC)
3122 		rtm->rtm_protocol = RTPROT_REDIRECT;
3123 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
3124 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3125 			rtm->rtm_protocol = RTPROT_RA;
3126 		else
3127 			rtm->rtm_protocol = RTPROT_KERNEL;
3128 	}
3129 
3130 	if (rt->rt6i_flags & RTF_CACHE)
3131 		rtm->rtm_flags |= RTM_F_CLONED;
3132 
3133 	if (dst) {
3134 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3135 			goto nla_put_failure;
3136 		rtm->rtm_dst_len = 128;
3137 	} else if (rtm->rtm_dst_len)
3138 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3139 			goto nla_put_failure;
3140 #ifdef CONFIG_IPV6_SUBTREES
3141 	if (src) {
3142 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3143 			goto nla_put_failure;
3144 		rtm->rtm_src_len = 128;
3145 	} else if (rtm->rtm_src_len &&
3146 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3147 		goto nla_put_failure;
3148 #endif
3149 	if (iif) {
3150 #ifdef CONFIG_IPV6_MROUTE
3151 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3152 			int err = ip6mr_get_route(net, skb, rtm, nowait);
3153 			if (err <= 0) {
3154 				if (!nowait) {
3155 					if (err == 0)
3156 						return 0;
3157 					goto nla_put_failure;
3158 				} else {
3159 					if (err == -EMSGSIZE)
3160 						goto nla_put_failure;
3161 				}
3162 			}
3163 		} else
3164 #endif
3165 			if (nla_put_u32(skb, RTA_IIF, iif))
3166 				goto nla_put_failure;
3167 	} else if (dst) {
3168 		struct in6_addr saddr_buf;
3169 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3170 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3171 			goto nla_put_failure;
3172 	}
3173 
3174 	if (rt->rt6i_prefsrc.plen) {
3175 		struct in6_addr saddr_buf;
3176 		saddr_buf = rt->rt6i_prefsrc.addr;
3177 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3178 			goto nla_put_failure;
3179 	}
3180 
3181 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3182 	if (rt->rt6i_pmtu)
3183 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3184 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3185 		goto nla_put_failure;
3186 
3187 	if (rt->rt6i_flags & RTF_GATEWAY) {
3188 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3189 			goto nla_put_failure;
3190 	}
3191 
3192 	if (rt->dst.dev &&
3193 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3194 		goto nla_put_failure;
3195 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3196 		goto nla_put_failure;
3197 
3198 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3199 
3200 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3201 		goto nla_put_failure;
3202 
3203 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3204 		goto nla_put_failure;
3205 
3206 	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3207 
3208 	nlmsg_end(skb, nlh);
3209 	return 0;
3210 
3211 nla_put_failure:
3212 	nlmsg_cancel(skb, nlh);
3213 	return -EMSGSIZE;
3214 }
3215 
3216 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3217 {
3218 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3219 	int prefix;
3220 
3221 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3222 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3223 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3224 	} else
3225 		prefix = 0;
3226 
3227 	return rt6_fill_node(arg->net,
3228 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3229 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3230 		     prefix, 0, NLM_F_MULTI);
3231 }
3232 
3233 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3234 {
3235 	struct net *net = sock_net(in_skb->sk);
3236 	struct nlattr *tb[RTA_MAX+1];
3237 	struct rt6_info *rt;
3238 	struct sk_buff *skb;
3239 	struct rtmsg *rtm;
3240 	struct flowi6 fl6;
3241 	int err, iif = 0, oif = 0;
3242 
3243 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3244 	if (err < 0)
3245 		goto errout;
3246 
3247 	err = -EINVAL;
3248 	memset(&fl6, 0, sizeof(fl6));
3249 
3250 	if (tb[RTA_SRC]) {
3251 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3252 			goto errout;
3253 
3254 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3255 	}
3256 
3257 	if (tb[RTA_DST]) {
3258 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3259 			goto errout;
3260 
3261 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3262 	}
3263 
3264 	if (tb[RTA_IIF])
3265 		iif = nla_get_u32(tb[RTA_IIF]);
3266 
3267 	if (tb[RTA_OIF])
3268 		oif = nla_get_u32(tb[RTA_OIF]);
3269 
3270 	if (tb[RTA_MARK])
3271 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3272 
3273 	if (iif) {
3274 		struct net_device *dev;
3275 		int flags = 0;
3276 
3277 		dev = __dev_get_by_index(net, iif);
3278 		if (!dev) {
3279 			err = -ENODEV;
3280 			goto errout;
3281 		}
3282 
3283 		fl6.flowi6_iif = iif;
3284 
3285 		if (!ipv6_addr_any(&fl6.saddr))
3286 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3287 
3288 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3289 							       flags);
3290 	} else {
3291 		fl6.flowi6_oif = oif;
3292 
3293 		if (netif_index_is_l3_master(net, oif)) {
3294 			fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC |
3295 					   FLOWI_FLAG_SKIP_NH_OIF;
3296 		}
3297 
3298 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3299 	}
3300 
3301 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3302 	if (!skb) {
3303 		ip6_rt_put(rt);
3304 		err = -ENOBUFS;
3305 		goto errout;
3306 	}
3307 
3308 	/* Reserve room for dummy headers, this skb can pass
3309 	   through good chunk of routing engine.
3310 	 */
3311 	skb_reset_mac_header(skb);
3312 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3313 
3314 	skb_dst_set(skb, &rt->dst);
3315 
3316 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3317 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3318 			    nlh->nlmsg_seq, 0, 0, 0);
3319 	if (err < 0) {
3320 		kfree_skb(skb);
3321 		goto errout;
3322 	}
3323 
3324 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3325 errout:
3326 	return err;
3327 }
3328 
3329 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3330 		     unsigned int nlm_flags)
3331 {
3332 	struct sk_buff *skb;
3333 	struct net *net = info->nl_net;
3334 	u32 seq;
3335 	int err;
3336 
3337 	err = -ENOBUFS;
3338 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3339 
3340 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3341 	if (!skb)
3342 		goto errout;
3343 
3344 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3345 				event, info->portid, seq, 0, 0, nlm_flags);
3346 	if (err < 0) {
3347 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3348 		WARN_ON(err == -EMSGSIZE);
3349 		kfree_skb(skb);
3350 		goto errout;
3351 	}
3352 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3353 		    info->nlh, gfp_any());
3354 	return;
3355 errout:
3356 	if (err < 0)
3357 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3358 }
3359 
3360 static int ip6_route_dev_notify(struct notifier_block *this,
3361 				unsigned long event, void *ptr)
3362 {
3363 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3364 	struct net *net = dev_net(dev);
3365 
3366 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3367 		net->ipv6.ip6_null_entry->dst.dev = dev;
3368 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3369 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3370 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3371 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3372 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3373 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3374 #endif
3375 	}
3376 
3377 	return NOTIFY_OK;
3378 }
3379 
3380 /*
3381  *	/proc
3382  */
3383 
3384 #ifdef CONFIG_PROC_FS
3385 
3386 static const struct file_operations ipv6_route_proc_fops = {
3387 	.owner		= THIS_MODULE,
3388 	.open		= ipv6_route_open,
3389 	.read		= seq_read,
3390 	.llseek		= seq_lseek,
3391 	.release	= seq_release_net,
3392 };
3393 
3394 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3395 {
3396 	struct net *net = (struct net *)seq->private;
3397 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3398 		   net->ipv6.rt6_stats->fib_nodes,
3399 		   net->ipv6.rt6_stats->fib_route_nodes,
3400 		   net->ipv6.rt6_stats->fib_rt_alloc,
3401 		   net->ipv6.rt6_stats->fib_rt_entries,
3402 		   net->ipv6.rt6_stats->fib_rt_cache,
3403 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3404 		   net->ipv6.rt6_stats->fib_discarded_routes);
3405 
3406 	return 0;
3407 }
3408 
3409 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3410 {
3411 	return single_open_net(inode, file, rt6_stats_seq_show);
3412 }
3413 
3414 static const struct file_operations rt6_stats_seq_fops = {
3415 	.owner	 = THIS_MODULE,
3416 	.open	 = rt6_stats_seq_open,
3417 	.read	 = seq_read,
3418 	.llseek	 = seq_lseek,
3419 	.release = single_release_net,
3420 };
3421 #endif	/* CONFIG_PROC_FS */
3422 
3423 #ifdef CONFIG_SYSCTL
3424 
3425 static
3426 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3427 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3428 {
3429 	struct net *net;
3430 	int delay;
3431 	if (!write)
3432 		return -EINVAL;
3433 
3434 	net = (struct net *)ctl->extra1;
3435 	delay = net->ipv6.sysctl.flush_delay;
3436 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3437 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3438 	return 0;
3439 }
3440 
3441 struct ctl_table ipv6_route_table_template[] = {
3442 	{
3443 		.procname	=	"flush",
3444 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3445 		.maxlen		=	sizeof(int),
3446 		.mode		=	0200,
3447 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3448 	},
3449 	{
3450 		.procname	=	"gc_thresh",
3451 		.data		=	&ip6_dst_ops_template.gc_thresh,
3452 		.maxlen		=	sizeof(int),
3453 		.mode		=	0644,
3454 		.proc_handler	=	proc_dointvec,
3455 	},
3456 	{
3457 		.procname	=	"max_size",
3458 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3459 		.maxlen		=	sizeof(int),
3460 		.mode		=	0644,
3461 		.proc_handler	=	proc_dointvec,
3462 	},
3463 	{
3464 		.procname	=	"gc_min_interval",
3465 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3466 		.maxlen		=	sizeof(int),
3467 		.mode		=	0644,
3468 		.proc_handler	=	proc_dointvec_jiffies,
3469 	},
3470 	{
3471 		.procname	=	"gc_timeout",
3472 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3473 		.maxlen		=	sizeof(int),
3474 		.mode		=	0644,
3475 		.proc_handler	=	proc_dointvec_jiffies,
3476 	},
3477 	{
3478 		.procname	=	"gc_interval",
3479 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3480 		.maxlen		=	sizeof(int),
3481 		.mode		=	0644,
3482 		.proc_handler	=	proc_dointvec_jiffies,
3483 	},
3484 	{
3485 		.procname	=	"gc_elasticity",
3486 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3487 		.maxlen		=	sizeof(int),
3488 		.mode		=	0644,
3489 		.proc_handler	=	proc_dointvec,
3490 	},
3491 	{
3492 		.procname	=	"mtu_expires",
3493 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3494 		.maxlen		=	sizeof(int),
3495 		.mode		=	0644,
3496 		.proc_handler	=	proc_dointvec_jiffies,
3497 	},
3498 	{
3499 		.procname	=	"min_adv_mss",
3500 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3501 		.maxlen		=	sizeof(int),
3502 		.mode		=	0644,
3503 		.proc_handler	=	proc_dointvec,
3504 	},
3505 	{
3506 		.procname	=	"gc_min_interval_ms",
3507 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3508 		.maxlen		=	sizeof(int),
3509 		.mode		=	0644,
3510 		.proc_handler	=	proc_dointvec_ms_jiffies,
3511 	},
3512 	{ }
3513 };
3514 
3515 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3516 {
3517 	struct ctl_table *table;
3518 
3519 	table = kmemdup(ipv6_route_table_template,
3520 			sizeof(ipv6_route_table_template),
3521 			GFP_KERNEL);
3522 
3523 	if (table) {
3524 		table[0].data = &net->ipv6.sysctl.flush_delay;
3525 		table[0].extra1 = net;
3526 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3527 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3528 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3529 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3530 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3531 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3532 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3533 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3534 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3535 
3536 		/* Don't export sysctls to unprivileged users */
3537 		if (net->user_ns != &init_user_ns)
3538 			table[0].procname = NULL;
3539 	}
3540 
3541 	return table;
3542 }
3543 #endif
3544 
3545 static int __net_init ip6_route_net_init(struct net *net)
3546 {
3547 	int ret = -ENOMEM;
3548 
3549 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3550 	       sizeof(net->ipv6.ip6_dst_ops));
3551 
3552 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3553 		goto out_ip6_dst_ops;
3554 
3555 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3556 					   sizeof(*net->ipv6.ip6_null_entry),
3557 					   GFP_KERNEL);
3558 	if (!net->ipv6.ip6_null_entry)
3559 		goto out_ip6_dst_entries;
3560 	net->ipv6.ip6_null_entry->dst.path =
3561 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3562 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3563 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3564 			 ip6_template_metrics, true);
3565 
3566 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3567 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3568 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3569 					       GFP_KERNEL);
3570 	if (!net->ipv6.ip6_prohibit_entry)
3571 		goto out_ip6_null_entry;
3572 	net->ipv6.ip6_prohibit_entry->dst.path =
3573 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3574 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3575 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3576 			 ip6_template_metrics, true);
3577 
3578 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3579 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3580 					       GFP_KERNEL);
3581 	if (!net->ipv6.ip6_blk_hole_entry)
3582 		goto out_ip6_prohibit_entry;
3583 	net->ipv6.ip6_blk_hole_entry->dst.path =
3584 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3585 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3586 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3587 			 ip6_template_metrics, true);
3588 #endif
3589 
3590 	net->ipv6.sysctl.flush_delay = 0;
3591 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3592 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3593 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3594 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3595 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3596 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3597 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3598 
3599 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3600 
3601 	ret = 0;
3602 out:
3603 	return ret;
3604 
3605 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3606 out_ip6_prohibit_entry:
3607 	kfree(net->ipv6.ip6_prohibit_entry);
3608 out_ip6_null_entry:
3609 	kfree(net->ipv6.ip6_null_entry);
3610 #endif
3611 out_ip6_dst_entries:
3612 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3613 out_ip6_dst_ops:
3614 	goto out;
3615 }
3616 
3617 static void __net_exit ip6_route_net_exit(struct net *net)
3618 {
3619 	kfree(net->ipv6.ip6_null_entry);
3620 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3621 	kfree(net->ipv6.ip6_prohibit_entry);
3622 	kfree(net->ipv6.ip6_blk_hole_entry);
3623 #endif
3624 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3625 }
3626 
3627 static int __net_init ip6_route_net_init_late(struct net *net)
3628 {
3629 #ifdef CONFIG_PROC_FS
3630 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3631 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3632 #endif
3633 	return 0;
3634 }
3635 
3636 static void __net_exit ip6_route_net_exit_late(struct net *net)
3637 {
3638 #ifdef CONFIG_PROC_FS
3639 	remove_proc_entry("ipv6_route", net->proc_net);
3640 	remove_proc_entry("rt6_stats", net->proc_net);
3641 #endif
3642 }
3643 
3644 static struct pernet_operations ip6_route_net_ops = {
3645 	.init = ip6_route_net_init,
3646 	.exit = ip6_route_net_exit,
3647 };
3648 
3649 static int __net_init ipv6_inetpeer_init(struct net *net)
3650 {
3651 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3652 
3653 	if (!bp)
3654 		return -ENOMEM;
3655 	inet_peer_base_init(bp);
3656 	net->ipv6.peers = bp;
3657 	return 0;
3658 }
3659 
3660 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3661 {
3662 	struct inet_peer_base *bp = net->ipv6.peers;
3663 
3664 	net->ipv6.peers = NULL;
3665 	inetpeer_invalidate_tree(bp);
3666 	kfree(bp);
3667 }
3668 
3669 static struct pernet_operations ipv6_inetpeer_ops = {
3670 	.init	=	ipv6_inetpeer_init,
3671 	.exit	=	ipv6_inetpeer_exit,
3672 };
3673 
3674 static struct pernet_operations ip6_route_net_late_ops = {
3675 	.init = ip6_route_net_init_late,
3676 	.exit = ip6_route_net_exit_late,
3677 };
3678 
3679 static struct notifier_block ip6_route_dev_notifier = {
3680 	.notifier_call = ip6_route_dev_notify,
3681 	.priority = 0,
3682 };
3683 
3684 int __init ip6_route_init(void)
3685 {
3686 	int ret;
3687 	int cpu;
3688 
3689 	ret = -ENOMEM;
3690 	ip6_dst_ops_template.kmem_cachep =
3691 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3692 				  SLAB_HWCACHE_ALIGN, NULL);
3693 	if (!ip6_dst_ops_template.kmem_cachep)
3694 		goto out;
3695 
3696 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3697 	if (ret)
3698 		goto out_kmem_cache;
3699 
3700 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3701 	if (ret)
3702 		goto out_dst_entries;
3703 
3704 	ret = register_pernet_subsys(&ip6_route_net_ops);
3705 	if (ret)
3706 		goto out_register_inetpeer;
3707 
3708 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3709 
3710 	/* Registering of the loopback is done before this portion of code,
3711 	 * the loopback reference in rt6_info will not be taken, do it
3712 	 * manually for init_net */
3713 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3714 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3715   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3716 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3717 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3718 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3719 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3720   #endif
3721 	ret = fib6_init();
3722 	if (ret)
3723 		goto out_register_subsys;
3724 
3725 	ret = xfrm6_init();
3726 	if (ret)
3727 		goto out_fib6_init;
3728 
3729 	ret = fib6_rules_init();
3730 	if (ret)
3731 		goto xfrm6_init;
3732 
3733 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3734 	if (ret)
3735 		goto fib6_rules_init;
3736 
3737 	ret = -ENOBUFS;
3738 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3739 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3740 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3741 		goto out_register_late_subsys;
3742 
3743 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3744 	if (ret)
3745 		goto out_register_late_subsys;
3746 
3747 	for_each_possible_cpu(cpu) {
3748 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3749 
3750 		INIT_LIST_HEAD(&ul->head);
3751 		spin_lock_init(&ul->lock);
3752 	}
3753 
3754 out:
3755 	return ret;
3756 
3757 out_register_late_subsys:
3758 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3759 fib6_rules_init:
3760 	fib6_rules_cleanup();
3761 xfrm6_init:
3762 	xfrm6_fini();
3763 out_fib6_init:
3764 	fib6_gc_cleanup();
3765 out_register_subsys:
3766 	unregister_pernet_subsys(&ip6_route_net_ops);
3767 out_register_inetpeer:
3768 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3769 out_dst_entries:
3770 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3771 out_kmem_cache:
3772 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3773 	goto out;
3774 }
3775 
3776 void ip6_route_cleanup(void)
3777 {
3778 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3779 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3780 	fib6_rules_cleanup();
3781 	xfrm6_fini();
3782 	fib6_gc_cleanup();
3783 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3784 	unregister_pernet_subsys(&ip6_route_net_ops);
3785 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3786 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3787 }
3788