xref: /openbmc/linux/net/ipv6/route.c (revision 82003e04)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66 
67 #include <asm/uaccess.h>
68 
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72 
73 enum rt6_nud_state {
74 	RT6_NUD_FAIL_HARD = -3,
75 	RT6_NUD_FAIL_PROBE = -2,
76 	RT6_NUD_FAIL_DO_RR = -1,
77 	RT6_NUD_SUCCEED = 1
78 };
79 
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void		ip6_dst_destroy(struct dst_entry *);
86 static void		ip6_dst_ifdown(struct dst_entry *,
87 				       struct net_device *dev, int how);
88 static int		 ip6_dst_gc(struct dst_ops *ops);
89 
90 static int		ip6_pkt_discard(struct sk_buff *skb);
91 static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int		ip6_pkt_prohibit(struct sk_buff *skb);
93 static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void		ip6_link_failure(struct sk_buff *skb);
95 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 					   struct sk_buff *skb, u32 mtu);
97 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98 					struct sk_buff *skb);
99 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104 					   const struct in6_addr *prefix, int prefixlen,
105 					   const struct in6_addr *gwaddr,
106 					   struct net_device *dev,
107 					   unsigned int pref);
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109 					   const struct in6_addr *prefix, int prefixlen,
110 					   const struct in6_addr *gwaddr,
111 					   struct net_device *dev);
112 #endif
113 
114 struct uncached_list {
115 	spinlock_t		lock;
116 	struct list_head	head;
117 };
118 
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
120 
121 static void rt6_uncached_list_add(struct rt6_info *rt)
122 {
123 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
124 
125 	rt->dst.flags |= DST_NOCACHE;
126 	rt->rt6i_uncached_list = ul;
127 
128 	spin_lock_bh(&ul->lock);
129 	list_add_tail(&rt->rt6i_uncached, &ul->head);
130 	spin_unlock_bh(&ul->lock);
131 }
132 
133 static void rt6_uncached_list_del(struct rt6_info *rt)
134 {
135 	if (!list_empty(&rt->rt6i_uncached)) {
136 		struct uncached_list *ul = rt->rt6i_uncached_list;
137 
138 		spin_lock_bh(&ul->lock);
139 		list_del(&rt->rt6i_uncached);
140 		spin_unlock_bh(&ul->lock);
141 	}
142 }
143 
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
145 {
146 	struct net_device *loopback_dev = net->loopback_dev;
147 	int cpu;
148 
149 	if (dev == loopback_dev)
150 		return;
151 
152 	for_each_possible_cpu(cpu) {
153 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
154 		struct rt6_info *rt;
155 
156 		spin_lock_bh(&ul->lock);
157 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158 			struct inet6_dev *rt_idev = rt->rt6i_idev;
159 			struct net_device *rt_dev = rt->dst.dev;
160 
161 			if (rt_idev->dev == dev) {
162 				rt->rt6i_idev = in6_dev_get(loopback_dev);
163 				in6_dev_put(rt_idev);
164 			}
165 
166 			if (rt_dev == dev) {
167 				rt->dst.dev = loopback_dev;
168 				dev_hold(rt->dst.dev);
169 				dev_put(rt_dev);
170 			}
171 		}
172 		spin_unlock_bh(&ul->lock);
173 	}
174 }
175 
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
177 {
178 	return dst_metrics_write_ptr(rt->dst.from);
179 }
180 
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
182 {
183 	struct rt6_info *rt = (struct rt6_info *)dst;
184 
185 	if (rt->rt6i_flags & RTF_PCPU)
186 		return rt6_pcpu_cow_metrics(rt);
187 	else if (rt->rt6i_flags & RTF_CACHE)
188 		return NULL;
189 	else
190 		return dst_cow_metrics_generic(dst, old);
191 }
192 
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
194 					     struct sk_buff *skb,
195 					     const void *daddr)
196 {
197 	struct in6_addr *p = &rt->rt6i_gateway;
198 
199 	if (!ipv6_addr_any(p))
200 		return (const void *) p;
201 	else if (skb)
202 		return &ipv6_hdr(skb)->daddr;
203 	return daddr;
204 }
205 
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
207 					  struct sk_buff *skb,
208 					  const void *daddr)
209 {
210 	struct rt6_info *rt = (struct rt6_info *) dst;
211 	struct neighbour *n;
212 
213 	daddr = choose_neigh_daddr(rt, skb, daddr);
214 	n = __ipv6_neigh_lookup(dst->dev, daddr);
215 	if (n)
216 		return n;
217 	return neigh_create(&nd_tbl, daddr, dst->dev);
218 }
219 
220 static struct dst_ops ip6_dst_ops_template = {
221 	.family			=	AF_INET6,
222 	.gc			=	ip6_dst_gc,
223 	.gc_thresh		=	1024,
224 	.check			=	ip6_dst_check,
225 	.default_advmss		=	ip6_default_advmss,
226 	.mtu			=	ip6_mtu,
227 	.cow_metrics		=	ipv6_cow_metrics,
228 	.destroy		=	ip6_dst_destroy,
229 	.ifdown			=	ip6_dst_ifdown,
230 	.negative_advice	=	ip6_negative_advice,
231 	.link_failure		=	ip6_link_failure,
232 	.update_pmtu		=	ip6_rt_update_pmtu,
233 	.redirect		=	rt6_do_redirect,
234 	.local_out		=	__ip6_local_out,
235 	.neigh_lookup		=	ip6_neigh_lookup,
236 };
237 
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
239 {
240 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
241 
242 	return mtu ? : dst->dev->mtu;
243 }
244 
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246 					 struct sk_buff *skb, u32 mtu)
247 {
248 }
249 
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251 				      struct sk_buff *skb)
252 {
253 }
254 
255 static struct dst_ops ip6_dst_blackhole_ops = {
256 	.family			=	AF_INET6,
257 	.destroy		=	ip6_dst_destroy,
258 	.check			=	ip6_dst_check,
259 	.mtu			=	ip6_blackhole_mtu,
260 	.default_advmss		=	ip6_default_advmss,
261 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
262 	.redirect		=	ip6_rt_blackhole_redirect,
263 	.cow_metrics		=	dst_cow_metrics_generic,
264 	.neigh_lookup		=	ip6_neigh_lookup,
265 };
266 
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268 	[RTAX_HOPLIMIT - 1] = 0,
269 };
270 
271 static const struct rt6_info ip6_null_entry_template = {
272 	.dst = {
273 		.__refcnt	= ATOMIC_INIT(1),
274 		.__use		= 1,
275 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
276 		.error		= -ENETUNREACH,
277 		.input		= ip6_pkt_discard,
278 		.output		= ip6_pkt_discard_out,
279 	},
280 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
281 	.rt6i_protocol  = RTPROT_KERNEL,
282 	.rt6i_metric	= ~(u32) 0,
283 	.rt6i_ref	= ATOMIC_INIT(1),
284 };
285 
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287 
288 static const struct rt6_info ip6_prohibit_entry_template = {
289 	.dst = {
290 		.__refcnt	= ATOMIC_INIT(1),
291 		.__use		= 1,
292 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
293 		.error		= -EACCES,
294 		.input		= ip6_pkt_prohibit,
295 		.output		= ip6_pkt_prohibit_out,
296 	},
297 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
298 	.rt6i_protocol  = RTPROT_KERNEL,
299 	.rt6i_metric	= ~(u32) 0,
300 	.rt6i_ref	= ATOMIC_INIT(1),
301 };
302 
303 static const struct rt6_info ip6_blk_hole_entry_template = {
304 	.dst = {
305 		.__refcnt	= ATOMIC_INIT(1),
306 		.__use		= 1,
307 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
308 		.error		= -EINVAL,
309 		.input		= dst_discard,
310 		.output		= dst_discard_out,
311 	},
312 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
313 	.rt6i_protocol  = RTPROT_KERNEL,
314 	.rt6i_metric	= ~(u32) 0,
315 	.rt6i_ref	= ATOMIC_INIT(1),
316 };
317 
318 #endif
319 
320 static void rt6_info_init(struct rt6_info *rt)
321 {
322 	struct dst_entry *dst = &rt->dst;
323 
324 	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325 	INIT_LIST_HEAD(&rt->rt6i_siblings);
326 	INIT_LIST_HEAD(&rt->rt6i_uncached);
327 }
328 
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331 					struct net_device *dev,
332 					int flags)
333 {
334 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335 					0, DST_OBSOLETE_FORCE_CHK, flags);
336 
337 	if (rt)
338 		rt6_info_init(rt);
339 
340 	return rt;
341 }
342 
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344 			       struct net_device *dev,
345 			       int flags)
346 {
347 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
348 
349 	if (rt) {
350 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
351 		if (rt->rt6i_pcpu) {
352 			int cpu;
353 
354 			for_each_possible_cpu(cpu) {
355 				struct rt6_info **p;
356 
357 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358 				/* no one shares rt */
359 				*p =  NULL;
360 			}
361 		} else {
362 			dst_destroy((struct dst_entry *)rt);
363 			return NULL;
364 		}
365 	}
366 
367 	return rt;
368 }
369 EXPORT_SYMBOL(ip6_dst_alloc);
370 
371 static void ip6_dst_destroy(struct dst_entry *dst)
372 {
373 	struct rt6_info *rt = (struct rt6_info *)dst;
374 	struct dst_entry *from = dst->from;
375 	struct inet6_dev *idev;
376 
377 	dst_destroy_metrics_generic(dst);
378 	free_percpu(rt->rt6i_pcpu);
379 	rt6_uncached_list_del(rt);
380 
381 	idev = rt->rt6i_idev;
382 	if (idev) {
383 		rt->rt6i_idev = NULL;
384 		in6_dev_put(idev);
385 	}
386 
387 	dst->from = NULL;
388 	dst_release(from);
389 }
390 
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392 			   int how)
393 {
394 	struct rt6_info *rt = (struct rt6_info *)dst;
395 	struct inet6_dev *idev = rt->rt6i_idev;
396 	struct net_device *loopback_dev =
397 		dev_net(dev)->loopback_dev;
398 
399 	if (dev != loopback_dev) {
400 		if (idev && idev->dev == dev) {
401 			struct inet6_dev *loopback_idev =
402 				in6_dev_get(loopback_dev);
403 			if (loopback_idev) {
404 				rt->rt6i_idev = loopback_idev;
405 				in6_dev_put(idev);
406 			}
407 		}
408 	}
409 }
410 
411 static bool __rt6_check_expired(const struct rt6_info *rt)
412 {
413 	if (rt->rt6i_flags & RTF_EXPIRES)
414 		return time_after(jiffies, rt->dst.expires);
415 	else
416 		return false;
417 }
418 
419 static bool rt6_check_expired(const struct rt6_info *rt)
420 {
421 	if (rt->rt6i_flags & RTF_EXPIRES) {
422 		if (time_after(jiffies, rt->dst.expires))
423 			return true;
424 	} else if (rt->dst.from) {
425 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
426 	}
427 	return false;
428 }
429 
430 /* Multipath route selection:
431  *   Hash based function using packet header and flowlabel.
432  * Adapted from fib_info_hashfn()
433  */
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435 			       const struct flowi6 *fl6)
436 {
437 	return get_hash_from_flowi6(fl6) % candidate_count;
438 }
439 
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441 					     struct flowi6 *fl6, int oif,
442 					     int strict)
443 {
444 	struct rt6_info *sibling, *next_sibling;
445 	int route_choosen;
446 
447 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448 	/* Don't change the route, if route_choosen == 0
449 	 * (siblings does not include ourself)
450 	 */
451 	if (route_choosen)
452 		list_for_each_entry_safe(sibling, next_sibling,
453 				&match->rt6i_siblings, rt6i_siblings) {
454 			route_choosen--;
455 			if (route_choosen == 0) {
456 				if (rt6_score_route(sibling, oif, strict) < 0)
457 					break;
458 				match = sibling;
459 				break;
460 			}
461 		}
462 	return match;
463 }
464 
465 /*
466  *	Route lookup. Any table->tb6_lock is implied.
467  */
468 
469 static inline struct rt6_info *rt6_device_match(struct net *net,
470 						    struct rt6_info *rt,
471 						    const struct in6_addr *saddr,
472 						    int oif,
473 						    int flags)
474 {
475 	struct rt6_info *local = NULL;
476 	struct rt6_info *sprt;
477 
478 	if (!oif && ipv6_addr_any(saddr))
479 		goto out;
480 
481 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482 		struct net_device *dev = sprt->dst.dev;
483 
484 		if (oif) {
485 			if (dev->ifindex == oif)
486 				return sprt;
487 			if (dev->flags & IFF_LOOPBACK) {
488 				if (!sprt->rt6i_idev ||
489 				    sprt->rt6i_idev->dev->ifindex != oif) {
490 					if (flags & RT6_LOOKUP_F_IFACE)
491 						continue;
492 					if (local &&
493 					    local->rt6i_idev->dev->ifindex == oif)
494 						continue;
495 				}
496 				local = sprt;
497 			}
498 		} else {
499 			if (ipv6_chk_addr(net, saddr, dev,
500 					  flags & RT6_LOOKUP_F_IFACE))
501 				return sprt;
502 		}
503 	}
504 
505 	if (oif) {
506 		if (local)
507 			return local;
508 
509 		if (flags & RT6_LOOKUP_F_IFACE)
510 			return net->ipv6.ip6_null_entry;
511 	}
512 out:
513 	return rt;
514 }
515 
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518 	struct work_struct work;
519 	struct in6_addr target;
520 	struct net_device *dev;
521 };
522 
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525 	struct in6_addr mcaddr;
526 	struct __rt6_probe_work *work =
527 		container_of(w, struct __rt6_probe_work, work);
528 
529 	addrconf_addr_solict_mult(&work->target, &mcaddr);
530 	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
531 	dev_put(work->dev);
532 	kfree(work);
533 }
534 
535 static void rt6_probe(struct rt6_info *rt)
536 {
537 	struct __rt6_probe_work *work;
538 	struct neighbour *neigh;
539 	/*
540 	 * Okay, this does not seem to be appropriate
541 	 * for now, however, we need to check if it
542 	 * is really so; aka Router Reachability Probing.
543 	 *
544 	 * Router Reachability Probe MUST be rate-limited
545 	 * to no more than one per minute.
546 	 */
547 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
548 		return;
549 	rcu_read_lock_bh();
550 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
551 	if (neigh) {
552 		if (neigh->nud_state & NUD_VALID)
553 			goto out;
554 
555 		work = NULL;
556 		write_lock(&neigh->lock);
557 		if (!(neigh->nud_state & NUD_VALID) &&
558 		    time_after(jiffies,
559 			       neigh->updated +
560 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
561 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
562 			if (work)
563 				__neigh_set_probe_once(neigh);
564 		}
565 		write_unlock(&neigh->lock);
566 	} else {
567 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
568 	}
569 
570 	if (work) {
571 		INIT_WORK(&work->work, rt6_probe_deferred);
572 		work->target = rt->rt6i_gateway;
573 		dev_hold(rt->dst.dev);
574 		work->dev = rt->dst.dev;
575 		schedule_work(&work->work);
576 	}
577 
578 out:
579 	rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct rt6_info *rt)
583 {
584 }
585 #endif
586 
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
591 {
592 	struct net_device *dev = rt->dst.dev;
593 	if (!oif || dev->ifindex == oif)
594 		return 2;
595 	if ((dev->flags & IFF_LOOPBACK) &&
596 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
597 		return 1;
598 	return 0;
599 }
600 
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
602 {
603 	struct neighbour *neigh;
604 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
605 
606 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
607 	    !(rt->rt6i_flags & RTF_GATEWAY))
608 		return RT6_NUD_SUCCEED;
609 
610 	rcu_read_lock_bh();
611 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
612 	if (neigh) {
613 		read_lock(&neigh->lock);
614 		if (neigh->nud_state & NUD_VALID)
615 			ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617 		else if (!(neigh->nud_state & NUD_FAILED))
618 			ret = RT6_NUD_SUCCEED;
619 		else
620 			ret = RT6_NUD_FAIL_PROBE;
621 #endif
622 		read_unlock(&neigh->lock);
623 	} else {
624 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626 	}
627 	rcu_read_unlock_bh();
628 
629 	return ret;
630 }
631 
632 static int rt6_score_route(struct rt6_info *rt, int oif,
633 			   int strict)
634 {
635 	int m;
636 
637 	m = rt6_check_dev(rt, oif);
638 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
639 		return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
642 #endif
643 	if (strict & RT6_LOOKUP_F_REACHABLE) {
644 		int n = rt6_check_neigh(rt);
645 		if (n < 0)
646 			return n;
647 	}
648 	return m;
649 }
650 
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652 				   int *mpri, struct rt6_info *match,
653 				   bool *do_rr)
654 {
655 	int m;
656 	bool match_do_rr = false;
657 	struct inet6_dev *idev = rt->rt6i_idev;
658 	struct net_device *dev = rt->dst.dev;
659 
660 	if (dev && !netif_carrier_ok(dev) &&
661 	    idev->cnf.ignore_routes_with_linkdown &&
662 	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
663 		goto out;
664 
665 	if (rt6_check_expired(rt))
666 		goto out;
667 
668 	m = rt6_score_route(rt, oif, strict);
669 	if (m == RT6_NUD_FAIL_DO_RR) {
670 		match_do_rr = true;
671 		m = 0; /* lowest valid score */
672 	} else if (m == RT6_NUD_FAIL_HARD) {
673 		goto out;
674 	}
675 
676 	if (strict & RT6_LOOKUP_F_REACHABLE)
677 		rt6_probe(rt);
678 
679 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
680 	if (m > *mpri) {
681 		*do_rr = match_do_rr;
682 		*mpri = m;
683 		match = rt;
684 	}
685 out:
686 	return match;
687 }
688 
689 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
690 				     struct rt6_info *rr_head,
691 				     u32 metric, int oif, int strict,
692 				     bool *do_rr)
693 {
694 	struct rt6_info *rt, *match, *cont;
695 	int mpri = -1;
696 
697 	match = NULL;
698 	cont = NULL;
699 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
700 		if (rt->rt6i_metric != metric) {
701 			cont = rt;
702 			break;
703 		}
704 
705 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
706 	}
707 
708 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
709 		if (rt->rt6i_metric != metric) {
710 			cont = rt;
711 			break;
712 		}
713 
714 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
715 	}
716 
717 	if (match || !cont)
718 		return match;
719 
720 	for (rt = cont; rt; rt = rt->dst.rt6_next)
721 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
722 
723 	return match;
724 }
725 
726 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 {
728 	struct rt6_info *match, *rt0;
729 	struct net *net;
730 	bool do_rr = false;
731 
732 	rt0 = fn->rr_ptr;
733 	if (!rt0)
734 		fn->rr_ptr = rt0 = fn->leaf;
735 
736 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
737 			     &do_rr);
738 
739 	if (do_rr) {
740 		struct rt6_info *next = rt0->dst.rt6_next;
741 
742 		/* no entries matched; do round-robin */
743 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
744 			next = fn->leaf;
745 
746 		if (next != rt0)
747 			fn->rr_ptr = next;
748 	}
749 
750 	net = dev_net(rt0->dst.dev);
751 	return match ? match : net->ipv6.ip6_null_entry;
752 }
753 
754 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 {
756 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
757 }
758 
759 #ifdef CONFIG_IPV6_ROUTE_INFO
760 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
761 		  const struct in6_addr *gwaddr)
762 {
763 	struct net *net = dev_net(dev);
764 	struct route_info *rinfo = (struct route_info *) opt;
765 	struct in6_addr prefix_buf, *prefix;
766 	unsigned int pref;
767 	unsigned long lifetime;
768 	struct rt6_info *rt;
769 
770 	if (len < sizeof(struct route_info)) {
771 		return -EINVAL;
772 	}
773 
774 	/* Sanity check for prefix_len and length */
775 	if (rinfo->length > 3) {
776 		return -EINVAL;
777 	} else if (rinfo->prefix_len > 128) {
778 		return -EINVAL;
779 	} else if (rinfo->prefix_len > 64) {
780 		if (rinfo->length < 2) {
781 			return -EINVAL;
782 		}
783 	} else if (rinfo->prefix_len > 0) {
784 		if (rinfo->length < 1) {
785 			return -EINVAL;
786 		}
787 	}
788 
789 	pref = rinfo->route_pref;
790 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
791 		return -EINVAL;
792 
793 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794 
795 	if (rinfo->length == 3)
796 		prefix = (struct in6_addr *)rinfo->prefix;
797 	else {
798 		/* this function is safe */
799 		ipv6_addr_prefix(&prefix_buf,
800 				 (struct in6_addr *)rinfo->prefix,
801 				 rinfo->prefix_len);
802 		prefix = &prefix_buf;
803 	}
804 
805 	if (rinfo->prefix_len == 0)
806 		rt = rt6_get_dflt_router(gwaddr, dev);
807 	else
808 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
809 					gwaddr, dev);
810 
811 	if (rt && !lifetime) {
812 		ip6_del_rt(rt);
813 		rt = NULL;
814 	}
815 
816 	if (!rt && lifetime)
817 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
818 					dev, pref);
819 	else if (rt)
820 		rt->rt6i_flags = RTF_ROUTEINFO |
821 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
822 
823 	if (rt) {
824 		if (!addrconf_finite_timeout(lifetime))
825 			rt6_clean_expires(rt);
826 		else
827 			rt6_set_expires(rt, jiffies + HZ * lifetime);
828 
829 		ip6_rt_put(rt);
830 	}
831 	return 0;
832 }
833 #endif
834 
835 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
836 					struct in6_addr *saddr)
837 {
838 	struct fib6_node *pn;
839 	while (1) {
840 		if (fn->fn_flags & RTN_TL_ROOT)
841 			return NULL;
842 		pn = fn->parent;
843 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
844 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
845 		else
846 			fn = pn;
847 		if (fn->fn_flags & RTN_RTINFO)
848 			return fn;
849 	}
850 }
851 
852 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
853 					     struct fib6_table *table,
854 					     struct flowi6 *fl6, int flags)
855 {
856 	struct fib6_node *fn;
857 	struct rt6_info *rt;
858 
859 	read_lock_bh(&table->tb6_lock);
860 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
861 restart:
862 	rt = fn->leaf;
863 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
864 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
865 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
866 	if (rt == net->ipv6.ip6_null_entry) {
867 		fn = fib6_backtrack(fn, &fl6->saddr);
868 		if (fn)
869 			goto restart;
870 	}
871 	dst_use(&rt->dst, jiffies);
872 	read_unlock_bh(&table->tb6_lock);
873 
874 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
875 
876 	return rt;
877 
878 }
879 
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881 				    int flags)
882 {
883 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886 
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888 			    const struct in6_addr *saddr, int oif, int strict)
889 {
890 	struct flowi6 fl6 = {
891 		.flowi6_oif = oif,
892 		.daddr = *daddr,
893 	};
894 	struct dst_entry *dst;
895 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896 
897 	if (saddr) {
898 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899 		flags |= RT6_LOOKUP_F_HAS_SADDR;
900 	}
901 
902 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903 	if (dst->error == 0)
904 		return (struct rt6_info *) dst;
905 
906 	dst_release(dst);
907 
908 	return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911 
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917 
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919 			struct mx6_config *mxc)
920 {
921 	int err;
922 	struct fib6_table *table;
923 
924 	table = rt->rt6i_table;
925 	write_lock_bh(&table->tb6_lock);
926 	err = fib6_add(&table->tb6_root, rt, info, mxc);
927 	write_unlock_bh(&table->tb6_lock);
928 
929 	return err;
930 }
931 
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
935 	struct mx6_config mxc = { .mx = NULL, };
936 
937 	return __ip6_ins_rt(rt, &info, &mxc);
938 }
939 
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941 					   const struct in6_addr *daddr,
942 					   const struct in6_addr *saddr)
943 {
944 	struct rt6_info *rt;
945 
946 	/*
947 	 *	Clone the route.
948 	 */
949 
950 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951 		ort = (struct rt6_info *)ort->dst.from;
952 
953 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
954 
955 	if (!rt)
956 		return NULL;
957 
958 	ip6_rt_copy_init(rt, ort);
959 	rt->rt6i_flags |= RTF_CACHE;
960 	rt->rt6i_metric = 0;
961 	rt->dst.flags |= DST_HOST;
962 	rt->rt6i_dst.addr = *daddr;
963 	rt->rt6i_dst.plen = 128;
964 
965 	if (!rt6_is_gw_or_nonexthop(ort)) {
966 		if (ort->rt6i_dst.plen != 128 &&
967 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
968 			rt->rt6i_flags |= RTF_ANYCAST;
969 #ifdef CONFIG_IPV6_SUBTREES
970 		if (rt->rt6i_src.plen && saddr) {
971 			rt->rt6i_src.addr = *saddr;
972 			rt->rt6i_src.plen = 128;
973 		}
974 #endif
975 	}
976 
977 	return rt;
978 }
979 
980 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
981 {
982 	struct rt6_info *pcpu_rt;
983 
984 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
985 				  rt->dst.dev, rt->dst.flags);
986 
987 	if (!pcpu_rt)
988 		return NULL;
989 	ip6_rt_copy_init(pcpu_rt, rt);
990 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
991 	pcpu_rt->rt6i_flags |= RTF_PCPU;
992 	return pcpu_rt;
993 }
994 
995 /* It should be called with read_lock_bh(&tb6_lock) acquired */
996 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
997 {
998 	struct rt6_info *pcpu_rt, **p;
999 
1000 	p = this_cpu_ptr(rt->rt6i_pcpu);
1001 	pcpu_rt = *p;
1002 
1003 	if (pcpu_rt) {
1004 		dst_hold(&pcpu_rt->dst);
1005 		rt6_dst_from_metrics_check(pcpu_rt);
1006 	}
1007 	return pcpu_rt;
1008 }
1009 
1010 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1011 {
1012 	struct fib6_table *table = rt->rt6i_table;
1013 	struct rt6_info *pcpu_rt, *prev, **p;
1014 
1015 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1016 	if (!pcpu_rt) {
1017 		struct net *net = dev_net(rt->dst.dev);
1018 
1019 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1020 		return net->ipv6.ip6_null_entry;
1021 	}
1022 
1023 	read_lock_bh(&table->tb6_lock);
1024 	if (rt->rt6i_pcpu) {
1025 		p = this_cpu_ptr(rt->rt6i_pcpu);
1026 		prev = cmpxchg(p, NULL, pcpu_rt);
1027 		if (prev) {
1028 			/* If someone did it before us, return prev instead */
1029 			dst_destroy(&pcpu_rt->dst);
1030 			pcpu_rt = prev;
1031 		}
1032 	} else {
1033 		/* rt has been removed from the fib6 tree
1034 		 * before we have a chance to acquire the read_lock.
1035 		 * In this case, don't brother to create a pcpu rt
1036 		 * since rt is going away anyway.  The next
1037 		 * dst_check() will trigger a re-lookup.
1038 		 */
1039 		dst_destroy(&pcpu_rt->dst);
1040 		pcpu_rt = rt;
1041 	}
1042 	dst_hold(&pcpu_rt->dst);
1043 	rt6_dst_from_metrics_check(pcpu_rt);
1044 	read_unlock_bh(&table->tb6_lock);
1045 	return pcpu_rt;
1046 }
1047 
1048 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1049 			       int oif, struct flowi6 *fl6, int flags)
1050 {
1051 	struct fib6_node *fn, *saved_fn;
1052 	struct rt6_info *rt;
1053 	int strict = 0;
1054 
1055 	strict |= flags & RT6_LOOKUP_F_IFACE;
1056 	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1057 	if (net->ipv6.devconf_all->forwarding == 0)
1058 		strict |= RT6_LOOKUP_F_REACHABLE;
1059 
1060 	read_lock_bh(&table->tb6_lock);
1061 
1062 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1063 	saved_fn = fn;
1064 
1065 	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1066 		oif = 0;
1067 
1068 redo_rt6_select:
1069 	rt = rt6_select(fn, oif, strict);
1070 	if (rt->rt6i_nsiblings)
1071 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1072 	if (rt == net->ipv6.ip6_null_entry) {
1073 		fn = fib6_backtrack(fn, &fl6->saddr);
1074 		if (fn)
1075 			goto redo_rt6_select;
1076 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1077 			/* also consider unreachable route */
1078 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1079 			fn = saved_fn;
1080 			goto redo_rt6_select;
1081 		}
1082 	}
1083 
1084 
1085 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1086 		dst_use(&rt->dst, jiffies);
1087 		read_unlock_bh(&table->tb6_lock);
1088 
1089 		rt6_dst_from_metrics_check(rt);
1090 
1091 		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1092 		return rt;
1093 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1094 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1095 		/* Create a RTF_CACHE clone which will not be
1096 		 * owned by the fib6 tree.  It is for the special case where
1097 		 * the daddr in the skb during the neighbor look-up is different
1098 		 * from the fl6->daddr used to look-up route here.
1099 		 */
1100 
1101 		struct rt6_info *uncached_rt;
1102 
1103 		dst_use(&rt->dst, jiffies);
1104 		read_unlock_bh(&table->tb6_lock);
1105 
1106 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1107 		dst_release(&rt->dst);
1108 
1109 		if (uncached_rt)
1110 			rt6_uncached_list_add(uncached_rt);
1111 		else
1112 			uncached_rt = net->ipv6.ip6_null_entry;
1113 
1114 		dst_hold(&uncached_rt->dst);
1115 
1116 		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1117 		return uncached_rt;
1118 
1119 	} else {
1120 		/* Get a percpu copy */
1121 
1122 		struct rt6_info *pcpu_rt;
1123 
1124 		rt->dst.lastuse = jiffies;
1125 		rt->dst.__use++;
1126 		pcpu_rt = rt6_get_pcpu_route(rt);
1127 
1128 		if (pcpu_rt) {
1129 			read_unlock_bh(&table->tb6_lock);
1130 		} else {
1131 			/* We have to do the read_unlock first
1132 			 * because rt6_make_pcpu_route() may trigger
1133 			 * ip6_dst_gc() which will take the write_lock.
1134 			 */
1135 			dst_hold(&rt->dst);
1136 			read_unlock_bh(&table->tb6_lock);
1137 			pcpu_rt = rt6_make_pcpu_route(rt);
1138 			dst_release(&rt->dst);
1139 		}
1140 
1141 		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1142 		return pcpu_rt;
1143 
1144 	}
1145 }
1146 EXPORT_SYMBOL_GPL(ip6_pol_route);
1147 
1148 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1149 					    struct flowi6 *fl6, int flags)
1150 {
1151 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1152 }
1153 
1154 struct dst_entry *ip6_route_input_lookup(struct net *net,
1155 					 struct net_device *dev,
1156 					 struct flowi6 *fl6, int flags)
1157 {
1158 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1159 		flags |= RT6_LOOKUP_F_IFACE;
1160 
1161 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1162 }
1163 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1164 
1165 void ip6_route_input(struct sk_buff *skb)
1166 {
1167 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1168 	struct net *net = dev_net(skb->dev);
1169 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1170 	struct ip_tunnel_info *tun_info;
1171 	struct flowi6 fl6 = {
1172 		.flowi6_iif = skb->dev->ifindex,
1173 		.daddr = iph->daddr,
1174 		.saddr = iph->saddr,
1175 		.flowlabel = ip6_flowinfo(iph),
1176 		.flowi6_mark = skb->mark,
1177 		.flowi6_proto = iph->nexthdr,
1178 	};
1179 
1180 	tun_info = skb_tunnel_info(skb);
1181 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1182 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1183 	skb_dst_drop(skb);
1184 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1185 }
1186 
1187 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1188 					     struct flowi6 *fl6, int flags)
1189 {
1190 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1191 }
1192 
1193 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1194 					 struct flowi6 *fl6, int flags)
1195 {
1196 	bool any_src;
1197 
1198 	if (rt6_need_strict(&fl6->daddr)) {
1199 		struct dst_entry *dst;
1200 
1201 		dst = l3mdev_link_scope_lookup(net, fl6);
1202 		if (dst)
1203 			return dst;
1204 	}
1205 
1206 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1207 
1208 	any_src = ipv6_addr_any(&fl6->saddr);
1209 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1210 	    (fl6->flowi6_oif && any_src))
1211 		flags |= RT6_LOOKUP_F_IFACE;
1212 
1213 	if (!any_src)
1214 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1215 	else if (sk)
1216 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1217 
1218 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1219 }
1220 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1221 
1222 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1223 {
1224 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1225 	struct dst_entry *new = NULL;
1226 
1227 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1228 	if (rt) {
1229 		rt6_info_init(rt);
1230 
1231 		new = &rt->dst;
1232 		new->__use = 1;
1233 		new->input = dst_discard;
1234 		new->output = dst_discard_out;
1235 
1236 		dst_copy_metrics(new, &ort->dst);
1237 		rt->rt6i_idev = ort->rt6i_idev;
1238 		if (rt->rt6i_idev)
1239 			in6_dev_hold(rt->rt6i_idev);
1240 
1241 		rt->rt6i_gateway = ort->rt6i_gateway;
1242 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1243 		rt->rt6i_metric = 0;
1244 
1245 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1246 #ifdef CONFIG_IPV6_SUBTREES
1247 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1248 #endif
1249 
1250 		dst_free(new);
1251 	}
1252 
1253 	dst_release(dst_orig);
1254 	return new ? new : ERR_PTR(-ENOMEM);
1255 }
1256 
1257 /*
1258  *	Destination cache support functions
1259  */
1260 
1261 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1262 {
1263 	if (rt->dst.from &&
1264 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1265 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1266 }
1267 
1268 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1269 {
1270 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1271 		return NULL;
1272 
1273 	if (rt6_check_expired(rt))
1274 		return NULL;
1275 
1276 	return &rt->dst;
1277 }
1278 
1279 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1280 {
1281 	if (!__rt6_check_expired(rt) &&
1282 	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1283 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1284 		return &rt->dst;
1285 	else
1286 		return NULL;
1287 }
1288 
1289 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1290 {
1291 	struct rt6_info *rt;
1292 
1293 	rt = (struct rt6_info *) dst;
1294 
1295 	/* All IPV6 dsts are created with ->obsolete set to the value
1296 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1297 	 * into this function always.
1298 	 */
1299 
1300 	rt6_dst_from_metrics_check(rt);
1301 
1302 	if (rt->rt6i_flags & RTF_PCPU ||
1303 	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1304 		return rt6_dst_from_check(rt, cookie);
1305 	else
1306 		return rt6_check(rt, cookie);
1307 }
1308 
1309 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1310 {
1311 	struct rt6_info *rt = (struct rt6_info *) dst;
1312 
1313 	if (rt) {
1314 		if (rt->rt6i_flags & RTF_CACHE) {
1315 			if (rt6_check_expired(rt)) {
1316 				ip6_del_rt(rt);
1317 				dst = NULL;
1318 			}
1319 		} else {
1320 			dst_release(dst);
1321 			dst = NULL;
1322 		}
1323 	}
1324 	return dst;
1325 }
1326 
1327 static void ip6_link_failure(struct sk_buff *skb)
1328 {
1329 	struct rt6_info *rt;
1330 
1331 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1332 
1333 	rt = (struct rt6_info *) skb_dst(skb);
1334 	if (rt) {
1335 		if (rt->rt6i_flags & RTF_CACHE) {
1336 			dst_hold(&rt->dst);
1337 			ip6_del_rt(rt);
1338 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1339 			rt->rt6i_node->fn_sernum = -1;
1340 		}
1341 	}
1342 }
1343 
1344 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1345 {
1346 	struct net *net = dev_net(rt->dst.dev);
1347 
1348 	rt->rt6i_flags |= RTF_MODIFIED;
1349 	rt->rt6i_pmtu = mtu;
1350 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1351 }
1352 
1353 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1354 {
1355 	return !(rt->rt6i_flags & RTF_CACHE) &&
1356 		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1357 }
1358 
1359 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1360 				 const struct ipv6hdr *iph, u32 mtu)
1361 {
1362 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1363 
1364 	if (rt6->rt6i_flags & RTF_LOCAL)
1365 		return;
1366 
1367 	dst_confirm(dst);
1368 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1369 	if (mtu >= dst_mtu(dst))
1370 		return;
1371 
1372 	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1373 		rt6_do_update_pmtu(rt6, mtu);
1374 	} else {
1375 		const struct in6_addr *daddr, *saddr;
1376 		struct rt6_info *nrt6;
1377 
1378 		if (iph) {
1379 			daddr = &iph->daddr;
1380 			saddr = &iph->saddr;
1381 		} else if (sk) {
1382 			daddr = &sk->sk_v6_daddr;
1383 			saddr = &inet6_sk(sk)->saddr;
1384 		} else {
1385 			return;
1386 		}
1387 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1388 		if (nrt6) {
1389 			rt6_do_update_pmtu(nrt6, mtu);
1390 
1391 			/* ip6_ins_rt(nrt6) will bump the
1392 			 * rt6->rt6i_node->fn_sernum
1393 			 * which will fail the next rt6_check() and
1394 			 * invalidate the sk->sk_dst_cache.
1395 			 */
1396 			ip6_ins_rt(nrt6);
1397 		}
1398 	}
1399 }
1400 
1401 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1402 			       struct sk_buff *skb, u32 mtu)
1403 {
1404 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1405 }
1406 
1407 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1408 		     int oif, u32 mark)
1409 {
1410 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1411 	struct dst_entry *dst;
1412 	struct flowi6 fl6;
1413 
1414 	memset(&fl6, 0, sizeof(fl6));
1415 	fl6.flowi6_oif = oif;
1416 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1417 	fl6.daddr = iph->daddr;
1418 	fl6.saddr = iph->saddr;
1419 	fl6.flowlabel = ip6_flowinfo(iph);
1420 
1421 	dst = ip6_route_output(net, NULL, &fl6);
1422 	if (!dst->error)
1423 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1424 	dst_release(dst);
1425 }
1426 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1427 
1428 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1429 {
1430 	struct dst_entry *dst;
1431 
1432 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1433 			sk->sk_bound_dev_if, sk->sk_mark);
1434 
1435 	dst = __sk_dst_get(sk);
1436 	if (!dst || !dst->obsolete ||
1437 	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1438 		return;
1439 
1440 	bh_lock_sock(sk);
1441 	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1442 		ip6_datagram_dst_update(sk, false);
1443 	bh_unlock_sock(sk);
1444 }
1445 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1446 
1447 /* Handle redirects */
1448 struct ip6rd_flowi {
1449 	struct flowi6 fl6;
1450 	struct in6_addr gateway;
1451 };
1452 
1453 static struct rt6_info *__ip6_route_redirect(struct net *net,
1454 					     struct fib6_table *table,
1455 					     struct flowi6 *fl6,
1456 					     int flags)
1457 {
1458 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1459 	struct rt6_info *rt;
1460 	struct fib6_node *fn;
1461 
1462 	/* Get the "current" route for this destination and
1463 	 * check if the redirect has come from approriate router.
1464 	 *
1465 	 * RFC 4861 specifies that redirects should only be
1466 	 * accepted if they come from the nexthop to the target.
1467 	 * Due to the way the routes are chosen, this notion
1468 	 * is a bit fuzzy and one might need to check all possible
1469 	 * routes.
1470 	 */
1471 
1472 	read_lock_bh(&table->tb6_lock);
1473 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1474 restart:
1475 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1476 		if (rt6_check_expired(rt))
1477 			continue;
1478 		if (rt->dst.error)
1479 			break;
1480 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1481 			continue;
1482 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1483 			continue;
1484 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1485 			continue;
1486 		break;
1487 	}
1488 
1489 	if (!rt)
1490 		rt = net->ipv6.ip6_null_entry;
1491 	else if (rt->dst.error) {
1492 		rt = net->ipv6.ip6_null_entry;
1493 		goto out;
1494 	}
1495 
1496 	if (rt == net->ipv6.ip6_null_entry) {
1497 		fn = fib6_backtrack(fn, &fl6->saddr);
1498 		if (fn)
1499 			goto restart;
1500 	}
1501 
1502 out:
1503 	dst_hold(&rt->dst);
1504 
1505 	read_unlock_bh(&table->tb6_lock);
1506 
1507 	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1508 	return rt;
1509 };
1510 
1511 static struct dst_entry *ip6_route_redirect(struct net *net,
1512 					const struct flowi6 *fl6,
1513 					const struct in6_addr *gateway)
1514 {
1515 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1516 	struct ip6rd_flowi rdfl;
1517 
1518 	rdfl.fl6 = *fl6;
1519 	rdfl.gateway = *gateway;
1520 
1521 	return fib6_rule_lookup(net, &rdfl.fl6,
1522 				flags, __ip6_route_redirect);
1523 }
1524 
1525 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1526 {
1527 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1528 	struct dst_entry *dst;
1529 	struct flowi6 fl6;
1530 
1531 	memset(&fl6, 0, sizeof(fl6));
1532 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1533 	fl6.flowi6_oif = oif;
1534 	fl6.flowi6_mark = mark;
1535 	fl6.daddr = iph->daddr;
1536 	fl6.saddr = iph->saddr;
1537 	fl6.flowlabel = ip6_flowinfo(iph);
1538 
1539 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1540 	rt6_do_redirect(dst, NULL, skb);
1541 	dst_release(dst);
1542 }
1543 EXPORT_SYMBOL_GPL(ip6_redirect);
1544 
1545 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1546 			    u32 mark)
1547 {
1548 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1549 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1550 	struct dst_entry *dst;
1551 	struct flowi6 fl6;
1552 
1553 	memset(&fl6, 0, sizeof(fl6));
1554 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1555 	fl6.flowi6_oif = oif;
1556 	fl6.flowi6_mark = mark;
1557 	fl6.daddr = msg->dest;
1558 	fl6.saddr = iph->daddr;
1559 
1560 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1561 	rt6_do_redirect(dst, NULL, skb);
1562 	dst_release(dst);
1563 }
1564 
1565 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1566 {
1567 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1568 }
1569 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1570 
1571 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1572 {
1573 	struct net_device *dev = dst->dev;
1574 	unsigned int mtu = dst_mtu(dst);
1575 	struct net *net = dev_net(dev);
1576 
1577 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1578 
1579 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1580 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1581 
1582 	/*
1583 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1584 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1585 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1586 	 * rely only on pmtu discovery"
1587 	 */
1588 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1589 		mtu = IPV6_MAXPLEN;
1590 	return mtu;
1591 }
1592 
1593 static unsigned int ip6_mtu(const struct dst_entry *dst)
1594 {
1595 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1596 	unsigned int mtu = rt->rt6i_pmtu;
1597 	struct inet6_dev *idev;
1598 
1599 	if (mtu)
1600 		goto out;
1601 
1602 	mtu = dst_metric_raw(dst, RTAX_MTU);
1603 	if (mtu)
1604 		goto out;
1605 
1606 	mtu = IPV6_MIN_MTU;
1607 
1608 	rcu_read_lock();
1609 	idev = __in6_dev_get(dst->dev);
1610 	if (idev)
1611 		mtu = idev->cnf.mtu6;
1612 	rcu_read_unlock();
1613 
1614 out:
1615 	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1616 
1617 	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1618 }
1619 
1620 static struct dst_entry *icmp6_dst_gc_list;
1621 static DEFINE_SPINLOCK(icmp6_dst_lock);
1622 
1623 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1624 				  struct flowi6 *fl6)
1625 {
1626 	struct dst_entry *dst;
1627 	struct rt6_info *rt;
1628 	struct inet6_dev *idev = in6_dev_get(dev);
1629 	struct net *net = dev_net(dev);
1630 
1631 	if (unlikely(!idev))
1632 		return ERR_PTR(-ENODEV);
1633 
1634 	rt = ip6_dst_alloc(net, dev, 0);
1635 	if (unlikely(!rt)) {
1636 		in6_dev_put(idev);
1637 		dst = ERR_PTR(-ENOMEM);
1638 		goto out;
1639 	}
1640 
1641 	rt->dst.flags |= DST_HOST;
1642 	rt->dst.output  = ip6_output;
1643 	atomic_set(&rt->dst.__refcnt, 1);
1644 	rt->rt6i_gateway  = fl6->daddr;
1645 	rt->rt6i_dst.addr = fl6->daddr;
1646 	rt->rt6i_dst.plen = 128;
1647 	rt->rt6i_idev     = idev;
1648 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1649 
1650 	spin_lock_bh(&icmp6_dst_lock);
1651 	rt->dst.next = icmp6_dst_gc_list;
1652 	icmp6_dst_gc_list = &rt->dst;
1653 	spin_unlock_bh(&icmp6_dst_lock);
1654 
1655 	fib6_force_start_gc(net);
1656 
1657 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1658 
1659 out:
1660 	return dst;
1661 }
1662 
1663 int icmp6_dst_gc(void)
1664 {
1665 	struct dst_entry *dst, **pprev;
1666 	int more = 0;
1667 
1668 	spin_lock_bh(&icmp6_dst_lock);
1669 	pprev = &icmp6_dst_gc_list;
1670 
1671 	while ((dst = *pprev) != NULL) {
1672 		if (!atomic_read(&dst->__refcnt)) {
1673 			*pprev = dst->next;
1674 			dst_free(dst);
1675 		} else {
1676 			pprev = &dst->next;
1677 			++more;
1678 		}
1679 	}
1680 
1681 	spin_unlock_bh(&icmp6_dst_lock);
1682 
1683 	return more;
1684 }
1685 
1686 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1687 			    void *arg)
1688 {
1689 	struct dst_entry *dst, **pprev;
1690 
1691 	spin_lock_bh(&icmp6_dst_lock);
1692 	pprev = &icmp6_dst_gc_list;
1693 	while ((dst = *pprev) != NULL) {
1694 		struct rt6_info *rt = (struct rt6_info *) dst;
1695 		if (func(rt, arg)) {
1696 			*pprev = dst->next;
1697 			dst_free(dst);
1698 		} else {
1699 			pprev = &dst->next;
1700 		}
1701 	}
1702 	spin_unlock_bh(&icmp6_dst_lock);
1703 }
1704 
1705 static int ip6_dst_gc(struct dst_ops *ops)
1706 {
1707 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1708 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1709 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1710 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1711 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1712 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1713 	int entries;
1714 
1715 	entries = dst_entries_get_fast(ops);
1716 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1717 	    entries <= rt_max_size)
1718 		goto out;
1719 
1720 	net->ipv6.ip6_rt_gc_expire++;
1721 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1722 	entries = dst_entries_get_slow(ops);
1723 	if (entries < ops->gc_thresh)
1724 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1725 out:
1726 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1727 	return entries > rt_max_size;
1728 }
1729 
1730 static int ip6_convert_metrics(struct mx6_config *mxc,
1731 			       const struct fib6_config *cfg)
1732 {
1733 	bool ecn_ca = false;
1734 	struct nlattr *nla;
1735 	int remaining;
1736 	u32 *mp;
1737 
1738 	if (!cfg->fc_mx)
1739 		return 0;
1740 
1741 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1742 	if (unlikely(!mp))
1743 		return -ENOMEM;
1744 
1745 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1746 		int type = nla_type(nla);
1747 		u32 val;
1748 
1749 		if (!type)
1750 			continue;
1751 		if (unlikely(type > RTAX_MAX))
1752 			goto err;
1753 
1754 		if (type == RTAX_CC_ALGO) {
1755 			char tmp[TCP_CA_NAME_MAX];
1756 
1757 			nla_strlcpy(tmp, nla, sizeof(tmp));
1758 			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1759 			if (val == TCP_CA_UNSPEC)
1760 				goto err;
1761 		} else {
1762 			val = nla_get_u32(nla);
1763 		}
1764 		if (type == RTAX_HOPLIMIT && val > 255)
1765 			val = 255;
1766 		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1767 			goto err;
1768 
1769 		mp[type - 1] = val;
1770 		__set_bit(type - 1, mxc->mx_valid);
1771 	}
1772 
1773 	if (ecn_ca) {
1774 		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1775 		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1776 	}
1777 
1778 	mxc->mx = mp;
1779 	return 0;
1780  err:
1781 	kfree(mp);
1782 	return -EINVAL;
1783 }
1784 
1785 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1786 					    struct fib6_config *cfg,
1787 					    const struct in6_addr *gw_addr)
1788 {
1789 	struct flowi6 fl6 = {
1790 		.flowi6_oif = cfg->fc_ifindex,
1791 		.daddr = *gw_addr,
1792 		.saddr = cfg->fc_prefsrc,
1793 	};
1794 	struct fib6_table *table;
1795 	struct rt6_info *rt;
1796 	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1797 
1798 	table = fib6_get_table(net, cfg->fc_table);
1799 	if (!table)
1800 		return NULL;
1801 
1802 	if (!ipv6_addr_any(&cfg->fc_prefsrc))
1803 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1804 
1805 	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1806 
1807 	/* if table lookup failed, fall back to full lookup */
1808 	if (rt == net->ipv6.ip6_null_entry) {
1809 		ip6_rt_put(rt);
1810 		rt = NULL;
1811 	}
1812 
1813 	return rt;
1814 }
1815 
1816 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1817 {
1818 	struct net *net = cfg->fc_nlinfo.nl_net;
1819 	struct rt6_info *rt = NULL;
1820 	struct net_device *dev = NULL;
1821 	struct inet6_dev *idev = NULL;
1822 	struct fib6_table *table;
1823 	int addr_type;
1824 	int err = -EINVAL;
1825 
1826 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1827 		goto out;
1828 #ifndef CONFIG_IPV6_SUBTREES
1829 	if (cfg->fc_src_len)
1830 		goto out;
1831 #endif
1832 	if (cfg->fc_ifindex) {
1833 		err = -ENODEV;
1834 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1835 		if (!dev)
1836 			goto out;
1837 		idev = in6_dev_get(dev);
1838 		if (!idev)
1839 			goto out;
1840 	}
1841 
1842 	if (cfg->fc_metric == 0)
1843 		cfg->fc_metric = IP6_RT_PRIO_USER;
1844 
1845 	err = -ENOBUFS;
1846 	if (cfg->fc_nlinfo.nlh &&
1847 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1848 		table = fib6_get_table(net, cfg->fc_table);
1849 		if (!table) {
1850 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1851 			table = fib6_new_table(net, cfg->fc_table);
1852 		}
1853 	} else {
1854 		table = fib6_new_table(net, cfg->fc_table);
1855 	}
1856 
1857 	if (!table)
1858 		goto out;
1859 
1860 	rt = ip6_dst_alloc(net, NULL,
1861 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1862 
1863 	if (!rt) {
1864 		err = -ENOMEM;
1865 		goto out;
1866 	}
1867 
1868 	if (cfg->fc_flags & RTF_EXPIRES)
1869 		rt6_set_expires(rt, jiffies +
1870 				clock_t_to_jiffies(cfg->fc_expires));
1871 	else
1872 		rt6_clean_expires(rt);
1873 
1874 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1875 		cfg->fc_protocol = RTPROT_BOOT;
1876 	rt->rt6i_protocol = cfg->fc_protocol;
1877 
1878 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1879 
1880 	if (addr_type & IPV6_ADDR_MULTICAST)
1881 		rt->dst.input = ip6_mc_input;
1882 	else if (cfg->fc_flags & RTF_LOCAL)
1883 		rt->dst.input = ip6_input;
1884 	else
1885 		rt->dst.input = ip6_forward;
1886 
1887 	rt->dst.output = ip6_output;
1888 
1889 	if (cfg->fc_encap) {
1890 		struct lwtunnel_state *lwtstate;
1891 
1892 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1893 					   cfg->fc_encap, AF_INET6, cfg,
1894 					   &lwtstate);
1895 		if (err)
1896 			goto out;
1897 		rt->dst.lwtstate = lwtstate_get(lwtstate);
1898 		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1899 			rt->dst.lwtstate->orig_output = rt->dst.output;
1900 			rt->dst.output = lwtunnel_output;
1901 		}
1902 		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1903 			rt->dst.lwtstate->orig_input = rt->dst.input;
1904 			rt->dst.input = lwtunnel_input;
1905 		}
1906 	}
1907 
1908 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1909 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1910 	if (rt->rt6i_dst.plen == 128)
1911 		rt->dst.flags |= DST_HOST;
1912 
1913 #ifdef CONFIG_IPV6_SUBTREES
1914 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1915 	rt->rt6i_src.plen = cfg->fc_src_len;
1916 #endif
1917 
1918 	rt->rt6i_metric = cfg->fc_metric;
1919 
1920 	/* We cannot add true routes via loopback here,
1921 	   they would result in kernel looping; promote them to reject routes
1922 	 */
1923 	if ((cfg->fc_flags & RTF_REJECT) ||
1924 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1925 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1926 	     !(cfg->fc_flags & RTF_LOCAL))) {
1927 		/* hold loopback dev/idev if we haven't done so. */
1928 		if (dev != net->loopback_dev) {
1929 			if (dev) {
1930 				dev_put(dev);
1931 				in6_dev_put(idev);
1932 			}
1933 			dev = net->loopback_dev;
1934 			dev_hold(dev);
1935 			idev = in6_dev_get(dev);
1936 			if (!idev) {
1937 				err = -ENODEV;
1938 				goto out;
1939 			}
1940 		}
1941 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1942 		switch (cfg->fc_type) {
1943 		case RTN_BLACKHOLE:
1944 			rt->dst.error = -EINVAL;
1945 			rt->dst.output = dst_discard_out;
1946 			rt->dst.input = dst_discard;
1947 			break;
1948 		case RTN_PROHIBIT:
1949 			rt->dst.error = -EACCES;
1950 			rt->dst.output = ip6_pkt_prohibit_out;
1951 			rt->dst.input = ip6_pkt_prohibit;
1952 			break;
1953 		case RTN_THROW:
1954 		case RTN_UNREACHABLE:
1955 		default:
1956 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1957 					: (cfg->fc_type == RTN_UNREACHABLE)
1958 					? -EHOSTUNREACH : -ENETUNREACH;
1959 			rt->dst.output = ip6_pkt_discard_out;
1960 			rt->dst.input = ip6_pkt_discard;
1961 			break;
1962 		}
1963 		goto install_route;
1964 	}
1965 
1966 	if (cfg->fc_flags & RTF_GATEWAY) {
1967 		const struct in6_addr *gw_addr;
1968 		int gwa_type;
1969 
1970 		gw_addr = &cfg->fc_gateway;
1971 		gwa_type = ipv6_addr_type(gw_addr);
1972 
1973 		/* if gw_addr is local we will fail to detect this in case
1974 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1975 		 * will return already-added prefix route via interface that
1976 		 * prefix route was assigned to, which might be non-loopback.
1977 		 */
1978 		err = -EINVAL;
1979 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1980 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1981 					    dev : NULL, 0, 0))
1982 			goto out;
1983 
1984 		rt->rt6i_gateway = *gw_addr;
1985 
1986 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1987 			struct rt6_info *grt = NULL;
1988 
1989 			/* IPv6 strictly inhibits using not link-local
1990 			   addresses as nexthop address.
1991 			   Otherwise, router will not able to send redirects.
1992 			   It is very good, but in some (rare!) circumstances
1993 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1994 			   some exceptions. --ANK
1995 			 */
1996 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1997 				goto out;
1998 
1999 			if (cfg->fc_table) {
2000 				grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2001 
2002 				if (grt) {
2003 					if (grt->rt6i_flags & RTF_GATEWAY ||
2004 					    (dev && dev != grt->dst.dev)) {
2005 						ip6_rt_put(grt);
2006 						grt = NULL;
2007 					}
2008 				}
2009 			}
2010 
2011 			if (!grt)
2012 				grt = rt6_lookup(net, gw_addr, NULL,
2013 						 cfg->fc_ifindex, 1);
2014 
2015 			err = -EHOSTUNREACH;
2016 			if (!grt)
2017 				goto out;
2018 			if (dev) {
2019 				if (dev != grt->dst.dev) {
2020 					ip6_rt_put(grt);
2021 					goto out;
2022 				}
2023 			} else {
2024 				dev = grt->dst.dev;
2025 				idev = grt->rt6i_idev;
2026 				dev_hold(dev);
2027 				in6_dev_hold(grt->rt6i_idev);
2028 			}
2029 			if (!(grt->rt6i_flags & RTF_GATEWAY))
2030 				err = 0;
2031 			ip6_rt_put(grt);
2032 
2033 			if (err)
2034 				goto out;
2035 		}
2036 		err = -EINVAL;
2037 		if (!dev || (dev->flags & IFF_LOOPBACK))
2038 			goto out;
2039 	}
2040 
2041 	err = -ENODEV;
2042 	if (!dev)
2043 		goto out;
2044 
2045 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2046 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2047 			err = -EINVAL;
2048 			goto out;
2049 		}
2050 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2051 		rt->rt6i_prefsrc.plen = 128;
2052 	} else
2053 		rt->rt6i_prefsrc.plen = 0;
2054 
2055 	rt->rt6i_flags = cfg->fc_flags;
2056 
2057 install_route:
2058 	rt->dst.dev = dev;
2059 	rt->rt6i_idev = idev;
2060 	rt->rt6i_table = table;
2061 
2062 	cfg->fc_nlinfo.nl_net = dev_net(dev);
2063 
2064 	return rt;
2065 out:
2066 	if (dev)
2067 		dev_put(dev);
2068 	if (idev)
2069 		in6_dev_put(idev);
2070 	if (rt)
2071 		dst_free(&rt->dst);
2072 
2073 	return ERR_PTR(err);
2074 }
2075 
2076 int ip6_route_add(struct fib6_config *cfg)
2077 {
2078 	struct mx6_config mxc = { .mx = NULL, };
2079 	struct rt6_info *rt;
2080 	int err;
2081 
2082 	rt = ip6_route_info_create(cfg);
2083 	if (IS_ERR(rt)) {
2084 		err = PTR_ERR(rt);
2085 		rt = NULL;
2086 		goto out;
2087 	}
2088 
2089 	err = ip6_convert_metrics(&mxc, cfg);
2090 	if (err)
2091 		goto out;
2092 
2093 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2094 
2095 	kfree(mxc.mx);
2096 
2097 	return err;
2098 out:
2099 	if (rt)
2100 		dst_free(&rt->dst);
2101 
2102 	return err;
2103 }
2104 
2105 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2106 {
2107 	int err;
2108 	struct fib6_table *table;
2109 	struct net *net = dev_net(rt->dst.dev);
2110 
2111 	if (rt == net->ipv6.ip6_null_entry ||
2112 	    rt->dst.flags & DST_NOCACHE) {
2113 		err = -ENOENT;
2114 		goto out;
2115 	}
2116 
2117 	table = rt->rt6i_table;
2118 	write_lock_bh(&table->tb6_lock);
2119 	err = fib6_del(rt, info);
2120 	write_unlock_bh(&table->tb6_lock);
2121 
2122 out:
2123 	ip6_rt_put(rt);
2124 	return err;
2125 }
2126 
2127 int ip6_del_rt(struct rt6_info *rt)
2128 {
2129 	struct nl_info info = {
2130 		.nl_net = dev_net(rt->dst.dev),
2131 	};
2132 	return __ip6_del_rt(rt, &info);
2133 }
2134 
2135 static int ip6_route_del(struct fib6_config *cfg)
2136 {
2137 	struct fib6_table *table;
2138 	struct fib6_node *fn;
2139 	struct rt6_info *rt;
2140 	int err = -ESRCH;
2141 
2142 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2143 	if (!table)
2144 		return err;
2145 
2146 	read_lock_bh(&table->tb6_lock);
2147 
2148 	fn = fib6_locate(&table->tb6_root,
2149 			 &cfg->fc_dst, cfg->fc_dst_len,
2150 			 &cfg->fc_src, cfg->fc_src_len);
2151 
2152 	if (fn) {
2153 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2154 			if ((rt->rt6i_flags & RTF_CACHE) &&
2155 			    !(cfg->fc_flags & RTF_CACHE))
2156 				continue;
2157 			if (cfg->fc_ifindex &&
2158 			    (!rt->dst.dev ||
2159 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2160 				continue;
2161 			if (cfg->fc_flags & RTF_GATEWAY &&
2162 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2163 				continue;
2164 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2165 				continue;
2166 			dst_hold(&rt->dst);
2167 			read_unlock_bh(&table->tb6_lock);
2168 
2169 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2170 		}
2171 	}
2172 	read_unlock_bh(&table->tb6_lock);
2173 
2174 	return err;
2175 }
2176 
2177 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2178 {
2179 	struct netevent_redirect netevent;
2180 	struct rt6_info *rt, *nrt = NULL;
2181 	struct ndisc_options ndopts;
2182 	struct inet6_dev *in6_dev;
2183 	struct neighbour *neigh;
2184 	struct rd_msg *msg;
2185 	int optlen, on_link;
2186 	u8 *lladdr;
2187 
2188 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2189 	optlen -= sizeof(*msg);
2190 
2191 	if (optlen < 0) {
2192 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2193 		return;
2194 	}
2195 
2196 	msg = (struct rd_msg *)icmp6_hdr(skb);
2197 
2198 	if (ipv6_addr_is_multicast(&msg->dest)) {
2199 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2200 		return;
2201 	}
2202 
2203 	on_link = 0;
2204 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2205 		on_link = 1;
2206 	} else if (ipv6_addr_type(&msg->target) !=
2207 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2208 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2209 		return;
2210 	}
2211 
2212 	in6_dev = __in6_dev_get(skb->dev);
2213 	if (!in6_dev)
2214 		return;
2215 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2216 		return;
2217 
2218 	/* RFC2461 8.1:
2219 	 *	The IP source address of the Redirect MUST be the same as the current
2220 	 *	first-hop router for the specified ICMP Destination Address.
2221 	 */
2222 
2223 	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2224 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2225 		return;
2226 	}
2227 
2228 	lladdr = NULL;
2229 	if (ndopts.nd_opts_tgt_lladdr) {
2230 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2231 					     skb->dev);
2232 		if (!lladdr) {
2233 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2234 			return;
2235 		}
2236 	}
2237 
2238 	rt = (struct rt6_info *) dst;
2239 	if (rt->rt6i_flags & RTF_REJECT) {
2240 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2241 		return;
2242 	}
2243 
2244 	/* Redirect received -> path was valid.
2245 	 * Look, redirects are sent only in response to data packets,
2246 	 * so that this nexthop apparently is reachable. --ANK
2247 	 */
2248 	dst_confirm(&rt->dst);
2249 
2250 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2251 	if (!neigh)
2252 		return;
2253 
2254 	/*
2255 	 *	We have finally decided to accept it.
2256 	 */
2257 
2258 	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2259 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2260 		     NEIGH_UPDATE_F_OVERRIDE|
2261 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2262 				     NEIGH_UPDATE_F_ISROUTER)),
2263 		     NDISC_REDIRECT, &ndopts);
2264 
2265 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2266 	if (!nrt)
2267 		goto out;
2268 
2269 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2270 	if (on_link)
2271 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2272 
2273 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2274 
2275 	if (ip6_ins_rt(nrt))
2276 		goto out;
2277 
2278 	netevent.old = &rt->dst;
2279 	netevent.new = &nrt->dst;
2280 	netevent.daddr = &msg->dest;
2281 	netevent.neigh = neigh;
2282 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2283 
2284 	if (rt->rt6i_flags & RTF_CACHE) {
2285 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2286 		ip6_del_rt(rt);
2287 	}
2288 
2289 out:
2290 	neigh_release(neigh);
2291 }
2292 
2293 /*
2294  *	Misc support functions
2295  */
2296 
2297 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2298 {
2299 	BUG_ON(from->dst.from);
2300 
2301 	rt->rt6i_flags &= ~RTF_EXPIRES;
2302 	dst_hold(&from->dst);
2303 	rt->dst.from = &from->dst;
2304 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2305 }
2306 
2307 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2308 {
2309 	rt->dst.input = ort->dst.input;
2310 	rt->dst.output = ort->dst.output;
2311 	rt->rt6i_dst = ort->rt6i_dst;
2312 	rt->dst.error = ort->dst.error;
2313 	rt->rt6i_idev = ort->rt6i_idev;
2314 	if (rt->rt6i_idev)
2315 		in6_dev_hold(rt->rt6i_idev);
2316 	rt->dst.lastuse = jiffies;
2317 	rt->rt6i_gateway = ort->rt6i_gateway;
2318 	rt->rt6i_flags = ort->rt6i_flags;
2319 	rt6_set_from(rt, ort);
2320 	rt->rt6i_metric = ort->rt6i_metric;
2321 #ifdef CONFIG_IPV6_SUBTREES
2322 	rt->rt6i_src = ort->rt6i_src;
2323 #endif
2324 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2325 	rt->rt6i_table = ort->rt6i_table;
2326 	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2327 }
2328 
2329 #ifdef CONFIG_IPV6_ROUTE_INFO
2330 static struct rt6_info *rt6_get_route_info(struct net *net,
2331 					   const struct in6_addr *prefix, int prefixlen,
2332 					   const struct in6_addr *gwaddr,
2333 					   struct net_device *dev)
2334 {
2335 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2336 	int ifindex = dev->ifindex;
2337 	struct fib6_node *fn;
2338 	struct rt6_info *rt = NULL;
2339 	struct fib6_table *table;
2340 
2341 	table = fib6_get_table(net, tb_id);
2342 	if (!table)
2343 		return NULL;
2344 
2345 	read_lock_bh(&table->tb6_lock);
2346 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2347 	if (!fn)
2348 		goto out;
2349 
2350 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2351 		if (rt->dst.dev->ifindex != ifindex)
2352 			continue;
2353 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2354 			continue;
2355 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2356 			continue;
2357 		dst_hold(&rt->dst);
2358 		break;
2359 	}
2360 out:
2361 	read_unlock_bh(&table->tb6_lock);
2362 	return rt;
2363 }
2364 
2365 static struct rt6_info *rt6_add_route_info(struct net *net,
2366 					   const struct in6_addr *prefix, int prefixlen,
2367 					   const struct in6_addr *gwaddr,
2368 					   struct net_device *dev,
2369 					   unsigned int pref)
2370 {
2371 	struct fib6_config cfg = {
2372 		.fc_metric	= IP6_RT_PRIO_USER,
2373 		.fc_ifindex	= dev->ifindex,
2374 		.fc_dst_len	= prefixlen,
2375 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2376 				  RTF_UP | RTF_PREF(pref),
2377 		.fc_nlinfo.portid = 0,
2378 		.fc_nlinfo.nlh = NULL,
2379 		.fc_nlinfo.nl_net = net,
2380 	};
2381 
2382 	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2383 	cfg.fc_dst = *prefix;
2384 	cfg.fc_gateway = *gwaddr;
2385 
2386 	/* We should treat it as a default route if prefix length is 0. */
2387 	if (!prefixlen)
2388 		cfg.fc_flags |= RTF_DEFAULT;
2389 
2390 	ip6_route_add(&cfg);
2391 
2392 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2393 }
2394 #endif
2395 
2396 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2397 {
2398 	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2399 	struct rt6_info *rt;
2400 	struct fib6_table *table;
2401 
2402 	table = fib6_get_table(dev_net(dev), tb_id);
2403 	if (!table)
2404 		return NULL;
2405 
2406 	read_lock_bh(&table->tb6_lock);
2407 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2408 		if (dev == rt->dst.dev &&
2409 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2410 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2411 			break;
2412 	}
2413 	if (rt)
2414 		dst_hold(&rt->dst);
2415 	read_unlock_bh(&table->tb6_lock);
2416 	return rt;
2417 }
2418 
2419 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2420 				     struct net_device *dev,
2421 				     unsigned int pref)
2422 {
2423 	struct fib6_config cfg = {
2424 		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2425 		.fc_metric	= IP6_RT_PRIO_USER,
2426 		.fc_ifindex	= dev->ifindex,
2427 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2428 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2429 		.fc_nlinfo.portid = 0,
2430 		.fc_nlinfo.nlh = NULL,
2431 		.fc_nlinfo.nl_net = dev_net(dev),
2432 	};
2433 
2434 	cfg.fc_gateway = *gwaddr;
2435 
2436 	if (!ip6_route_add(&cfg)) {
2437 		struct fib6_table *table;
2438 
2439 		table = fib6_get_table(dev_net(dev), cfg.fc_table);
2440 		if (table)
2441 			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2442 	}
2443 
2444 	return rt6_get_dflt_router(gwaddr, dev);
2445 }
2446 
2447 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2448 {
2449 	struct rt6_info *rt;
2450 
2451 restart:
2452 	read_lock_bh(&table->tb6_lock);
2453 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2454 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2455 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2456 			dst_hold(&rt->dst);
2457 			read_unlock_bh(&table->tb6_lock);
2458 			ip6_del_rt(rt);
2459 			goto restart;
2460 		}
2461 	}
2462 	read_unlock_bh(&table->tb6_lock);
2463 
2464 	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2465 }
2466 
2467 void rt6_purge_dflt_routers(struct net *net)
2468 {
2469 	struct fib6_table *table;
2470 	struct hlist_head *head;
2471 	unsigned int h;
2472 
2473 	rcu_read_lock();
2474 
2475 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2476 		head = &net->ipv6.fib_table_hash[h];
2477 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2478 			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2479 				__rt6_purge_dflt_routers(table);
2480 		}
2481 	}
2482 
2483 	rcu_read_unlock();
2484 }
2485 
2486 static void rtmsg_to_fib6_config(struct net *net,
2487 				 struct in6_rtmsg *rtmsg,
2488 				 struct fib6_config *cfg)
2489 {
2490 	memset(cfg, 0, sizeof(*cfg));
2491 
2492 	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2493 			 : RT6_TABLE_MAIN;
2494 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2495 	cfg->fc_metric = rtmsg->rtmsg_metric;
2496 	cfg->fc_expires = rtmsg->rtmsg_info;
2497 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2498 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2499 	cfg->fc_flags = rtmsg->rtmsg_flags;
2500 
2501 	cfg->fc_nlinfo.nl_net = net;
2502 
2503 	cfg->fc_dst = rtmsg->rtmsg_dst;
2504 	cfg->fc_src = rtmsg->rtmsg_src;
2505 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2506 }
2507 
2508 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2509 {
2510 	struct fib6_config cfg;
2511 	struct in6_rtmsg rtmsg;
2512 	int err;
2513 
2514 	switch (cmd) {
2515 	case SIOCADDRT:		/* Add a route */
2516 	case SIOCDELRT:		/* Delete a route */
2517 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2518 			return -EPERM;
2519 		err = copy_from_user(&rtmsg, arg,
2520 				     sizeof(struct in6_rtmsg));
2521 		if (err)
2522 			return -EFAULT;
2523 
2524 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2525 
2526 		rtnl_lock();
2527 		switch (cmd) {
2528 		case SIOCADDRT:
2529 			err = ip6_route_add(&cfg);
2530 			break;
2531 		case SIOCDELRT:
2532 			err = ip6_route_del(&cfg);
2533 			break;
2534 		default:
2535 			err = -EINVAL;
2536 		}
2537 		rtnl_unlock();
2538 
2539 		return err;
2540 	}
2541 
2542 	return -EINVAL;
2543 }
2544 
2545 /*
2546  *	Drop the packet on the floor
2547  */
2548 
2549 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2550 {
2551 	int type;
2552 	struct dst_entry *dst = skb_dst(skb);
2553 	switch (ipstats_mib_noroutes) {
2554 	case IPSTATS_MIB_INNOROUTES:
2555 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2556 		if (type == IPV6_ADDR_ANY) {
2557 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2558 				      IPSTATS_MIB_INADDRERRORS);
2559 			break;
2560 		}
2561 		/* FALLTHROUGH */
2562 	case IPSTATS_MIB_OUTNOROUTES:
2563 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2564 			      ipstats_mib_noroutes);
2565 		break;
2566 	}
2567 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2568 	kfree_skb(skb);
2569 	return 0;
2570 }
2571 
2572 static int ip6_pkt_discard(struct sk_buff *skb)
2573 {
2574 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2575 }
2576 
2577 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2578 {
2579 	skb->dev = skb_dst(skb)->dev;
2580 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2581 }
2582 
2583 static int ip6_pkt_prohibit(struct sk_buff *skb)
2584 {
2585 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2586 }
2587 
2588 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2589 {
2590 	skb->dev = skb_dst(skb)->dev;
2591 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2592 }
2593 
2594 /*
2595  *	Allocate a dst for local (unicast / anycast) address.
2596  */
2597 
2598 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2599 				    const struct in6_addr *addr,
2600 				    bool anycast)
2601 {
2602 	u32 tb_id;
2603 	struct net *net = dev_net(idev->dev);
2604 	struct net_device *dev = net->loopback_dev;
2605 	struct rt6_info *rt;
2606 
2607 	/* use L3 Master device as loopback for host routes if device
2608 	 * is enslaved and address is not link local or multicast
2609 	 */
2610 	if (!rt6_need_strict(addr))
2611 		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2612 
2613 	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2614 	if (!rt)
2615 		return ERR_PTR(-ENOMEM);
2616 
2617 	in6_dev_hold(idev);
2618 
2619 	rt->dst.flags |= DST_HOST;
2620 	rt->dst.input = ip6_input;
2621 	rt->dst.output = ip6_output;
2622 	rt->rt6i_idev = idev;
2623 
2624 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2625 	if (anycast)
2626 		rt->rt6i_flags |= RTF_ANYCAST;
2627 	else
2628 		rt->rt6i_flags |= RTF_LOCAL;
2629 
2630 	rt->rt6i_gateway  = *addr;
2631 	rt->rt6i_dst.addr = *addr;
2632 	rt->rt6i_dst.plen = 128;
2633 	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2634 	rt->rt6i_table = fib6_get_table(net, tb_id);
2635 	rt->dst.flags |= DST_NOCACHE;
2636 
2637 	atomic_set(&rt->dst.__refcnt, 1);
2638 
2639 	return rt;
2640 }
2641 
2642 /* remove deleted ip from prefsrc entries */
2643 struct arg_dev_net_ip {
2644 	struct net_device *dev;
2645 	struct net *net;
2646 	struct in6_addr *addr;
2647 };
2648 
2649 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2650 {
2651 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2652 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2653 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2654 
2655 	if (((void *)rt->dst.dev == dev || !dev) &&
2656 	    rt != net->ipv6.ip6_null_entry &&
2657 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2658 		/* remove prefsrc entry */
2659 		rt->rt6i_prefsrc.plen = 0;
2660 	}
2661 	return 0;
2662 }
2663 
2664 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2665 {
2666 	struct net *net = dev_net(ifp->idev->dev);
2667 	struct arg_dev_net_ip adni = {
2668 		.dev = ifp->idev->dev,
2669 		.net = net,
2670 		.addr = &ifp->addr,
2671 	};
2672 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2673 }
2674 
2675 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2676 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2677 
2678 /* Remove routers and update dst entries when gateway turn into host. */
2679 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2680 {
2681 	struct in6_addr *gateway = (struct in6_addr *)arg;
2682 
2683 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2684 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2685 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2686 		return -1;
2687 	}
2688 	return 0;
2689 }
2690 
2691 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2692 {
2693 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2694 }
2695 
2696 struct arg_dev_net {
2697 	struct net_device *dev;
2698 	struct net *net;
2699 };
2700 
2701 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2702 {
2703 	const struct arg_dev_net *adn = arg;
2704 	const struct net_device *dev = adn->dev;
2705 
2706 	if ((rt->dst.dev == dev || !dev) &&
2707 	    rt != adn->net->ipv6.ip6_null_entry)
2708 		return -1;
2709 
2710 	return 0;
2711 }
2712 
2713 void rt6_ifdown(struct net *net, struct net_device *dev)
2714 {
2715 	struct arg_dev_net adn = {
2716 		.dev = dev,
2717 		.net = net,
2718 	};
2719 
2720 	fib6_clean_all(net, fib6_ifdown, &adn);
2721 	icmp6_clean_all(fib6_ifdown, &adn);
2722 	if (dev)
2723 		rt6_uncached_list_flush_dev(net, dev);
2724 }
2725 
2726 struct rt6_mtu_change_arg {
2727 	struct net_device *dev;
2728 	unsigned int mtu;
2729 };
2730 
2731 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2732 {
2733 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2734 	struct inet6_dev *idev;
2735 
2736 	/* In IPv6 pmtu discovery is not optional,
2737 	   so that RTAX_MTU lock cannot disable it.
2738 	   We still use this lock to block changes
2739 	   caused by addrconf/ndisc.
2740 	*/
2741 
2742 	idev = __in6_dev_get(arg->dev);
2743 	if (!idev)
2744 		return 0;
2745 
2746 	/* For administrative MTU increase, there is no way to discover
2747 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2748 	   Since RFC 1981 doesn't include administrative MTU increase
2749 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2750 	 */
2751 	/*
2752 	   If new MTU is less than route PMTU, this new MTU will be the
2753 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2754 	   decreases; if new MTU is greater than route PMTU, and the
2755 	   old MTU is the lowest MTU in the path, update the route PMTU
2756 	   to reflect the increase. In this case if the other nodes' MTU
2757 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2758 	   PMTU discouvery.
2759 	 */
2760 	if (rt->dst.dev == arg->dev &&
2761 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2762 		if (rt->rt6i_flags & RTF_CACHE) {
2763 			/* For RTF_CACHE with rt6i_pmtu == 0
2764 			 * (i.e. a redirected route),
2765 			 * the metrics of its rt->dst.from has already
2766 			 * been updated.
2767 			 */
2768 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2769 				rt->rt6i_pmtu = arg->mtu;
2770 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2771 			   (dst_mtu(&rt->dst) < arg->mtu &&
2772 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2773 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2774 		}
2775 	}
2776 	return 0;
2777 }
2778 
2779 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2780 {
2781 	struct rt6_mtu_change_arg arg = {
2782 		.dev = dev,
2783 		.mtu = mtu,
2784 	};
2785 
2786 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2787 }
2788 
2789 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2790 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2791 	[RTA_OIF]               = { .type = NLA_U32 },
2792 	[RTA_IIF]		= { .type = NLA_U32 },
2793 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2794 	[RTA_METRICS]           = { .type = NLA_NESTED },
2795 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2796 	[RTA_PREF]              = { .type = NLA_U8 },
2797 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2798 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2799 	[RTA_EXPIRES]		= { .type = NLA_U32 },
2800 };
2801 
2802 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2803 			      struct fib6_config *cfg)
2804 {
2805 	struct rtmsg *rtm;
2806 	struct nlattr *tb[RTA_MAX+1];
2807 	unsigned int pref;
2808 	int err;
2809 
2810 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2811 	if (err < 0)
2812 		goto errout;
2813 
2814 	err = -EINVAL;
2815 	rtm = nlmsg_data(nlh);
2816 	memset(cfg, 0, sizeof(*cfg));
2817 
2818 	cfg->fc_table = rtm->rtm_table;
2819 	cfg->fc_dst_len = rtm->rtm_dst_len;
2820 	cfg->fc_src_len = rtm->rtm_src_len;
2821 	cfg->fc_flags = RTF_UP;
2822 	cfg->fc_protocol = rtm->rtm_protocol;
2823 	cfg->fc_type = rtm->rtm_type;
2824 
2825 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2826 	    rtm->rtm_type == RTN_BLACKHOLE ||
2827 	    rtm->rtm_type == RTN_PROHIBIT ||
2828 	    rtm->rtm_type == RTN_THROW)
2829 		cfg->fc_flags |= RTF_REJECT;
2830 
2831 	if (rtm->rtm_type == RTN_LOCAL)
2832 		cfg->fc_flags |= RTF_LOCAL;
2833 
2834 	if (rtm->rtm_flags & RTM_F_CLONED)
2835 		cfg->fc_flags |= RTF_CACHE;
2836 
2837 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2838 	cfg->fc_nlinfo.nlh = nlh;
2839 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2840 
2841 	if (tb[RTA_GATEWAY]) {
2842 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2843 		cfg->fc_flags |= RTF_GATEWAY;
2844 	}
2845 
2846 	if (tb[RTA_DST]) {
2847 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2848 
2849 		if (nla_len(tb[RTA_DST]) < plen)
2850 			goto errout;
2851 
2852 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2853 	}
2854 
2855 	if (tb[RTA_SRC]) {
2856 		int plen = (rtm->rtm_src_len + 7) >> 3;
2857 
2858 		if (nla_len(tb[RTA_SRC]) < plen)
2859 			goto errout;
2860 
2861 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2862 	}
2863 
2864 	if (tb[RTA_PREFSRC])
2865 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2866 
2867 	if (tb[RTA_OIF])
2868 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2869 
2870 	if (tb[RTA_PRIORITY])
2871 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2872 
2873 	if (tb[RTA_METRICS]) {
2874 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2875 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2876 	}
2877 
2878 	if (tb[RTA_TABLE])
2879 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2880 
2881 	if (tb[RTA_MULTIPATH]) {
2882 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2883 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2884 	}
2885 
2886 	if (tb[RTA_PREF]) {
2887 		pref = nla_get_u8(tb[RTA_PREF]);
2888 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2889 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2890 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2891 		cfg->fc_flags |= RTF_PREF(pref);
2892 	}
2893 
2894 	if (tb[RTA_ENCAP])
2895 		cfg->fc_encap = tb[RTA_ENCAP];
2896 
2897 	if (tb[RTA_ENCAP_TYPE])
2898 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2899 
2900 	if (tb[RTA_EXPIRES]) {
2901 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2902 
2903 		if (addrconf_finite_timeout(timeout)) {
2904 			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2905 			cfg->fc_flags |= RTF_EXPIRES;
2906 		}
2907 	}
2908 
2909 	err = 0;
2910 errout:
2911 	return err;
2912 }
2913 
2914 struct rt6_nh {
2915 	struct rt6_info *rt6_info;
2916 	struct fib6_config r_cfg;
2917 	struct mx6_config mxc;
2918 	struct list_head next;
2919 };
2920 
2921 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2922 {
2923 	struct rt6_nh *nh;
2924 
2925 	list_for_each_entry(nh, rt6_nh_list, next) {
2926 		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2927 		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2928 		        nh->r_cfg.fc_ifindex);
2929 	}
2930 }
2931 
2932 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2933 				 struct rt6_info *rt, struct fib6_config *r_cfg)
2934 {
2935 	struct rt6_nh *nh;
2936 	struct rt6_info *rtnh;
2937 	int err = -EEXIST;
2938 
2939 	list_for_each_entry(nh, rt6_nh_list, next) {
2940 		/* check if rt6_info already exists */
2941 		rtnh = nh->rt6_info;
2942 
2943 		if (rtnh->dst.dev == rt->dst.dev &&
2944 		    rtnh->rt6i_idev == rt->rt6i_idev &&
2945 		    ipv6_addr_equal(&rtnh->rt6i_gateway,
2946 				    &rt->rt6i_gateway))
2947 			return err;
2948 	}
2949 
2950 	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2951 	if (!nh)
2952 		return -ENOMEM;
2953 	nh->rt6_info = rt;
2954 	err = ip6_convert_metrics(&nh->mxc, r_cfg);
2955 	if (err) {
2956 		kfree(nh);
2957 		return err;
2958 	}
2959 	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2960 	list_add_tail(&nh->next, rt6_nh_list);
2961 
2962 	return 0;
2963 }
2964 
2965 static int ip6_route_multipath_add(struct fib6_config *cfg)
2966 {
2967 	struct fib6_config r_cfg;
2968 	struct rtnexthop *rtnh;
2969 	struct rt6_info *rt;
2970 	struct rt6_nh *err_nh;
2971 	struct rt6_nh *nh, *nh_safe;
2972 	int remaining;
2973 	int attrlen;
2974 	int err = 1;
2975 	int nhn = 0;
2976 	int replace = (cfg->fc_nlinfo.nlh &&
2977 		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
2978 	LIST_HEAD(rt6_nh_list);
2979 
2980 	remaining = cfg->fc_mp_len;
2981 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2982 
2983 	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
2984 	 * rt6_info structs per nexthop
2985 	 */
2986 	while (rtnh_ok(rtnh, remaining)) {
2987 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2988 		if (rtnh->rtnh_ifindex)
2989 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2990 
2991 		attrlen = rtnh_attrlen(rtnh);
2992 		if (attrlen > 0) {
2993 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2994 
2995 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2996 			if (nla) {
2997 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2998 				r_cfg.fc_flags |= RTF_GATEWAY;
2999 			}
3000 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3001 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3002 			if (nla)
3003 				r_cfg.fc_encap_type = nla_get_u16(nla);
3004 		}
3005 
3006 		rt = ip6_route_info_create(&r_cfg);
3007 		if (IS_ERR(rt)) {
3008 			err = PTR_ERR(rt);
3009 			rt = NULL;
3010 			goto cleanup;
3011 		}
3012 
3013 		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3014 		if (err) {
3015 			dst_free(&rt->dst);
3016 			goto cleanup;
3017 		}
3018 
3019 		rtnh = rtnh_next(rtnh, &remaining);
3020 	}
3021 
3022 	err_nh = NULL;
3023 	list_for_each_entry(nh, &rt6_nh_list, next) {
3024 		err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
3025 		/* nh->rt6_info is used or freed at this point, reset to NULL*/
3026 		nh->rt6_info = NULL;
3027 		if (err) {
3028 			if (replace && nhn)
3029 				ip6_print_replace_route_err(&rt6_nh_list);
3030 			err_nh = nh;
3031 			goto add_errout;
3032 		}
3033 
3034 		/* Because each route is added like a single route we remove
3035 		 * these flags after the first nexthop: if there is a collision,
3036 		 * we have already failed to add the first nexthop:
3037 		 * fib6_add_rt2node() has rejected it; when replacing, old
3038 		 * nexthops have been replaced by first new, the rest should
3039 		 * be added to it.
3040 		 */
3041 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3042 						     NLM_F_REPLACE);
3043 		nhn++;
3044 	}
3045 
3046 	goto cleanup;
3047 
3048 add_errout:
3049 	/* Delete routes that were already added */
3050 	list_for_each_entry(nh, &rt6_nh_list, next) {
3051 		if (err_nh == nh)
3052 			break;
3053 		ip6_route_del(&nh->r_cfg);
3054 	}
3055 
3056 cleanup:
3057 	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3058 		if (nh->rt6_info)
3059 			dst_free(&nh->rt6_info->dst);
3060 		kfree(nh->mxc.mx);
3061 		list_del(&nh->next);
3062 		kfree(nh);
3063 	}
3064 
3065 	return err;
3066 }
3067 
3068 static int ip6_route_multipath_del(struct fib6_config *cfg)
3069 {
3070 	struct fib6_config r_cfg;
3071 	struct rtnexthop *rtnh;
3072 	int remaining;
3073 	int attrlen;
3074 	int err = 1, last_err = 0;
3075 
3076 	remaining = cfg->fc_mp_len;
3077 	rtnh = (struct rtnexthop *)cfg->fc_mp;
3078 
3079 	/* Parse a Multipath Entry */
3080 	while (rtnh_ok(rtnh, remaining)) {
3081 		memcpy(&r_cfg, cfg, sizeof(*cfg));
3082 		if (rtnh->rtnh_ifindex)
3083 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3084 
3085 		attrlen = rtnh_attrlen(rtnh);
3086 		if (attrlen > 0) {
3087 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3088 
3089 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3090 			if (nla) {
3091 				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3092 				r_cfg.fc_flags |= RTF_GATEWAY;
3093 			}
3094 		}
3095 		err = ip6_route_del(&r_cfg);
3096 		if (err)
3097 			last_err = err;
3098 
3099 		rtnh = rtnh_next(rtnh, &remaining);
3100 	}
3101 
3102 	return last_err;
3103 }
3104 
3105 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3106 {
3107 	struct fib6_config cfg;
3108 	int err;
3109 
3110 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3111 	if (err < 0)
3112 		return err;
3113 
3114 	if (cfg.fc_mp)
3115 		return ip6_route_multipath_del(&cfg);
3116 	else
3117 		return ip6_route_del(&cfg);
3118 }
3119 
3120 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3121 {
3122 	struct fib6_config cfg;
3123 	int err;
3124 
3125 	err = rtm_to_fib6_config(skb, nlh, &cfg);
3126 	if (err < 0)
3127 		return err;
3128 
3129 	if (cfg.fc_mp)
3130 		return ip6_route_multipath_add(&cfg);
3131 	else
3132 		return ip6_route_add(&cfg);
3133 }
3134 
3135 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3136 {
3137 	return NLMSG_ALIGN(sizeof(struct rtmsg))
3138 	       + nla_total_size(16) /* RTA_SRC */
3139 	       + nla_total_size(16) /* RTA_DST */
3140 	       + nla_total_size(16) /* RTA_GATEWAY */
3141 	       + nla_total_size(16) /* RTA_PREFSRC */
3142 	       + nla_total_size(4) /* RTA_TABLE */
3143 	       + nla_total_size(4) /* RTA_IIF */
3144 	       + nla_total_size(4) /* RTA_OIF */
3145 	       + nla_total_size(4) /* RTA_PRIORITY */
3146 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3147 	       + nla_total_size(sizeof(struct rta_cacheinfo))
3148 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3149 	       + nla_total_size(1) /* RTA_PREF */
3150 	       + lwtunnel_get_encap_size(rt->dst.lwtstate);
3151 }
3152 
3153 static int rt6_fill_node(struct net *net,
3154 			 struct sk_buff *skb, struct rt6_info *rt,
3155 			 struct in6_addr *dst, struct in6_addr *src,
3156 			 int iif, int type, u32 portid, u32 seq,
3157 			 int prefix, int nowait, unsigned int flags)
3158 {
3159 	u32 metrics[RTAX_MAX];
3160 	struct rtmsg *rtm;
3161 	struct nlmsghdr *nlh;
3162 	long expires;
3163 	u32 table;
3164 
3165 	if (prefix) {	/* user wants prefix routes only */
3166 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3167 			/* success since this is not a prefix route */
3168 			return 1;
3169 		}
3170 	}
3171 
3172 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3173 	if (!nlh)
3174 		return -EMSGSIZE;
3175 
3176 	rtm = nlmsg_data(nlh);
3177 	rtm->rtm_family = AF_INET6;
3178 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
3179 	rtm->rtm_src_len = rt->rt6i_src.plen;
3180 	rtm->rtm_tos = 0;
3181 	if (rt->rt6i_table)
3182 		table = rt->rt6i_table->tb6_id;
3183 	else
3184 		table = RT6_TABLE_UNSPEC;
3185 	rtm->rtm_table = table;
3186 	if (nla_put_u32(skb, RTA_TABLE, table))
3187 		goto nla_put_failure;
3188 	if (rt->rt6i_flags & RTF_REJECT) {
3189 		switch (rt->dst.error) {
3190 		case -EINVAL:
3191 			rtm->rtm_type = RTN_BLACKHOLE;
3192 			break;
3193 		case -EACCES:
3194 			rtm->rtm_type = RTN_PROHIBIT;
3195 			break;
3196 		case -EAGAIN:
3197 			rtm->rtm_type = RTN_THROW;
3198 			break;
3199 		default:
3200 			rtm->rtm_type = RTN_UNREACHABLE;
3201 			break;
3202 		}
3203 	}
3204 	else if (rt->rt6i_flags & RTF_LOCAL)
3205 		rtm->rtm_type = RTN_LOCAL;
3206 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3207 		rtm->rtm_type = RTN_LOCAL;
3208 	else
3209 		rtm->rtm_type = RTN_UNICAST;
3210 	rtm->rtm_flags = 0;
3211 	if (!netif_carrier_ok(rt->dst.dev)) {
3212 		rtm->rtm_flags |= RTNH_F_LINKDOWN;
3213 		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3214 			rtm->rtm_flags |= RTNH_F_DEAD;
3215 	}
3216 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3217 	rtm->rtm_protocol = rt->rt6i_protocol;
3218 	if (rt->rt6i_flags & RTF_DYNAMIC)
3219 		rtm->rtm_protocol = RTPROT_REDIRECT;
3220 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
3221 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3222 			rtm->rtm_protocol = RTPROT_RA;
3223 		else
3224 			rtm->rtm_protocol = RTPROT_KERNEL;
3225 	}
3226 
3227 	if (rt->rt6i_flags & RTF_CACHE)
3228 		rtm->rtm_flags |= RTM_F_CLONED;
3229 
3230 	if (dst) {
3231 		if (nla_put_in6_addr(skb, RTA_DST, dst))
3232 			goto nla_put_failure;
3233 		rtm->rtm_dst_len = 128;
3234 	} else if (rtm->rtm_dst_len)
3235 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3236 			goto nla_put_failure;
3237 #ifdef CONFIG_IPV6_SUBTREES
3238 	if (src) {
3239 		if (nla_put_in6_addr(skb, RTA_SRC, src))
3240 			goto nla_put_failure;
3241 		rtm->rtm_src_len = 128;
3242 	} else if (rtm->rtm_src_len &&
3243 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3244 		goto nla_put_failure;
3245 #endif
3246 	if (iif) {
3247 #ifdef CONFIG_IPV6_MROUTE
3248 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3249 			int err = ip6mr_get_route(net, skb, rtm, nowait,
3250 						  portid);
3251 
3252 			if (err <= 0) {
3253 				if (!nowait) {
3254 					if (err == 0)
3255 						return 0;
3256 					goto nla_put_failure;
3257 				} else {
3258 					if (err == -EMSGSIZE)
3259 						goto nla_put_failure;
3260 				}
3261 			}
3262 		} else
3263 #endif
3264 			if (nla_put_u32(skb, RTA_IIF, iif))
3265 				goto nla_put_failure;
3266 	} else if (dst) {
3267 		struct in6_addr saddr_buf;
3268 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3269 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3270 			goto nla_put_failure;
3271 	}
3272 
3273 	if (rt->rt6i_prefsrc.plen) {
3274 		struct in6_addr saddr_buf;
3275 		saddr_buf = rt->rt6i_prefsrc.addr;
3276 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3277 			goto nla_put_failure;
3278 	}
3279 
3280 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3281 	if (rt->rt6i_pmtu)
3282 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3283 	if (rtnetlink_put_metrics(skb, metrics) < 0)
3284 		goto nla_put_failure;
3285 
3286 	if (rt->rt6i_flags & RTF_GATEWAY) {
3287 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3288 			goto nla_put_failure;
3289 	}
3290 
3291 	if (rt->dst.dev &&
3292 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3293 		goto nla_put_failure;
3294 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3295 		goto nla_put_failure;
3296 
3297 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3298 
3299 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3300 		goto nla_put_failure;
3301 
3302 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3303 		goto nla_put_failure;
3304 
3305 	lwtunnel_fill_encap(skb, rt->dst.lwtstate);
3306 
3307 	nlmsg_end(skb, nlh);
3308 	return 0;
3309 
3310 nla_put_failure:
3311 	nlmsg_cancel(skb, nlh);
3312 	return -EMSGSIZE;
3313 }
3314 
3315 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3316 {
3317 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3318 	int prefix;
3319 
3320 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3321 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3322 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3323 	} else
3324 		prefix = 0;
3325 
3326 	return rt6_fill_node(arg->net,
3327 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3328 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3329 		     prefix, 0, NLM_F_MULTI);
3330 }
3331 
3332 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3333 {
3334 	struct net *net = sock_net(in_skb->sk);
3335 	struct nlattr *tb[RTA_MAX+1];
3336 	struct rt6_info *rt;
3337 	struct sk_buff *skb;
3338 	struct rtmsg *rtm;
3339 	struct flowi6 fl6;
3340 	int err, iif = 0, oif = 0;
3341 
3342 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3343 	if (err < 0)
3344 		goto errout;
3345 
3346 	err = -EINVAL;
3347 	memset(&fl6, 0, sizeof(fl6));
3348 	rtm = nlmsg_data(nlh);
3349 	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3350 
3351 	if (tb[RTA_SRC]) {
3352 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3353 			goto errout;
3354 
3355 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3356 	}
3357 
3358 	if (tb[RTA_DST]) {
3359 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3360 			goto errout;
3361 
3362 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3363 	}
3364 
3365 	if (tb[RTA_IIF])
3366 		iif = nla_get_u32(tb[RTA_IIF]);
3367 
3368 	if (tb[RTA_OIF])
3369 		oif = nla_get_u32(tb[RTA_OIF]);
3370 
3371 	if (tb[RTA_MARK])
3372 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3373 
3374 	if (iif) {
3375 		struct net_device *dev;
3376 		int flags = 0;
3377 
3378 		dev = __dev_get_by_index(net, iif);
3379 		if (!dev) {
3380 			err = -ENODEV;
3381 			goto errout;
3382 		}
3383 
3384 		fl6.flowi6_iif = iif;
3385 
3386 		if (!ipv6_addr_any(&fl6.saddr))
3387 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3388 
3389 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3390 							       flags);
3391 	} else {
3392 		fl6.flowi6_oif = oif;
3393 
3394 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3395 	}
3396 
3397 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3398 	if (!skb) {
3399 		ip6_rt_put(rt);
3400 		err = -ENOBUFS;
3401 		goto errout;
3402 	}
3403 
3404 	/* Reserve room for dummy headers, this skb can pass
3405 	   through good chunk of routing engine.
3406 	 */
3407 	skb_reset_mac_header(skb);
3408 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3409 
3410 	skb_dst_set(skb, &rt->dst);
3411 
3412 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3413 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3414 			    nlh->nlmsg_seq, 0, 0, 0);
3415 	if (err < 0) {
3416 		kfree_skb(skb);
3417 		goto errout;
3418 	}
3419 
3420 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3421 errout:
3422 	return err;
3423 }
3424 
3425 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3426 		     unsigned int nlm_flags)
3427 {
3428 	struct sk_buff *skb;
3429 	struct net *net = info->nl_net;
3430 	u32 seq;
3431 	int err;
3432 
3433 	err = -ENOBUFS;
3434 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3435 
3436 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3437 	if (!skb)
3438 		goto errout;
3439 
3440 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3441 				event, info->portid, seq, 0, 0, nlm_flags);
3442 	if (err < 0) {
3443 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3444 		WARN_ON(err == -EMSGSIZE);
3445 		kfree_skb(skb);
3446 		goto errout;
3447 	}
3448 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3449 		    info->nlh, gfp_any());
3450 	return;
3451 errout:
3452 	if (err < 0)
3453 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3454 }
3455 
3456 static int ip6_route_dev_notify(struct notifier_block *this,
3457 				unsigned long event, void *ptr)
3458 {
3459 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3460 	struct net *net = dev_net(dev);
3461 
3462 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3463 		net->ipv6.ip6_null_entry->dst.dev = dev;
3464 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3465 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3466 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3467 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3468 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3469 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3470 #endif
3471 	}
3472 
3473 	return NOTIFY_OK;
3474 }
3475 
3476 /*
3477  *	/proc
3478  */
3479 
3480 #ifdef CONFIG_PROC_FS
3481 
3482 static const struct file_operations ipv6_route_proc_fops = {
3483 	.owner		= THIS_MODULE,
3484 	.open		= ipv6_route_open,
3485 	.read		= seq_read,
3486 	.llseek		= seq_lseek,
3487 	.release	= seq_release_net,
3488 };
3489 
3490 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3491 {
3492 	struct net *net = (struct net *)seq->private;
3493 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3494 		   net->ipv6.rt6_stats->fib_nodes,
3495 		   net->ipv6.rt6_stats->fib_route_nodes,
3496 		   net->ipv6.rt6_stats->fib_rt_alloc,
3497 		   net->ipv6.rt6_stats->fib_rt_entries,
3498 		   net->ipv6.rt6_stats->fib_rt_cache,
3499 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3500 		   net->ipv6.rt6_stats->fib_discarded_routes);
3501 
3502 	return 0;
3503 }
3504 
3505 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3506 {
3507 	return single_open_net(inode, file, rt6_stats_seq_show);
3508 }
3509 
3510 static const struct file_operations rt6_stats_seq_fops = {
3511 	.owner	 = THIS_MODULE,
3512 	.open	 = rt6_stats_seq_open,
3513 	.read	 = seq_read,
3514 	.llseek	 = seq_lseek,
3515 	.release = single_release_net,
3516 };
3517 #endif	/* CONFIG_PROC_FS */
3518 
3519 #ifdef CONFIG_SYSCTL
3520 
3521 static
3522 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3523 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3524 {
3525 	struct net *net;
3526 	int delay;
3527 	if (!write)
3528 		return -EINVAL;
3529 
3530 	net = (struct net *)ctl->extra1;
3531 	delay = net->ipv6.sysctl.flush_delay;
3532 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3533 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3534 	return 0;
3535 }
3536 
3537 struct ctl_table ipv6_route_table_template[] = {
3538 	{
3539 		.procname	=	"flush",
3540 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3541 		.maxlen		=	sizeof(int),
3542 		.mode		=	0200,
3543 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3544 	},
3545 	{
3546 		.procname	=	"gc_thresh",
3547 		.data		=	&ip6_dst_ops_template.gc_thresh,
3548 		.maxlen		=	sizeof(int),
3549 		.mode		=	0644,
3550 		.proc_handler	=	proc_dointvec,
3551 	},
3552 	{
3553 		.procname	=	"max_size",
3554 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3555 		.maxlen		=	sizeof(int),
3556 		.mode		=	0644,
3557 		.proc_handler	=	proc_dointvec,
3558 	},
3559 	{
3560 		.procname	=	"gc_min_interval",
3561 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3562 		.maxlen		=	sizeof(int),
3563 		.mode		=	0644,
3564 		.proc_handler	=	proc_dointvec_jiffies,
3565 	},
3566 	{
3567 		.procname	=	"gc_timeout",
3568 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3569 		.maxlen		=	sizeof(int),
3570 		.mode		=	0644,
3571 		.proc_handler	=	proc_dointvec_jiffies,
3572 	},
3573 	{
3574 		.procname	=	"gc_interval",
3575 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3576 		.maxlen		=	sizeof(int),
3577 		.mode		=	0644,
3578 		.proc_handler	=	proc_dointvec_jiffies,
3579 	},
3580 	{
3581 		.procname	=	"gc_elasticity",
3582 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3583 		.maxlen		=	sizeof(int),
3584 		.mode		=	0644,
3585 		.proc_handler	=	proc_dointvec,
3586 	},
3587 	{
3588 		.procname	=	"mtu_expires",
3589 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3590 		.maxlen		=	sizeof(int),
3591 		.mode		=	0644,
3592 		.proc_handler	=	proc_dointvec_jiffies,
3593 	},
3594 	{
3595 		.procname	=	"min_adv_mss",
3596 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3597 		.maxlen		=	sizeof(int),
3598 		.mode		=	0644,
3599 		.proc_handler	=	proc_dointvec,
3600 	},
3601 	{
3602 		.procname	=	"gc_min_interval_ms",
3603 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3604 		.maxlen		=	sizeof(int),
3605 		.mode		=	0644,
3606 		.proc_handler	=	proc_dointvec_ms_jiffies,
3607 	},
3608 	{ }
3609 };
3610 
3611 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3612 {
3613 	struct ctl_table *table;
3614 
3615 	table = kmemdup(ipv6_route_table_template,
3616 			sizeof(ipv6_route_table_template),
3617 			GFP_KERNEL);
3618 
3619 	if (table) {
3620 		table[0].data = &net->ipv6.sysctl.flush_delay;
3621 		table[0].extra1 = net;
3622 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3623 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3624 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3625 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3626 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3627 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3628 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3629 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3630 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3631 
3632 		/* Don't export sysctls to unprivileged users */
3633 		if (net->user_ns != &init_user_ns)
3634 			table[0].procname = NULL;
3635 	}
3636 
3637 	return table;
3638 }
3639 #endif
3640 
3641 static int __net_init ip6_route_net_init(struct net *net)
3642 {
3643 	int ret = -ENOMEM;
3644 
3645 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3646 	       sizeof(net->ipv6.ip6_dst_ops));
3647 
3648 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3649 		goto out_ip6_dst_ops;
3650 
3651 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3652 					   sizeof(*net->ipv6.ip6_null_entry),
3653 					   GFP_KERNEL);
3654 	if (!net->ipv6.ip6_null_entry)
3655 		goto out_ip6_dst_entries;
3656 	net->ipv6.ip6_null_entry->dst.path =
3657 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3658 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3659 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3660 			 ip6_template_metrics, true);
3661 
3662 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3663 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3664 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3665 					       GFP_KERNEL);
3666 	if (!net->ipv6.ip6_prohibit_entry)
3667 		goto out_ip6_null_entry;
3668 	net->ipv6.ip6_prohibit_entry->dst.path =
3669 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3670 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3671 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3672 			 ip6_template_metrics, true);
3673 
3674 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3675 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3676 					       GFP_KERNEL);
3677 	if (!net->ipv6.ip6_blk_hole_entry)
3678 		goto out_ip6_prohibit_entry;
3679 	net->ipv6.ip6_blk_hole_entry->dst.path =
3680 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3681 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3682 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3683 			 ip6_template_metrics, true);
3684 #endif
3685 
3686 	net->ipv6.sysctl.flush_delay = 0;
3687 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3688 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3689 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3690 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3691 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3692 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3693 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3694 
3695 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3696 
3697 	ret = 0;
3698 out:
3699 	return ret;
3700 
3701 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3702 out_ip6_prohibit_entry:
3703 	kfree(net->ipv6.ip6_prohibit_entry);
3704 out_ip6_null_entry:
3705 	kfree(net->ipv6.ip6_null_entry);
3706 #endif
3707 out_ip6_dst_entries:
3708 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3709 out_ip6_dst_ops:
3710 	goto out;
3711 }
3712 
3713 static void __net_exit ip6_route_net_exit(struct net *net)
3714 {
3715 	kfree(net->ipv6.ip6_null_entry);
3716 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3717 	kfree(net->ipv6.ip6_prohibit_entry);
3718 	kfree(net->ipv6.ip6_blk_hole_entry);
3719 #endif
3720 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3721 }
3722 
3723 static int __net_init ip6_route_net_init_late(struct net *net)
3724 {
3725 #ifdef CONFIG_PROC_FS
3726 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3727 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3728 #endif
3729 	return 0;
3730 }
3731 
3732 static void __net_exit ip6_route_net_exit_late(struct net *net)
3733 {
3734 #ifdef CONFIG_PROC_FS
3735 	remove_proc_entry("ipv6_route", net->proc_net);
3736 	remove_proc_entry("rt6_stats", net->proc_net);
3737 #endif
3738 }
3739 
3740 static struct pernet_operations ip6_route_net_ops = {
3741 	.init = ip6_route_net_init,
3742 	.exit = ip6_route_net_exit,
3743 };
3744 
3745 static int __net_init ipv6_inetpeer_init(struct net *net)
3746 {
3747 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3748 
3749 	if (!bp)
3750 		return -ENOMEM;
3751 	inet_peer_base_init(bp);
3752 	net->ipv6.peers = bp;
3753 	return 0;
3754 }
3755 
3756 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3757 {
3758 	struct inet_peer_base *bp = net->ipv6.peers;
3759 
3760 	net->ipv6.peers = NULL;
3761 	inetpeer_invalidate_tree(bp);
3762 	kfree(bp);
3763 }
3764 
3765 static struct pernet_operations ipv6_inetpeer_ops = {
3766 	.init	=	ipv6_inetpeer_init,
3767 	.exit	=	ipv6_inetpeer_exit,
3768 };
3769 
3770 static struct pernet_operations ip6_route_net_late_ops = {
3771 	.init = ip6_route_net_init_late,
3772 	.exit = ip6_route_net_exit_late,
3773 };
3774 
3775 static struct notifier_block ip6_route_dev_notifier = {
3776 	.notifier_call = ip6_route_dev_notify,
3777 	.priority = 0,
3778 };
3779 
3780 int __init ip6_route_init(void)
3781 {
3782 	int ret;
3783 	int cpu;
3784 
3785 	ret = -ENOMEM;
3786 	ip6_dst_ops_template.kmem_cachep =
3787 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3788 				  SLAB_HWCACHE_ALIGN, NULL);
3789 	if (!ip6_dst_ops_template.kmem_cachep)
3790 		goto out;
3791 
3792 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3793 	if (ret)
3794 		goto out_kmem_cache;
3795 
3796 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3797 	if (ret)
3798 		goto out_dst_entries;
3799 
3800 	ret = register_pernet_subsys(&ip6_route_net_ops);
3801 	if (ret)
3802 		goto out_register_inetpeer;
3803 
3804 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3805 
3806 	/* Registering of the loopback is done before this portion of code,
3807 	 * the loopback reference in rt6_info will not be taken, do it
3808 	 * manually for init_net */
3809 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3810 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3811   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3812 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3813 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3814 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3815 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3816   #endif
3817 	ret = fib6_init();
3818 	if (ret)
3819 		goto out_register_subsys;
3820 
3821 	ret = xfrm6_init();
3822 	if (ret)
3823 		goto out_fib6_init;
3824 
3825 	ret = fib6_rules_init();
3826 	if (ret)
3827 		goto xfrm6_init;
3828 
3829 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3830 	if (ret)
3831 		goto fib6_rules_init;
3832 
3833 	ret = -ENOBUFS;
3834 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3835 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3836 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3837 		goto out_register_late_subsys;
3838 
3839 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3840 	if (ret)
3841 		goto out_register_late_subsys;
3842 
3843 	for_each_possible_cpu(cpu) {
3844 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3845 
3846 		INIT_LIST_HEAD(&ul->head);
3847 		spin_lock_init(&ul->lock);
3848 	}
3849 
3850 out:
3851 	return ret;
3852 
3853 out_register_late_subsys:
3854 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3855 fib6_rules_init:
3856 	fib6_rules_cleanup();
3857 xfrm6_init:
3858 	xfrm6_fini();
3859 out_fib6_init:
3860 	fib6_gc_cleanup();
3861 out_register_subsys:
3862 	unregister_pernet_subsys(&ip6_route_net_ops);
3863 out_register_inetpeer:
3864 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3865 out_dst_entries:
3866 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3867 out_kmem_cache:
3868 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3869 	goto out;
3870 }
3871 
3872 void ip6_route_cleanup(void)
3873 {
3874 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3875 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3876 	fib6_rules_cleanup();
3877 	xfrm6_fini();
3878 	fib6_gc_cleanup();
3879 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3880 	unregister_pernet_subsys(&ip6_route_net_ops);
3881 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3882 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3883 }
3884