xref: /openbmc/linux/net/ipv6/route.c (revision c0e297dc)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 
62 #include <asm/uaccess.h>
63 
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67 
68 enum rt6_nud_state {
69 	RT6_NUD_FAIL_HARD = -3,
70 	RT6_NUD_FAIL_PROBE = -2,
71 	RT6_NUD_FAIL_DO_RR = -1,
72 	RT6_NUD_SUCCEED = 1
73 };
74 
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int		ip6_pkt_prohibit(struct sk_buff *skb);
88 static int		ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void		ip6_link_failure(struct sk_buff *skb);
90 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91 					   struct sk_buff *skb, u32 mtu);
92 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93 					struct sk_buff *skb);
94 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96 
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99 					   const struct in6_addr *prefix, int prefixlen,
100 					   const struct in6_addr *gwaddr, int ifindex,
101 					   unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103 					   const struct in6_addr *prefix, int prefixlen,
104 					   const struct in6_addr *gwaddr, int ifindex);
105 #endif
106 
107 struct uncached_list {
108 	spinlock_t		lock;
109 	struct list_head	head;
110 };
111 
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113 
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117 
118 	rt->dst.flags |= DST_NOCACHE;
119 	rt->rt6i_uncached_list = ul;
120 
121 	spin_lock_bh(&ul->lock);
122 	list_add_tail(&rt->rt6i_uncached, &ul->head);
123 	spin_unlock_bh(&ul->lock);
124 }
125 
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128 	if (!list_empty(&rt->rt6i_uncached)) {
129 		struct uncached_list *ul = rt->rt6i_uncached_list;
130 
131 		spin_lock_bh(&ul->lock);
132 		list_del(&rt->rt6i_uncached);
133 		spin_unlock_bh(&ul->lock);
134 	}
135 }
136 
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139 	struct net_device *loopback_dev = net->loopback_dev;
140 	int cpu;
141 
142 	for_each_possible_cpu(cpu) {
143 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144 		struct rt6_info *rt;
145 
146 		spin_lock_bh(&ul->lock);
147 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148 			struct inet6_dev *rt_idev = rt->rt6i_idev;
149 			struct net_device *rt_dev = rt->dst.dev;
150 
151 			if (rt_idev && (rt_idev->dev == dev || !dev) &&
152 			    rt_idev->dev != loopback_dev) {
153 				rt->rt6i_idev = in6_dev_get(loopback_dev);
154 				in6_dev_put(rt_idev);
155 			}
156 
157 			if (rt_dev && (rt_dev == dev || !dev) &&
158 			    rt_dev != loopback_dev) {
159 				rt->dst.dev = loopback_dev;
160 				dev_hold(rt->dst.dev);
161 				dev_put(rt_dev);
162 			}
163 		}
164 		spin_unlock_bh(&ul->lock);
165 	}
166 }
167 
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170 	return dst_metrics_write_ptr(rt->dst.from);
171 }
172 
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175 	struct rt6_info *rt = (struct rt6_info *)dst;
176 
177 	if (rt->rt6i_flags & RTF_PCPU)
178 		return rt6_pcpu_cow_metrics(rt);
179 	else if (rt->rt6i_flags & RTF_CACHE)
180 		return NULL;
181 	else
182 		return dst_cow_metrics_generic(dst, old);
183 }
184 
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	struct in6_addr *p = &rt->rt6i_gateway;
190 
191 	if (!ipv6_addr_any(p))
192 		return (const void *) p;
193 	else if (skb)
194 		return &ipv6_hdr(skb)->daddr;
195 	return daddr;
196 }
197 
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199 					  struct sk_buff *skb,
200 					  const void *daddr)
201 {
202 	struct rt6_info *rt = (struct rt6_info *) dst;
203 	struct neighbour *n;
204 
205 	daddr = choose_neigh_daddr(rt, skb, daddr);
206 	n = __ipv6_neigh_lookup(dst->dev, daddr);
207 	if (n)
208 		return n;
209 	return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211 
212 static struct dst_ops ip6_dst_ops_template = {
213 	.family			=	AF_INET6,
214 	.gc			=	ip6_dst_gc,
215 	.gc_thresh		=	1024,
216 	.check			=	ip6_dst_check,
217 	.default_advmss		=	ip6_default_advmss,
218 	.mtu			=	ip6_mtu,
219 	.cow_metrics		=	ipv6_cow_metrics,
220 	.destroy		=	ip6_dst_destroy,
221 	.ifdown			=	ip6_dst_ifdown,
222 	.negative_advice	=	ip6_negative_advice,
223 	.link_failure		=	ip6_link_failure,
224 	.update_pmtu		=	ip6_rt_update_pmtu,
225 	.redirect		=	rt6_do_redirect,
226 	.local_out		=	__ip6_local_out,
227 	.neigh_lookup		=	ip6_neigh_lookup,
228 };
229 
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233 
234 	return mtu ? : dst->dev->mtu;
235 }
236 
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238 					 struct sk_buff *skb, u32 mtu)
239 {
240 }
241 
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243 				      struct sk_buff *skb)
244 {
245 }
246 
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248 					 unsigned long old)
249 {
250 	return NULL;
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_sk,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320 					struct net_device *dev,
321 					int flags)
322 {
323 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
324 					0, DST_OBSOLETE_FORCE_CHK, flags);
325 
326 	if (rt) {
327 		struct dst_entry *dst = &rt->dst;
328 
329 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
330 		INIT_LIST_HEAD(&rt->rt6i_siblings);
331 		INIT_LIST_HEAD(&rt->rt6i_uncached);
332 	}
333 	return rt;
334 }
335 
336 static struct rt6_info *ip6_dst_alloc(struct net *net,
337 				      struct net_device *dev,
338 				      int flags)
339 {
340 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
341 
342 	if (rt) {
343 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
344 		if (rt->rt6i_pcpu) {
345 			int cpu;
346 
347 			for_each_possible_cpu(cpu) {
348 				struct rt6_info **p;
349 
350 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
351 				/* no one shares rt */
352 				*p =  NULL;
353 			}
354 		} else {
355 			dst_destroy((struct dst_entry *)rt);
356 			return NULL;
357 		}
358 	}
359 
360 	return rt;
361 }
362 
363 static void ip6_dst_destroy(struct dst_entry *dst)
364 {
365 	struct rt6_info *rt = (struct rt6_info *)dst;
366 	struct dst_entry *from = dst->from;
367 	struct inet6_dev *idev;
368 
369 	dst_destroy_metrics_generic(dst);
370 	free_percpu(rt->rt6i_pcpu);
371 	rt6_uncached_list_del(rt);
372 
373 	idev = rt->rt6i_idev;
374 	if (idev) {
375 		rt->rt6i_idev = NULL;
376 		in6_dev_put(idev);
377 	}
378 
379 	dst->from = NULL;
380 	dst_release(from);
381 }
382 
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384 			   int how)
385 {
386 	struct rt6_info *rt = (struct rt6_info *)dst;
387 	struct inet6_dev *idev = rt->rt6i_idev;
388 	struct net_device *loopback_dev =
389 		dev_net(dev)->loopback_dev;
390 
391 	if (dev != loopback_dev) {
392 		if (idev && idev->dev == dev) {
393 			struct inet6_dev *loopback_idev =
394 				in6_dev_get(loopback_dev);
395 			if (loopback_idev) {
396 				rt->rt6i_idev = loopback_idev;
397 				in6_dev_put(idev);
398 			}
399 		}
400 	}
401 }
402 
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405 	if (rt->rt6i_flags & RTF_EXPIRES) {
406 		if (time_after(jiffies, rt->dst.expires))
407 			return true;
408 	} else if (rt->dst.from) {
409 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
410 	}
411 	return false;
412 }
413 
414 /* Multipath route selection:
415  *   Hash based function using packet header and flowlabel.
416  * Adapted from fib_info_hashfn()
417  */
418 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
419 			       const struct flowi6 *fl6)
420 {
421 	unsigned int val = fl6->flowi6_proto;
422 
423 	val ^= ipv6_addr_hash(&fl6->daddr);
424 	val ^= ipv6_addr_hash(&fl6->saddr);
425 
426 	/* Work only if this not encapsulated */
427 	switch (fl6->flowi6_proto) {
428 	case IPPROTO_UDP:
429 	case IPPROTO_TCP:
430 	case IPPROTO_SCTP:
431 		val ^= (__force u16)fl6->fl6_sport;
432 		val ^= (__force u16)fl6->fl6_dport;
433 		break;
434 
435 	case IPPROTO_ICMPV6:
436 		val ^= (__force u16)fl6->fl6_icmp_type;
437 		val ^= (__force u16)fl6->fl6_icmp_code;
438 		break;
439 	}
440 	/* RFC6438 recommands to use flowlabel */
441 	val ^= (__force u32)fl6->flowlabel;
442 
443 	/* Perhaps, we need to tune, this function? */
444 	val = val ^ (val >> 7) ^ (val >> 12);
445 	return val % candidate_count;
446 }
447 
448 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
449 					     struct flowi6 *fl6, int oif,
450 					     int strict)
451 {
452 	struct rt6_info *sibling, *next_sibling;
453 	int route_choosen;
454 
455 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
456 	/* Don't change the route, if route_choosen == 0
457 	 * (siblings does not include ourself)
458 	 */
459 	if (route_choosen)
460 		list_for_each_entry_safe(sibling, next_sibling,
461 				&match->rt6i_siblings, rt6i_siblings) {
462 			route_choosen--;
463 			if (route_choosen == 0) {
464 				if (rt6_score_route(sibling, oif, strict) < 0)
465 					break;
466 				match = sibling;
467 				break;
468 			}
469 		}
470 	return match;
471 }
472 
473 /*
474  *	Route lookup. Any table->tb6_lock is implied.
475  */
476 
477 static inline struct rt6_info *rt6_device_match(struct net *net,
478 						    struct rt6_info *rt,
479 						    const struct in6_addr *saddr,
480 						    int oif,
481 						    int flags)
482 {
483 	struct rt6_info *local = NULL;
484 	struct rt6_info *sprt;
485 
486 	if (!oif && ipv6_addr_any(saddr))
487 		goto out;
488 
489 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
490 		struct net_device *dev = sprt->dst.dev;
491 
492 		if (oif) {
493 			if (dev->ifindex == oif)
494 				return sprt;
495 			if (dev->flags & IFF_LOOPBACK) {
496 				if (!sprt->rt6i_idev ||
497 				    sprt->rt6i_idev->dev->ifindex != oif) {
498 					if (flags & RT6_LOOKUP_F_IFACE && oif)
499 						continue;
500 					if (local && (!oif ||
501 						      local->rt6i_idev->dev->ifindex == oif))
502 						continue;
503 				}
504 				local = sprt;
505 			}
506 		} else {
507 			if (ipv6_chk_addr(net, saddr, dev,
508 					  flags & RT6_LOOKUP_F_IFACE))
509 				return sprt;
510 		}
511 	}
512 
513 	if (oif) {
514 		if (local)
515 			return local;
516 
517 		if (flags & RT6_LOOKUP_F_IFACE)
518 			return net->ipv6.ip6_null_entry;
519 	}
520 out:
521 	return rt;
522 }
523 
524 #ifdef CONFIG_IPV6_ROUTER_PREF
525 struct __rt6_probe_work {
526 	struct work_struct work;
527 	struct in6_addr target;
528 	struct net_device *dev;
529 };
530 
531 static void rt6_probe_deferred(struct work_struct *w)
532 {
533 	struct in6_addr mcaddr;
534 	struct __rt6_probe_work *work =
535 		container_of(w, struct __rt6_probe_work, work);
536 
537 	addrconf_addr_solict_mult(&work->target, &mcaddr);
538 	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
539 	dev_put(work->dev);
540 	kfree(work);
541 }
542 
543 static void rt6_probe(struct rt6_info *rt)
544 {
545 	struct neighbour *neigh;
546 	/*
547 	 * Okay, this does not seem to be appropriate
548 	 * for now, however, we need to check if it
549 	 * is really so; aka Router Reachability Probing.
550 	 *
551 	 * Router Reachability Probe MUST be rate-limited
552 	 * to no more than one per minute.
553 	 */
554 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
555 		return;
556 	rcu_read_lock_bh();
557 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
558 	if (neigh) {
559 		write_lock(&neigh->lock);
560 		if (neigh->nud_state & NUD_VALID)
561 			goto out;
562 	}
563 
564 	if (!neigh ||
565 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
566 		struct __rt6_probe_work *work;
567 
568 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
569 
570 		if (neigh && work)
571 			__neigh_set_probe_once(neigh);
572 
573 		if (neigh)
574 			write_unlock(&neigh->lock);
575 
576 		if (work) {
577 			INIT_WORK(&work->work, rt6_probe_deferred);
578 			work->target = rt->rt6i_gateway;
579 			dev_hold(rt->dst.dev);
580 			work->dev = rt->dst.dev;
581 			schedule_work(&work->work);
582 		}
583 	} else {
584 out:
585 		write_unlock(&neigh->lock);
586 	}
587 	rcu_read_unlock_bh();
588 }
589 #else
590 static inline void rt6_probe(struct rt6_info *rt)
591 {
592 }
593 #endif
594 
595 /*
596  * Default Router Selection (RFC 2461 6.3.6)
597  */
598 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
599 {
600 	struct net_device *dev = rt->dst.dev;
601 	if (!oif || dev->ifindex == oif)
602 		return 2;
603 	if ((dev->flags & IFF_LOOPBACK) &&
604 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
605 		return 1;
606 	return 0;
607 }
608 
609 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
610 {
611 	struct neighbour *neigh;
612 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
613 
614 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
615 	    !(rt->rt6i_flags & RTF_GATEWAY))
616 		return RT6_NUD_SUCCEED;
617 
618 	rcu_read_lock_bh();
619 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
620 	if (neigh) {
621 		read_lock(&neigh->lock);
622 		if (neigh->nud_state & NUD_VALID)
623 			ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625 		else if (!(neigh->nud_state & NUD_FAILED))
626 			ret = RT6_NUD_SUCCEED;
627 		else
628 			ret = RT6_NUD_FAIL_PROBE;
629 #endif
630 		read_unlock(&neigh->lock);
631 	} else {
632 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634 	}
635 	rcu_read_unlock_bh();
636 
637 	return ret;
638 }
639 
640 static int rt6_score_route(struct rt6_info *rt, int oif,
641 			   int strict)
642 {
643 	int m;
644 
645 	m = rt6_check_dev(rt, oif);
646 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
647 		return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
650 #endif
651 	if (strict & RT6_LOOKUP_F_REACHABLE) {
652 		int n = rt6_check_neigh(rt);
653 		if (n < 0)
654 			return n;
655 	}
656 	return m;
657 }
658 
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660 				   int *mpri, struct rt6_info *match,
661 				   bool *do_rr)
662 {
663 	int m;
664 	bool match_do_rr = false;
665 
666 	if (rt6_check_expired(rt))
667 		goto out;
668 
669 	m = rt6_score_route(rt, oif, strict);
670 	if (m == RT6_NUD_FAIL_DO_RR) {
671 		match_do_rr = true;
672 		m = 0; /* lowest valid score */
673 	} else if (m == RT6_NUD_FAIL_HARD) {
674 		goto out;
675 	}
676 
677 	if (strict & RT6_LOOKUP_F_REACHABLE)
678 		rt6_probe(rt);
679 
680 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
681 	if (m > *mpri) {
682 		*do_rr = match_do_rr;
683 		*mpri = m;
684 		match = rt;
685 	}
686 out:
687 	return match;
688 }
689 
690 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
691 				     struct rt6_info *rr_head,
692 				     u32 metric, int oif, int strict,
693 				     bool *do_rr)
694 {
695 	struct rt6_info *rt, *match, *cont;
696 	int mpri = -1;
697 
698 	match = NULL;
699 	cont = NULL;
700 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
701 		if (rt->rt6i_metric != metric) {
702 			cont = rt;
703 			break;
704 		}
705 
706 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
707 	}
708 
709 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
710 		if (rt->rt6i_metric != metric) {
711 			cont = rt;
712 			break;
713 		}
714 
715 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 	}
717 
718 	if (match || !cont)
719 		return match;
720 
721 	for (rt = cont; rt; rt = rt->dst.rt6_next)
722 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 
724 	return match;
725 }
726 
727 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
728 {
729 	struct rt6_info *match, *rt0;
730 	struct net *net;
731 	bool do_rr = false;
732 
733 	rt0 = fn->rr_ptr;
734 	if (!rt0)
735 		fn->rr_ptr = rt0 = fn->leaf;
736 
737 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
738 			     &do_rr);
739 
740 	if (do_rr) {
741 		struct rt6_info *next = rt0->dst.rt6_next;
742 
743 		/* no entries matched; do round-robin */
744 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
745 			next = fn->leaf;
746 
747 		if (next != rt0)
748 			fn->rr_ptr = next;
749 	}
750 
751 	net = dev_net(rt0->dst.dev);
752 	return match ? match : net->ipv6.ip6_null_entry;
753 }
754 
755 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
756 {
757 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
758 }
759 
760 #ifdef CONFIG_IPV6_ROUTE_INFO
761 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
762 		  const struct in6_addr *gwaddr)
763 {
764 	struct net *net = dev_net(dev);
765 	struct route_info *rinfo = (struct route_info *) opt;
766 	struct in6_addr prefix_buf, *prefix;
767 	unsigned int pref;
768 	unsigned long lifetime;
769 	struct rt6_info *rt;
770 
771 	if (len < sizeof(struct route_info)) {
772 		return -EINVAL;
773 	}
774 
775 	/* Sanity check for prefix_len and length */
776 	if (rinfo->length > 3) {
777 		return -EINVAL;
778 	} else if (rinfo->prefix_len > 128) {
779 		return -EINVAL;
780 	} else if (rinfo->prefix_len > 64) {
781 		if (rinfo->length < 2) {
782 			return -EINVAL;
783 		}
784 	} else if (rinfo->prefix_len > 0) {
785 		if (rinfo->length < 1) {
786 			return -EINVAL;
787 		}
788 	}
789 
790 	pref = rinfo->route_pref;
791 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
792 		return -EINVAL;
793 
794 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
795 
796 	if (rinfo->length == 3)
797 		prefix = (struct in6_addr *)rinfo->prefix;
798 	else {
799 		/* this function is safe */
800 		ipv6_addr_prefix(&prefix_buf,
801 				 (struct in6_addr *)rinfo->prefix,
802 				 rinfo->prefix_len);
803 		prefix = &prefix_buf;
804 	}
805 
806 	if (rinfo->prefix_len == 0)
807 		rt = rt6_get_dflt_router(gwaddr, dev);
808 	else
809 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
810 					gwaddr, dev->ifindex);
811 
812 	if (rt && !lifetime) {
813 		ip6_del_rt(rt);
814 		rt = NULL;
815 	}
816 
817 	if (!rt && lifetime)
818 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
819 					pref);
820 	else if (rt)
821 		rt->rt6i_flags = RTF_ROUTEINFO |
822 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
823 
824 	if (rt) {
825 		if (!addrconf_finite_timeout(lifetime))
826 			rt6_clean_expires(rt);
827 		else
828 			rt6_set_expires(rt, jiffies + HZ * lifetime);
829 
830 		ip6_rt_put(rt);
831 	}
832 	return 0;
833 }
834 #endif
835 
836 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
837 					struct in6_addr *saddr)
838 {
839 	struct fib6_node *pn;
840 	while (1) {
841 		if (fn->fn_flags & RTN_TL_ROOT)
842 			return NULL;
843 		pn = fn->parent;
844 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
845 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
846 		else
847 			fn = pn;
848 		if (fn->fn_flags & RTN_RTINFO)
849 			return fn;
850 	}
851 }
852 
853 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
854 					     struct fib6_table *table,
855 					     struct flowi6 *fl6, int flags)
856 {
857 	struct fib6_node *fn;
858 	struct rt6_info *rt;
859 
860 	read_lock_bh(&table->tb6_lock);
861 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
862 restart:
863 	rt = fn->leaf;
864 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
865 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
866 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
867 	if (rt == net->ipv6.ip6_null_entry) {
868 		fn = fib6_backtrack(fn, &fl6->saddr);
869 		if (fn)
870 			goto restart;
871 	}
872 	dst_use(&rt->dst, jiffies);
873 	read_unlock_bh(&table->tb6_lock);
874 	return rt;
875 
876 }
877 
878 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
879 				    int flags)
880 {
881 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
882 }
883 EXPORT_SYMBOL_GPL(ip6_route_lookup);
884 
885 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
886 			    const struct in6_addr *saddr, int oif, int strict)
887 {
888 	struct flowi6 fl6 = {
889 		.flowi6_oif = oif,
890 		.daddr = *daddr,
891 	};
892 	struct dst_entry *dst;
893 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
894 
895 	if (saddr) {
896 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
897 		flags |= RT6_LOOKUP_F_HAS_SADDR;
898 	}
899 
900 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
901 	if (dst->error == 0)
902 		return (struct rt6_info *) dst;
903 
904 	dst_release(dst);
905 
906 	return NULL;
907 }
908 EXPORT_SYMBOL(rt6_lookup);
909 
910 /* ip6_ins_rt is called with FREE table->tb6_lock.
911    It takes new route entry, the addition fails by any reason the
912    route is freed. In any case, if caller does not hold it, it may
913    be destroyed.
914  */
915 
916 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
917 			struct mx6_config *mxc)
918 {
919 	int err;
920 	struct fib6_table *table;
921 
922 	table = rt->rt6i_table;
923 	write_lock_bh(&table->tb6_lock);
924 	err = fib6_add(&table->tb6_root, rt, info, mxc);
925 	write_unlock_bh(&table->tb6_lock);
926 
927 	return err;
928 }
929 
930 int ip6_ins_rt(struct rt6_info *rt)
931 {
932 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
933 	struct mx6_config mxc = { .mx = NULL, };
934 
935 	return __ip6_ins_rt(rt, &info, &mxc);
936 }
937 
938 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
939 					   const struct in6_addr *daddr,
940 					   const struct in6_addr *saddr)
941 {
942 	struct rt6_info *rt;
943 
944 	/*
945 	 *	Clone the route.
946 	 */
947 
948 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
949 		ort = (struct rt6_info *)ort->dst.from;
950 
951 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
952 
953 	if (!rt)
954 		return NULL;
955 
956 	ip6_rt_copy_init(rt, ort);
957 	rt->rt6i_flags |= RTF_CACHE;
958 	rt->rt6i_metric = 0;
959 	rt->dst.flags |= DST_HOST;
960 	rt->rt6i_dst.addr = *daddr;
961 	rt->rt6i_dst.plen = 128;
962 
963 	if (!rt6_is_gw_or_nonexthop(ort)) {
964 		if (ort->rt6i_dst.plen != 128 &&
965 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
966 			rt->rt6i_flags |= RTF_ANYCAST;
967 #ifdef CONFIG_IPV6_SUBTREES
968 		if (rt->rt6i_src.plen && saddr) {
969 			rt->rt6i_src.addr = *saddr;
970 			rt->rt6i_src.plen = 128;
971 		}
972 #endif
973 	}
974 
975 	return rt;
976 }
977 
978 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
979 {
980 	struct rt6_info *pcpu_rt;
981 
982 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
983 				  rt->dst.dev, rt->dst.flags);
984 
985 	if (!pcpu_rt)
986 		return NULL;
987 	ip6_rt_copy_init(pcpu_rt, rt);
988 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
989 	pcpu_rt->rt6i_flags |= RTF_PCPU;
990 	return pcpu_rt;
991 }
992 
993 /* It should be called with read_lock_bh(&tb6_lock) acquired */
994 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
995 {
996 	struct rt6_info *pcpu_rt, **p;
997 
998 	p = this_cpu_ptr(rt->rt6i_pcpu);
999 	pcpu_rt = *p;
1000 
1001 	if (pcpu_rt) {
1002 		dst_hold(&pcpu_rt->dst);
1003 		rt6_dst_from_metrics_check(pcpu_rt);
1004 	}
1005 	return pcpu_rt;
1006 }
1007 
1008 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1009 {
1010 	struct fib6_table *table = rt->rt6i_table;
1011 	struct rt6_info *pcpu_rt, *prev, **p;
1012 
1013 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1014 	if (!pcpu_rt) {
1015 		struct net *net = dev_net(rt->dst.dev);
1016 
1017 		dst_hold(&net->ipv6.ip6_null_entry->dst);
1018 		return net->ipv6.ip6_null_entry;
1019 	}
1020 
1021 	read_lock_bh(&table->tb6_lock);
1022 	if (rt->rt6i_pcpu) {
1023 		p = this_cpu_ptr(rt->rt6i_pcpu);
1024 		prev = cmpxchg(p, NULL, pcpu_rt);
1025 		if (prev) {
1026 			/* If someone did it before us, return prev instead */
1027 			dst_destroy(&pcpu_rt->dst);
1028 			pcpu_rt = prev;
1029 		}
1030 	} else {
1031 		/* rt has been removed from the fib6 tree
1032 		 * before we have a chance to acquire the read_lock.
1033 		 * In this case, don't brother to create a pcpu rt
1034 		 * since rt is going away anyway.  The next
1035 		 * dst_check() will trigger a re-lookup.
1036 		 */
1037 		dst_destroy(&pcpu_rt->dst);
1038 		pcpu_rt = rt;
1039 	}
1040 	dst_hold(&pcpu_rt->dst);
1041 	rt6_dst_from_metrics_check(pcpu_rt);
1042 	read_unlock_bh(&table->tb6_lock);
1043 	return pcpu_rt;
1044 }
1045 
1046 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1047 				      struct flowi6 *fl6, int flags)
1048 {
1049 	struct fib6_node *fn, *saved_fn;
1050 	struct rt6_info *rt;
1051 	int strict = 0;
1052 
1053 	strict |= flags & RT6_LOOKUP_F_IFACE;
1054 	if (net->ipv6.devconf_all->forwarding == 0)
1055 		strict |= RT6_LOOKUP_F_REACHABLE;
1056 
1057 	read_lock_bh(&table->tb6_lock);
1058 
1059 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1060 	saved_fn = fn;
1061 
1062 redo_rt6_select:
1063 	rt = rt6_select(fn, oif, strict);
1064 	if (rt->rt6i_nsiblings)
1065 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1066 	if (rt == net->ipv6.ip6_null_entry) {
1067 		fn = fib6_backtrack(fn, &fl6->saddr);
1068 		if (fn)
1069 			goto redo_rt6_select;
1070 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1071 			/* also consider unreachable route */
1072 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1073 			fn = saved_fn;
1074 			goto redo_rt6_select;
1075 		}
1076 	}
1077 
1078 
1079 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1080 		dst_use(&rt->dst, jiffies);
1081 		read_unlock_bh(&table->tb6_lock);
1082 
1083 		rt6_dst_from_metrics_check(rt);
1084 		return rt;
1085 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1086 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1087 		/* Create a RTF_CACHE clone which will not be
1088 		 * owned by the fib6 tree.  It is for the special case where
1089 		 * the daddr in the skb during the neighbor look-up is different
1090 		 * from the fl6->daddr used to look-up route here.
1091 		 */
1092 
1093 		struct rt6_info *uncached_rt;
1094 
1095 		dst_use(&rt->dst, jiffies);
1096 		read_unlock_bh(&table->tb6_lock);
1097 
1098 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1099 		dst_release(&rt->dst);
1100 
1101 		if (uncached_rt)
1102 			rt6_uncached_list_add(uncached_rt);
1103 		else
1104 			uncached_rt = net->ipv6.ip6_null_entry;
1105 
1106 		dst_hold(&uncached_rt->dst);
1107 		return uncached_rt;
1108 
1109 	} else {
1110 		/* Get a percpu copy */
1111 
1112 		struct rt6_info *pcpu_rt;
1113 
1114 		rt->dst.lastuse = jiffies;
1115 		rt->dst.__use++;
1116 		pcpu_rt = rt6_get_pcpu_route(rt);
1117 
1118 		if (pcpu_rt) {
1119 			read_unlock_bh(&table->tb6_lock);
1120 		} else {
1121 			/* We have to do the read_unlock first
1122 			 * because rt6_make_pcpu_route() may trigger
1123 			 * ip6_dst_gc() which will take the write_lock.
1124 			 */
1125 			dst_hold(&rt->dst);
1126 			read_unlock_bh(&table->tb6_lock);
1127 			pcpu_rt = rt6_make_pcpu_route(rt);
1128 			dst_release(&rt->dst);
1129 		}
1130 
1131 		return pcpu_rt;
1132 
1133 	}
1134 }
1135 
1136 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1137 					    struct flowi6 *fl6, int flags)
1138 {
1139 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1140 }
1141 
1142 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1143 						struct net_device *dev,
1144 						struct flowi6 *fl6, int flags)
1145 {
1146 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1147 		flags |= RT6_LOOKUP_F_IFACE;
1148 
1149 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1150 }
1151 
1152 void ip6_route_input(struct sk_buff *skb)
1153 {
1154 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1155 	struct net *net = dev_net(skb->dev);
1156 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1157 	struct flowi6 fl6 = {
1158 		.flowi6_iif = skb->dev->ifindex,
1159 		.daddr = iph->daddr,
1160 		.saddr = iph->saddr,
1161 		.flowlabel = ip6_flowinfo(iph),
1162 		.flowi6_mark = skb->mark,
1163 		.flowi6_proto = iph->nexthdr,
1164 	};
1165 
1166 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1167 }
1168 
1169 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1170 					     struct flowi6 *fl6, int flags)
1171 {
1172 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1173 }
1174 
1175 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1176 				    struct flowi6 *fl6)
1177 {
1178 	int flags = 0;
1179 
1180 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1181 
1182 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1183 		flags |= RT6_LOOKUP_F_IFACE;
1184 
1185 	if (!ipv6_addr_any(&fl6->saddr))
1186 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1187 	else if (sk)
1188 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1189 
1190 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1191 }
1192 EXPORT_SYMBOL(ip6_route_output);
1193 
1194 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1195 {
1196 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1197 	struct dst_entry *new = NULL;
1198 
1199 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1200 	if (rt) {
1201 		new = &rt->dst;
1202 
1203 		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1204 
1205 		new->__use = 1;
1206 		new->input = dst_discard;
1207 		new->output = dst_discard_sk;
1208 
1209 		if (dst_metrics_read_only(&ort->dst))
1210 			new->_metrics = ort->dst._metrics;
1211 		else
1212 			dst_copy_metrics(new, &ort->dst);
1213 		rt->rt6i_idev = ort->rt6i_idev;
1214 		if (rt->rt6i_idev)
1215 			in6_dev_hold(rt->rt6i_idev);
1216 
1217 		rt->rt6i_gateway = ort->rt6i_gateway;
1218 		rt->rt6i_flags = ort->rt6i_flags;
1219 		rt->rt6i_metric = 0;
1220 
1221 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1222 #ifdef CONFIG_IPV6_SUBTREES
1223 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1224 #endif
1225 
1226 		dst_free(new);
1227 	}
1228 
1229 	dst_release(dst_orig);
1230 	return new ? new : ERR_PTR(-ENOMEM);
1231 }
1232 
1233 /*
1234  *	Destination cache support functions
1235  */
1236 
1237 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1238 {
1239 	if (rt->dst.from &&
1240 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1241 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1242 }
1243 
1244 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1245 {
1246 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1247 		return NULL;
1248 
1249 	if (rt6_check_expired(rt))
1250 		return NULL;
1251 
1252 	return &rt->dst;
1253 }
1254 
1255 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1256 {
1257 	if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1258 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1259 		return &rt->dst;
1260 	else
1261 		return NULL;
1262 }
1263 
1264 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1265 {
1266 	struct rt6_info *rt;
1267 
1268 	rt = (struct rt6_info *) dst;
1269 
1270 	/* All IPV6 dsts are created with ->obsolete set to the value
1271 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1272 	 * into this function always.
1273 	 */
1274 
1275 	rt6_dst_from_metrics_check(rt);
1276 
1277 	if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1278 		return rt6_dst_from_check(rt, cookie);
1279 	else
1280 		return rt6_check(rt, cookie);
1281 }
1282 
1283 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1284 {
1285 	struct rt6_info *rt = (struct rt6_info *) dst;
1286 
1287 	if (rt) {
1288 		if (rt->rt6i_flags & RTF_CACHE) {
1289 			if (rt6_check_expired(rt)) {
1290 				ip6_del_rt(rt);
1291 				dst = NULL;
1292 			}
1293 		} else {
1294 			dst_release(dst);
1295 			dst = NULL;
1296 		}
1297 	}
1298 	return dst;
1299 }
1300 
1301 static void ip6_link_failure(struct sk_buff *skb)
1302 {
1303 	struct rt6_info *rt;
1304 
1305 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1306 
1307 	rt = (struct rt6_info *) skb_dst(skb);
1308 	if (rt) {
1309 		if (rt->rt6i_flags & RTF_CACHE) {
1310 			dst_hold(&rt->dst);
1311 			if (ip6_del_rt(rt))
1312 				dst_free(&rt->dst);
1313 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1314 			rt->rt6i_node->fn_sernum = -1;
1315 		}
1316 	}
1317 }
1318 
1319 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1320 {
1321 	struct net *net = dev_net(rt->dst.dev);
1322 
1323 	rt->rt6i_flags |= RTF_MODIFIED;
1324 	rt->rt6i_pmtu = mtu;
1325 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1326 }
1327 
1328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1329 				 const struct ipv6hdr *iph, u32 mtu)
1330 {
1331 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1332 
1333 	if (rt6->rt6i_flags & RTF_LOCAL)
1334 		return;
1335 
1336 	dst_confirm(dst);
1337 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1338 	if (mtu >= dst_mtu(dst))
1339 		return;
1340 
1341 	if (rt6->rt6i_flags & RTF_CACHE) {
1342 		rt6_do_update_pmtu(rt6, mtu);
1343 	} else {
1344 		const struct in6_addr *daddr, *saddr;
1345 		struct rt6_info *nrt6;
1346 
1347 		if (iph) {
1348 			daddr = &iph->daddr;
1349 			saddr = &iph->saddr;
1350 		} else if (sk) {
1351 			daddr = &sk->sk_v6_daddr;
1352 			saddr = &inet6_sk(sk)->saddr;
1353 		} else {
1354 			return;
1355 		}
1356 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1357 		if (nrt6) {
1358 			rt6_do_update_pmtu(nrt6, mtu);
1359 
1360 			/* ip6_ins_rt(nrt6) will bump the
1361 			 * rt6->rt6i_node->fn_sernum
1362 			 * which will fail the next rt6_check() and
1363 			 * invalidate the sk->sk_dst_cache.
1364 			 */
1365 			ip6_ins_rt(nrt6);
1366 		}
1367 	}
1368 }
1369 
1370 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1371 			       struct sk_buff *skb, u32 mtu)
1372 {
1373 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1374 }
1375 
1376 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1377 		     int oif, u32 mark)
1378 {
1379 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1380 	struct dst_entry *dst;
1381 	struct flowi6 fl6;
1382 
1383 	memset(&fl6, 0, sizeof(fl6));
1384 	fl6.flowi6_oif = oif;
1385 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1386 	fl6.daddr = iph->daddr;
1387 	fl6.saddr = iph->saddr;
1388 	fl6.flowlabel = ip6_flowinfo(iph);
1389 
1390 	dst = ip6_route_output(net, NULL, &fl6);
1391 	if (!dst->error)
1392 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1393 	dst_release(dst);
1394 }
1395 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1396 
1397 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1398 {
1399 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1400 			sk->sk_bound_dev_if, sk->sk_mark);
1401 }
1402 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1403 
1404 /* Handle redirects */
1405 struct ip6rd_flowi {
1406 	struct flowi6 fl6;
1407 	struct in6_addr gateway;
1408 };
1409 
1410 static struct rt6_info *__ip6_route_redirect(struct net *net,
1411 					     struct fib6_table *table,
1412 					     struct flowi6 *fl6,
1413 					     int flags)
1414 {
1415 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1416 	struct rt6_info *rt;
1417 	struct fib6_node *fn;
1418 
1419 	/* Get the "current" route for this destination and
1420 	 * check if the redirect has come from approriate router.
1421 	 *
1422 	 * RFC 4861 specifies that redirects should only be
1423 	 * accepted if they come from the nexthop to the target.
1424 	 * Due to the way the routes are chosen, this notion
1425 	 * is a bit fuzzy and one might need to check all possible
1426 	 * routes.
1427 	 */
1428 
1429 	read_lock_bh(&table->tb6_lock);
1430 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1431 restart:
1432 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1433 		if (rt6_check_expired(rt))
1434 			continue;
1435 		if (rt->dst.error)
1436 			break;
1437 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1438 			continue;
1439 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1440 			continue;
1441 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1442 			continue;
1443 		break;
1444 	}
1445 
1446 	if (!rt)
1447 		rt = net->ipv6.ip6_null_entry;
1448 	else if (rt->dst.error) {
1449 		rt = net->ipv6.ip6_null_entry;
1450 		goto out;
1451 	}
1452 
1453 	if (rt == net->ipv6.ip6_null_entry) {
1454 		fn = fib6_backtrack(fn, &fl6->saddr);
1455 		if (fn)
1456 			goto restart;
1457 	}
1458 
1459 out:
1460 	dst_hold(&rt->dst);
1461 
1462 	read_unlock_bh(&table->tb6_lock);
1463 
1464 	return rt;
1465 };
1466 
1467 static struct dst_entry *ip6_route_redirect(struct net *net,
1468 					const struct flowi6 *fl6,
1469 					const struct in6_addr *gateway)
1470 {
1471 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1472 	struct ip6rd_flowi rdfl;
1473 
1474 	rdfl.fl6 = *fl6;
1475 	rdfl.gateway = *gateway;
1476 
1477 	return fib6_rule_lookup(net, &rdfl.fl6,
1478 				flags, __ip6_route_redirect);
1479 }
1480 
1481 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1482 {
1483 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1484 	struct dst_entry *dst;
1485 	struct flowi6 fl6;
1486 
1487 	memset(&fl6, 0, sizeof(fl6));
1488 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1489 	fl6.flowi6_oif = oif;
1490 	fl6.flowi6_mark = mark;
1491 	fl6.daddr = iph->daddr;
1492 	fl6.saddr = iph->saddr;
1493 	fl6.flowlabel = ip6_flowinfo(iph);
1494 
1495 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1496 	rt6_do_redirect(dst, NULL, skb);
1497 	dst_release(dst);
1498 }
1499 EXPORT_SYMBOL_GPL(ip6_redirect);
1500 
1501 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1502 			    u32 mark)
1503 {
1504 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1505 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1506 	struct dst_entry *dst;
1507 	struct flowi6 fl6;
1508 
1509 	memset(&fl6, 0, sizeof(fl6));
1510 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1511 	fl6.flowi6_oif = oif;
1512 	fl6.flowi6_mark = mark;
1513 	fl6.daddr = msg->dest;
1514 	fl6.saddr = iph->daddr;
1515 
1516 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1517 	rt6_do_redirect(dst, NULL, skb);
1518 	dst_release(dst);
1519 }
1520 
1521 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1522 {
1523 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1524 }
1525 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1526 
1527 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1528 {
1529 	struct net_device *dev = dst->dev;
1530 	unsigned int mtu = dst_mtu(dst);
1531 	struct net *net = dev_net(dev);
1532 
1533 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1534 
1535 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1536 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1537 
1538 	/*
1539 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1540 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1541 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1542 	 * rely only on pmtu discovery"
1543 	 */
1544 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1545 		mtu = IPV6_MAXPLEN;
1546 	return mtu;
1547 }
1548 
1549 static unsigned int ip6_mtu(const struct dst_entry *dst)
1550 {
1551 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1552 	unsigned int mtu = rt->rt6i_pmtu;
1553 	struct inet6_dev *idev;
1554 
1555 	if (mtu)
1556 		goto out;
1557 
1558 	mtu = dst_metric_raw(dst, RTAX_MTU);
1559 	if (mtu)
1560 		goto out;
1561 
1562 	mtu = IPV6_MIN_MTU;
1563 
1564 	rcu_read_lock();
1565 	idev = __in6_dev_get(dst->dev);
1566 	if (idev)
1567 		mtu = idev->cnf.mtu6;
1568 	rcu_read_unlock();
1569 
1570 out:
1571 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1572 }
1573 
1574 static struct dst_entry *icmp6_dst_gc_list;
1575 static DEFINE_SPINLOCK(icmp6_dst_lock);
1576 
1577 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1578 				  struct flowi6 *fl6)
1579 {
1580 	struct dst_entry *dst;
1581 	struct rt6_info *rt;
1582 	struct inet6_dev *idev = in6_dev_get(dev);
1583 	struct net *net = dev_net(dev);
1584 
1585 	if (unlikely(!idev))
1586 		return ERR_PTR(-ENODEV);
1587 
1588 	rt = ip6_dst_alloc(net, dev, 0);
1589 	if (unlikely(!rt)) {
1590 		in6_dev_put(idev);
1591 		dst = ERR_PTR(-ENOMEM);
1592 		goto out;
1593 	}
1594 
1595 	rt->dst.flags |= DST_HOST;
1596 	rt->dst.output  = ip6_output;
1597 	atomic_set(&rt->dst.__refcnt, 1);
1598 	rt->rt6i_gateway  = fl6->daddr;
1599 	rt->rt6i_dst.addr = fl6->daddr;
1600 	rt->rt6i_dst.plen = 128;
1601 	rt->rt6i_idev     = idev;
1602 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1603 
1604 	spin_lock_bh(&icmp6_dst_lock);
1605 	rt->dst.next = icmp6_dst_gc_list;
1606 	icmp6_dst_gc_list = &rt->dst;
1607 	spin_unlock_bh(&icmp6_dst_lock);
1608 
1609 	fib6_force_start_gc(net);
1610 
1611 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1612 
1613 out:
1614 	return dst;
1615 }
1616 
1617 int icmp6_dst_gc(void)
1618 {
1619 	struct dst_entry *dst, **pprev;
1620 	int more = 0;
1621 
1622 	spin_lock_bh(&icmp6_dst_lock);
1623 	pprev = &icmp6_dst_gc_list;
1624 
1625 	while ((dst = *pprev) != NULL) {
1626 		if (!atomic_read(&dst->__refcnt)) {
1627 			*pprev = dst->next;
1628 			dst_free(dst);
1629 		} else {
1630 			pprev = &dst->next;
1631 			++more;
1632 		}
1633 	}
1634 
1635 	spin_unlock_bh(&icmp6_dst_lock);
1636 
1637 	return more;
1638 }
1639 
1640 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1641 			    void *arg)
1642 {
1643 	struct dst_entry *dst, **pprev;
1644 
1645 	spin_lock_bh(&icmp6_dst_lock);
1646 	pprev = &icmp6_dst_gc_list;
1647 	while ((dst = *pprev) != NULL) {
1648 		struct rt6_info *rt = (struct rt6_info *) dst;
1649 		if (func(rt, arg)) {
1650 			*pprev = dst->next;
1651 			dst_free(dst);
1652 		} else {
1653 			pprev = &dst->next;
1654 		}
1655 	}
1656 	spin_unlock_bh(&icmp6_dst_lock);
1657 }
1658 
1659 static int ip6_dst_gc(struct dst_ops *ops)
1660 {
1661 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1662 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1663 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1664 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1665 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1666 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1667 	int entries;
1668 
1669 	entries = dst_entries_get_fast(ops);
1670 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1671 	    entries <= rt_max_size)
1672 		goto out;
1673 
1674 	net->ipv6.ip6_rt_gc_expire++;
1675 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1676 	entries = dst_entries_get_slow(ops);
1677 	if (entries < ops->gc_thresh)
1678 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1679 out:
1680 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1681 	return entries > rt_max_size;
1682 }
1683 
1684 static int ip6_convert_metrics(struct mx6_config *mxc,
1685 			       const struct fib6_config *cfg)
1686 {
1687 	struct nlattr *nla;
1688 	int remaining;
1689 	u32 *mp;
1690 
1691 	if (!cfg->fc_mx)
1692 		return 0;
1693 
1694 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1695 	if (unlikely(!mp))
1696 		return -ENOMEM;
1697 
1698 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1699 		int type = nla_type(nla);
1700 
1701 		if (type) {
1702 			u32 val;
1703 
1704 			if (unlikely(type > RTAX_MAX))
1705 				goto err;
1706 			if (type == RTAX_CC_ALGO) {
1707 				char tmp[TCP_CA_NAME_MAX];
1708 
1709 				nla_strlcpy(tmp, nla, sizeof(tmp));
1710 				val = tcp_ca_get_key_by_name(tmp);
1711 				if (val == TCP_CA_UNSPEC)
1712 					goto err;
1713 			} else {
1714 				val = nla_get_u32(nla);
1715 			}
1716 
1717 			mp[type - 1] = val;
1718 			__set_bit(type - 1, mxc->mx_valid);
1719 		}
1720 	}
1721 
1722 	mxc->mx = mp;
1723 
1724 	return 0;
1725  err:
1726 	kfree(mp);
1727 	return -EINVAL;
1728 }
1729 
1730 int ip6_route_add(struct fib6_config *cfg)
1731 {
1732 	int err;
1733 	struct net *net = cfg->fc_nlinfo.nl_net;
1734 	struct rt6_info *rt = NULL;
1735 	struct net_device *dev = NULL;
1736 	struct inet6_dev *idev = NULL;
1737 	struct fib6_table *table;
1738 	struct mx6_config mxc = { .mx = NULL, };
1739 	int addr_type;
1740 
1741 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1742 		return -EINVAL;
1743 #ifndef CONFIG_IPV6_SUBTREES
1744 	if (cfg->fc_src_len)
1745 		return -EINVAL;
1746 #endif
1747 	if (cfg->fc_ifindex) {
1748 		err = -ENODEV;
1749 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1750 		if (!dev)
1751 			goto out;
1752 		idev = in6_dev_get(dev);
1753 		if (!idev)
1754 			goto out;
1755 	}
1756 
1757 	if (cfg->fc_metric == 0)
1758 		cfg->fc_metric = IP6_RT_PRIO_USER;
1759 
1760 	err = -ENOBUFS;
1761 	if (cfg->fc_nlinfo.nlh &&
1762 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1763 		table = fib6_get_table(net, cfg->fc_table);
1764 		if (!table) {
1765 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1766 			table = fib6_new_table(net, cfg->fc_table);
1767 		}
1768 	} else {
1769 		table = fib6_new_table(net, cfg->fc_table);
1770 	}
1771 
1772 	if (!table)
1773 		goto out;
1774 
1775 	rt = ip6_dst_alloc(net, NULL,
1776 			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1777 
1778 	if (!rt) {
1779 		err = -ENOMEM;
1780 		goto out;
1781 	}
1782 
1783 	if (cfg->fc_flags & RTF_EXPIRES)
1784 		rt6_set_expires(rt, jiffies +
1785 				clock_t_to_jiffies(cfg->fc_expires));
1786 	else
1787 		rt6_clean_expires(rt);
1788 
1789 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1790 		cfg->fc_protocol = RTPROT_BOOT;
1791 	rt->rt6i_protocol = cfg->fc_protocol;
1792 
1793 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1794 
1795 	if (addr_type & IPV6_ADDR_MULTICAST)
1796 		rt->dst.input = ip6_mc_input;
1797 	else if (cfg->fc_flags & RTF_LOCAL)
1798 		rt->dst.input = ip6_input;
1799 	else
1800 		rt->dst.input = ip6_forward;
1801 
1802 	rt->dst.output = ip6_output;
1803 
1804 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1805 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1806 	if (rt->rt6i_dst.plen == 128)
1807 		rt->dst.flags |= DST_HOST;
1808 
1809 #ifdef CONFIG_IPV6_SUBTREES
1810 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1811 	rt->rt6i_src.plen = cfg->fc_src_len;
1812 #endif
1813 
1814 	rt->rt6i_metric = cfg->fc_metric;
1815 
1816 	/* We cannot add true routes via loopback here,
1817 	   they would result in kernel looping; promote them to reject routes
1818 	 */
1819 	if ((cfg->fc_flags & RTF_REJECT) ||
1820 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1821 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1822 	     !(cfg->fc_flags & RTF_LOCAL))) {
1823 		/* hold loopback dev/idev if we haven't done so. */
1824 		if (dev != net->loopback_dev) {
1825 			if (dev) {
1826 				dev_put(dev);
1827 				in6_dev_put(idev);
1828 			}
1829 			dev = net->loopback_dev;
1830 			dev_hold(dev);
1831 			idev = in6_dev_get(dev);
1832 			if (!idev) {
1833 				err = -ENODEV;
1834 				goto out;
1835 			}
1836 		}
1837 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1838 		switch (cfg->fc_type) {
1839 		case RTN_BLACKHOLE:
1840 			rt->dst.error = -EINVAL;
1841 			rt->dst.output = dst_discard_sk;
1842 			rt->dst.input = dst_discard;
1843 			break;
1844 		case RTN_PROHIBIT:
1845 			rt->dst.error = -EACCES;
1846 			rt->dst.output = ip6_pkt_prohibit_out;
1847 			rt->dst.input = ip6_pkt_prohibit;
1848 			break;
1849 		case RTN_THROW:
1850 		default:
1851 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1852 					: -ENETUNREACH;
1853 			rt->dst.output = ip6_pkt_discard_out;
1854 			rt->dst.input = ip6_pkt_discard;
1855 			break;
1856 		}
1857 		goto install_route;
1858 	}
1859 
1860 	if (cfg->fc_flags & RTF_GATEWAY) {
1861 		const struct in6_addr *gw_addr;
1862 		int gwa_type;
1863 
1864 		gw_addr = &cfg->fc_gateway;
1865 		gwa_type = ipv6_addr_type(gw_addr);
1866 
1867 		/* if gw_addr is local we will fail to detect this in case
1868 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1869 		 * will return already-added prefix route via interface that
1870 		 * prefix route was assigned to, which might be non-loopback.
1871 		 */
1872 		err = -EINVAL;
1873 		if (ipv6_chk_addr_and_flags(net, gw_addr,
1874 					    gwa_type & IPV6_ADDR_LINKLOCAL ?
1875 					    dev : NULL, 0, 0))
1876 			goto out;
1877 
1878 		rt->rt6i_gateway = *gw_addr;
1879 
1880 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1881 			struct rt6_info *grt;
1882 
1883 			/* IPv6 strictly inhibits using not link-local
1884 			   addresses as nexthop address.
1885 			   Otherwise, router will not able to send redirects.
1886 			   It is very good, but in some (rare!) circumstances
1887 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1888 			   some exceptions. --ANK
1889 			 */
1890 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1891 				goto out;
1892 
1893 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1894 
1895 			err = -EHOSTUNREACH;
1896 			if (!grt)
1897 				goto out;
1898 			if (dev) {
1899 				if (dev != grt->dst.dev) {
1900 					ip6_rt_put(grt);
1901 					goto out;
1902 				}
1903 			} else {
1904 				dev = grt->dst.dev;
1905 				idev = grt->rt6i_idev;
1906 				dev_hold(dev);
1907 				in6_dev_hold(grt->rt6i_idev);
1908 			}
1909 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1910 				err = 0;
1911 			ip6_rt_put(grt);
1912 
1913 			if (err)
1914 				goto out;
1915 		}
1916 		err = -EINVAL;
1917 		if (!dev || (dev->flags & IFF_LOOPBACK))
1918 			goto out;
1919 	}
1920 
1921 	err = -ENODEV;
1922 	if (!dev)
1923 		goto out;
1924 
1925 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1926 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1927 			err = -EINVAL;
1928 			goto out;
1929 		}
1930 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1931 		rt->rt6i_prefsrc.plen = 128;
1932 	} else
1933 		rt->rt6i_prefsrc.plen = 0;
1934 
1935 	rt->rt6i_flags = cfg->fc_flags;
1936 
1937 install_route:
1938 	rt->dst.dev = dev;
1939 	rt->rt6i_idev = idev;
1940 	rt->rt6i_table = table;
1941 
1942 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1943 
1944 	err = ip6_convert_metrics(&mxc, cfg);
1945 	if (err)
1946 		goto out;
1947 
1948 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1949 
1950 	kfree(mxc.mx);
1951 	return err;
1952 out:
1953 	if (dev)
1954 		dev_put(dev);
1955 	if (idev)
1956 		in6_dev_put(idev);
1957 	if (rt)
1958 		dst_free(&rt->dst);
1959 	return err;
1960 }
1961 
1962 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1963 {
1964 	int err;
1965 	struct fib6_table *table;
1966 	struct net *net = dev_net(rt->dst.dev);
1967 
1968 	if (rt == net->ipv6.ip6_null_entry) {
1969 		err = -ENOENT;
1970 		goto out;
1971 	}
1972 
1973 	table = rt->rt6i_table;
1974 	write_lock_bh(&table->tb6_lock);
1975 	err = fib6_del(rt, info);
1976 	write_unlock_bh(&table->tb6_lock);
1977 
1978 out:
1979 	ip6_rt_put(rt);
1980 	return err;
1981 }
1982 
1983 int ip6_del_rt(struct rt6_info *rt)
1984 {
1985 	struct nl_info info = {
1986 		.nl_net = dev_net(rt->dst.dev),
1987 	};
1988 	return __ip6_del_rt(rt, &info);
1989 }
1990 
1991 static int ip6_route_del(struct fib6_config *cfg)
1992 {
1993 	struct fib6_table *table;
1994 	struct fib6_node *fn;
1995 	struct rt6_info *rt;
1996 	int err = -ESRCH;
1997 
1998 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1999 	if (!table)
2000 		return err;
2001 
2002 	read_lock_bh(&table->tb6_lock);
2003 
2004 	fn = fib6_locate(&table->tb6_root,
2005 			 &cfg->fc_dst, cfg->fc_dst_len,
2006 			 &cfg->fc_src, cfg->fc_src_len);
2007 
2008 	if (fn) {
2009 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2010 			if ((rt->rt6i_flags & RTF_CACHE) &&
2011 			    !(cfg->fc_flags & RTF_CACHE))
2012 				continue;
2013 			if (cfg->fc_ifindex &&
2014 			    (!rt->dst.dev ||
2015 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
2016 				continue;
2017 			if (cfg->fc_flags & RTF_GATEWAY &&
2018 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2019 				continue;
2020 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2021 				continue;
2022 			dst_hold(&rt->dst);
2023 			read_unlock_bh(&table->tb6_lock);
2024 
2025 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2026 		}
2027 	}
2028 	read_unlock_bh(&table->tb6_lock);
2029 
2030 	return err;
2031 }
2032 
2033 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2034 {
2035 	struct net *net = dev_net(skb->dev);
2036 	struct netevent_redirect netevent;
2037 	struct rt6_info *rt, *nrt = NULL;
2038 	struct ndisc_options ndopts;
2039 	struct inet6_dev *in6_dev;
2040 	struct neighbour *neigh;
2041 	struct rd_msg *msg;
2042 	int optlen, on_link;
2043 	u8 *lladdr;
2044 
2045 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2046 	optlen -= sizeof(*msg);
2047 
2048 	if (optlen < 0) {
2049 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2050 		return;
2051 	}
2052 
2053 	msg = (struct rd_msg *)icmp6_hdr(skb);
2054 
2055 	if (ipv6_addr_is_multicast(&msg->dest)) {
2056 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2057 		return;
2058 	}
2059 
2060 	on_link = 0;
2061 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2062 		on_link = 1;
2063 	} else if (ipv6_addr_type(&msg->target) !=
2064 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2065 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2066 		return;
2067 	}
2068 
2069 	in6_dev = __in6_dev_get(skb->dev);
2070 	if (!in6_dev)
2071 		return;
2072 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2073 		return;
2074 
2075 	/* RFC2461 8.1:
2076 	 *	The IP source address of the Redirect MUST be the same as the current
2077 	 *	first-hop router for the specified ICMP Destination Address.
2078 	 */
2079 
2080 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2081 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2082 		return;
2083 	}
2084 
2085 	lladdr = NULL;
2086 	if (ndopts.nd_opts_tgt_lladdr) {
2087 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2088 					     skb->dev);
2089 		if (!lladdr) {
2090 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2091 			return;
2092 		}
2093 	}
2094 
2095 	rt = (struct rt6_info *) dst;
2096 	if (rt == net->ipv6.ip6_null_entry) {
2097 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2098 		return;
2099 	}
2100 
2101 	/* Redirect received -> path was valid.
2102 	 * Look, redirects are sent only in response to data packets,
2103 	 * so that this nexthop apparently is reachable. --ANK
2104 	 */
2105 	dst_confirm(&rt->dst);
2106 
2107 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2108 	if (!neigh)
2109 		return;
2110 
2111 	/*
2112 	 *	We have finally decided to accept it.
2113 	 */
2114 
2115 	neigh_update(neigh, lladdr, NUD_STALE,
2116 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2117 		     NEIGH_UPDATE_F_OVERRIDE|
2118 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2119 				     NEIGH_UPDATE_F_ISROUTER))
2120 		     );
2121 
2122 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2123 	if (!nrt)
2124 		goto out;
2125 
2126 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2127 	if (on_link)
2128 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2129 
2130 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2131 
2132 	if (ip6_ins_rt(nrt))
2133 		goto out;
2134 
2135 	netevent.old = &rt->dst;
2136 	netevent.new = &nrt->dst;
2137 	netevent.daddr = &msg->dest;
2138 	netevent.neigh = neigh;
2139 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2140 
2141 	if (rt->rt6i_flags & RTF_CACHE) {
2142 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2143 		ip6_del_rt(rt);
2144 	}
2145 
2146 out:
2147 	neigh_release(neigh);
2148 }
2149 
2150 /*
2151  *	Misc support functions
2152  */
2153 
2154 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2155 {
2156 	BUG_ON(from->dst.from);
2157 
2158 	rt->rt6i_flags &= ~RTF_EXPIRES;
2159 	dst_hold(&from->dst);
2160 	rt->dst.from = &from->dst;
2161 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2162 }
2163 
2164 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2165 {
2166 	rt->dst.input = ort->dst.input;
2167 	rt->dst.output = ort->dst.output;
2168 	rt->rt6i_dst = ort->rt6i_dst;
2169 	rt->dst.error = ort->dst.error;
2170 	rt->rt6i_idev = ort->rt6i_idev;
2171 	if (rt->rt6i_idev)
2172 		in6_dev_hold(rt->rt6i_idev);
2173 	rt->dst.lastuse = jiffies;
2174 	rt->rt6i_gateway = ort->rt6i_gateway;
2175 	rt->rt6i_flags = ort->rt6i_flags;
2176 	rt6_set_from(rt, ort);
2177 	rt->rt6i_metric = ort->rt6i_metric;
2178 #ifdef CONFIG_IPV6_SUBTREES
2179 	rt->rt6i_src = ort->rt6i_src;
2180 #endif
2181 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2182 	rt->rt6i_table = ort->rt6i_table;
2183 }
2184 
2185 #ifdef CONFIG_IPV6_ROUTE_INFO
2186 static struct rt6_info *rt6_get_route_info(struct net *net,
2187 					   const struct in6_addr *prefix, int prefixlen,
2188 					   const struct in6_addr *gwaddr, int ifindex)
2189 {
2190 	struct fib6_node *fn;
2191 	struct rt6_info *rt = NULL;
2192 	struct fib6_table *table;
2193 
2194 	table = fib6_get_table(net, RT6_TABLE_INFO);
2195 	if (!table)
2196 		return NULL;
2197 
2198 	read_lock_bh(&table->tb6_lock);
2199 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2200 	if (!fn)
2201 		goto out;
2202 
2203 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2204 		if (rt->dst.dev->ifindex != ifindex)
2205 			continue;
2206 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2207 			continue;
2208 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2209 			continue;
2210 		dst_hold(&rt->dst);
2211 		break;
2212 	}
2213 out:
2214 	read_unlock_bh(&table->tb6_lock);
2215 	return rt;
2216 }
2217 
2218 static struct rt6_info *rt6_add_route_info(struct net *net,
2219 					   const struct in6_addr *prefix, int prefixlen,
2220 					   const struct in6_addr *gwaddr, int ifindex,
2221 					   unsigned int pref)
2222 {
2223 	struct fib6_config cfg = {
2224 		.fc_table	= RT6_TABLE_INFO,
2225 		.fc_metric	= IP6_RT_PRIO_USER,
2226 		.fc_ifindex	= ifindex,
2227 		.fc_dst_len	= prefixlen,
2228 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2229 				  RTF_UP | RTF_PREF(pref),
2230 		.fc_nlinfo.portid = 0,
2231 		.fc_nlinfo.nlh = NULL,
2232 		.fc_nlinfo.nl_net = net,
2233 	};
2234 
2235 	cfg.fc_dst = *prefix;
2236 	cfg.fc_gateway = *gwaddr;
2237 
2238 	/* We should treat it as a default route if prefix length is 0. */
2239 	if (!prefixlen)
2240 		cfg.fc_flags |= RTF_DEFAULT;
2241 
2242 	ip6_route_add(&cfg);
2243 
2244 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2245 }
2246 #endif
2247 
2248 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2249 {
2250 	struct rt6_info *rt;
2251 	struct fib6_table *table;
2252 
2253 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2254 	if (!table)
2255 		return NULL;
2256 
2257 	read_lock_bh(&table->tb6_lock);
2258 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2259 		if (dev == rt->dst.dev &&
2260 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2261 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2262 			break;
2263 	}
2264 	if (rt)
2265 		dst_hold(&rt->dst);
2266 	read_unlock_bh(&table->tb6_lock);
2267 	return rt;
2268 }
2269 
2270 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2271 				     struct net_device *dev,
2272 				     unsigned int pref)
2273 {
2274 	struct fib6_config cfg = {
2275 		.fc_table	= RT6_TABLE_DFLT,
2276 		.fc_metric	= IP6_RT_PRIO_USER,
2277 		.fc_ifindex	= dev->ifindex,
2278 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2279 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2280 		.fc_nlinfo.portid = 0,
2281 		.fc_nlinfo.nlh = NULL,
2282 		.fc_nlinfo.nl_net = dev_net(dev),
2283 	};
2284 
2285 	cfg.fc_gateway = *gwaddr;
2286 
2287 	ip6_route_add(&cfg);
2288 
2289 	return rt6_get_dflt_router(gwaddr, dev);
2290 }
2291 
2292 void rt6_purge_dflt_routers(struct net *net)
2293 {
2294 	struct rt6_info *rt;
2295 	struct fib6_table *table;
2296 
2297 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2298 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2299 	if (!table)
2300 		return;
2301 
2302 restart:
2303 	read_lock_bh(&table->tb6_lock);
2304 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2305 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2306 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2307 			dst_hold(&rt->dst);
2308 			read_unlock_bh(&table->tb6_lock);
2309 			ip6_del_rt(rt);
2310 			goto restart;
2311 		}
2312 	}
2313 	read_unlock_bh(&table->tb6_lock);
2314 }
2315 
2316 static void rtmsg_to_fib6_config(struct net *net,
2317 				 struct in6_rtmsg *rtmsg,
2318 				 struct fib6_config *cfg)
2319 {
2320 	memset(cfg, 0, sizeof(*cfg));
2321 
2322 	cfg->fc_table = RT6_TABLE_MAIN;
2323 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2324 	cfg->fc_metric = rtmsg->rtmsg_metric;
2325 	cfg->fc_expires = rtmsg->rtmsg_info;
2326 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2327 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2328 	cfg->fc_flags = rtmsg->rtmsg_flags;
2329 
2330 	cfg->fc_nlinfo.nl_net = net;
2331 
2332 	cfg->fc_dst = rtmsg->rtmsg_dst;
2333 	cfg->fc_src = rtmsg->rtmsg_src;
2334 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2335 }
2336 
2337 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2338 {
2339 	struct fib6_config cfg;
2340 	struct in6_rtmsg rtmsg;
2341 	int err;
2342 
2343 	switch (cmd) {
2344 	case SIOCADDRT:		/* Add a route */
2345 	case SIOCDELRT:		/* Delete a route */
2346 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2347 			return -EPERM;
2348 		err = copy_from_user(&rtmsg, arg,
2349 				     sizeof(struct in6_rtmsg));
2350 		if (err)
2351 			return -EFAULT;
2352 
2353 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2354 
2355 		rtnl_lock();
2356 		switch (cmd) {
2357 		case SIOCADDRT:
2358 			err = ip6_route_add(&cfg);
2359 			break;
2360 		case SIOCDELRT:
2361 			err = ip6_route_del(&cfg);
2362 			break;
2363 		default:
2364 			err = -EINVAL;
2365 		}
2366 		rtnl_unlock();
2367 
2368 		return err;
2369 	}
2370 
2371 	return -EINVAL;
2372 }
2373 
2374 /*
2375  *	Drop the packet on the floor
2376  */
2377 
2378 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2379 {
2380 	int type;
2381 	struct dst_entry *dst = skb_dst(skb);
2382 	switch (ipstats_mib_noroutes) {
2383 	case IPSTATS_MIB_INNOROUTES:
2384 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2385 		if (type == IPV6_ADDR_ANY) {
2386 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2387 				      IPSTATS_MIB_INADDRERRORS);
2388 			break;
2389 		}
2390 		/* FALLTHROUGH */
2391 	case IPSTATS_MIB_OUTNOROUTES:
2392 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2393 			      ipstats_mib_noroutes);
2394 		break;
2395 	}
2396 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2397 	kfree_skb(skb);
2398 	return 0;
2399 }
2400 
2401 static int ip6_pkt_discard(struct sk_buff *skb)
2402 {
2403 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2404 }
2405 
2406 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2407 {
2408 	skb->dev = skb_dst(skb)->dev;
2409 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2410 }
2411 
2412 static int ip6_pkt_prohibit(struct sk_buff *skb)
2413 {
2414 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2415 }
2416 
2417 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2418 {
2419 	skb->dev = skb_dst(skb)->dev;
2420 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2421 }
2422 
2423 /*
2424  *	Allocate a dst for local (unicast / anycast) address.
2425  */
2426 
2427 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2428 				    const struct in6_addr *addr,
2429 				    bool anycast)
2430 {
2431 	struct net *net = dev_net(idev->dev);
2432 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2433 					    DST_NOCOUNT);
2434 	if (!rt)
2435 		return ERR_PTR(-ENOMEM);
2436 
2437 	in6_dev_hold(idev);
2438 
2439 	rt->dst.flags |= DST_HOST;
2440 	rt->dst.input = ip6_input;
2441 	rt->dst.output = ip6_output;
2442 	rt->rt6i_idev = idev;
2443 
2444 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2445 	if (anycast)
2446 		rt->rt6i_flags |= RTF_ANYCAST;
2447 	else
2448 		rt->rt6i_flags |= RTF_LOCAL;
2449 
2450 	rt->rt6i_gateway  = *addr;
2451 	rt->rt6i_dst.addr = *addr;
2452 	rt->rt6i_dst.plen = 128;
2453 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2454 
2455 	atomic_set(&rt->dst.__refcnt, 1);
2456 
2457 	return rt;
2458 }
2459 
2460 int ip6_route_get_saddr(struct net *net,
2461 			struct rt6_info *rt,
2462 			const struct in6_addr *daddr,
2463 			unsigned int prefs,
2464 			struct in6_addr *saddr)
2465 {
2466 	struct inet6_dev *idev =
2467 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2468 	int err = 0;
2469 	if (rt && rt->rt6i_prefsrc.plen)
2470 		*saddr = rt->rt6i_prefsrc.addr;
2471 	else
2472 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2473 					 daddr, prefs, saddr);
2474 	return err;
2475 }
2476 
2477 /* remove deleted ip from prefsrc entries */
2478 struct arg_dev_net_ip {
2479 	struct net_device *dev;
2480 	struct net *net;
2481 	struct in6_addr *addr;
2482 };
2483 
2484 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2485 {
2486 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2487 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2488 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2489 
2490 	if (((void *)rt->dst.dev == dev || !dev) &&
2491 	    rt != net->ipv6.ip6_null_entry &&
2492 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2493 		/* remove prefsrc entry */
2494 		rt->rt6i_prefsrc.plen = 0;
2495 	}
2496 	return 0;
2497 }
2498 
2499 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2500 {
2501 	struct net *net = dev_net(ifp->idev->dev);
2502 	struct arg_dev_net_ip adni = {
2503 		.dev = ifp->idev->dev,
2504 		.net = net,
2505 		.addr = &ifp->addr,
2506 	};
2507 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2508 }
2509 
2510 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2511 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2512 
2513 /* Remove routers and update dst entries when gateway turn into host. */
2514 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2515 {
2516 	struct in6_addr *gateway = (struct in6_addr *)arg;
2517 
2518 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2519 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2520 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2521 		return -1;
2522 	}
2523 	return 0;
2524 }
2525 
2526 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2527 {
2528 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2529 }
2530 
2531 struct arg_dev_net {
2532 	struct net_device *dev;
2533 	struct net *net;
2534 };
2535 
2536 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2537 {
2538 	const struct arg_dev_net *adn = arg;
2539 	const struct net_device *dev = adn->dev;
2540 
2541 	if ((rt->dst.dev == dev || !dev) &&
2542 	    rt != adn->net->ipv6.ip6_null_entry)
2543 		return -1;
2544 
2545 	return 0;
2546 }
2547 
2548 void rt6_ifdown(struct net *net, struct net_device *dev)
2549 {
2550 	struct arg_dev_net adn = {
2551 		.dev = dev,
2552 		.net = net,
2553 	};
2554 
2555 	fib6_clean_all(net, fib6_ifdown, &adn);
2556 	icmp6_clean_all(fib6_ifdown, &adn);
2557 	rt6_uncached_list_flush_dev(net, dev);
2558 }
2559 
2560 struct rt6_mtu_change_arg {
2561 	struct net_device *dev;
2562 	unsigned int mtu;
2563 };
2564 
2565 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2566 {
2567 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2568 	struct inet6_dev *idev;
2569 
2570 	/* In IPv6 pmtu discovery is not optional,
2571 	   so that RTAX_MTU lock cannot disable it.
2572 	   We still use this lock to block changes
2573 	   caused by addrconf/ndisc.
2574 	*/
2575 
2576 	idev = __in6_dev_get(arg->dev);
2577 	if (!idev)
2578 		return 0;
2579 
2580 	/* For administrative MTU increase, there is no way to discover
2581 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2582 	   Since RFC 1981 doesn't include administrative MTU increase
2583 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2584 	 */
2585 	/*
2586 	   If new MTU is less than route PMTU, this new MTU will be the
2587 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2588 	   decreases; if new MTU is greater than route PMTU, and the
2589 	   old MTU is the lowest MTU in the path, update the route PMTU
2590 	   to reflect the increase. In this case if the other nodes' MTU
2591 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2592 	   PMTU discouvery.
2593 	 */
2594 	if (rt->dst.dev == arg->dev &&
2595 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2596 		if (rt->rt6i_flags & RTF_CACHE) {
2597 			/* For RTF_CACHE with rt6i_pmtu == 0
2598 			 * (i.e. a redirected route),
2599 			 * the metrics of its rt->dst.from has already
2600 			 * been updated.
2601 			 */
2602 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2603 				rt->rt6i_pmtu = arg->mtu;
2604 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2605 			   (dst_mtu(&rt->dst) < arg->mtu &&
2606 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2607 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2608 		}
2609 	}
2610 	return 0;
2611 }
2612 
2613 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2614 {
2615 	struct rt6_mtu_change_arg arg = {
2616 		.dev = dev,
2617 		.mtu = mtu,
2618 	};
2619 
2620 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2621 }
2622 
2623 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2624 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2625 	[RTA_OIF]               = { .type = NLA_U32 },
2626 	[RTA_IIF]		= { .type = NLA_U32 },
2627 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2628 	[RTA_METRICS]           = { .type = NLA_NESTED },
2629 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2630 	[RTA_PREF]              = { .type = NLA_U8 },
2631 };
2632 
2633 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2634 			      struct fib6_config *cfg)
2635 {
2636 	struct rtmsg *rtm;
2637 	struct nlattr *tb[RTA_MAX+1];
2638 	unsigned int pref;
2639 	int err;
2640 
2641 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2642 	if (err < 0)
2643 		goto errout;
2644 
2645 	err = -EINVAL;
2646 	rtm = nlmsg_data(nlh);
2647 	memset(cfg, 0, sizeof(*cfg));
2648 
2649 	cfg->fc_table = rtm->rtm_table;
2650 	cfg->fc_dst_len = rtm->rtm_dst_len;
2651 	cfg->fc_src_len = rtm->rtm_src_len;
2652 	cfg->fc_flags = RTF_UP;
2653 	cfg->fc_protocol = rtm->rtm_protocol;
2654 	cfg->fc_type = rtm->rtm_type;
2655 
2656 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2657 	    rtm->rtm_type == RTN_BLACKHOLE ||
2658 	    rtm->rtm_type == RTN_PROHIBIT ||
2659 	    rtm->rtm_type == RTN_THROW)
2660 		cfg->fc_flags |= RTF_REJECT;
2661 
2662 	if (rtm->rtm_type == RTN_LOCAL)
2663 		cfg->fc_flags |= RTF_LOCAL;
2664 
2665 	if (rtm->rtm_flags & RTM_F_CLONED)
2666 		cfg->fc_flags |= RTF_CACHE;
2667 
2668 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2669 	cfg->fc_nlinfo.nlh = nlh;
2670 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2671 
2672 	if (tb[RTA_GATEWAY]) {
2673 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2674 		cfg->fc_flags |= RTF_GATEWAY;
2675 	}
2676 
2677 	if (tb[RTA_DST]) {
2678 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2679 
2680 		if (nla_len(tb[RTA_DST]) < plen)
2681 			goto errout;
2682 
2683 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2684 	}
2685 
2686 	if (tb[RTA_SRC]) {
2687 		int plen = (rtm->rtm_src_len + 7) >> 3;
2688 
2689 		if (nla_len(tb[RTA_SRC]) < plen)
2690 			goto errout;
2691 
2692 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2693 	}
2694 
2695 	if (tb[RTA_PREFSRC])
2696 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2697 
2698 	if (tb[RTA_OIF])
2699 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2700 
2701 	if (tb[RTA_PRIORITY])
2702 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2703 
2704 	if (tb[RTA_METRICS]) {
2705 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2706 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2707 	}
2708 
2709 	if (tb[RTA_TABLE])
2710 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2711 
2712 	if (tb[RTA_MULTIPATH]) {
2713 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2714 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2715 	}
2716 
2717 	if (tb[RTA_PREF]) {
2718 		pref = nla_get_u8(tb[RTA_PREF]);
2719 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2720 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2721 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2722 		cfg->fc_flags |= RTF_PREF(pref);
2723 	}
2724 
2725 	err = 0;
2726 errout:
2727 	return err;
2728 }
2729 
2730 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2731 {
2732 	struct fib6_config r_cfg;
2733 	struct rtnexthop *rtnh;
2734 	int remaining;
2735 	int attrlen;
2736 	int err = 0, last_err = 0;
2737 
2738 	remaining = cfg->fc_mp_len;
2739 beginning:
2740 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2741 
2742 	/* Parse a Multipath Entry */
2743 	while (rtnh_ok(rtnh, remaining)) {
2744 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2745 		if (rtnh->rtnh_ifindex)
2746 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2747 
2748 		attrlen = rtnh_attrlen(rtnh);
2749 		if (attrlen > 0) {
2750 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2751 
2752 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2753 			if (nla) {
2754 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2755 				r_cfg.fc_flags |= RTF_GATEWAY;
2756 			}
2757 		}
2758 		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2759 		if (err) {
2760 			last_err = err;
2761 			/* If we are trying to remove a route, do not stop the
2762 			 * loop when ip6_route_del() fails (because next hop is
2763 			 * already gone), we should try to remove all next hops.
2764 			 */
2765 			if (add) {
2766 				/* If add fails, we should try to delete all
2767 				 * next hops that have been already added.
2768 				 */
2769 				add = 0;
2770 				remaining = cfg->fc_mp_len - remaining;
2771 				goto beginning;
2772 			}
2773 		}
2774 		/* Because each route is added like a single route we remove
2775 		 * these flags after the first nexthop: if there is a collision,
2776 		 * we have already failed to add the first nexthop:
2777 		 * fib6_add_rt2node() has rejected it; when replacing, old
2778 		 * nexthops have been replaced by first new, the rest should
2779 		 * be added to it.
2780 		 */
2781 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2782 						     NLM_F_REPLACE);
2783 		rtnh = rtnh_next(rtnh, &remaining);
2784 	}
2785 
2786 	return last_err;
2787 }
2788 
2789 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2790 {
2791 	struct fib6_config cfg;
2792 	int err;
2793 
2794 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2795 	if (err < 0)
2796 		return err;
2797 
2798 	if (cfg.fc_mp)
2799 		return ip6_route_multipath(&cfg, 0);
2800 	else
2801 		return ip6_route_del(&cfg);
2802 }
2803 
2804 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2805 {
2806 	struct fib6_config cfg;
2807 	int err;
2808 
2809 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2810 	if (err < 0)
2811 		return err;
2812 
2813 	if (cfg.fc_mp)
2814 		return ip6_route_multipath(&cfg, 1);
2815 	else
2816 		return ip6_route_add(&cfg);
2817 }
2818 
2819 static inline size_t rt6_nlmsg_size(void)
2820 {
2821 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2822 	       + nla_total_size(16) /* RTA_SRC */
2823 	       + nla_total_size(16) /* RTA_DST */
2824 	       + nla_total_size(16) /* RTA_GATEWAY */
2825 	       + nla_total_size(16) /* RTA_PREFSRC */
2826 	       + nla_total_size(4) /* RTA_TABLE */
2827 	       + nla_total_size(4) /* RTA_IIF */
2828 	       + nla_total_size(4) /* RTA_OIF */
2829 	       + nla_total_size(4) /* RTA_PRIORITY */
2830 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2831 	       + nla_total_size(sizeof(struct rta_cacheinfo))
2832 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2833 	       + nla_total_size(1); /* RTA_PREF */
2834 }
2835 
2836 static int rt6_fill_node(struct net *net,
2837 			 struct sk_buff *skb, struct rt6_info *rt,
2838 			 struct in6_addr *dst, struct in6_addr *src,
2839 			 int iif, int type, u32 portid, u32 seq,
2840 			 int prefix, int nowait, unsigned int flags)
2841 {
2842 	u32 metrics[RTAX_MAX];
2843 	struct rtmsg *rtm;
2844 	struct nlmsghdr *nlh;
2845 	long expires;
2846 	u32 table;
2847 
2848 	if (prefix) {	/* user wants prefix routes only */
2849 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2850 			/* success since this is not a prefix route */
2851 			return 1;
2852 		}
2853 	}
2854 
2855 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2856 	if (!nlh)
2857 		return -EMSGSIZE;
2858 
2859 	rtm = nlmsg_data(nlh);
2860 	rtm->rtm_family = AF_INET6;
2861 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2862 	rtm->rtm_src_len = rt->rt6i_src.plen;
2863 	rtm->rtm_tos = 0;
2864 	if (rt->rt6i_table)
2865 		table = rt->rt6i_table->tb6_id;
2866 	else
2867 		table = RT6_TABLE_UNSPEC;
2868 	rtm->rtm_table = table;
2869 	if (nla_put_u32(skb, RTA_TABLE, table))
2870 		goto nla_put_failure;
2871 	if (rt->rt6i_flags & RTF_REJECT) {
2872 		switch (rt->dst.error) {
2873 		case -EINVAL:
2874 			rtm->rtm_type = RTN_BLACKHOLE;
2875 			break;
2876 		case -EACCES:
2877 			rtm->rtm_type = RTN_PROHIBIT;
2878 			break;
2879 		case -EAGAIN:
2880 			rtm->rtm_type = RTN_THROW;
2881 			break;
2882 		default:
2883 			rtm->rtm_type = RTN_UNREACHABLE;
2884 			break;
2885 		}
2886 	}
2887 	else if (rt->rt6i_flags & RTF_LOCAL)
2888 		rtm->rtm_type = RTN_LOCAL;
2889 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2890 		rtm->rtm_type = RTN_LOCAL;
2891 	else
2892 		rtm->rtm_type = RTN_UNICAST;
2893 	rtm->rtm_flags = 0;
2894 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2895 	rtm->rtm_protocol = rt->rt6i_protocol;
2896 	if (rt->rt6i_flags & RTF_DYNAMIC)
2897 		rtm->rtm_protocol = RTPROT_REDIRECT;
2898 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
2899 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2900 			rtm->rtm_protocol = RTPROT_RA;
2901 		else
2902 			rtm->rtm_protocol = RTPROT_KERNEL;
2903 	}
2904 
2905 	if (rt->rt6i_flags & RTF_CACHE)
2906 		rtm->rtm_flags |= RTM_F_CLONED;
2907 
2908 	if (dst) {
2909 		if (nla_put_in6_addr(skb, RTA_DST, dst))
2910 			goto nla_put_failure;
2911 		rtm->rtm_dst_len = 128;
2912 	} else if (rtm->rtm_dst_len)
2913 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2914 			goto nla_put_failure;
2915 #ifdef CONFIG_IPV6_SUBTREES
2916 	if (src) {
2917 		if (nla_put_in6_addr(skb, RTA_SRC, src))
2918 			goto nla_put_failure;
2919 		rtm->rtm_src_len = 128;
2920 	} else if (rtm->rtm_src_len &&
2921 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2922 		goto nla_put_failure;
2923 #endif
2924 	if (iif) {
2925 #ifdef CONFIG_IPV6_MROUTE
2926 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2927 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2928 			if (err <= 0) {
2929 				if (!nowait) {
2930 					if (err == 0)
2931 						return 0;
2932 					goto nla_put_failure;
2933 				} else {
2934 					if (err == -EMSGSIZE)
2935 						goto nla_put_failure;
2936 				}
2937 			}
2938 		} else
2939 #endif
2940 			if (nla_put_u32(skb, RTA_IIF, iif))
2941 				goto nla_put_failure;
2942 	} else if (dst) {
2943 		struct in6_addr saddr_buf;
2944 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2945 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2946 			goto nla_put_failure;
2947 	}
2948 
2949 	if (rt->rt6i_prefsrc.plen) {
2950 		struct in6_addr saddr_buf;
2951 		saddr_buf = rt->rt6i_prefsrc.addr;
2952 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2953 			goto nla_put_failure;
2954 	}
2955 
2956 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2957 	if (rt->rt6i_pmtu)
2958 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2959 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2960 		goto nla_put_failure;
2961 
2962 	if (rt->rt6i_flags & RTF_GATEWAY) {
2963 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2964 			goto nla_put_failure;
2965 	}
2966 
2967 	if (rt->dst.dev &&
2968 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2969 		goto nla_put_failure;
2970 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2971 		goto nla_put_failure;
2972 
2973 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2974 
2975 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2976 		goto nla_put_failure;
2977 
2978 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2979 		goto nla_put_failure;
2980 
2981 	nlmsg_end(skb, nlh);
2982 	return 0;
2983 
2984 nla_put_failure:
2985 	nlmsg_cancel(skb, nlh);
2986 	return -EMSGSIZE;
2987 }
2988 
2989 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2990 {
2991 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2992 	int prefix;
2993 
2994 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2995 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2996 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2997 	} else
2998 		prefix = 0;
2999 
3000 	return rt6_fill_node(arg->net,
3001 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3002 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3003 		     prefix, 0, NLM_F_MULTI);
3004 }
3005 
3006 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3007 {
3008 	struct net *net = sock_net(in_skb->sk);
3009 	struct nlattr *tb[RTA_MAX+1];
3010 	struct rt6_info *rt;
3011 	struct sk_buff *skb;
3012 	struct rtmsg *rtm;
3013 	struct flowi6 fl6;
3014 	int err, iif = 0, oif = 0;
3015 
3016 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3017 	if (err < 0)
3018 		goto errout;
3019 
3020 	err = -EINVAL;
3021 	memset(&fl6, 0, sizeof(fl6));
3022 
3023 	if (tb[RTA_SRC]) {
3024 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3025 			goto errout;
3026 
3027 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3028 	}
3029 
3030 	if (tb[RTA_DST]) {
3031 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3032 			goto errout;
3033 
3034 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3035 	}
3036 
3037 	if (tb[RTA_IIF])
3038 		iif = nla_get_u32(tb[RTA_IIF]);
3039 
3040 	if (tb[RTA_OIF])
3041 		oif = nla_get_u32(tb[RTA_OIF]);
3042 
3043 	if (tb[RTA_MARK])
3044 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3045 
3046 	if (iif) {
3047 		struct net_device *dev;
3048 		int flags = 0;
3049 
3050 		dev = __dev_get_by_index(net, iif);
3051 		if (!dev) {
3052 			err = -ENODEV;
3053 			goto errout;
3054 		}
3055 
3056 		fl6.flowi6_iif = iif;
3057 
3058 		if (!ipv6_addr_any(&fl6.saddr))
3059 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3060 
3061 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3062 							       flags);
3063 	} else {
3064 		fl6.flowi6_oif = oif;
3065 
3066 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3067 	}
3068 
3069 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070 	if (!skb) {
3071 		ip6_rt_put(rt);
3072 		err = -ENOBUFS;
3073 		goto errout;
3074 	}
3075 
3076 	/* Reserve room for dummy headers, this skb can pass
3077 	   through good chunk of routing engine.
3078 	 */
3079 	skb_reset_mac_header(skb);
3080 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3081 
3082 	skb_dst_set(skb, &rt->dst);
3083 
3084 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3085 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3086 			    nlh->nlmsg_seq, 0, 0, 0);
3087 	if (err < 0) {
3088 		kfree_skb(skb);
3089 		goto errout;
3090 	}
3091 
3092 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3093 errout:
3094 	return err;
3095 }
3096 
3097 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3098 {
3099 	struct sk_buff *skb;
3100 	struct net *net = info->nl_net;
3101 	u32 seq;
3102 	int err;
3103 
3104 	err = -ENOBUFS;
3105 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3106 
3107 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3108 	if (!skb)
3109 		goto errout;
3110 
3111 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3112 				event, info->portid, seq, 0, 0, 0);
3113 	if (err < 0) {
3114 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3115 		WARN_ON(err == -EMSGSIZE);
3116 		kfree_skb(skb);
3117 		goto errout;
3118 	}
3119 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3120 		    info->nlh, gfp_any());
3121 	return;
3122 errout:
3123 	if (err < 0)
3124 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3125 }
3126 
3127 static int ip6_route_dev_notify(struct notifier_block *this,
3128 				unsigned long event, void *ptr)
3129 {
3130 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3131 	struct net *net = dev_net(dev);
3132 
3133 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3134 		net->ipv6.ip6_null_entry->dst.dev = dev;
3135 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3136 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3137 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3138 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3139 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3140 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3141 #endif
3142 	}
3143 
3144 	return NOTIFY_OK;
3145 }
3146 
3147 /*
3148  *	/proc
3149  */
3150 
3151 #ifdef CONFIG_PROC_FS
3152 
3153 static const struct file_operations ipv6_route_proc_fops = {
3154 	.owner		= THIS_MODULE,
3155 	.open		= ipv6_route_open,
3156 	.read		= seq_read,
3157 	.llseek		= seq_lseek,
3158 	.release	= seq_release_net,
3159 };
3160 
3161 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3162 {
3163 	struct net *net = (struct net *)seq->private;
3164 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3165 		   net->ipv6.rt6_stats->fib_nodes,
3166 		   net->ipv6.rt6_stats->fib_route_nodes,
3167 		   net->ipv6.rt6_stats->fib_rt_alloc,
3168 		   net->ipv6.rt6_stats->fib_rt_entries,
3169 		   net->ipv6.rt6_stats->fib_rt_cache,
3170 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3171 		   net->ipv6.rt6_stats->fib_discarded_routes);
3172 
3173 	return 0;
3174 }
3175 
3176 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3177 {
3178 	return single_open_net(inode, file, rt6_stats_seq_show);
3179 }
3180 
3181 static const struct file_operations rt6_stats_seq_fops = {
3182 	.owner	 = THIS_MODULE,
3183 	.open	 = rt6_stats_seq_open,
3184 	.read	 = seq_read,
3185 	.llseek	 = seq_lseek,
3186 	.release = single_release_net,
3187 };
3188 #endif	/* CONFIG_PROC_FS */
3189 
3190 #ifdef CONFIG_SYSCTL
3191 
3192 static
3193 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3194 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3195 {
3196 	struct net *net;
3197 	int delay;
3198 	if (!write)
3199 		return -EINVAL;
3200 
3201 	net = (struct net *)ctl->extra1;
3202 	delay = net->ipv6.sysctl.flush_delay;
3203 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3204 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3205 	return 0;
3206 }
3207 
3208 struct ctl_table ipv6_route_table_template[] = {
3209 	{
3210 		.procname	=	"flush",
3211 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3212 		.maxlen		=	sizeof(int),
3213 		.mode		=	0200,
3214 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3215 	},
3216 	{
3217 		.procname	=	"gc_thresh",
3218 		.data		=	&ip6_dst_ops_template.gc_thresh,
3219 		.maxlen		=	sizeof(int),
3220 		.mode		=	0644,
3221 		.proc_handler	=	proc_dointvec,
3222 	},
3223 	{
3224 		.procname	=	"max_size",
3225 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3226 		.maxlen		=	sizeof(int),
3227 		.mode		=	0644,
3228 		.proc_handler	=	proc_dointvec,
3229 	},
3230 	{
3231 		.procname	=	"gc_min_interval",
3232 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3233 		.maxlen		=	sizeof(int),
3234 		.mode		=	0644,
3235 		.proc_handler	=	proc_dointvec_jiffies,
3236 	},
3237 	{
3238 		.procname	=	"gc_timeout",
3239 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3240 		.maxlen		=	sizeof(int),
3241 		.mode		=	0644,
3242 		.proc_handler	=	proc_dointvec_jiffies,
3243 	},
3244 	{
3245 		.procname	=	"gc_interval",
3246 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3247 		.maxlen		=	sizeof(int),
3248 		.mode		=	0644,
3249 		.proc_handler	=	proc_dointvec_jiffies,
3250 	},
3251 	{
3252 		.procname	=	"gc_elasticity",
3253 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3254 		.maxlen		=	sizeof(int),
3255 		.mode		=	0644,
3256 		.proc_handler	=	proc_dointvec,
3257 	},
3258 	{
3259 		.procname	=	"mtu_expires",
3260 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3261 		.maxlen		=	sizeof(int),
3262 		.mode		=	0644,
3263 		.proc_handler	=	proc_dointvec_jiffies,
3264 	},
3265 	{
3266 		.procname	=	"min_adv_mss",
3267 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3268 		.maxlen		=	sizeof(int),
3269 		.mode		=	0644,
3270 		.proc_handler	=	proc_dointvec,
3271 	},
3272 	{
3273 		.procname	=	"gc_min_interval_ms",
3274 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3275 		.maxlen		=	sizeof(int),
3276 		.mode		=	0644,
3277 		.proc_handler	=	proc_dointvec_ms_jiffies,
3278 	},
3279 	{ }
3280 };
3281 
3282 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3283 {
3284 	struct ctl_table *table;
3285 
3286 	table = kmemdup(ipv6_route_table_template,
3287 			sizeof(ipv6_route_table_template),
3288 			GFP_KERNEL);
3289 
3290 	if (table) {
3291 		table[0].data = &net->ipv6.sysctl.flush_delay;
3292 		table[0].extra1 = net;
3293 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3294 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3295 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3296 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3297 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3298 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3299 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3300 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3301 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3302 
3303 		/* Don't export sysctls to unprivileged users */
3304 		if (net->user_ns != &init_user_ns)
3305 			table[0].procname = NULL;
3306 	}
3307 
3308 	return table;
3309 }
3310 #endif
3311 
3312 static int __net_init ip6_route_net_init(struct net *net)
3313 {
3314 	int ret = -ENOMEM;
3315 
3316 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3317 	       sizeof(net->ipv6.ip6_dst_ops));
3318 
3319 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3320 		goto out_ip6_dst_ops;
3321 
3322 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3323 					   sizeof(*net->ipv6.ip6_null_entry),
3324 					   GFP_KERNEL);
3325 	if (!net->ipv6.ip6_null_entry)
3326 		goto out_ip6_dst_entries;
3327 	net->ipv6.ip6_null_entry->dst.path =
3328 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3329 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3330 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3331 			 ip6_template_metrics, true);
3332 
3333 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3334 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3335 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3336 					       GFP_KERNEL);
3337 	if (!net->ipv6.ip6_prohibit_entry)
3338 		goto out_ip6_null_entry;
3339 	net->ipv6.ip6_prohibit_entry->dst.path =
3340 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3341 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3342 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3343 			 ip6_template_metrics, true);
3344 
3345 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3346 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3347 					       GFP_KERNEL);
3348 	if (!net->ipv6.ip6_blk_hole_entry)
3349 		goto out_ip6_prohibit_entry;
3350 	net->ipv6.ip6_blk_hole_entry->dst.path =
3351 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3352 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3353 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3354 			 ip6_template_metrics, true);
3355 #endif
3356 
3357 	net->ipv6.sysctl.flush_delay = 0;
3358 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3359 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3360 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3361 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3362 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3363 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3364 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3365 
3366 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3367 
3368 	ret = 0;
3369 out:
3370 	return ret;
3371 
3372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3373 out_ip6_prohibit_entry:
3374 	kfree(net->ipv6.ip6_prohibit_entry);
3375 out_ip6_null_entry:
3376 	kfree(net->ipv6.ip6_null_entry);
3377 #endif
3378 out_ip6_dst_entries:
3379 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3380 out_ip6_dst_ops:
3381 	goto out;
3382 }
3383 
3384 static void __net_exit ip6_route_net_exit(struct net *net)
3385 {
3386 	kfree(net->ipv6.ip6_null_entry);
3387 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3388 	kfree(net->ipv6.ip6_prohibit_entry);
3389 	kfree(net->ipv6.ip6_blk_hole_entry);
3390 #endif
3391 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3392 }
3393 
3394 static int __net_init ip6_route_net_init_late(struct net *net)
3395 {
3396 #ifdef CONFIG_PROC_FS
3397 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3398 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3399 #endif
3400 	return 0;
3401 }
3402 
3403 static void __net_exit ip6_route_net_exit_late(struct net *net)
3404 {
3405 #ifdef CONFIG_PROC_FS
3406 	remove_proc_entry("ipv6_route", net->proc_net);
3407 	remove_proc_entry("rt6_stats", net->proc_net);
3408 #endif
3409 }
3410 
3411 static struct pernet_operations ip6_route_net_ops = {
3412 	.init = ip6_route_net_init,
3413 	.exit = ip6_route_net_exit,
3414 };
3415 
3416 static int __net_init ipv6_inetpeer_init(struct net *net)
3417 {
3418 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3419 
3420 	if (!bp)
3421 		return -ENOMEM;
3422 	inet_peer_base_init(bp);
3423 	net->ipv6.peers = bp;
3424 	return 0;
3425 }
3426 
3427 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3428 {
3429 	struct inet_peer_base *bp = net->ipv6.peers;
3430 
3431 	net->ipv6.peers = NULL;
3432 	inetpeer_invalidate_tree(bp);
3433 	kfree(bp);
3434 }
3435 
3436 static struct pernet_operations ipv6_inetpeer_ops = {
3437 	.init	=	ipv6_inetpeer_init,
3438 	.exit	=	ipv6_inetpeer_exit,
3439 };
3440 
3441 static struct pernet_operations ip6_route_net_late_ops = {
3442 	.init = ip6_route_net_init_late,
3443 	.exit = ip6_route_net_exit_late,
3444 };
3445 
3446 static struct notifier_block ip6_route_dev_notifier = {
3447 	.notifier_call = ip6_route_dev_notify,
3448 	.priority = 0,
3449 };
3450 
3451 int __init ip6_route_init(void)
3452 {
3453 	int ret;
3454 	int cpu;
3455 
3456 	ret = -ENOMEM;
3457 	ip6_dst_ops_template.kmem_cachep =
3458 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3459 				  SLAB_HWCACHE_ALIGN, NULL);
3460 	if (!ip6_dst_ops_template.kmem_cachep)
3461 		goto out;
3462 
3463 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3464 	if (ret)
3465 		goto out_kmem_cache;
3466 
3467 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3468 	if (ret)
3469 		goto out_dst_entries;
3470 
3471 	ret = register_pernet_subsys(&ip6_route_net_ops);
3472 	if (ret)
3473 		goto out_register_inetpeer;
3474 
3475 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3476 
3477 	/* Registering of the loopback is done before this portion of code,
3478 	 * the loopback reference in rt6_info will not be taken, do it
3479 	 * manually for init_net */
3480 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3481 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3482   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3483 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3484 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3485 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3486 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3487   #endif
3488 	ret = fib6_init();
3489 	if (ret)
3490 		goto out_register_subsys;
3491 
3492 	ret = xfrm6_init();
3493 	if (ret)
3494 		goto out_fib6_init;
3495 
3496 	ret = fib6_rules_init();
3497 	if (ret)
3498 		goto xfrm6_init;
3499 
3500 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3501 	if (ret)
3502 		goto fib6_rules_init;
3503 
3504 	ret = -ENOBUFS;
3505 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3506 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3507 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3508 		goto out_register_late_subsys;
3509 
3510 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3511 	if (ret)
3512 		goto out_register_late_subsys;
3513 
3514 	for_each_possible_cpu(cpu) {
3515 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3516 
3517 		INIT_LIST_HEAD(&ul->head);
3518 		spin_lock_init(&ul->lock);
3519 	}
3520 
3521 out:
3522 	return ret;
3523 
3524 out_register_late_subsys:
3525 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3526 fib6_rules_init:
3527 	fib6_rules_cleanup();
3528 xfrm6_init:
3529 	xfrm6_fini();
3530 out_fib6_init:
3531 	fib6_gc_cleanup();
3532 out_register_subsys:
3533 	unregister_pernet_subsys(&ip6_route_net_ops);
3534 out_register_inetpeer:
3535 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3536 out_dst_entries:
3537 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3538 out_kmem_cache:
3539 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3540 	goto out;
3541 }
3542 
3543 void ip6_route_cleanup(void)
3544 {
3545 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3546 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3547 	fib6_rules_cleanup();
3548 	xfrm6_fini();
3549 	fib6_gc_cleanup();
3550 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3551 	unregister_pernet_subsys(&ip6_route_net_ops);
3552 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3553 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3554 }
3555