xref: /openbmc/linux/net/ipv6/route.c (revision 179dd8c0)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 
62 #include <asm/uaccess.h>
63 
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67 
68 enum rt6_nud_state {
69 	RT6_NUD_FAIL_HARD = -3,
70 	RT6_NUD_FAIL_PROBE = -2,
71 	RT6_NUD_FAIL_DO_RR = -1,
72 	RT6_NUD_SUCCEED = 1
73 };
74 
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int		ip6_pkt_prohibit(struct sk_buff *skb);
88 static int		ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void		ip6_link_failure(struct sk_buff *skb);
90 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91 					   struct sk_buff *skb, u32 mtu);
92 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93 					struct sk_buff *skb);
94 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96 
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99 					   const struct in6_addr *prefix, int prefixlen,
100 					   const struct in6_addr *gwaddr, int ifindex,
101 					   unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103 					   const struct in6_addr *prefix, int prefixlen,
104 					   const struct in6_addr *gwaddr, int ifindex);
105 #endif
106 
107 struct uncached_list {
108 	spinlock_t		lock;
109 	struct list_head	head;
110 };
111 
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113 
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117 
118 	rt->dst.flags |= DST_NOCACHE;
119 	rt->rt6i_uncached_list = ul;
120 
121 	spin_lock_bh(&ul->lock);
122 	list_add_tail(&rt->rt6i_uncached, &ul->head);
123 	spin_unlock_bh(&ul->lock);
124 }
125 
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128 	if (!list_empty(&rt->rt6i_uncached)) {
129 		struct uncached_list *ul = rt->rt6i_uncached_list;
130 
131 		spin_lock_bh(&ul->lock);
132 		list_del(&rt->rt6i_uncached);
133 		spin_unlock_bh(&ul->lock);
134 	}
135 }
136 
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139 	struct net_device *loopback_dev = net->loopback_dev;
140 	int cpu;
141 
142 	for_each_possible_cpu(cpu) {
143 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144 		struct rt6_info *rt;
145 
146 		spin_lock_bh(&ul->lock);
147 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148 			struct inet6_dev *rt_idev = rt->rt6i_idev;
149 			struct net_device *rt_dev = rt->dst.dev;
150 
151 			if (rt_idev && (rt_idev->dev == dev || !dev) &&
152 			    rt_idev->dev != loopback_dev) {
153 				rt->rt6i_idev = in6_dev_get(loopback_dev);
154 				in6_dev_put(rt_idev);
155 			}
156 
157 			if (rt_dev && (rt_dev == dev || !dev) &&
158 			    rt_dev != loopback_dev) {
159 				rt->dst.dev = loopback_dev;
160 				dev_hold(rt->dst.dev);
161 				dev_put(rt_dev);
162 			}
163 		}
164 		spin_unlock_bh(&ul->lock);
165 	}
166 }
167 
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170 	return dst_metrics_write_ptr(rt->dst.from);
171 }
172 
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175 	struct rt6_info *rt = (struct rt6_info *)dst;
176 
177 	if (rt->rt6i_flags & RTF_PCPU)
178 		return rt6_pcpu_cow_metrics(rt);
179 	else if (rt->rt6i_flags & RTF_CACHE)
180 		return NULL;
181 	else
182 		return dst_cow_metrics_generic(dst, old);
183 }
184 
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	struct in6_addr *p = &rt->rt6i_gateway;
190 
191 	if (!ipv6_addr_any(p))
192 		return (const void *) p;
193 	else if (skb)
194 		return &ipv6_hdr(skb)->daddr;
195 	return daddr;
196 }
197 
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199 					  struct sk_buff *skb,
200 					  const void *daddr)
201 {
202 	struct rt6_info *rt = (struct rt6_info *) dst;
203 	struct neighbour *n;
204 
205 	daddr = choose_neigh_daddr(rt, skb, daddr);
206 	n = __ipv6_neigh_lookup(dst->dev, daddr);
207 	if (n)
208 		return n;
209 	return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211 
212 static struct dst_ops ip6_dst_ops_template = {
213 	.family			=	AF_INET6,
214 	.gc			=	ip6_dst_gc,
215 	.gc_thresh		=	1024,
216 	.check			=	ip6_dst_check,
217 	.default_advmss		=	ip6_default_advmss,
218 	.mtu			=	ip6_mtu,
219 	.cow_metrics		=	ipv6_cow_metrics,
220 	.destroy		=	ip6_dst_destroy,
221 	.ifdown			=	ip6_dst_ifdown,
222 	.negative_advice	=	ip6_negative_advice,
223 	.link_failure		=	ip6_link_failure,
224 	.update_pmtu		=	ip6_rt_update_pmtu,
225 	.redirect		=	rt6_do_redirect,
226 	.local_out		=	__ip6_local_out,
227 	.neigh_lookup		=	ip6_neigh_lookup,
228 };
229 
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233 
234 	return mtu ? : dst->dev->mtu;
235 }
236 
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238 					 struct sk_buff *skb, u32 mtu)
239 {
240 }
241 
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243 				      struct sk_buff *skb)
244 {
245 }
246 
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248 					 unsigned long old)
249 {
250 	return NULL;
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_sk,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320 					struct net_device *dev,
321 					int flags,
322 					struct fib6_table *table)
323 {
324 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
325 					0, DST_OBSOLETE_FORCE_CHK, flags);
326 
327 	if (rt) {
328 		struct dst_entry *dst = &rt->dst;
329 
330 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
331 		INIT_LIST_HEAD(&rt->rt6i_siblings);
332 		INIT_LIST_HEAD(&rt->rt6i_uncached);
333 	}
334 	return rt;
335 }
336 
337 static struct rt6_info *ip6_dst_alloc(struct net *net,
338 				      struct net_device *dev,
339 				      int flags,
340 				      struct fib6_table *table)
341 {
342 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
343 
344 	if (rt) {
345 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
346 		if (rt->rt6i_pcpu) {
347 			int cpu;
348 
349 			for_each_possible_cpu(cpu) {
350 				struct rt6_info **p;
351 
352 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
353 				/* no one shares rt */
354 				*p =  NULL;
355 			}
356 		} else {
357 			dst_destroy((struct dst_entry *)rt);
358 			return NULL;
359 		}
360 	}
361 
362 	return rt;
363 }
364 
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct rt6_info *rt = (struct rt6_info *)dst;
368 	struct dst_entry *from = dst->from;
369 	struct inet6_dev *idev;
370 
371 	dst_destroy_metrics_generic(dst);
372 
373 	if (rt->rt6i_pcpu)
374 		free_percpu(rt->rt6i_pcpu);
375 
376 	rt6_uncached_list_del(rt);
377 
378 	idev = rt->rt6i_idev;
379 	if (idev) {
380 		rt->rt6i_idev = NULL;
381 		in6_dev_put(idev);
382 	}
383 
384 	dst->from = NULL;
385 	dst_release(from);
386 }
387 
388 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389 			   int how)
390 {
391 	struct rt6_info *rt = (struct rt6_info *)dst;
392 	struct inet6_dev *idev = rt->rt6i_idev;
393 	struct net_device *loopback_dev =
394 		dev_net(dev)->loopback_dev;
395 
396 	if (dev != loopback_dev) {
397 		if (idev && idev->dev == dev) {
398 			struct inet6_dev *loopback_idev =
399 				in6_dev_get(loopback_dev);
400 			if (loopback_idev) {
401 				rt->rt6i_idev = loopback_idev;
402 				in6_dev_put(idev);
403 			}
404 		}
405 	}
406 }
407 
408 static bool rt6_check_expired(const struct rt6_info *rt)
409 {
410 	if (rt->rt6i_flags & RTF_EXPIRES) {
411 		if (time_after(jiffies, rt->dst.expires))
412 			return true;
413 	} else if (rt->dst.from) {
414 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
415 	}
416 	return false;
417 }
418 
419 /* Multipath route selection:
420  *   Hash based function using packet header and flowlabel.
421  * Adapted from fib_info_hashfn()
422  */
423 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
424 			       const struct flowi6 *fl6)
425 {
426 	unsigned int val = fl6->flowi6_proto;
427 
428 	val ^= ipv6_addr_hash(&fl6->daddr);
429 	val ^= ipv6_addr_hash(&fl6->saddr);
430 
431 	/* Work only if this not encapsulated */
432 	switch (fl6->flowi6_proto) {
433 	case IPPROTO_UDP:
434 	case IPPROTO_TCP:
435 	case IPPROTO_SCTP:
436 		val ^= (__force u16)fl6->fl6_sport;
437 		val ^= (__force u16)fl6->fl6_dport;
438 		break;
439 
440 	case IPPROTO_ICMPV6:
441 		val ^= (__force u16)fl6->fl6_icmp_type;
442 		val ^= (__force u16)fl6->fl6_icmp_code;
443 		break;
444 	}
445 	/* RFC6438 recommands to use flowlabel */
446 	val ^= (__force u32)fl6->flowlabel;
447 
448 	/* Perhaps, we need to tune, this function? */
449 	val = val ^ (val >> 7) ^ (val >> 12);
450 	return val % candidate_count;
451 }
452 
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454 					     struct flowi6 *fl6, int oif,
455 					     int strict)
456 {
457 	struct rt6_info *sibling, *next_sibling;
458 	int route_choosen;
459 
460 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
461 	/* Don't change the route, if route_choosen == 0
462 	 * (siblings does not include ourself)
463 	 */
464 	if (route_choosen)
465 		list_for_each_entry_safe(sibling, next_sibling,
466 				&match->rt6i_siblings, rt6i_siblings) {
467 			route_choosen--;
468 			if (route_choosen == 0) {
469 				if (rt6_score_route(sibling, oif, strict) < 0)
470 					break;
471 				match = sibling;
472 				break;
473 			}
474 		}
475 	return match;
476 }
477 
478 /*
479  *	Route lookup. Any table->tb6_lock is implied.
480  */
481 
482 static inline struct rt6_info *rt6_device_match(struct net *net,
483 						    struct rt6_info *rt,
484 						    const struct in6_addr *saddr,
485 						    int oif,
486 						    int flags)
487 {
488 	struct rt6_info *local = NULL;
489 	struct rt6_info *sprt;
490 
491 	if (!oif && ipv6_addr_any(saddr))
492 		goto out;
493 
494 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
495 		struct net_device *dev = sprt->dst.dev;
496 
497 		if (oif) {
498 			if (dev->ifindex == oif)
499 				return sprt;
500 			if (dev->flags & IFF_LOOPBACK) {
501 				if (!sprt->rt6i_idev ||
502 				    sprt->rt6i_idev->dev->ifindex != oif) {
503 					if (flags & RT6_LOOKUP_F_IFACE && oif)
504 						continue;
505 					if (local && (!oif ||
506 						      local->rt6i_idev->dev->ifindex == oif))
507 						continue;
508 				}
509 				local = sprt;
510 			}
511 		} else {
512 			if (ipv6_chk_addr(net, saddr, dev,
513 					  flags & RT6_LOOKUP_F_IFACE))
514 				return sprt;
515 		}
516 	}
517 
518 	if (oif) {
519 		if (local)
520 			return local;
521 
522 		if (flags & RT6_LOOKUP_F_IFACE)
523 			return net->ipv6.ip6_null_entry;
524 	}
525 out:
526 	return rt;
527 }
528 
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531 	struct work_struct work;
532 	struct in6_addr target;
533 	struct net_device *dev;
534 };
535 
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538 	struct in6_addr mcaddr;
539 	struct __rt6_probe_work *work =
540 		container_of(w, struct __rt6_probe_work, work);
541 
542 	addrconf_addr_solict_mult(&work->target, &mcaddr);
543 	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
544 	dev_put(work->dev);
545 	kfree(work);
546 }
547 
548 static void rt6_probe(struct rt6_info *rt)
549 {
550 	struct neighbour *neigh;
551 	/*
552 	 * Okay, this does not seem to be appropriate
553 	 * for now, however, we need to check if it
554 	 * is really so; aka Router Reachability Probing.
555 	 *
556 	 * Router Reachability Probe MUST be rate-limited
557 	 * to no more than one per minute.
558 	 */
559 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
560 		return;
561 	rcu_read_lock_bh();
562 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
563 	if (neigh) {
564 		write_lock(&neigh->lock);
565 		if (neigh->nud_state & NUD_VALID)
566 			goto out;
567 	}
568 
569 	if (!neigh ||
570 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
571 		struct __rt6_probe_work *work;
572 
573 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
574 
575 		if (neigh && work)
576 			__neigh_set_probe_once(neigh);
577 
578 		if (neigh)
579 			write_unlock(&neigh->lock);
580 
581 		if (work) {
582 			INIT_WORK(&work->work, rt6_probe_deferred);
583 			work->target = rt->rt6i_gateway;
584 			dev_hold(rt->dst.dev);
585 			work->dev = rt->dst.dev;
586 			schedule_work(&work->work);
587 		}
588 	} else {
589 out:
590 		write_unlock(&neigh->lock);
591 	}
592 	rcu_read_unlock_bh();
593 }
594 #else
595 static inline void rt6_probe(struct rt6_info *rt)
596 {
597 }
598 #endif
599 
600 /*
601  * Default Router Selection (RFC 2461 6.3.6)
602  */
603 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
604 {
605 	struct net_device *dev = rt->dst.dev;
606 	if (!oif || dev->ifindex == oif)
607 		return 2;
608 	if ((dev->flags & IFF_LOOPBACK) &&
609 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
610 		return 1;
611 	return 0;
612 }
613 
614 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
615 {
616 	struct neighbour *neigh;
617 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
618 
619 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
620 	    !(rt->rt6i_flags & RTF_GATEWAY))
621 		return RT6_NUD_SUCCEED;
622 
623 	rcu_read_lock_bh();
624 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
625 	if (neigh) {
626 		read_lock(&neigh->lock);
627 		if (neigh->nud_state & NUD_VALID)
628 			ret = RT6_NUD_SUCCEED;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630 		else if (!(neigh->nud_state & NUD_FAILED))
631 			ret = RT6_NUD_SUCCEED;
632 		else
633 			ret = RT6_NUD_FAIL_PROBE;
634 #endif
635 		read_unlock(&neigh->lock);
636 	} else {
637 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
638 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
639 	}
640 	rcu_read_unlock_bh();
641 
642 	return ret;
643 }
644 
645 static int rt6_score_route(struct rt6_info *rt, int oif,
646 			   int strict)
647 {
648 	int m;
649 
650 	m = rt6_check_dev(rt, oif);
651 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
652 		return RT6_NUD_FAIL_HARD;
653 #ifdef CONFIG_IPV6_ROUTER_PREF
654 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
655 #endif
656 	if (strict & RT6_LOOKUP_F_REACHABLE) {
657 		int n = rt6_check_neigh(rt);
658 		if (n < 0)
659 			return n;
660 	}
661 	return m;
662 }
663 
664 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
665 				   int *mpri, struct rt6_info *match,
666 				   bool *do_rr)
667 {
668 	int m;
669 	bool match_do_rr = false;
670 
671 	if (rt6_check_expired(rt))
672 		goto out;
673 
674 	m = rt6_score_route(rt, oif, strict);
675 	if (m == RT6_NUD_FAIL_DO_RR) {
676 		match_do_rr = true;
677 		m = 0; /* lowest valid score */
678 	} else if (m == RT6_NUD_FAIL_HARD) {
679 		goto out;
680 	}
681 
682 	if (strict & RT6_LOOKUP_F_REACHABLE)
683 		rt6_probe(rt);
684 
685 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
686 	if (m > *mpri) {
687 		*do_rr = match_do_rr;
688 		*mpri = m;
689 		match = rt;
690 	}
691 out:
692 	return match;
693 }
694 
695 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
696 				     struct rt6_info *rr_head,
697 				     u32 metric, int oif, int strict,
698 				     bool *do_rr)
699 {
700 	struct rt6_info *rt, *match, *cont;
701 	int mpri = -1;
702 
703 	match = NULL;
704 	cont = NULL;
705 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
706 		if (rt->rt6i_metric != metric) {
707 			cont = rt;
708 			break;
709 		}
710 
711 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
712 	}
713 
714 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
715 		if (rt->rt6i_metric != metric) {
716 			cont = rt;
717 			break;
718 		}
719 
720 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 	}
722 
723 	if (match || !cont)
724 		return match;
725 
726 	for (rt = cont; rt; rt = rt->dst.rt6_next)
727 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
728 
729 	return match;
730 }
731 
732 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
733 {
734 	struct rt6_info *match, *rt0;
735 	struct net *net;
736 	bool do_rr = false;
737 
738 	rt0 = fn->rr_ptr;
739 	if (!rt0)
740 		fn->rr_ptr = rt0 = fn->leaf;
741 
742 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
743 			     &do_rr);
744 
745 	if (do_rr) {
746 		struct rt6_info *next = rt0->dst.rt6_next;
747 
748 		/* no entries matched; do round-robin */
749 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
750 			next = fn->leaf;
751 
752 		if (next != rt0)
753 			fn->rr_ptr = next;
754 	}
755 
756 	net = dev_net(rt0->dst.dev);
757 	return match ? match : net->ipv6.ip6_null_entry;
758 }
759 
760 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
761 {
762 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
763 }
764 
765 #ifdef CONFIG_IPV6_ROUTE_INFO
766 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
767 		  const struct in6_addr *gwaddr)
768 {
769 	struct net *net = dev_net(dev);
770 	struct route_info *rinfo = (struct route_info *) opt;
771 	struct in6_addr prefix_buf, *prefix;
772 	unsigned int pref;
773 	unsigned long lifetime;
774 	struct rt6_info *rt;
775 
776 	if (len < sizeof(struct route_info)) {
777 		return -EINVAL;
778 	}
779 
780 	/* Sanity check for prefix_len and length */
781 	if (rinfo->length > 3) {
782 		return -EINVAL;
783 	} else if (rinfo->prefix_len > 128) {
784 		return -EINVAL;
785 	} else if (rinfo->prefix_len > 64) {
786 		if (rinfo->length < 2) {
787 			return -EINVAL;
788 		}
789 	} else if (rinfo->prefix_len > 0) {
790 		if (rinfo->length < 1) {
791 			return -EINVAL;
792 		}
793 	}
794 
795 	pref = rinfo->route_pref;
796 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
797 		return -EINVAL;
798 
799 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
800 
801 	if (rinfo->length == 3)
802 		prefix = (struct in6_addr *)rinfo->prefix;
803 	else {
804 		/* this function is safe */
805 		ipv6_addr_prefix(&prefix_buf,
806 				 (struct in6_addr *)rinfo->prefix,
807 				 rinfo->prefix_len);
808 		prefix = &prefix_buf;
809 	}
810 
811 	if (rinfo->prefix_len == 0)
812 		rt = rt6_get_dflt_router(gwaddr, dev);
813 	else
814 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
815 					gwaddr, dev->ifindex);
816 
817 	if (rt && !lifetime) {
818 		ip6_del_rt(rt);
819 		rt = NULL;
820 	}
821 
822 	if (!rt && lifetime)
823 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
824 					pref);
825 	else if (rt)
826 		rt->rt6i_flags = RTF_ROUTEINFO |
827 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
828 
829 	if (rt) {
830 		if (!addrconf_finite_timeout(lifetime))
831 			rt6_clean_expires(rt);
832 		else
833 			rt6_set_expires(rt, jiffies + HZ * lifetime);
834 
835 		ip6_rt_put(rt);
836 	}
837 	return 0;
838 }
839 #endif
840 
841 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
842 					struct in6_addr *saddr)
843 {
844 	struct fib6_node *pn;
845 	while (1) {
846 		if (fn->fn_flags & RTN_TL_ROOT)
847 			return NULL;
848 		pn = fn->parent;
849 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
850 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
851 		else
852 			fn = pn;
853 		if (fn->fn_flags & RTN_RTINFO)
854 			return fn;
855 	}
856 }
857 
858 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
859 					     struct fib6_table *table,
860 					     struct flowi6 *fl6, int flags)
861 {
862 	struct fib6_node *fn;
863 	struct rt6_info *rt;
864 
865 	read_lock_bh(&table->tb6_lock);
866 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
867 restart:
868 	rt = fn->leaf;
869 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
870 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
871 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
872 	if (rt == net->ipv6.ip6_null_entry) {
873 		fn = fib6_backtrack(fn, &fl6->saddr);
874 		if (fn)
875 			goto restart;
876 	}
877 	dst_use(&rt->dst, jiffies);
878 	read_unlock_bh(&table->tb6_lock);
879 	return rt;
880 
881 }
882 
883 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
884 				    int flags)
885 {
886 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
887 }
888 EXPORT_SYMBOL_GPL(ip6_route_lookup);
889 
890 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
891 			    const struct in6_addr *saddr, int oif, int strict)
892 {
893 	struct flowi6 fl6 = {
894 		.flowi6_oif = oif,
895 		.daddr = *daddr,
896 	};
897 	struct dst_entry *dst;
898 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
899 
900 	if (saddr) {
901 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
902 		flags |= RT6_LOOKUP_F_HAS_SADDR;
903 	}
904 
905 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
906 	if (dst->error == 0)
907 		return (struct rt6_info *) dst;
908 
909 	dst_release(dst);
910 
911 	return NULL;
912 }
913 EXPORT_SYMBOL(rt6_lookup);
914 
915 /* ip6_ins_rt is called with FREE table->tb6_lock.
916    It takes new route entry, the addition fails by any reason the
917    route is freed. In any case, if caller does not hold it, it may
918    be destroyed.
919  */
920 
921 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
922 			struct mx6_config *mxc)
923 {
924 	int err;
925 	struct fib6_table *table;
926 
927 	table = rt->rt6i_table;
928 	write_lock_bh(&table->tb6_lock);
929 	err = fib6_add(&table->tb6_root, rt, info, mxc);
930 	write_unlock_bh(&table->tb6_lock);
931 
932 	return err;
933 }
934 
935 int ip6_ins_rt(struct rt6_info *rt)
936 {
937 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
938 	struct mx6_config mxc = { .mx = NULL, };
939 
940 	return __ip6_ins_rt(rt, &info, &mxc);
941 }
942 
943 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
944 					   const struct in6_addr *daddr,
945 					   const struct in6_addr *saddr)
946 {
947 	struct rt6_info *rt;
948 
949 	/*
950 	 *	Clone the route.
951 	 */
952 
953 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
954 		ort = (struct rt6_info *)ort->dst.from;
955 
956 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
957 			     0, ort->rt6i_table);
958 
959 	if (!rt)
960 		return NULL;
961 
962 	ip6_rt_copy_init(rt, ort);
963 	rt->rt6i_flags |= RTF_CACHE;
964 	rt->rt6i_metric = 0;
965 	rt->dst.flags |= DST_HOST;
966 	rt->rt6i_dst.addr = *daddr;
967 	rt->rt6i_dst.plen = 128;
968 
969 	if (!rt6_is_gw_or_nonexthop(ort)) {
970 		if (ort->rt6i_dst.plen != 128 &&
971 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
972 			rt->rt6i_flags |= RTF_ANYCAST;
973 #ifdef CONFIG_IPV6_SUBTREES
974 		if (rt->rt6i_src.plen && saddr) {
975 			rt->rt6i_src.addr = *saddr;
976 			rt->rt6i_src.plen = 128;
977 		}
978 #endif
979 	}
980 
981 	return rt;
982 }
983 
984 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
985 {
986 	struct rt6_info *pcpu_rt;
987 
988 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
989 				  rt->dst.dev, rt->dst.flags,
990 				  rt->rt6i_table);
991 
992 	if (!pcpu_rt)
993 		return NULL;
994 	ip6_rt_copy_init(pcpu_rt, rt);
995 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
996 	pcpu_rt->rt6i_flags |= RTF_PCPU;
997 	return pcpu_rt;
998 }
999 
1000 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1001 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1002 {
1003 	struct rt6_info *pcpu_rt, *prev, **p;
1004 
1005 	p = this_cpu_ptr(rt->rt6i_pcpu);
1006 	pcpu_rt = *p;
1007 
1008 	if (pcpu_rt)
1009 		goto done;
1010 
1011 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1012 	if (!pcpu_rt) {
1013 		struct net *net = dev_net(rt->dst.dev);
1014 
1015 		pcpu_rt = net->ipv6.ip6_null_entry;
1016 		goto done;
1017 	}
1018 
1019 	prev = cmpxchg(p, NULL, pcpu_rt);
1020 	if (prev) {
1021 		/* If someone did it before us, return prev instead */
1022 		dst_destroy(&pcpu_rt->dst);
1023 		pcpu_rt = prev;
1024 	}
1025 
1026 done:
1027 	dst_hold(&pcpu_rt->dst);
1028 	rt6_dst_from_metrics_check(pcpu_rt);
1029 	return pcpu_rt;
1030 }
1031 
1032 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1033 				      struct flowi6 *fl6, int flags)
1034 {
1035 	struct fib6_node *fn, *saved_fn;
1036 	struct rt6_info *rt;
1037 	int strict = 0;
1038 
1039 	strict |= flags & RT6_LOOKUP_F_IFACE;
1040 	if (net->ipv6.devconf_all->forwarding == 0)
1041 		strict |= RT6_LOOKUP_F_REACHABLE;
1042 
1043 	read_lock_bh(&table->tb6_lock);
1044 
1045 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1046 	saved_fn = fn;
1047 
1048 redo_rt6_select:
1049 	rt = rt6_select(fn, oif, strict);
1050 	if (rt->rt6i_nsiblings)
1051 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1052 	if (rt == net->ipv6.ip6_null_entry) {
1053 		fn = fib6_backtrack(fn, &fl6->saddr);
1054 		if (fn)
1055 			goto redo_rt6_select;
1056 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1057 			/* also consider unreachable route */
1058 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1059 			fn = saved_fn;
1060 			goto redo_rt6_select;
1061 		}
1062 	}
1063 
1064 
1065 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1066 		dst_use(&rt->dst, jiffies);
1067 		read_unlock_bh(&table->tb6_lock);
1068 
1069 		rt6_dst_from_metrics_check(rt);
1070 		return rt;
1071 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1072 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1073 		/* Create a RTF_CACHE clone which will not be
1074 		 * owned by the fib6 tree.  It is for the special case where
1075 		 * the daddr in the skb during the neighbor look-up is different
1076 		 * from the fl6->daddr used to look-up route here.
1077 		 */
1078 
1079 		struct rt6_info *uncached_rt;
1080 
1081 		dst_use(&rt->dst, jiffies);
1082 		read_unlock_bh(&table->tb6_lock);
1083 
1084 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1085 		dst_release(&rt->dst);
1086 
1087 		if (uncached_rt)
1088 			rt6_uncached_list_add(uncached_rt);
1089 		else
1090 			uncached_rt = net->ipv6.ip6_null_entry;
1091 
1092 		dst_hold(&uncached_rt->dst);
1093 		return uncached_rt;
1094 
1095 	} else {
1096 		/* Get a percpu copy */
1097 
1098 		struct rt6_info *pcpu_rt;
1099 
1100 		rt->dst.lastuse = jiffies;
1101 		rt->dst.__use++;
1102 		pcpu_rt = rt6_get_pcpu_route(rt);
1103 		read_unlock_bh(&table->tb6_lock);
1104 
1105 		return pcpu_rt;
1106 	}
1107 }
1108 
1109 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1110 					    struct flowi6 *fl6, int flags)
1111 {
1112 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1113 }
1114 
1115 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1116 						struct net_device *dev,
1117 						struct flowi6 *fl6, int flags)
1118 {
1119 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1120 		flags |= RT6_LOOKUP_F_IFACE;
1121 
1122 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1123 }
1124 
1125 void ip6_route_input(struct sk_buff *skb)
1126 {
1127 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1128 	struct net *net = dev_net(skb->dev);
1129 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1130 	struct flowi6 fl6 = {
1131 		.flowi6_iif = skb->dev->ifindex,
1132 		.daddr = iph->daddr,
1133 		.saddr = iph->saddr,
1134 		.flowlabel = ip6_flowinfo(iph),
1135 		.flowi6_mark = skb->mark,
1136 		.flowi6_proto = iph->nexthdr,
1137 	};
1138 
1139 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1140 }
1141 
1142 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1143 					     struct flowi6 *fl6, int flags)
1144 {
1145 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1146 }
1147 
1148 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1149 				    struct flowi6 *fl6)
1150 {
1151 	int flags = 0;
1152 
1153 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1154 
1155 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1156 		flags |= RT6_LOOKUP_F_IFACE;
1157 
1158 	if (!ipv6_addr_any(&fl6->saddr))
1159 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1160 	else if (sk)
1161 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1162 
1163 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1164 }
1165 EXPORT_SYMBOL(ip6_route_output);
1166 
1167 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1168 {
1169 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1170 	struct dst_entry *new = NULL;
1171 
1172 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1173 	if (rt) {
1174 		new = &rt->dst;
1175 
1176 		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1177 
1178 		new->__use = 1;
1179 		new->input = dst_discard;
1180 		new->output = dst_discard_sk;
1181 
1182 		if (dst_metrics_read_only(&ort->dst))
1183 			new->_metrics = ort->dst._metrics;
1184 		else
1185 			dst_copy_metrics(new, &ort->dst);
1186 		rt->rt6i_idev = ort->rt6i_idev;
1187 		if (rt->rt6i_idev)
1188 			in6_dev_hold(rt->rt6i_idev);
1189 
1190 		rt->rt6i_gateway = ort->rt6i_gateway;
1191 		rt->rt6i_flags = ort->rt6i_flags;
1192 		rt->rt6i_metric = 0;
1193 
1194 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1195 #ifdef CONFIG_IPV6_SUBTREES
1196 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1197 #endif
1198 
1199 		dst_free(new);
1200 	}
1201 
1202 	dst_release(dst_orig);
1203 	return new ? new : ERR_PTR(-ENOMEM);
1204 }
1205 
1206 /*
1207  *	Destination cache support functions
1208  */
1209 
1210 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1211 {
1212 	if (rt->dst.from &&
1213 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1214 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1215 }
1216 
1217 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1218 {
1219 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1220 		return NULL;
1221 
1222 	if (rt6_check_expired(rt))
1223 		return NULL;
1224 
1225 	return &rt->dst;
1226 }
1227 
1228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1229 {
1230 	if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1231 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1232 		return &rt->dst;
1233 	else
1234 		return NULL;
1235 }
1236 
1237 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1238 {
1239 	struct rt6_info *rt;
1240 
1241 	rt = (struct rt6_info *) dst;
1242 
1243 	/* All IPV6 dsts are created with ->obsolete set to the value
1244 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1245 	 * into this function always.
1246 	 */
1247 
1248 	rt6_dst_from_metrics_check(rt);
1249 
1250 	if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1251 		return rt6_dst_from_check(rt, cookie);
1252 	else
1253 		return rt6_check(rt, cookie);
1254 }
1255 
1256 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1257 {
1258 	struct rt6_info *rt = (struct rt6_info *) dst;
1259 
1260 	if (rt) {
1261 		if (rt->rt6i_flags & RTF_CACHE) {
1262 			if (rt6_check_expired(rt)) {
1263 				ip6_del_rt(rt);
1264 				dst = NULL;
1265 			}
1266 		} else {
1267 			dst_release(dst);
1268 			dst = NULL;
1269 		}
1270 	}
1271 	return dst;
1272 }
1273 
1274 static void ip6_link_failure(struct sk_buff *skb)
1275 {
1276 	struct rt6_info *rt;
1277 
1278 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1279 
1280 	rt = (struct rt6_info *) skb_dst(skb);
1281 	if (rt) {
1282 		if (rt->rt6i_flags & RTF_CACHE) {
1283 			dst_hold(&rt->dst);
1284 			if (ip6_del_rt(rt))
1285 				dst_free(&rt->dst);
1286 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1287 			rt->rt6i_node->fn_sernum = -1;
1288 		}
1289 	}
1290 }
1291 
1292 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1293 {
1294 	struct net *net = dev_net(rt->dst.dev);
1295 
1296 	rt->rt6i_flags |= RTF_MODIFIED;
1297 	rt->rt6i_pmtu = mtu;
1298 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1299 }
1300 
1301 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1302 				 const struct ipv6hdr *iph, u32 mtu)
1303 {
1304 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1305 
1306 	if (rt6->rt6i_flags & RTF_LOCAL)
1307 		return;
1308 
1309 	dst_confirm(dst);
1310 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1311 	if (mtu >= dst_mtu(dst))
1312 		return;
1313 
1314 	if (rt6->rt6i_flags & RTF_CACHE) {
1315 		rt6_do_update_pmtu(rt6, mtu);
1316 	} else {
1317 		const struct in6_addr *daddr, *saddr;
1318 		struct rt6_info *nrt6;
1319 
1320 		if (iph) {
1321 			daddr = &iph->daddr;
1322 			saddr = &iph->saddr;
1323 		} else if (sk) {
1324 			daddr = &sk->sk_v6_daddr;
1325 			saddr = &inet6_sk(sk)->saddr;
1326 		} else {
1327 			return;
1328 		}
1329 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1330 		if (nrt6) {
1331 			rt6_do_update_pmtu(nrt6, mtu);
1332 
1333 			/* ip6_ins_rt(nrt6) will bump the
1334 			 * rt6->rt6i_node->fn_sernum
1335 			 * which will fail the next rt6_check() and
1336 			 * invalidate the sk->sk_dst_cache.
1337 			 */
1338 			ip6_ins_rt(nrt6);
1339 		}
1340 	}
1341 }
1342 
1343 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1344 			       struct sk_buff *skb, u32 mtu)
1345 {
1346 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1347 }
1348 
1349 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1350 		     int oif, u32 mark)
1351 {
1352 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1353 	struct dst_entry *dst;
1354 	struct flowi6 fl6;
1355 
1356 	memset(&fl6, 0, sizeof(fl6));
1357 	fl6.flowi6_oif = oif;
1358 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1359 	fl6.daddr = iph->daddr;
1360 	fl6.saddr = iph->saddr;
1361 	fl6.flowlabel = ip6_flowinfo(iph);
1362 
1363 	dst = ip6_route_output(net, NULL, &fl6);
1364 	if (!dst->error)
1365 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1366 	dst_release(dst);
1367 }
1368 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1369 
1370 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1371 {
1372 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1373 			sk->sk_bound_dev_if, sk->sk_mark);
1374 }
1375 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1376 
1377 /* Handle redirects */
1378 struct ip6rd_flowi {
1379 	struct flowi6 fl6;
1380 	struct in6_addr gateway;
1381 };
1382 
1383 static struct rt6_info *__ip6_route_redirect(struct net *net,
1384 					     struct fib6_table *table,
1385 					     struct flowi6 *fl6,
1386 					     int flags)
1387 {
1388 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1389 	struct rt6_info *rt;
1390 	struct fib6_node *fn;
1391 
1392 	/* Get the "current" route for this destination and
1393 	 * check if the redirect has come from approriate router.
1394 	 *
1395 	 * RFC 4861 specifies that redirects should only be
1396 	 * accepted if they come from the nexthop to the target.
1397 	 * Due to the way the routes are chosen, this notion
1398 	 * is a bit fuzzy and one might need to check all possible
1399 	 * routes.
1400 	 */
1401 
1402 	read_lock_bh(&table->tb6_lock);
1403 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1404 restart:
1405 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1406 		if (rt6_check_expired(rt))
1407 			continue;
1408 		if (rt->dst.error)
1409 			break;
1410 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1411 			continue;
1412 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1413 			continue;
1414 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1415 			continue;
1416 		break;
1417 	}
1418 
1419 	if (!rt)
1420 		rt = net->ipv6.ip6_null_entry;
1421 	else if (rt->dst.error) {
1422 		rt = net->ipv6.ip6_null_entry;
1423 		goto out;
1424 	}
1425 
1426 	if (rt == net->ipv6.ip6_null_entry) {
1427 		fn = fib6_backtrack(fn, &fl6->saddr);
1428 		if (fn)
1429 			goto restart;
1430 	}
1431 
1432 out:
1433 	dst_hold(&rt->dst);
1434 
1435 	read_unlock_bh(&table->tb6_lock);
1436 
1437 	return rt;
1438 };
1439 
1440 static struct dst_entry *ip6_route_redirect(struct net *net,
1441 					const struct flowi6 *fl6,
1442 					const struct in6_addr *gateway)
1443 {
1444 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1445 	struct ip6rd_flowi rdfl;
1446 
1447 	rdfl.fl6 = *fl6;
1448 	rdfl.gateway = *gateway;
1449 
1450 	return fib6_rule_lookup(net, &rdfl.fl6,
1451 				flags, __ip6_route_redirect);
1452 }
1453 
1454 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1455 {
1456 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1457 	struct dst_entry *dst;
1458 	struct flowi6 fl6;
1459 
1460 	memset(&fl6, 0, sizeof(fl6));
1461 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1462 	fl6.flowi6_oif = oif;
1463 	fl6.flowi6_mark = mark;
1464 	fl6.daddr = iph->daddr;
1465 	fl6.saddr = iph->saddr;
1466 	fl6.flowlabel = ip6_flowinfo(iph);
1467 
1468 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1469 	rt6_do_redirect(dst, NULL, skb);
1470 	dst_release(dst);
1471 }
1472 EXPORT_SYMBOL_GPL(ip6_redirect);
1473 
1474 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1475 			    u32 mark)
1476 {
1477 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1478 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1479 	struct dst_entry *dst;
1480 	struct flowi6 fl6;
1481 
1482 	memset(&fl6, 0, sizeof(fl6));
1483 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1484 	fl6.flowi6_oif = oif;
1485 	fl6.flowi6_mark = mark;
1486 	fl6.daddr = msg->dest;
1487 	fl6.saddr = iph->daddr;
1488 
1489 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1490 	rt6_do_redirect(dst, NULL, skb);
1491 	dst_release(dst);
1492 }
1493 
1494 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1495 {
1496 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1497 }
1498 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1499 
1500 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1501 {
1502 	struct net_device *dev = dst->dev;
1503 	unsigned int mtu = dst_mtu(dst);
1504 	struct net *net = dev_net(dev);
1505 
1506 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1507 
1508 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1509 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1510 
1511 	/*
1512 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1513 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1514 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1515 	 * rely only on pmtu discovery"
1516 	 */
1517 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1518 		mtu = IPV6_MAXPLEN;
1519 	return mtu;
1520 }
1521 
1522 static unsigned int ip6_mtu(const struct dst_entry *dst)
1523 {
1524 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1525 	unsigned int mtu = rt->rt6i_pmtu;
1526 	struct inet6_dev *idev;
1527 
1528 	if (mtu)
1529 		goto out;
1530 
1531 	mtu = dst_metric_raw(dst, RTAX_MTU);
1532 	if (mtu)
1533 		goto out;
1534 
1535 	mtu = IPV6_MIN_MTU;
1536 
1537 	rcu_read_lock();
1538 	idev = __in6_dev_get(dst->dev);
1539 	if (idev)
1540 		mtu = idev->cnf.mtu6;
1541 	rcu_read_unlock();
1542 
1543 out:
1544 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1545 }
1546 
1547 static struct dst_entry *icmp6_dst_gc_list;
1548 static DEFINE_SPINLOCK(icmp6_dst_lock);
1549 
1550 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1551 				  struct flowi6 *fl6)
1552 {
1553 	struct dst_entry *dst;
1554 	struct rt6_info *rt;
1555 	struct inet6_dev *idev = in6_dev_get(dev);
1556 	struct net *net = dev_net(dev);
1557 
1558 	if (unlikely(!idev))
1559 		return ERR_PTR(-ENODEV);
1560 
1561 	rt = ip6_dst_alloc(net, dev, 0, NULL);
1562 	if (unlikely(!rt)) {
1563 		in6_dev_put(idev);
1564 		dst = ERR_PTR(-ENOMEM);
1565 		goto out;
1566 	}
1567 
1568 	rt->dst.flags |= DST_HOST;
1569 	rt->dst.output  = ip6_output;
1570 	atomic_set(&rt->dst.__refcnt, 1);
1571 	rt->rt6i_gateway  = fl6->daddr;
1572 	rt->rt6i_dst.addr = fl6->daddr;
1573 	rt->rt6i_dst.plen = 128;
1574 	rt->rt6i_idev     = idev;
1575 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1576 
1577 	spin_lock_bh(&icmp6_dst_lock);
1578 	rt->dst.next = icmp6_dst_gc_list;
1579 	icmp6_dst_gc_list = &rt->dst;
1580 	spin_unlock_bh(&icmp6_dst_lock);
1581 
1582 	fib6_force_start_gc(net);
1583 
1584 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1585 
1586 out:
1587 	return dst;
1588 }
1589 
1590 int icmp6_dst_gc(void)
1591 {
1592 	struct dst_entry *dst, **pprev;
1593 	int more = 0;
1594 
1595 	spin_lock_bh(&icmp6_dst_lock);
1596 	pprev = &icmp6_dst_gc_list;
1597 
1598 	while ((dst = *pprev) != NULL) {
1599 		if (!atomic_read(&dst->__refcnt)) {
1600 			*pprev = dst->next;
1601 			dst_free(dst);
1602 		} else {
1603 			pprev = &dst->next;
1604 			++more;
1605 		}
1606 	}
1607 
1608 	spin_unlock_bh(&icmp6_dst_lock);
1609 
1610 	return more;
1611 }
1612 
1613 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1614 			    void *arg)
1615 {
1616 	struct dst_entry *dst, **pprev;
1617 
1618 	spin_lock_bh(&icmp6_dst_lock);
1619 	pprev = &icmp6_dst_gc_list;
1620 	while ((dst = *pprev) != NULL) {
1621 		struct rt6_info *rt = (struct rt6_info *) dst;
1622 		if (func(rt, arg)) {
1623 			*pprev = dst->next;
1624 			dst_free(dst);
1625 		} else {
1626 			pprev = &dst->next;
1627 		}
1628 	}
1629 	spin_unlock_bh(&icmp6_dst_lock);
1630 }
1631 
1632 static int ip6_dst_gc(struct dst_ops *ops)
1633 {
1634 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1635 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1636 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1637 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1638 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1639 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1640 	int entries;
1641 
1642 	entries = dst_entries_get_fast(ops);
1643 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1644 	    entries <= rt_max_size)
1645 		goto out;
1646 
1647 	net->ipv6.ip6_rt_gc_expire++;
1648 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1649 	entries = dst_entries_get_slow(ops);
1650 	if (entries < ops->gc_thresh)
1651 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1652 out:
1653 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1654 	return entries > rt_max_size;
1655 }
1656 
1657 static int ip6_convert_metrics(struct mx6_config *mxc,
1658 			       const struct fib6_config *cfg)
1659 {
1660 	struct nlattr *nla;
1661 	int remaining;
1662 	u32 *mp;
1663 
1664 	if (!cfg->fc_mx)
1665 		return 0;
1666 
1667 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1668 	if (unlikely(!mp))
1669 		return -ENOMEM;
1670 
1671 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1672 		int type = nla_type(nla);
1673 
1674 		if (type) {
1675 			u32 val;
1676 
1677 			if (unlikely(type > RTAX_MAX))
1678 				goto err;
1679 			if (type == RTAX_CC_ALGO) {
1680 				char tmp[TCP_CA_NAME_MAX];
1681 
1682 				nla_strlcpy(tmp, nla, sizeof(tmp));
1683 				val = tcp_ca_get_key_by_name(tmp);
1684 				if (val == TCP_CA_UNSPEC)
1685 					goto err;
1686 			} else {
1687 				val = nla_get_u32(nla);
1688 			}
1689 
1690 			mp[type - 1] = val;
1691 			__set_bit(type - 1, mxc->mx_valid);
1692 		}
1693 	}
1694 
1695 	mxc->mx = mp;
1696 
1697 	return 0;
1698  err:
1699 	kfree(mp);
1700 	return -EINVAL;
1701 }
1702 
1703 int ip6_route_add(struct fib6_config *cfg)
1704 {
1705 	int err;
1706 	struct net *net = cfg->fc_nlinfo.nl_net;
1707 	struct rt6_info *rt = NULL;
1708 	struct net_device *dev = NULL;
1709 	struct inet6_dev *idev = NULL;
1710 	struct fib6_table *table;
1711 	struct mx6_config mxc = { .mx = NULL, };
1712 	int addr_type;
1713 
1714 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1715 		return -EINVAL;
1716 #ifndef CONFIG_IPV6_SUBTREES
1717 	if (cfg->fc_src_len)
1718 		return -EINVAL;
1719 #endif
1720 	if (cfg->fc_ifindex) {
1721 		err = -ENODEV;
1722 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1723 		if (!dev)
1724 			goto out;
1725 		idev = in6_dev_get(dev);
1726 		if (!idev)
1727 			goto out;
1728 	}
1729 
1730 	if (cfg->fc_metric == 0)
1731 		cfg->fc_metric = IP6_RT_PRIO_USER;
1732 
1733 	err = -ENOBUFS;
1734 	if (cfg->fc_nlinfo.nlh &&
1735 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1736 		table = fib6_get_table(net, cfg->fc_table);
1737 		if (!table) {
1738 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1739 			table = fib6_new_table(net, cfg->fc_table);
1740 		}
1741 	} else {
1742 		table = fib6_new_table(net, cfg->fc_table);
1743 	}
1744 
1745 	if (!table)
1746 		goto out;
1747 
1748 	rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1749 
1750 	if (!rt) {
1751 		err = -ENOMEM;
1752 		goto out;
1753 	}
1754 
1755 	if (cfg->fc_flags & RTF_EXPIRES)
1756 		rt6_set_expires(rt, jiffies +
1757 				clock_t_to_jiffies(cfg->fc_expires));
1758 	else
1759 		rt6_clean_expires(rt);
1760 
1761 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1762 		cfg->fc_protocol = RTPROT_BOOT;
1763 	rt->rt6i_protocol = cfg->fc_protocol;
1764 
1765 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1766 
1767 	if (addr_type & IPV6_ADDR_MULTICAST)
1768 		rt->dst.input = ip6_mc_input;
1769 	else if (cfg->fc_flags & RTF_LOCAL)
1770 		rt->dst.input = ip6_input;
1771 	else
1772 		rt->dst.input = ip6_forward;
1773 
1774 	rt->dst.output = ip6_output;
1775 
1776 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1777 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1778 	if (rt->rt6i_dst.plen == 128)
1779 		rt->dst.flags |= DST_HOST;
1780 
1781 #ifdef CONFIG_IPV6_SUBTREES
1782 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1783 	rt->rt6i_src.plen = cfg->fc_src_len;
1784 #endif
1785 
1786 	rt->rt6i_metric = cfg->fc_metric;
1787 
1788 	/* We cannot add true routes via loopback here,
1789 	   they would result in kernel looping; promote them to reject routes
1790 	 */
1791 	if ((cfg->fc_flags & RTF_REJECT) ||
1792 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1793 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1794 	     !(cfg->fc_flags & RTF_LOCAL))) {
1795 		/* hold loopback dev/idev if we haven't done so. */
1796 		if (dev != net->loopback_dev) {
1797 			if (dev) {
1798 				dev_put(dev);
1799 				in6_dev_put(idev);
1800 			}
1801 			dev = net->loopback_dev;
1802 			dev_hold(dev);
1803 			idev = in6_dev_get(dev);
1804 			if (!idev) {
1805 				err = -ENODEV;
1806 				goto out;
1807 			}
1808 		}
1809 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1810 		switch (cfg->fc_type) {
1811 		case RTN_BLACKHOLE:
1812 			rt->dst.error = -EINVAL;
1813 			rt->dst.output = dst_discard_sk;
1814 			rt->dst.input = dst_discard;
1815 			break;
1816 		case RTN_PROHIBIT:
1817 			rt->dst.error = -EACCES;
1818 			rt->dst.output = ip6_pkt_prohibit_out;
1819 			rt->dst.input = ip6_pkt_prohibit;
1820 			break;
1821 		case RTN_THROW:
1822 		default:
1823 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1824 					: -ENETUNREACH;
1825 			rt->dst.output = ip6_pkt_discard_out;
1826 			rt->dst.input = ip6_pkt_discard;
1827 			break;
1828 		}
1829 		goto install_route;
1830 	}
1831 
1832 	if (cfg->fc_flags & RTF_GATEWAY) {
1833 		const struct in6_addr *gw_addr;
1834 		int gwa_type;
1835 
1836 		gw_addr = &cfg->fc_gateway;
1837 
1838 		/* if gw_addr is local we will fail to detect this in case
1839 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1840 		 * will return already-added prefix route via interface that
1841 		 * prefix route was assigned to, which might be non-loopback.
1842 		 */
1843 		err = -EINVAL;
1844 		if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1845 			goto out;
1846 
1847 		rt->rt6i_gateway = *gw_addr;
1848 		gwa_type = ipv6_addr_type(gw_addr);
1849 
1850 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1851 			struct rt6_info *grt;
1852 
1853 			/* IPv6 strictly inhibits using not link-local
1854 			   addresses as nexthop address.
1855 			   Otherwise, router will not able to send redirects.
1856 			   It is very good, but in some (rare!) circumstances
1857 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1858 			   some exceptions. --ANK
1859 			 */
1860 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1861 				goto out;
1862 
1863 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1864 
1865 			err = -EHOSTUNREACH;
1866 			if (!grt)
1867 				goto out;
1868 			if (dev) {
1869 				if (dev != grt->dst.dev) {
1870 					ip6_rt_put(grt);
1871 					goto out;
1872 				}
1873 			} else {
1874 				dev = grt->dst.dev;
1875 				idev = grt->rt6i_idev;
1876 				dev_hold(dev);
1877 				in6_dev_hold(grt->rt6i_idev);
1878 			}
1879 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1880 				err = 0;
1881 			ip6_rt_put(grt);
1882 
1883 			if (err)
1884 				goto out;
1885 		}
1886 		err = -EINVAL;
1887 		if (!dev || (dev->flags & IFF_LOOPBACK))
1888 			goto out;
1889 	}
1890 
1891 	err = -ENODEV;
1892 	if (!dev)
1893 		goto out;
1894 
1895 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1896 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1897 			err = -EINVAL;
1898 			goto out;
1899 		}
1900 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1901 		rt->rt6i_prefsrc.plen = 128;
1902 	} else
1903 		rt->rt6i_prefsrc.plen = 0;
1904 
1905 	rt->rt6i_flags = cfg->fc_flags;
1906 
1907 install_route:
1908 	rt->dst.dev = dev;
1909 	rt->rt6i_idev = idev;
1910 	rt->rt6i_table = table;
1911 
1912 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1913 
1914 	err = ip6_convert_metrics(&mxc, cfg);
1915 	if (err)
1916 		goto out;
1917 
1918 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1919 
1920 	kfree(mxc.mx);
1921 	return err;
1922 out:
1923 	if (dev)
1924 		dev_put(dev);
1925 	if (idev)
1926 		in6_dev_put(idev);
1927 	if (rt)
1928 		dst_free(&rt->dst);
1929 	return err;
1930 }
1931 
1932 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1933 {
1934 	int err;
1935 	struct fib6_table *table;
1936 	struct net *net = dev_net(rt->dst.dev);
1937 
1938 	if (rt == net->ipv6.ip6_null_entry) {
1939 		err = -ENOENT;
1940 		goto out;
1941 	}
1942 
1943 	table = rt->rt6i_table;
1944 	write_lock_bh(&table->tb6_lock);
1945 	err = fib6_del(rt, info);
1946 	write_unlock_bh(&table->tb6_lock);
1947 
1948 out:
1949 	ip6_rt_put(rt);
1950 	return err;
1951 }
1952 
1953 int ip6_del_rt(struct rt6_info *rt)
1954 {
1955 	struct nl_info info = {
1956 		.nl_net = dev_net(rt->dst.dev),
1957 	};
1958 	return __ip6_del_rt(rt, &info);
1959 }
1960 
1961 static int ip6_route_del(struct fib6_config *cfg)
1962 {
1963 	struct fib6_table *table;
1964 	struct fib6_node *fn;
1965 	struct rt6_info *rt;
1966 	int err = -ESRCH;
1967 
1968 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1969 	if (!table)
1970 		return err;
1971 
1972 	read_lock_bh(&table->tb6_lock);
1973 
1974 	fn = fib6_locate(&table->tb6_root,
1975 			 &cfg->fc_dst, cfg->fc_dst_len,
1976 			 &cfg->fc_src, cfg->fc_src_len);
1977 
1978 	if (fn) {
1979 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1980 			if ((rt->rt6i_flags & RTF_CACHE) &&
1981 			    !(cfg->fc_flags & RTF_CACHE))
1982 				continue;
1983 			if (cfg->fc_ifindex &&
1984 			    (!rt->dst.dev ||
1985 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
1986 				continue;
1987 			if (cfg->fc_flags & RTF_GATEWAY &&
1988 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1989 				continue;
1990 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1991 				continue;
1992 			dst_hold(&rt->dst);
1993 			read_unlock_bh(&table->tb6_lock);
1994 
1995 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1996 		}
1997 	}
1998 	read_unlock_bh(&table->tb6_lock);
1999 
2000 	return err;
2001 }
2002 
2003 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2004 {
2005 	struct net *net = dev_net(skb->dev);
2006 	struct netevent_redirect netevent;
2007 	struct rt6_info *rt, *nrt = NULL;
2008 	struct ndisc_options ndopts;
2009 	struct inet6_dev *in6_dev;
2010 	struct neighbour *neigh;
2011 	struct rd_msg *msg;
2012 	int optlen, on_link;
2013 	u8 *lladdr;
2014 
2015 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2016 	optlen -= sizeof(*msg);
2017 
2018 	if (optlen < 0) {
2019 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2020 		return;
2021 	}
2022 
2023 	msg = (struct rd_msg *)icmp6_hdr(skb);
2024 
2025 	if (ipv6_addr_is_multicast(&msg->dest)) {
2026 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2027 		return;
2028 	}
2029 
2030 	on_link = 0;
2031 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2032 		on_link = 1;
2033 	} else if (ipv6_addr_type(&msg->target) !=
2034 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2035 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2036 		return;
2037 	}
2038 
2039 	in6_dev = __in6_dev_get(skb->dev);
2040 	if (!in6_dev)
2041 		return;
2042 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2043 		return;
2044 
2045 	/* RFC2461 8.1:
2046 	 *	The IP source address of the Redirect MUST be the same as the current
2047 	 *	first-hop router for the specified ICMP Destination Address.
2048 	 */
2049 
2050 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2051 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2052 		return;
2053 	}
2054 
2055 	lladdr = NULL;
2056 	if (ndopts.nd_opts_tgt_lladdr) {
2057 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2058 					     skb->dev);
2059 		if (!lladdr) {
2060 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2061 			return;
2062 		}
2063 	}
2064 
2065 	rt = (struct rt6_info *) dst;
2066 	if (rt == net->ipv6.ip6_null_entry) {
2067 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2068 		return;
2069 	}
2070 
2071 	/* Redirect received -> path was valid.
2072 	 * Look, redirects are sent only in response to data packets,
2073 	 * so that this nexthop apparently is reachable. --ANK
2074 	 */
2075 	dst_confirm(&rt->dst);
2076 
2077 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2078 	if (!neigh)
2079 		return;
2080 
2081 	/*
2082 	 *	We have finally decided to accept it.
2083 	 */
2084 
2085 	neigh_update(neigh, lladdr, NUD_STALE,
2086 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2087 		     NEIGH_UPDATE_F_OVERRIDE|
2088 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2089 				     NEIGH_UPDATE_F_ISROUTER))
2090 		     );
2091 
2092 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2093 	if (!nrt)
2094 		goto out;
2095 
2096 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2097 	if (on_link)
2098 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2099 
2100 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2101 
2102 	if (ip6_ins_rt(nrt))
2103 		goto out;
2104 
2105 	netevent.old = &rt->dst;
2106 	netevent.new = &nrt->dst;
2107 	netevent.daddr = &msg->dest;
2108 	netevent.neigh = neigh;
2109 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2110 
2111 	if (rt->rt6i_flags & RTF_CACHE) {
2112 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2113 		ip6_del_rt(rt);
2114 	}
2115 
2116 out:
2117 	neigh_release(neigh);
2118 }
2119 
2120 /*
2121  *	Misc support functions
2122  */
2123 
2124 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2125 {
2126 	BUG_ON(from->dst.from);
2127 
2128 	rt->rt6i_flags &= ~RTF_EXPIRES;
2129 	dst_hold(&from->dst);
2130 	rt->dst.from = &from->dst;
2131 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2132 }
2133 
2134 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2135 {
2136 	rt->dst.input = ort->dst.input;
2137 	rt->dst.output = ort->dst.output;
2138 	rt->rt6i_dst = ort->rt6i_dst;
2139 	rt->dst.error = ort->dst.error;
2140 	rt->rt6i_idev = ort->rt6i_idev;
2141 	if (rt->rt6i_idev)
2142 		in6_dev_hold(rt->rt6i_idev);
2143 	rt->dst.lastuse = jiffies;
2144 	rt->rt6i_gateway = ort->rt6i_gateway;
2145 	rt->rt6i_flags = ort->rt6i_flags;
2146 	rt6_set_from(rt, ort);
2147 	rt->rt6i_metric = ort->rt6i_metric;
2148 #ifdef CONFIG_IPV6_SUBTREES
2149 	rt->rt6i_src = ort->rt6i_src;
2150 #endif
2151 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2152 	rt->rt6i_table = ort->rt6i_table;
2153 }
2154 
2155 #ifdef CONFIG_IPV6_ROUTE_INFO
2156 static struct rt6_info *rt6_get_route_info(struct net *net,
2157 					   const struct in6_addr *prefix, int prefixlen,
2158 					   const struct in6_addr *gwaddr, int ifindex)
2159 {
2160 	struct fib6_node *fn;
2161 	struct rt6_info *rt = NULL;
2162 	struct fib6_table *table;
2163 
2164 	table = fib6_get_table(net, RT6_TABLE_INFO);
2165 	if (!table)
2166 		return NULL;
2167 
2168 	read_lock_bh(&table->tb6_lock);
2169 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2170 	if (!fn)
2171 		goto out;
2172 
2173 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2174 		if (rt->dst.dev->ifindex != ifindex)
2175 			continue;
2176 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2177 			continue;
2178 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2179 			continue;
2180 		dst_hold(&rt->dst);
2181 		break;
2182 	}
2183 out:
2184 	read_unlock_bh(&table->tb6_lock);
2185 	return rt;
2186 }
2187 
2188 static struct rt6_info *rt6_add_route_info(struct net *net,
2189 					   const struct in6_addr *prefix, int prefixlen,
2190 					   const struct in6_addr *gwaddr, int ifindex,
2191 					   unsigned int pref)
2192 {
2193 	struct fib6_config cfg = {
2194 		.fc_table	= RT6_TABLE_INFO,
2195 		.fc_metric	= IP6_RT_PRIO_USER,
2196 		.fc_ifindex	= ifindex,
2197 		.fc_dst_len	= prefixlen,
2198 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2199 				  RTF_UP | RTF_PREF(pref),
2200 		.fc_nlinfo.portid = 0,
2201 		.fc_nlinfo.nlh = NULL,
2202 		.fc_nlinfo.nl_net = net,
2203 	};
2204 
2205 	cfg.fc_dst = *prefix;
2206 	cfg.fc_gateway = *gwaddr;
2207 
2208 	/* We should treat it as a default route if prefix length is 0. */
2209 	if (!prefixlen)
2210 		cfg.fc_flags |= RTF_DEFAULT;
2211 
2212 	ip6_route_add(&cfg);
2213 
2214 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2215 }
2216 #endif
2217 
2218 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2219 {
2220 	struct rt6_info *rt;
2221 	struct fib6_table *table;
2222 
2223 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2224 	if (!table)
2225 		return NULL;
2226 
2227 	read_lock_bh(&table->tb6_lock);
2228 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2229 		if (dev == rt->dst.dev &&
2230 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2231 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2232 			break;
2233 	}
2234 	if (rt)
2235 		dst_hold(&rt->dst);
2236 	read_unlock_bh(&table->tb6_lock);
2237 	return rt;
2238 }
2239 
2240 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2241 				     struct net_device *dev,
2242 				     unsigned int pref)
2243 {
2244 	struct fib6_config cfg = {
2245 		.fc_table	= RT6_TABLE_DFLT,
2246 		.fc_metric	= IP6_RT_PRIO_USER,
2247 		.fc_ifindex	= dev->ifindex,
2248 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2249 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2250 		.fc_nlinfo.portid = 0,
2251 		.fc_nlinfo.nlh = NULL,
2252 		.fc_nlinfo.nl_net = dev_net(dev),
2253 	};
2254 
2255 	cfg.fc_gateway = *gwaddr;
2256 
2257 	ip6_route_add(&cfg);
2258 
2259 	return rt6_get_dflt_router(gwaddr, dev);
2260 }
2261 
2262 void rt6_purge_dflt_routers(struct net *net)
2263 {
2264 	struct rt6_info *rt;
2265 	struct fib6_table *table;
2266 
2267 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2268 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2269 	if (!table)
2270 		return;
2271 
2272 restart:
2273 	read_lock_bh(&table->tb6_lock);
2274 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2275 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2276 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2277 			dst_hold(&rt->dst);
2278 			read_unlock_bh(&table->tb6_lock);
2279 			ip6_del_rt(rt);
2280 			goto restart;
2281 		}
2282 	}
2283 	read_unlock_bh(&table->tb6_lock);
2284 }
2285 
2286 static void rtmsg_to_fib6_config(struct net *net,
2287 				 struct in6_rtmsg *rtmsg,
2288 				 struct fib6_config *cfg)
2289 {
2290 	memset(cfg, 0, sizeof(*cfg));
2291 
2292 	cfg->fc_table = RT6_TABLE_MAIN;
2293 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2294 	cfg->fc_metric = rtmsg->rtmsg_metric;
2295 	cfg->fc_expires = rtmsg->rtmsg_info;
2296 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2297 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2298 	cfg->fc_flags = rtmsg->rtmsg_flags;
2299 
2300 	cfg->fc_nlinfo.nl_net = net;
2301 
2302 	cfg->fc_dst = rtmsg->rtmsg_dst;
2303 	cfg->fc_src = rtmsg->rtmsg_src;
2304 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2305 }
2306 
2307 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2308 {
2309 	struct fib6_config cfg;
2310 	struct in6_rtmsg rtmsg;
2311 	int err;
2312 
2313 	switch (cmd) {
2314 	case SIOCADDRT:		/* Add a route */
2315 	case SIOCDELRT:		/* Delete a route */
2316 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2317 			return -EPERM;
2318 		err = copy_from_user(&rtmsg, arg,
2319 				     sizeof(struct in6_rtmsg));
2320 		if (err)
2321 			return -EFAULT;
2322 
2323 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2324 
2325 		rtnl_lock();
2326 		switch (cmd) {
2327 		case SIOCADDRT:
2328 			err = ip6_route_add(&cfg);
2329 			break;
2330 		case SIOCDELRT:
2331 			err = ip6_route_del(&cfg);
2332 			break;
2333 		default:
2334 			err = -EINVAL;
2335 		}
2336 		rtnl_unlock();
2337 
2338 		return err;
2339 	}
2340 
2341 	return -EINVAL;
2342 }
2343 
2344 /*
2345  *	Drop the packet on the floor
2346  */
2347 
2348 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2349 {
2350 	int type;
2351 	struct dst_entry *dst = skb_dst(skb);
2352 	switch (ipstats_mib_noroutes) {
2353 	case IPSTATS_MIB_INNOROUTES:
2354 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2355 		if (type == IPV6_ADDR_ANY) {
2356 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2357 				      IPSTATS_MIB_INADDRERRORS);
2358 			break;
2359 		}
2360 		/* FALLTHROUGH */
2361 	case IPSTATS_MIB_OUTNOROUTES:
2362 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2363 			      ipstats_mib_noroutes);
2364 		break;
2365 	}
2366 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2367 	kfree_skb(skb);
2368 	return 0;
2369 }
2370 
2371 static int ip6_pkt_discard(struct sk_buff *skb)
2372 {
2373 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2374 }
2375 
2376 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2377 {
2378 	skb->dev = skb_dst(skb)->dev;
2379 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2380 }
2381 
2382 static int ip6_pkt_prohibit(struct sk_buff *skb)
2383 {
2384 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2385 }
2386 
2387 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2388 {
2389 	skb->dev = skb_dst(skb)->dev;
2390 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2391 }
2392 
2393 /*
2394  *	Allocate a dst for local (unicast / anycast) address.
2395  */
2396 
2397 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2398 				    const struct in6_addr *addr,
2399 				    bool anycast)
2400 {
2401 	struct net *net = dev_net(idev->dev);
2402 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2403 					    DST_NOCOUNT, NULL);
2404 	if (!rt)
2405 		return ERR_PTR(-ENOMEM);
2406 
2407 	in6_dev_hold(idev);
2408 
2409 	rt->dst.flags |= DST_HOST;
2410 	rt->dst.input = ip6_input;
2411 	rt->dst.output = ip6_output;
2412 	rt->rt6i_idev = idev;
2413 
2414 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2415 	if (anycast)
2416 		rt->rt6i_flags |= RTF_ANYCAST;
2417 	else
2418 		rt->rt6i_flags |= RTF_LOCAL;
2419 
2420 	rt->rt6i_gateway  = *addr;
2421 	rt->rt6i_dst.addr = *addr;
2422 	rt->rt6i_dst.plen = 128;
2423 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2424 
2425 	atomic_set(&rt->dst.__refcnt, 1);
2426 
2427 	return rt;
2428 }
2429 
2430 int ip6_route_get_saddr(struct net *net,
2431 			struct rt6_info *rt,
2432 			const struct in6_addr *daddr,
2433 			unsigned int prefs,
2434 			struct in6_addr *saddr)
2435 {
2436 	struct inet6_dev *idev =
2437 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2438 	int err = 0;
2439 	if (rt && rt->rt6i_prefsrc.plen)
2440 		*saddr = rt->rt6i_prefsrc.addr;
2441 	else
2442 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2443 					 daddr, prefs, saddr);
2444 	return err;
2445 }
2446 
2447 /* remove deleted ip from prefsrc entries */
2448 struct arg_dev_net_ip {
2449 	struct net_device *dev;
2450 	struct net *net;
2451 	struct in6_addr *addr;
2452 };
2453 
2454 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2455 {
2456 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2457 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2458 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2459 
2460 	if (((void *)rt->dst.dev == dev || !dev) &&
2461 	    rt != net->ipv6.ip6_null_entry &&
2462 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2463 		/* remove prefsrc entry */
2464 		rt->rt6i_prefsrc.plen = 0;
2465 	}
2466 	return 0;
2467 }
2468 
2469 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2470 {
2471 	struct net *net = dev_net(ifp->idev->dev);
2472 	struct arg_dev_net_ip adni = {
2473 		.dev = ifp->idev->dev,
2474 		.net = net,
2475 		.addr = &ifp->addr,
2476 	};
2477 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2478 }
2479 
2480 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2481 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2482 
2483 /* Remove routers and update dst entries when gateway turn into host. */
2484 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2485 {
2486 	struct in6_addr *gateway = (struct in6_addr *)arg;
2487 
2488 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2489 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2490 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2491 		return -1;
2492 	}
2493 	return 0;
2494 }
2495 
2496 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2497 {
2498 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2499 }
2500 
2501 struct arg_dev_net {
2502 	struct net_device *dev;
2503 	struct net *net;
2504 };
2505 
2506 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2507 {
2508 	const struct arg_dev_net *adn = arg;
2509 	const struct net_device *dev = adn->dev;
2510 
2511 	if ((rt->dst.dev == dev || !dev) &&
2512 	    rt != adn->net->ipv6.ip6_null_entry)
2513 		return -1;
2514 
2515 	return 0;
2516 }
2517 
2518 void rt6_ifdown(struct net *net, struct net_device *dev)
2519 {
2520 	struct arg_dev_net adn = {
2521 		.dev = dev,
2522 		.net = net,
2523 	};
2524 
2525 	fib6_clean_all(net, fib6_ifdown, &adn);
2526 	icmp6_clean_all(fib6_ifdown, &adn);
2527 	rt6_uncached_list_flush_dev(net, dev);
2528 }
2529 
2530 struct rt6_mtu_change_arg {
2531 	struct net_device *dev;
2532 	unsigned int mtu;
2533 };
2534 
2535 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2536 {
2537 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2538 	struct inet6_dev *idev;
2539 
2540 	/* In IPv6 pmtu discovery is not optional,
2541 	   so that RTAX_MTU lock cannot disable it.
2542 	   We still use this lock to block changes
2543 	   caused by addrconf/ndisc.
2544 	*/
2545 
2546 	idev = __in6_dev_get(arg->dev);
2547 	if (!idev)
2548 		return 0;
2549 
2550 	/* For administrative MTU increase, there is no way to discover
2551 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2552 	   Since RFC 1981 doesn't include administrative MTU increase
2553 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2554 	 */
2555 	/*
2556 	   If new MTU is less than route PMTU, this new MTU will be the
2557 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2558 	   decreases; if new MTU is greater than route PMTU, and the
2559 	   old MTU is the lowest MTU in the path, update the route PMTU
2560 	   to reflect the increase. In this case if the other nodes' MTU
2561 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2562 	   PMTU discouvery.
2563 	 */
2564 	if (rt->dst.dev == arg->dev &&
2565 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2566 		if (rt->rt6i_flags & RTF_CACHE) {
2567 			/* For RTF_CACHE with rt6i_pmtu == 0
2568 			 * (i.e. a redirected route),
2569 			 * the metrics of its rt->dst.from has already
2570 			 * been updated.
2571 			 */
2572 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2573 				rt->rt6i_pmtu = arg->mtu;
2574 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2575 			   (dst_mtu(&rt->dst) < arg->mtu &&
2576 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2577 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2578 		}
2579 	}
2580 	return 0;
2581 }
2582 
2583 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2584 {
2585 	struct rt6_mtu_change_arg arg = {
2586 		.dev = dev,
2587 		.mtu = mtu,
2588 	};
2589 
2590 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2591 }
2592 
2593 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2594 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2595 	[RTA_OIF]               = { .type = NLA_U32 },
2596 	[RTA_IIF]		= { .type = NLA_U32 },
2597 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2598 	[RTA_METRICS]           = { .type = NLA_NESTED },
2599 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2600 	[RTA_PREF]              = { .type = NLA_U8 },
2601 };
2602 
2603 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2604 			      struct fib6_config *cfg)
2605 {
2606 	struct rtmsg *rtm;
2607 	struct nlattr *tb[RTA_MAX+1];
2608 	unsigned int pref;
2609 	int err;
2610 
2611 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2612 	if (err < 0)
2613 		goto errout;
2614 
2615 	err = -EINVAL;
2616 	rtm = nlmsg_data(nlh);
2617 	memset(cfg, 0, sizeof(*cfg));
2618 
2619 	cfg->fc_table = rtm->rtm_table;
2620 	cfg->fc_dst_len = rtm->rtm_dst_len;
2621 	cfg->fc_src_len = rtm->rtm_src_len;
2622 	cfg->fc_flags = RTF_UP;
2623 	cfg->fc_protocol = rtm->rtm_protocol;
2624 	cfg->fc_type = rtm->rtm_type;
2625 
2626 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2627 	    rtm->rtm_type == RTN_BLACKHOLE ||
2628 	    rtm->rtm_type == RTN_PROHIBIT ||
2629 	    rtm->rtm_type == RTN_THROW)
2630 		cfg->fc_flags |= RTF_REJECT;
2631 
2632 	if (rtm->rtm_type == RTN_LOCAL)
2633 		cfg->fc_flags |= RTF_LOCAL;
2634 
2635 	if (rtm->rtm_flags & RTM_F_CLONED)
2636 		cfg->fc_flags |= RTF_CACHE;
2637 
2638 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2639 	cfg->fc_nlinfo.nlh = nlh;
2640 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2641 
2642 	if (tb[RTA_GATEWAY]) {
2643 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2644 		cfg->fc_flags |= RTF_GATEWAY;
2645 	}
2646 
2647 	if (tb[RTA_DST]) {
2648 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2649 
2650 		if (nla_len(tb[RTA_DST]) < plen)
2651 			goto errout;
2652 
2653 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2654 	}
2655 
2656 	if (tb[RTA_SRC]) {
2657 		int plen = (rtm->rtm_src_len + 7) >> 3;
2658 
2659 		if (nla_len(tb[RTA_SRC]) < plen)
2660 			goto errout;
2661 
2662 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2663 	}
2664 
2665 	if (tb[RTA_PREFSRC])
2666 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2667 
2668 	if (tb[RTA_OIF])
2669 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2670 
2671 	if (tb[RTA_PRIORITY])
2672 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2673 
2674 	if (tb[RTA_METRICS]) {
2675 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2676 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2677 	}
2678 
2679 	if (tb[RTA_TABLE])
2680 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2681 
2682 	if (tb[RTA_MULTIPATH]) {
2683 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2684 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2685 	}
2686 
2687 	if (tb[RTA_PREF]) {
2688 		pref = nla_get_u8(tb[RTA_PREF]);
2689 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2690 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2691 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2692 		cfg->fc_flags |= RTF_PREF(pref);
2693 	}
2694 
2695 	err = 0;
2696 errout:
2697 	return err;
2698 }
2699 
2700 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2701 {
2702 	struct fib6_config r_cfg;
2703 	struct rtnexthop *rtnh;
2704 	int remaining;
2705 	int attrlen;
2706 	int err = 0, last_err = 0;
2707 
2708 	remaining = cfg->fc_mp_len;
2709 beginning:
2710 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2711 
2712 	/* Parse a Multipath Entry */
2713 	while (rtnh_ok(rtnh, remaining)) {
2714 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2715 		if (rtnh->rtnh_ifindex)
2716 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2717 
2718 		attrlen = rtnh_attrlen(rtnh);
2719 		if (attrlen > 0) {
2720 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2721 
2722 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2723 			if (nla) {
2724 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2725 				r_cfg.fc_flags |= RTF_GATEWAY;
2726 			}
2727 		}
2728 		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2729 		if (err) {
2730 			last_err = err;
2731 			/* If we are trying to remove a route, do not stop the
2732 			 * loop when ip6_route_del() fails (because next hop is
2733 			 * already gone), we should try to remove all next hops.
2734 			 */
2735 			if (add) {
2736 				/* If add fails, we should try to delete all
2737 				 * next hops that have been already added.
2738 				 */
2739 				add = 0;
2740 				remaining = cfg->fc_mp_len - remaining;
2741 				goto beginning;
2742 			}
2743 		}
2744 		/* Because each route is added like a single route we remove
2745 		 * these flags after the first nexthop: if there is a collision,
2746 		 * we have already failed to add the first nexthop:
2747 		 * fib6_add_rt2node() has rejected it; when replacing, old
2748 		 * nexthops have been replaced by first new, the rest should
2749 		 * be added to it.
2750 		 */
2751 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2752 						     NLM_F_REPLACE);
2753 		rtnh = rtnh_next(rtnh, &remaining);
2754 	}
2755 
2756 	return last_err;
2757 }
2758 
2759 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2760 {
2761 	struct fib6_config cfg;
2762 	int err;
2763 
2764 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2765 	if (err < 0)
2766 		return err;
2767 
2768 	if (cfg.fc_mp)
2769 		return ip6_route_multipath(&cfg, 0);
2770 	else
2771 		return ip6_route_del(&cfg);
2772 }
2773 
2774 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2775 {
2776 	struct fib6_config cfg;
2777 	int err;
2778 
2779 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2780 	if (err < 0)
2781 		return err;
2782 
2783 	if (cfg.fc_mp)
2784 		return ip6_route_multipath(&cfg, 1);
2785 	else
2786 		return ip6_route_add(&cfg);
2787 }
2788 
2789 static inline size_t rt6_nlmsg_size(void)
2790 {
2791 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2792 	       + nla_total_size(16) /* RTA_SRC */
2793 	       + nla_total_size(16) /* RTA_DST */
2794 	       + nla_total_size(16) /* RTA_GATEWAY */
2795 	       + nla_total_size(16) /* RTA_PREFSRC */
2796 	       + nla_total_size(4) /* RTA_TABLE */
2797 	       + nla_total_size(4) /* RTA_IIF */
2798 	       + nla_total_size(4) /* RTA_OIF */
2799 	       + nla_total_size(4) /* RTA_PRIORITY */
2800 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2801 	       + nla_total_size(sizeof(struct rta_cacheinfo))
2802 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2803 	       + nla_total_size(1); /* RTA_PREF */
2804 }
2805 
2806 static int rt6_fill_node(struct net *net,
2807 			 struct sk_buff *skb, struct rt6_info *rt,
2808 			 struct in6_addr *dst, struct in6_addr *src,
2809 			 int iif, int type, u32 portid, u32 seq,
2810 			 int prefix, int nowait, unsigned int flags)
2811 {
2812 	u32 metrics[RTAX_MAX];
2813 	struct rtmsg *rtm;
2814 	struct nlmsghdr *nlh;
2815 	long expires;
2816 	u32 table;
2817 
2818 	if (prefix) {	/* user wants prefix routes only */
2819 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2820 			/* success since this is not a prefix route */
2821 			return 1;
2822 		}
2823 	}
2824 
2825 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2826 	if (!nlh)
2827 		return -EMSGSIZE;
2828 
2829 	rtm = nlmsg_data(nlh);
2830 	rtm->rtm_family = AF_INET6;
2831 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2832 	rtm->rtm_src_len = rt->rt6i_src.plen;
2833 	rtm->rtm_tos = 0;
2834 	if (rt->rt6i_table)
2835 		table = rt->rt6i_table->tb6_id;
2836 	else
2837 		table = RT6_TABLE_UNSPEC;
2838 	rtm->rtm_table = table;
2839 	if (nla_put_u32(skb, RTA_TABLE, table))
2840 		goto nla_put_failure;
2841 	if (rt->rt6i_flags & RTF_REJECT) {
2842 		switch (rt->dst.error) {
2843 		case -EINVAL:
2844 			rtm->rtm_type = RTN_BLACKHOLE;
2845 			break;
2846 		case -EACCES:
2847 			rtm->rtm_type = RTN_PROHIBIT;
2848 			break;
2849 		case -EAGAIN:
2850 			rtm->rtm_type = RTN_THROW;
2851 			break;
2852 		default:
2853 			rtm->rtm_type = RTN_UNREACHABLE;
2854 			break;
2855 		}
2856 	}
2857 	else if (rt->rt6i_flags & RTF_LOCAL)
2858 		rtm->rtm_type = RTN_LOCAL;
2859 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2860 		rtm->rtm_type = RTN_LOCAL;
2861 	else
2862 		rtm->rtm_type = RTN_UNICAST;
2863 	rtm->rtm_flags = 0;
2864 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2865 	rtm->rtm_protocol = rt->rt6i_protocol;
2866 	if (rt->rt6i_flags & RTF_DYNAMIC)
2867 		rtm->rtm_protocol = RTPROT_REDIRECT;
2868 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
2869 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2870 			rtm->rtm_protocol = RTPROT_RA;
2871 		else
2872 			rtm->rtm_protocol = RTPROT_KERNEL;
2873 	}
2874 
2875 	if (rt->rt6i_flags & RTF_CACHE)
2876 		rtm->rtm_flags |= RTM_F_CLONED;
2877 
2878 	if (dst) {
2879 		if (nla_put_in6_addr(skb, RTA_DST, dst))
2880 			goto nla_put_failure;
2881 		rtm->rtm_dst_len = 128;
2882 	} else if (rtm->rtm_dst_len)
2883 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2884 			goto nla_put_failure;
2885 #ifdef CONFIG_IPV6_SUBTREES
2886 	if (src) {
2887 		if (nla_put_in6_addr(skb, RTA_SRC, src))
2888 			goto nla_put_failure;
2889 		rtm->rtm_src_len = 128;
2890 	} else if (rtm->rtm_src_len &&
2891 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2892 		goto nla_put_failure;
2893 #endif
2894 	if (iif) {
2895 #ifdef CONFIG_IPV6_MROUTE
2896 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2897 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2898 			if (err <= 0) {
2899 				if (!nowait) {
2900 					if (err == 0)
2901 						return 0;
2902 					goto nla_put_failure;
2903 				} else {
2904 					if (err == -EMSGSIZE)
2905 						goto nla_put_failure;
2906 				}
2907 			}
2908 		} else
2909 #endif
2910 			if (nla_put_u32(skb, RTA_IIF, iif))
2911 				goto nla_put_failure;
2912 	} else if (dst) {
2913 		struct in6_addr saddr_buf;
2914 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2915 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2916 			goto nla_put_failure;
2917 	}
2918 
2919 	if (rt->rt6i_prefsrc.plen) {
2920 		struct in6_addr saddr_buf;
2921 		saddr_buf = rt->rt6i_prefsrc.addr;
2922 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2923 			goto nla_put_failure;
2924 	}
2925 
2926 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2927 	if (rt->rt6i_pmtu)
2928 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2929 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2930 		goto nla_put_failure;
2931 
2932 	if (rt->rt6i_flags & RTF_GATEWAY) {
2933 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2934 			goto nla_put_failure;
2935 	}
2936 
2937 	if (rt->dst.dev &&
2938 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2939 		goto nla_put_failure;
2940 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2941 		goto nla_put_failure;
2942 
2943 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2944 
2945 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2946 		goto nla_put_failure;
2947 
2948 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2949 		goto nla_put_failure;
2950 
2951 	nlmsg_end(skb, nlh);
2952 	return 0;
2953 
2954 nla_put_failure:
2955 	nlmsg_cancel(skb, nlh);
2956 	return -EMSGSIZE;
2957 }
2958 
2959 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2960 {
2961 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2962 	int prefix;
2963 
2964 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2965 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2966 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2967 	} else
2968 		prefix = 0;
2969 
2970 	return rt6_fill_node(arg->net,
2971 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2972 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2973 		     prefix, 0, NLM_F_MULTI);
2974 }
2975 
2976 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2977 {
2978 	struct net *net = sock_net(in_skb->sk);
2979 	struct nlattr *tb[RTA_MAX+1];
2980 	struct rt6_info *rt;
2981 	struct sk_buff *skb;
2982 	struct rtmsg *rtm;
2983 	struct flowi6 fl6;
2984 	int err, iif = 0, oif = 0;
2985 
2986 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2987 	if (err < 0)
2988 		goto errout;
2989 
2990 	err = -EINVAL;
2991 	memset(&fl6, 0, sizeof(fl6));
2992 
2993 	if (tb[RTA_SRC]) {
2994 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2995 			goto errout;
2996 
2997 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2998 	}
2999 
3000 	if (tb[RTA_DST]) {
3001 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3002 			goto errout;
3003 
3004 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3005 	}
3006 
3007 	if (tb[RTA_IIF])
3008 		iif = nla_get_u32(tb[RTA_IIF]);
3009 
3010 	if (tb[RTA_OIF])
3011 		oif = nla_get_u32(tb[RTA_OIF]);
3012 
3013 	if (tb[RTA_MARK])
3014 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3015 
3016 	if (iif) {
3017 		struct net_device *dev;
3018 		int flags = 0;
3019 
3020 		dev = __dev_get_by_index(net, iif);
3021 		if (!dev) {
3022 			err = -ENODEV;
3023 			goto errout;
3024 		}
3025 
3026 		fl6.flowi6_iif = iif;
3027 
3028 		if (!ipv6_addr_any(&fl6.saddr))
3029 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3030 
3031 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3032 							       flags);
3033 	} else {
3034 		fl6.flowi6_oif = oif;
3035 
3036 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3037 	}
3038 
3039 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3040 	if (!skb) {
3041 		ip6_rt_put(rt);
3042 		err = -ENOBUFS;
3043 		goto errout;
3044 	}
3045 
3046 	/* Reserve room for dummy headers, this skb can pass
3047 	   through good chunk of routing engine.
3048 	 */
3049 	skb_reset_mac_header(skb);
3050 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3051 
3052 	skb_dst_set(skb, &rt->dst);
3053 
3054 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3055 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3056 			    nlh->nlmsg_seq, 0, 0, 0);
3057 	if (err < 0) {
3058 		kfree_skb(skb);
3059 		goto errout;
3060 	}
3061 
3062 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3063 errout:
3064 	return err;
3065 }
3066 
3067 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3068 {
3069 	struct sk_buff *skb;
3070 	struct net *net = info->nl_net;
3071 	u32 seq;
3072 	int err;
3073 
3074 	err = -ENOBUFS;
3075 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3076 
3077 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3078 	if (!skb)
3079 		goto errout;
3080 
3081 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3082 				event, info->portid, seq, 0, 0, 0);
3083 	if (err < 0) {
3084 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3085 		WARN_ON(err == -EMSGSIZE);
3086 		kfree_skb(skb);
3087 		goto errout;
3088 	}
3089 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3090 		    info->nlh, gfp_any());
3091 	return;
3092 errout:
3093 	if (err < 0)
3094 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3095 }
3096 
3097 static int ip6_route_dev_notify(struct notifier_block *this,
3098 				unsigned long event, void *ptr)
3099 {
3100 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3101 	struct net *net = dev_net(dev);
3102 
3103 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3104 		net->ipv6.ip6_null_entry->dst.dev = dev;
3105 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3107 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3108 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3109 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3110 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3111 #endif
3112 	}
3113 
3114 	return NOTIFY_OK;
3115 }
3116 
3117 /*
3118  *	/proc
3119  */
3120 
3121 #ifdef CONFIG_PROC_FS
3122 
3123 static const struct file_operations ipv6_route_proc_fops = {
3124 	.owner		= THIS_MODULE,
3125 	.open		= ipv6_route_open,
3126 	.read		= seq_read,
3127 	.llseek		= seq_lseek,
3128 	.release	= seq_release_net,
3129 };
3130 
3131 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3132 {
3133 	struct net *net = (struct net *)seq->private;
3134 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3135 		   net->ipv6.rt6_stats->fib_nodes,
3136 		   net->ipv6.rt6_stats->fib_route_nodes,
3137 		   net->ipv6.rt6_stats->fib_rt_alloc,
3138 		   net->ipv6.rt6_stats->fib_rt_entries,
3139 		   net->ipv6.rt6_stats->fib_rt_cache,
3140 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3141 		   net->ipv6.rt6_stats->fib_discarded_routes);
3142 
3143 	return 0;
3144 }
3145 
3146 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3147 {
3148 	return single_open_net(inode, file, rt6_stats_seq_show);
3149 }
3150 
3151 static const struct file_operations rt6_stats_seq_fops = {
3152 	.owner	 = THIS_MODULE,
3153 	.open	 = rt6_stats_seq_open,
3154 	.read	 = seq_read,
3155 	.llseek	 = seq_lseek,
3156 	.release = single_release_net,
3157 };
3158 #endif	/* CONFIG_PROC_FS */
3159 
3160 #ifdef CONFIG_SYSCTL
3161 
3162 static
3163 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3164 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3165 {
3166 	struct net *net;
3167 	int delay;
3168 	if (!write)
3169 		return -EINVAL;
3170 
3171 	net = (struct net *)ctl->extra1;
3172 	delay = net->ipv6.sysctl.flush_delay;
3173 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3174 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3175 	return 0;
3176 }
3177 
3178 struct ctl_table ipv6_route_table_template[] = {
3179 	{
3180 		.procname	=	"flush",
3181 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3182 		.maxlen		=	sizeof(int),
3183 		.mode		=	0200,
3184 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3185 	},
3186 	{
3187 		.procname	=	"gc_thresh",
3188 		.data		=	&ip6_dst_ops_template.gc_thresh,
3189 		.maxlen		=	sizeof(int),
3190 		.mode		=	0644,
3191 		.proc_handler	=	proc_dointvec,
3192 	},
3193 	{
3194 		.procname	=	"max_size",
3195 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3196 		.maxlen		=	sizeof(int),
3197 		.mode		=	0644,
3198 		.proc_handler	=	proc_dointvec,
3199 	},
3200 	{
3201 		.procname	=	"gc_min_interval",
3202 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3203 		.maxlen		=	sizeof(int),
3204 		.mode		=	0644,
3205 		.proc_handler	=	proc_dointvec_jiffies,
3206 	},
3207 	{
3208 		.procname	=	"gc_timeout",
3209 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3210 		.maxlen		=	sizeof(int),
3211 		.mode		=	0644,
3212 		.proc_handler	=	proc_dointvec_jiffies,
3213 	},
3214 	{
3215 		.procname	=	"gc_interval",
3216 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3217 		.maxlen		=	sizeof(int),
3218 		.mode		=	0644,
3219 		.proc_handler	=	proc_dointvec_jiffies,
3220 	},
3221 	{
3222 		.procname	=	"gc_elasticity",
3223 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3224 		.maxlen		=	sizeof(int),
3225 		.mode		=	0644,
3226 		.proc_handler	=	proc_dointvec,
3227 	},
3228 	{
3229 		.procname	=	"mtu_expires",
3230 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3231 		.maxlen		=	sizeof(int),
3232 		.mode		=	0644,
3233 		.proc_handler	=	proc_dointvec_jiffies,
3234 	},
3235 	{
3236 		.procname	=	"min_adv_mss",
3237 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3238 		.maxlen		=	sizeof(int),
3239 		.mode		=	0644,
3240 		.proc_handler	=	proc_dointvec,
3241 	},
3242 	{
3243 		.procname	=	"gc_min_interval_ms",
3244 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3245 		.maxlen		=	sizeof(int),
3246 		.mode		=	0644,
3247 		.proc_handler	=	proc_dointvec_ms_jiffies,
3248 	},
3249 	{ }
3250 };
3251 
3252 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3253 {
3254 	struct ctl_table *table;
3255 
3256 	table = kmemdup(ipv6_route_table_template,
3257 			sizeof(ipv6_route_table_template),
3258 			GFP_KERNEL);
3259 
3260 	if (table) {
3261 		table[0].data = &net->ipv6.sysctl.flush_delay;
3262 		table[0].extra1 = net;
3263 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3264 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3265 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3266 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3267 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3268 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3269 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3270 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3271 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3272 
3273 		/* Don't export sysctls to unprivileged users */
3274 		if (net->user_ns != &init_user_ns)
3275 			table[0].procname = NULL;
3276 	}
3277 
3278 	return table;
3279 }
3280 #endif
3281 
3282 static int __net_init ip6_route_net_init(struct net *net)
3283 {
3284 	int ret = -ENOMEM;
3285 
3286 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3287 	       sizeof(net->ipv6.ip6_dst_ops));
3288 
3289 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3290 		goto out_ip6_dst_ops;
3291 
3292 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3293 					   sizeof(*net->ipv6.ip6_null_entry),
3294 					   GFP_KERNEL);
3295 	if (!net->ipv6.ip6_null_entry)
3296 		goto out_ip6_dst_entries;
3297 	net->ipv6.ip6_null_entry->dst.path =
3298 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3299 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3300 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3301 			 ip6_template_metrics, true);
3302 
3303 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3304 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3305 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3306 					       GFP_KERNEL);
3307 	if (!net->ipv6.ip6_prohibit_entry)
3308 		goto out_ip6_null_entry;
3309 	net->ipv6.ip6_prohibit_entry->dst.path =
3310 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3311 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3312 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3313 			 ip6_template_metrics, true);
3314 
3315 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3316 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3317 					       GFP_KERNEL);
3318 	if (!net->ipv6.ip6_blk_hole_entry)
3319 		goto out_ip6_prohibit_entry;
3320 	net->ipv6.ip6_blk_hole_entry->dst.path =
3321 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3322 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3323 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3324 			 ip6_template_metrics, true);
3325 #endif
3326 
3327 	net->ipv6.sysctl.flush_delay = 0;
3328 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3329 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3330 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3331 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3332 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3333 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3334 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3335 
3336 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3337 
3338 	ret = 0;
3339 out:
3340 	return ret;
3341 
3342 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3343 out_ip6_prohibit_entry:
3344 	kfree(net->ipv6.ip6_prohibit_entry);
3345 out_ip6_null_entry:
3346 	kfree(net->ipv6.ip6_null_entry);
3347 #endif
3348 out_ip6_dst_entries:
3349 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3350 out_ip6_dst_ops:
3351 	goto out;
3352 }
3353 
3354 static void __net_exit ip6_route_net_exit(struct net *net)
3355 {
3356 	kfree(net->ipv6.ip6_null_entry);
3357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3358 	kfree(net->ipv6.ip6_prohibit_entry);
3359 	kfree(net->ipv6.ip6_blk_hole_entry);
3360 #endif
3361 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3362 }
3363 
3364 static int __net_init ip6_route_net_init_late(struct net *net)
3365 {
3366 #ifdef CONFIG_PROC_FS
3367 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3368 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3369 #endif
3370 	return 0;
3371 }
3372 
3373 static void __net_exit ip6_route_net_exit_late(struct net *net)
3374 {
3375 #ifdef CONFIG_PROC_FS
3376 	remove_proc_entry("ipv6_route", net->proc_net);
3377 	remove_proc_entry("rt6_stats", net->proc_net);
3378 #endif
3379 }
3380 
3381 static struct pernet_operations ip6_route_net_ops = {
3382 	.init = ip6_route_net_init,
3383 	.exit = ip6_route_net_exit,
3384 };
3385 
3386 static int __net_init ipv6_inetpeer_init(struct net *net)
3387 {
3388 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3389 
3390 	if (!bp)
3391 		return -ENOMEM;
3392 	inet_peer_base_init(bp);
3393 	net->ipv6.peers = bp;
3394 	return 0;
3395 }
3396 
3397 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3398 {
3399 	struct inet_peer_base *bp = net->ipv6.peers;
3400 
3401 	net->ipv6.peers = NULL;
3402 	inetpeer_invalidate_tree(bp);
3403 	kfree(bp);
3404 }
3405 
3406 static struct pernet_operations ipv6_inetpeer_ops = {
3407 	.init	=	ipv6_inetpeer_init,
3408 	.exit	=	ipv6_inetpeer_exit,
3409 };
3410 
3411 static struct pernet_operations ip6_route_net_late_ops = {
3412 	.init = ip6_route_net_init_late,
3413 	.exit = ip6_route_net_exit_late,
3414 };
3415 
3416 static struct notifier_block ip6_route_dev_notifier = {
3417 	.notifier_call = ip6_route_dev_notify,
3418 	.priority = 0,
3419 };
3420 
3421 int __init ip6_route_init(void)
3422 {
3423 	int ret;
3424 	int cpu;
3425 
3426 	ret = -ENOMEM;
3427 	ip6_dst_ops_template.kmem_cachep =
3428 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3429 				  SLAB_HWCACHE_ALIGN, NULL);
3430 	if (!ip6_dst_ops_template.kmem_cachep)
3431 		goto out;
3432 
3433 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3434 	if (ret)
3435 		goto out_kmem_cache;
3436 
3437 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3438 	if (ret)
3439 		goto out_dst_entries;
3440 
3441 	ret = register_pernet_subsys(&ip6_route_net_ops);
3442 	if (ret)
3443 		goto out_register_inetpeer;
3444 
3445 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3446 
3447 	/* Registering of the loopback is done before this portion of code,
3448 	 * the loopback reference in rt6_info will not be taken, do it
3449 	 * manually for init_net */
3450 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3451 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3452   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3453 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3454 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3455 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3456 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3457   #endif
3458 	ret = fib6_init();
3459 	if (ret)
3460 		goto out_register_subsys;
3461 
3462 	ret = xfrm6_init();
3463 	if (ret)
3464 		goto out_fib6_init;
3465 
3466 	ret = fib6_rules_init();
3467 	if (ret)
3468 		goto xfrm6_init;
3469 
3470 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3471 	if (ret)
3472 		goto fib6_rules_init;
3473 
3474 	ret = -ENOBUFS;
3475 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3476 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3477 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3478 		goto out_register_late_subsys;
3479 
3480 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3481 	if (ret)
3482 		goto out_register_late_subsys;
3483 
3484 	for_each_possible_cpu(cpu) {
3485 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3486 
3487 		INIT_LIST_HEAD(&ul->head);
3488 		spin_lock_init(&ul->lock);
3489 	}
3490 
3491 out:
3492 	return ret;
3493 
3494 out_register_late_subsys:
3495 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3496 fib6_rules_init:
3497 	fib6_rules_cleanup();
3498 xfrm6_init:
3499 	xfrm6_fini();
3500 out_fib6_init:
3501 	fib6_gc_cleanup();
3502 out_register_subsys:
3503 	unregister_pernet_subsys(&ip6_route_net_ops);
3504 out_register_inetpeer:
3505 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3506 out_dst_entries:
3507 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3508 out_kmem_cache:
3509 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3510 	goto out;
3511 }
3512 
3513 void ip6_route_cleanup(void)
3514 {
3515 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3516 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3517 	fib6_rules_cleanup();
3518 	xfrm6_fini();
3519 	fib6_gc_cleanup();
3520 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3521 	unregister_pernet_subsys(&ip6_route_net_ops);
3522 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3523 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3524 }
3525