xref: /openbmc/linux/net/ipv6/route.c (revision 93df8a1e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 
62 #include <asm/uaccess.h>
63 
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67 
68 enum rt6_nud_state {
69 	RT6_NUD_FAIL_HARD = -3,
70 	RT6_NUD_FAIL_PROBE = -2,
71 	RT6_NUD_FAIL_DO_RR = -1,
72 	RT6_NUD_SUCCEED = 1
73 };
74 
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int		ip6_pkt_prohibit(struct sk_buff *skb);
88 static int		ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void		ip6_link_failure(struct sk_buff *skb);
90 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91 					   struct sk_buff *skb, u32 mtu);
92 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93 					struct sk_buff *skb);
94 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96 
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99 					   const struct in6_addr *prefix, int prefixlen,
100 					   const struct in6_addr *gwaddr, int ifindex,
101 					   unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103 					   const struct in6_addr *prefix, int prefixlen,
104 					   const struct in6_addr *gwaddr, int ifindex);
105 #endif
106 
107 struct uncached_list {
108 	spinlock_t		lock;
109 	struct list_head	head;
110 };
111 
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113 
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117 
118 	rt->dst.flags |= DST_NOCACHE;
119 	rt->rt6i_uncached_list = ul;
120 
121 	spin_lock_bh(&ul->lock);
122 	list_add_tail(&rt->rt6i_uncached, &ul->head);
123 	spin_unlock_bh(&ul->lock);
124 }
125 
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128 	if (!list_empty(&rt->rt6i_uncached)) {
129 		struct uncached_list *ul = rt->rt6i_uncached_list;
130 
131 		spin_lock_bh(&ul->lock);
132 		list_del(&rt->rt6i_uncached);
133 		spin_unlock_bh(&ul->lock);
134 	}
135 }
136 
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139 	struct net_device *loopback_dev = net->loopback_dev;
140 	int cpu;
141 
142 	for_each_possible_cpu(cpu) {
143 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144 		struct rt6_info *rt;
145 
146 		spin_lock_bh(&ul->lock);
147 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148 			struct inet6_dev *rt_idev = rt->rt6i_idev;
149 			struct net_device *rt_dev = rt->dst.dev;
150 
151 			if (rt_idev && (rt_idev->dev == dev || !dev) &&
152 			    rt_idev->dev != loopback_dev) {
153 				rt->rt6i_idev = in6_dev_get(loopback_dev);
154 				in6_dev_put(rt_idev);
155 			}
156 
157 			if (rt_dev && (rt_dev == dev || !dev) &&
158 			    rt_dev != loopback_dev) {
159 				rt->dst.dev = loopback_dev;
160 				dev_hold(rt->dst.dev);
161 				dev_put(rt_dev);
162 			}
163 		}
164 		spin_unlock_bh(&ul->lock);
165 	}
166 }
167 
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170 	return dst_metrics_write_ptr(rt->dst.from);
171 }
172 
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175 	struct rt6_info *rt = (struct rt6_info *)dst;
176 
177 	if (rt->rt6i_flags & RTF_PCPU)
178 		return rt6_pcpu_cow_metrics(rt);
179 	else if (rt->rt6i_flags & RTF_CACHE)
180 		return NULL;
181 	else
182 		return dst_cow_metrics_generic(dst, old);
183 }
184 
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186 					     struct sk_buff *skb,
187 					     const void *daddr)
188 {
189 	struct in6_addr *p = &rt->rt6i_gateway;
190 
191 	if (!ipv6_addr_any(p))
192 		return (const void *) p;
193 	else if (skb)
194 		return &ipv6_hdr(skb)->daddr;
195 	return daddr;
196 }
197 
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199 					  struct sk_buff *skb,
200 					  const void *daddr)
201 {
202 	struct rt6_info *rt = (struct rt6_info *) dst;
203 	struct neighbour *n;
204 
205 	daddr = choose_neigh_daddr(rt, skb, daddr);
206 	n = __ipv6_neigh_lookup(dst->dev, daddr);
207 	if (n)
208 		return n;
209 	return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211 
212 static struct dst_ops ip6_dst_ops_template = {
213 	.family			=	AF_INET6,
214 	.gc			=	ip6_dst_gc,
215 	.gc_thresh		=	1024,
216 	.check			=	ip6_dst_check,
217 	.default_advmss		=	ip6_default_advmss,
218 	.mtu			=	ip6_mtu,
219 	.cow_metrics		=	ipv6_cow_metrics,
220 	.destroy		=	ip6_dst_destroy,
221 	.ifdown			=	ip6_dst_ifdown,
222 	.negative_advice	=	ip6_negative_advice,
223 	.link_failure		=	ip6_link_failure,
224 	.update_pmtu		=	ip6_rt_update_pmtu,
225 	.redirect		=	rt6_do_redirect,
226 	.local_out		=	__ip6_local_out,
227 	.neigh_lookup		=	ip6_neigh_lookup,
228 };
229 
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233 
234 	return mtu ? : dst->dev->mtu;
235 }
236 
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238 					 struct sk_buff *skb, u32 mtu)
239 {
240 }
241 
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243 				      struct sk_buff *skb)
244 {
245 }
246 
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248 					 unsigned long old)
249 {
250 	return NULL;
251 }
252 
253 static struct dst_ops ip6_dst_blackhole_ops = {
254 	.family			=	AF_INET6,
255 	.destroy		=	ip6_dst_destroy,
256 	.check			=	ip6_dst_check,
257 	.mtu			=	ip6_blackhole_mtu,
258 	.default_advmss		=	ip6_default_advmss,
259 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
260 	.redirect		=	ip6_rt_blackhole_redirect,
261 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
262 	.neigh_lookup		=	ip6_neigh_lookup,
263 };
264 
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266 	[RTAX_HOPLIMIT - 1] = 0,
267 };
268 
269 static const struct rt6_info ip6_null_entry_template = {
270 	.dst = {
271 		.__refcnt	= ATOMIC_INIT(1),
272 		.__use		= 1,
273 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
274 		.error		= -ENETUNREACH,
275 		.input		= ip6_pkt_discard,
276 		.output		= ip6_pkt_discard_out,
277 	},
278 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
279 	.rt6i_protocol  = RTPROT_KERNEL,
280 	.rt6i_metric	= ~(u32) 0,
281 	.rt6i_ref	= ATOMIC_INIT(1),
282 };
283 
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285 
286 static const struct rt6_info ip6_prohibit_entry_template = {
287 	.dst = {
288 		.__refcnt	= ATOMIC_INIT(1),
289 		.__use		= 1,
290 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
291 		.error		= -EACCES,
292 		.input		= ip6_pkt_prohibit,
293 		.output		= ip6_pkt_prohibit_out,
294 	},
295 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
296 	.rt6i_protocol  = RTPROT_KERNEL,
297 	.rt6i_metric	= ~(u32) 0,
298 	.rt6i_ref	= ATOMIC_INIT(1),
299 };
300 
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302 	.dst = {
303 		.__refcnt	= ATOMIC_INIT(1),
304 		.__use		= 1,
305 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 		.error		= -EINVAL,
307 		.input		= dst_discard,
308 		.output		= dst_discard_sk,
309 	},
310 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
311 	.rt6i_protocol  = RTPROT_KERNEL,
312 	.rt6i_metric	= ~(u32) 0,
313 	.rt6i_ref	= ATOMIC_INIT(1),
314 };
315 
316 #endif
317 
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320 					struct net_device *dev,
321 					int flags,
322 					struct fib6_table *table)
323 {
324 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
325 					0, DST_OBSOLETE_FORCE_CHK, flags);
326 
327 	if (rt) {
328 		struct dst_entry *dst = &rt->dst;
329 
330 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
331 		INIT_LIST_HEAD(&rt->rt6i_siblings);
332 		INIT_LIST_HEAD(&rt->rt6i_uncached);
333 	}
334 	return rt;
335 }
336 
337 static struct rt6_info *ip6_dst_alloc(struct net *net,
338 				      struct net_device *dev,
339 				      int flags,
340 				      struct fib6_table *table)
341 {
342 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
343 
344 	if (rt) {
345 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
346 		if (rt->rt6i_pcpu) {
347 			int cpu;
348 
349 			for_each_possible_cpu(cpu) {
350 				struct rt6_info **p;
351 
352 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
353 				/* no one shares rt */
354 				*p =  NULL;
355 			}
356 		} else {
357 			dst_destroy((struct dst_entry *)rt);
358 			return NULL;
359 		}
360 	}
361 
362 	return rt;
363 }
364 
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367 	struct rt6_info *rt = (struct rt6_info *)dst;
368 	struct dst_entry *from = dst->from;
369 	struct inet6_dev *idev;
370 
371 	dst_destroy_metrics_generic(dst);
372 	free_percpu(rt->rt6i_pcpu);
373 	rt6_uncached_list_del(rt);
374 
375 	idev = rt->rt6i_idev;
376 	if (idev) {
377 		rt->rt6i_idev = NULL;
378 		in6_dev_put(idev);
379 	}
380 
381 	dst->from = NULL;
382 	dst_release(from);
383 }
384 
385 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
386 			   int how)
387 {
388 	struct rt6_info *rt = (struct rt6_info *)dst;
389 	struct inet6_dev *idev = rt->rt6i_idev;
390 	struct net_device *loopback_dev =
391 		dev_net(dev)->loopback_dev;
392 
393 	if (dev != loopback_dev) {
394 		if (idev && idev->dev == dev) {
395 			struct inet6_dev *loopback_idev =
396 				in6_dev_get(loopback_dev);
397 			if (loopback_idev) {
398 				rt->rt6i_idev = loopback_idev;
399 				in6_dev_put(idev);
400 			}
401 		}
402 	}
403 }
404 
405 static bool rt6_check_expired(const struct rt6_info *rt)
406 {
407 	if (rt->rt6i_flags & RTF_EXPIRES) {
408 		if (time_after(jiffies, rt->dst.expires))
409 			return true;
410 	} else if (rt->dst.from) {
411 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
412 	}
413 	return false;
414 }
415 
416 /* Multipath route selection:
417  *   Hash based function using packet header and flowlabel.
418  * Adapted from fib_info_hashfn()
419  */
420 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
421 			       const struct flowi6 *fl6)
422 {
423 	unsigned int val = fl6->flowi6_proto;
424 
425 	val ^= ipv6_addr_hash(&fl6->daddr);
426 	val ^= ipv6_addr_hash(&fl6->saddr);
427 
428 	/* Work only if this not encapsulated */
429 	switch (fl6->flowi6_proto) {
430 	case IPPROTO_UDP:
431 	case IPPROTO_TCP:
432 	case IPPROTO_SCTP:
433 		val ^= (__force u16)fl6->fl6_sport;
434 		val ^= (__force u16)fl6->fl6_dport;
435 		break;
436 
437 	case IPPROTO_ICMPV6:
438 		val ^= (__force u16)fl6->fl6_icmp_type;
439 		val ^= (__force u16)fl6->fl6_icmp_code;
440 		break;
441 	}
442 	/* RFC6438 recommands to use flowlabel */
443 	val ^= (__force u32)fl6->flowlabel;
444 
445 	/* Perhaps, we need to tune, this function? */
446 	val = val ^ (val >> 7) ^ (val >> 12);
447 	return val % candidate_count;
448 }
449 
450 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
451 					     struct flowi6 *fl6, int oif,
452 					     int strict)
453 {
454 	struct rt6_info *sibling, *next_sibling;
455 	int route_choosen;
456 
457 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
458 	/* Don't change the route, if route_choosen == 0
459 	 * (siblings does not include ourself)
460 	 */
461 	if (route_choosen)
462 		list_for_each_entry_safe(sibling, next_sibling,
463 				&match->rt6i_siblings, rt6i_siblings) {
464 			route_choosen--;
465 			if (route_choosen == 0) {
466 				if (rt6_score_route(sibling, oif, strict) < 0)
467 					break;
468 				match = sibling;
469 				break;
470 			}
471 		}
472 	return match;
473 }
474 
475 /*
476  *	Route lookup. Any table->tb6_lock is implied.
477  */
478 
479 static inline struct rt6_info *rt6_device_match(struct net *net,
480 						    struct rt6_info *rt,
481 						    const struct in6_addr *saddr,
482 						    int oif,
483 						    int flags)
484 {
485 	struct rt6_info *local = NULL;
486 	struct rt6_info *sprt;
487 
488 	if (!oif && ipv6_addr_any(saddr))
489 		goto out;
490 
491 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
492 		struct net_device *dev = sprt->dst.dev;
493 
494 		if (oif) {
495 			if (dev->ifindex == oif)
496 				return sprt;
497 			if (dev->flags & IFF_LOOPBACK) {
498 				if (!sprt->rt6i_idev ||
499 				    sprt->rt6i_idev->dev->ifindex != oif) {
500 					if (flags & RT6_LOOKUP_F_IFACE && oif)
501 						continue;
502 					if (local && (!oif ||
503 						      local->rt6i_idev->dev->ifindex == oif))
504 						continue;
505 				}
506 				local = sprt;
507 			}
508 		} else {
509 			if (ipv6_chk_addr(net, saddr, dev,
510 					  flags & RT6_LOOKUP_F_IFACE))
511 				return sprt;
512 		}
513 	}
514 
515 	if (oif) {
516 		if (local)
517 			return local;
518 
519 		if (flags & RT6_LOOKUP_F_IFACE)
520 			return net->ipv6.ip6_null_entry;
521 	}
522 out:
523 	return rt;
524 }
525 
526 #ifdef CONFIG_IPV6_ROUTER_PREF
527 struct __rt6_probe_work {
528 	struct work_struct work;
529 	struct in6_addr target;
530 	struct net_device *dev;
531 };
532 
533 static void rt6_probe_deferred(struct work_struct *w)
534 {
535 	struct in6_addr mcaddr;
536 	struct __rt6_probe_work *work =
537 		container_of(w, struct __rt6_probe_work, work);
538 
539 	addrconf_addr_solict_mult(&work->target, &mcaddr);
540 	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
541 	dev_put(work->dev);
542 	kfree(work);
543 }
544 
545 static void rt6_probe(struct rt6_info *rt)
546 {
547 	struct neighbour *neigh;
548 	/*
549 	 * Okay, this does not seem to be appropriate
550 	 * for now, however, we need to check if it
551 	 * is really so; aka Router Reachability Probing.
552 	 *
553 	 * Router Reachability Probe MUST be rate-limited
554 	 * to no more than one per minute.
555 	 */
556 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
557 		return;
558 	rcu_read_lock_bh();
559 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
560 	if (neigh) {
561 		write_lock(&neigh->lock);
562 		if (neigh->nud_state & NUD_VALID)
563 			goto out;
564 	}
565 
566 	if (!neigh ||
567 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
568 		struct __rt6_probe_work *work;
569 
570 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
571 
572 		if (neigh && work)
573 			__neigh_set_probe_once(neigh);
574 
575 		if (neigh)
576 			write_unlock(&neigh->lock);
577 
578 		if (work) {
579 			INIT_WORK(&work->work, rt6_probe_deferred);
580 			work->target = rt->rt6i_gateway;
581 			dev_hold(rt->dst.dev);
582 			work->dev = rt->dst.dev;
583 			schedule_work(&work->work);
584 		}
585 	} else {
586 out:
587 		write_unlock(&neigh->lock);
588 	}
589 	rcu_read_unlock_bh();
590 }
591 #else
592 static inline void rt6_probe(struct rt6_info *rt)
593 {
594 }
595 #endif
596 
597 /*
598  * Default Router Selection (RFC 2461 6.3.6)
599  */
600 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
601 {
602 	struct net_device *dev = rt->dst.dev;
603 	if (!oif || dev->ifindex == oif)
604 		return 2;
605 	if ((dev->flags & IFF_LOOPBACK) &&
606 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
607 		return 1;
608 	return 0;
609 }
610 
611 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
612 {
613 	struct neighbour *neigh;
614 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
615 
616 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
617 	    !(rt->rt6i_flags & RTF_GATEWAY))
618 		return RT6_NUD_SUCCEED;
619 
620 	rcu_read_lock_bh();
621 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
622 	if (neigh) {
623 		read_lock(&neigh->lock);
624 		if (neigh->nud_state & NUD_VALID)
625 			ret = RT6_NUD_SUCCEED;
626 #ifdef CONFIG_IPV6_ROUTER_PREF
627 		else if (!(neigh->nud_state & NUD_FAILED))
628 			ret = RT6_NUD_SUCCEED;
629 		else
630 			ret = RT6_NUD_FAIL_PROBE;
631 #endif
632 		read_unlock(&neigh->lock);
633 	} else {
634 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
635 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
636 	}
637 	rcu_read_unlock_bh();
638 
639 	return ret;
640 }
641 
642 static int rt6_score_route(struct rt6_info *rt, int oif,
643 			   int strict)
644 {
645 	int m;
646 
647 	m = rt6_check_dev(rt, oif);
648 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
649 		return RT6_NUD_FAIL_HARD;
650 #ifdef CONFIG_IPV6_ROUTER_PREF
651 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
652 #endif
653 	if (strict & RT6_LOOKUP_F_REACHABLE) {
654 		int n = rt6_check_neigh(rt);
655 		if (n < 0)
656 			return n;
657 	}
658 	return m;
659 }
660 
661 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
662 				   int *mpri, struct rt6_info *match,
663 				   bool *do_rr)
664 {
665 	int m;
666 	bool match_do_rr = false;
667 
668 	if (rt6_check_expired(rt))
669 		goto out;
670 
671 	m = rt6_score_route(rt, oif, strict);
672 	if (m == RT6_NUD_FAIL_DO_RR) {
673 		match_do_rr = true;
674 		m = 0; /* lowest valid score */
675 	} else if (m == RT6_NUD_FAIL_HARD) {
676 		goto out;
677 	}
678 
679 	if (strict & RT6_LOOKUP_F_REACHABLE)
680 		rt6_probe(rt);
681 
682 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
683 	if (m > *mpri) {
684 		*do_rr = match_do_rr;
685 		*mpri = m;
686 		match = rt;
687 	}
688 out:
689 	return match;
690 }
691 
692 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
693 				     struct rt6_info *rr_head,
694 				     u32 metric, int oif, int strict,
695 				     bool *do_rr)
696 {
697 	struct rt6_info *rt, *match, *cont;
698 	int mpri = -1;
699 
700 	match = NULL;
701 	cont = NULL;
702 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
703 		if (rt->rt6i_metric != metric) {
704 			cont = rt;
705 			break;
706 		}
707 
708 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
709 	}
710 
711 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
712 		if (rt->rt6i_metric != metric) {
713 			cont = rt;
714 			break;
715 		}
716 
717 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
718 	}
719 
720 	if (match || !cont)
721 		return match;
722 
723 	for (rt = cont; rt; rt = rt->dst.rt6_next)
724 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 
726 	return match;
727 }
728 
729 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
730 {
731 	struct rt6_info *match, *rt0;
732 	struct net *net;
733 	bool do_rr = false;
734 
735 	rt0 = fn->rr_ptr;
736 	if (!rt0)
737 		fn->rr_ptr = rt0 = fn->leaf;
738 
739 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
740 			     &do_rr);
741 
742 	if (do_rr) {
743 		struct rt6_info *next = rt0->dst.rt6_next;
744 
745 		/* no entries matched; do round-robin */
746 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
747 			next = fn->leaf;
748 
749 		if (next != rt0)
750 			fn->rr_ptr = next;
751 	}
752 
753 	net = dev_net(rt0->dst.dev);
754 	return match ? match : net->ipv6.ip6_null_entry;
755 }
756 
757 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
758 {
759 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
760 }
761 
762 #ifdef CONFIG_IPV6_ROUTE_INFO
763 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
764 		  const struct in6_addr *gwaddr)
765 {
766 	struct net *net = dev_net(dev);
767 	struct route_info *rinfo = (struct route_info *) opt;
768 	struct in6_addr prefix_buf, *prefix;
769 	unsigned int pref;
770 	unsigned long lifetime;
771 	struct rt6_info *rt;
772 
773 	if (len < sizeof(struct route_info)) {
774 		return -EINVAL;
775 	}
776 
777 	/* Sanity check for prefix_len and length */
778 	if (rinfo->length > 3) {
779 		return -EINVAL;
780 	} else if (rinfo->prefix_len > 128) {
781 		return -EINVAL;
782 	} else if (rinfo->prefix_len > 64) {
783 		if (rinfo->length < 2) {
784 			return -EINVAL;
785 		}
786 	} else if (rinfo->prefix_len > 0) {
787 		if (rinfo->length < 1) {
788 			return -EINVAL;
789 		}
790 	}
791 
792 	pref = rinfo->route_pref;
793 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
794 		return -EINVAL;
795 
796 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
797 
798 	if (rinfo->length == 3)
799 		prefix = (struct in6_addr *)rinfo->prefix;
800 	else {
801 		/* this function is safe */
802 		ipv6_addr_prefix(&prefix_buf,
803 				 (struct in6_addr *)rinfo->prefix,
804 				 rinfo->prefix_len);
805 		prefix = &prefix_buf;
806 	}
807 
808 	if (rinfo->prefix_len == 0)
809 		rt = rt6_get_dflt_router(gwaddr, dev);
810 	else
811 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
812 					gwaddr, dev->ifindex);
813 
814 	if (rt && !lifetime) {
815 		ip6_del_rt(rt);
816 		rt = NULL;
817 	}
818 
819 	if (!rt && lifetime)
820 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
821 					pref);
822 	else if (rt)
823 		rt->rt6i_flags = RTF_ROUTEINFO |
824 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
825 
826 	if (rt) {
827 		if (!addrconf_finite_timeout(lifetime))
828 			rt6_clean_expires(rt);
829 		else
830 			rt6_set_expires(rt, jiffies + HZ * lifetime);
831 
832 		ip6_rt_put(rt);
833 	}
834 	return 0;
835 }
836 #endif
837 
838 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
839 					struct in6_addr *saddr)
840 {
841 	struct fib6_node *pn;
842 	while (1) {
843 		if (fn->fn_flags & RTN_TL_ROOT)
844 			return NULL;
845 		pn = fn->parent;
846 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
847 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
848 		else
849 			fn = pn;
850 		if (fn->fn_flags & RTN_RTINFO)
851 			return fn;
852 	}
853 }
854 
855 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
856 					     struct fib6_table *table,
857 					     struct flowi6 *fl6, int flags)
858 {
859 	struct fib6_node *fn;
860 	struct rt6_info *rt;
861 
862 	read_lock_bh(&table->tb6_lock);
863 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
864 restart:
865 	rt = fn->leaf;
866 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
867 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
868 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
869 	if (rt == net->ipv6.ip6_null_entry) {
870 		fn = fib6_backtrack(fn, &fl6->saddr);
871 		if (fn)
872 			goto restart;
873 	}
874 	dst_use(&rt->dst, jiffies);
875 	read_unlock_bh(&table->tb6_lock);
876 	return rt;
877 
878 }
879 
880 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
881 				    int flags)
882 {
883 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
884 }
885 EXPORT_SYMBOL_GPL(ip6_route_lookup);
886 
887 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
888 			    const struct in6_addr *saddr, int oif, int strict)
889 {
890 	struct flowi6 fl6 = {
891 		.flowi6_oif = oif,
892 		.daddr = *daddr,
893 	};
894 	struct dst_entry *dst;
895 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
896 
897 	if (saddr) {
898 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
899 		flags |= RT6_LOOKUP_F_HAS_SADDR;
900 	}
901 
902 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
903 	if (dst->error == 0)
904 		return (struct rt6_info *) dst;
905 
906 	dst_release(dst);
907 
908 	return NULL;
909 }
910 EXPORT_SYMBOL(rt6_lookup);
911 
912 /* ip6_ins_rt is called with FREE table->tb6_lock.
913    It takes new route entry, the addition fails by any reason the
914    route is freed. In any case, if caller does not hold it, it may
915    be destroyed.
916  */
917 
918 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
919 			struct mx6_config *mxc)
920 {
921 	int err;
922 	struct fib6_table *table;
923 
924 	table = rt->rt6i_table;
925 	write_lock_bh(&table->tb6_lock);
926 	err = fib6_add(&table->tb6_root, rt, info, mxc);
927 	write_unlock_bh(&table->tb6_lock);
928 
929 	return err;
930 }
931 
932 int ip6_ins_rt(struct rt6_info *rt)
933 {
934 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
935 	struct mx6_config mxc = { .mx = NULL, };
936 
937 	return __ip6_ins_rt(rt, &info, &mxc);
938 }
939 
940 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
941 					   const struct in6_addr *daddr,
942 					   const struct in6_addr *saddr)
943 {
944 	struct rt6_info *rt;
945 
946 	/*
947 	 *	Clone the route.
948 	 */
949 
950 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
951 		ort = (struct rt6_info *)ort->dst.from;
952 
953 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
954 			     0, ort->rt6i_table);
955 
956 	if (!rt)
957 		return NULL;
958 
959 	ip6_rt_copy_init(rt, ort);
960 	rt->rt6i_flags |= RTF_CACHE;
961 	rt->rt6i_metric = 0;
962 	rt->dst.flags |= DST_HOST;
963 	rt->rt6i_dst.addr = *daddr;
964 	rt->rt6i_dst.plen = 128;
965 
966 	if (!rt6_is_gw_or_nonexthop(ort)) {
967 		if (ort->rt6i_dst.plen != 128 &&
968 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
969 			rt->rt6i_flags |= RTF_ANYCAST;
970 #ifdef CONFIG_IPV6_SUBTREES
971 		if (rt->rt6i_src.plen && saddr) {
972 			rt->rt6i_src.addr = *saddr;
973 			rt->rt6i_src.plen = 128;
974 		}
975 #endif
976 	}
977 
978 	return rt;
979 }
980 
981 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
982 {
983 	struct rt6_info *pcpu_rt;
984 
985 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
986 				  rt->dst.dev, rt->dst.flags,
987 				  rt->rt6i_table);
988 
989 	if (!pcpu_rt)
990 		return NULL;
991 	ip6_rt_copy_init(pcpu_rt, rt);
992 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
993 	pcpu_rt->rt6i_flags |= RTF_PCPU;
994 	return pcpu_rt;
995 }
996 
997 /* It should be called with read_lock_bh(&tb6_lock) acquired */
998 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
999 {
1000 	struct rt6_info *pcpu_rt, *prev, **p;
1001 
1002 	p = this_cpu_ptr(rt->rt6i_pcpu);
1003 	pcpu_rt = *p;
1004 
1005 	if (pcpu_rt)
1006 		goto done;
1007 
1008 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1009 	if (!pcpu_rt) {
1010 		struct net *net = dev_net(rt->dst.dev);
1011 
1012 		pcpu_rt = net->ipv6.ip6_null_entry;
1013 		goto done;
1014 	}
1015 
1016 	prev = cmpxchg(p, NULL, pcpu_rt);
1017 	if (prev) {
1018 		/* If someone did it before us, return prev instead */
1019 		dst_destroy(&pcpu_rt->dst);
1020 		pcpu_rt = prev;
1021 	}
1022 
1023 done:
1024 	dst_hold(&pcpu_rt->dst);
1025 	rt6_dst_from_metrics_check(pcpu_rt);
1026 	return pcpu_rt;
1027 }
1028 
1029 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1030 				      struct flowi6 *fl6, int flags)
1031 {
1032 	struct fib6_node *fn, *saved_fn;
1033 	struct rt6_info *rt;
1034 	int strict = 0;
1035 
1036 	strict |= flags & RT6_LOOKUP_F_IFACE;
1037 	if (net->ipv6.devconf_all->forwarding == 0)
1038 		strict |= RT6_LOOKUP_F_REACHABLE;
1039 
1040 	read_lock_bh(&table->tb6_lock);
1041 
1042 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1043 	saved_fn = fn;
1044 
1045 redo_rt6_select:
1046 	rt = rt6_select(fn, oif, strict);
1047 	if (rt->rt6i_nsiblings)
1048 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1049 	if (rt == net->ipv6.ip6_null_entry) {
1050 		fn = fib6_backtrack(fn, &fl6->saddr);
1051 		if (fn)
1052 			goto redo_rt6_select;
1053 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1054 			/* also consider unreachable route */
1055 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1056 			fn = saved_fn;
1057 			goto redo_rt6_select;
1058 		}
1059 	}
1060 
1061 
1062 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1063 		dst_use(&rt->dst, jiffies);
1064 		read_unlock_bh(&table->tb6_lock);
1065 
1066 		rt6_dst_from_metrics_check(rt);
1067 		return rt;
1068 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1069 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1070 		/* Create a RTF_CACHE clone which will not be
1071 		 * owned by the fib6 tree.  It is for the special case where
1072 		 * the daddr in the skb during the neighbor look-up is different
1073 		 * from the fl6->daddr used to look-up route here.
1074 		 */
1075 
1076 		struct rt6_info *uncached_rt;
1077 
1078 		dst_use(&rt->dst, jiffies);
1079 		read_unlock_bh(&table->tb6_lock);
1080 
1081 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1082 		dst_release(&rt->dst);
1083 
1084 		if (uncached_rt)
1085 			rt6_uncached_list_add(uncached_rt);
1086 		else
1087 			uncached_rt = net->ipv6.ip6_null_entry;
1088 
1089 		dst_hold(&uncached_rt->dst);
1090 		return uncached_rt;
1091 
1092 	} else {
1093 		/* Get a percpu copy */
1094 
1095 		struct rt6_info *pcpu_rt;
1096 
1097 		rt->dst.lastuse = jiffies;
1098 		rt->dst.__use++;
1099 		pcpu_rt = rt6_get_pcpu_route(rt);
1100 		read_unlock_bh(&table->tb6_lock);
1101 
1102 		return pcpu_rt;
1103 	}
1104 }
1105 
1106 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1107 					    struct flowi6 *fl6, int flags)
1108 {
1109 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1110 }
1111 
1112 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1113 						struct net_device *dev,
1114 						struct flowi6 *fl6, int flags)
1115 {
1116 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1117 		flags |= RT6_LOOKUP_F_IFACE;
1118 
1119 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1120 }
1121 
1122 void ip6_route_input(struct sk_buff *skb)
1123 {
1124 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1125 	struct net *net = dev_net(skb->dev);
1126 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1127 	struct flowi6 fl6 = {
1128 		.flowi6_iif = skb->dev->ifindex,
1129 		.daddr = iph->daddr,
1130 		.saddr = iph->saddr,
1131 		.flowlabel = ip6_flowinfo(iph),
1132 		.flowi6_mark = skb->mark,
1133 		.flowi6_proto = iph->nexthdr,
1134 	};
1135 
1136 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1137 }
1138 
1139 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1140 					     struct flowi6 *fl6, int flags)
1141 {
1142 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1143 }
1144 
1145 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1146 				    struct flowi6 *fl6)
1147 {
1148 	int flags = 0;
1149 
1150 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1151 
1152 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1153 		flags |= RT6_LOOKUP_F_IFACE;
1154 
1155 	if (!ipv6_addr_any(&fl6->saddr))
1156 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1157 	else if (sk)
1158 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1159 
1160 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1161 }
1162 EXPORT_SYMBOL(ip6_route_output);
1163 
1164 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1165 {
1166 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1167 	struct dst_entry *new = NULL;
1168 
1169 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1170 	if (rt) {
1171 		new = &rt->dst;
1172 
1173 		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1174 
1175 		new->__use = 1;
1176 		new->input = dst_discard;
1177 		new->output = dst_discard_sk;
1178 
1179 		if (dst_metrics_read_only(&ort->dst))
1180 			new->_metrics = ort->dst._metrics;
1181 		else
1182 			dst_copy_metrics(new, &ort->dst);
1183 		rt->rt6i_idev = ort->rt6i_idev;
1184 		if (rt->rt6i_idev)
1185 			in6_dev_hold(rt->rt6i_idev);
1186 
1187 		rt->rt6i_gateway = ort->rt6i_gateway;
1188 		rt->rt6i_flags = ort->rt6i_flags;
1189 		rt->rt6i_metric = 0;
1190 
1191 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1192 #ifdef CONFIG_IPV6_SUBTREES
1193 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1194 #endif
1195 
1196 		dst_free(new);
1197 	}
1198 
1199 	dst_release(dst_orig);
1200 	return new ? new : ERR_PTR(-ENOMEM);
1201 }
1202 
1203 /*
1204  *	Destination cache support functions
1205  */
1206 
1207 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1208 {
1209 	if (rt->dst.from &&
1210 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1211 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1212 }
1213 
1214 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1215 {
1216 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1217 		return NULL;
1218 
1219 	if (rt6_check_expired(rt))
1220 		return NULL;
1221 
1222 	return &rt->dst;
1223 }
1224 
1225 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1226 {
1227 	if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1228 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1229 		return &rt->dst;
1230 	else
1231 		return NULL;
1232 }
1233 
1234 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1235 {
1236 	struct rt6_info *rt;
1237 
1238 	rt = (struct rt6_info *) dst;
1239 
1240 	/* All IPV6 dsts are created with ->obsolete set to the value
1241 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1242 	 * into this function always.
1243 	 */
1244 
1245 	rt6_dst_from_metrics_check(rt);
1246 
1247 	if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1248 		return rt6_dst_from_check(rt, cookie);
1249 	else
1250 		return rt6_check(rt, cookie);
1251 }
1252 
1253 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1254 {
1255 	struct rt6_info *rt = (struct rt6_info *) dst;
1256 
1257 	if (rt) {
1258 		if (rt->rt6i_flags & RTF_CACHE) {
1259 			if (rt6_check_expired(rt)) {
1260 				ip6_del_rt(rt);
1261 				dst = NULL;
1262 			}
1263 		} else {
1264 			dst_release(dst);
1265 			dst = NULL;
1266 		}
1267 	}
1268 	return dst;
1269 }
1270 
1271 static void ip6_link_failure(struct sk_buff *skb)
1272 {
1273 	struct rt6_info *rt;
1274 
1275 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1276 
1277 	rt = (struct rt6_info *) skb_dst(skb);
1278 	if (rt) {
1279 		if (rt->rt6i_flags & RTF_CACHE) {
1280 			dst_hold(&rt->dst);
1281 			if (ip6_del_rt(rt))
1282 				dst_free(&rt->dst);
1283 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1284 			rt->rt6i_node->fn_sernum = -1;
1285 		}
1286 	}
1287 }
1288 
1289 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1290 {
1291 	struct net *net = dev_net(rt->dst.dev);
1292 
1293 	rt->rt6i_flags |= RTF_MODIFIED;
1294 	rt->rt6i_pmtu = mtu;
1295 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1296 }
1297 
1298 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1299 				 const struct ipv6hdr *iph, u32 mtu)
1300 {
1301 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1302 
1303 	if (rt6->rt6i_flags & RTF_LOCAL)
1304 		return;
1305 
1306 	dst_confirm(dst);
1307 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1308 	if (mtu >= dst_mtu(dst))
1309 		return;
1310 
1311 	if (rt6->rt6i_flags & RTF_CACHE) {
1312 		rt6_do_update_pmtu(rt6, mtu);
1313 	} else {
1314 		const struct in6_addr *daddr, *saddr;
1315 		struct rt6_info *nrt6;
1316 
1317 		if (iph) {
1318 			daddr = &iph->daddr;
1319 			saddr = &iph->saddr;
1320 		} else if (sk) {
1321 			daddr = &sk->sk_v6_daddr;
1322 			saddr = &inet6_sk(sk)->saddr;
1323 		} else {
1324 			return;
1325 		}
1326 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1327 		if (nrt6) {
1328 			rt6_do_update_pmtu(nrt6, mtu);
1329 
1330 			/* ip6_ins_rt(nrt6) will bump the
1331 			 * rt6->rt6i_node->fn_sernum
1332 			 * which will fail the next rt6_check() and
1333 			 * invalidate the sk->sk_dst_cache.
1334 			 */
1335 			ip6_ins_rt(nrt6);
1336 		}
1337 	}
1338 }
1339 
1340 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1341 			       struct sk_buff *skb, u32 mtu)
1342 {
1343 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1344 }
1345 
1346 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1347 		     int oif, u32 mark)
1348 {
1349 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1350 	struct dst_entry *dst;
1351 	struct flowi6 fl6;
1352 
1353 	memset(&fl6, 0, sizeof(fl6));
1354 	fl6.flowi6_oif = oif;
1355 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1356 	fl6.daddr = iph->daddr;
1357 	fl6.saddr = iph->saddr;
1358 	fl6.flowlabel = ip6_flowinfo(iph);
1359 
1360 	dst = ip6_route_output(net, NULL, &fl6);
1361 	if (!dst->error)
1362 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1363 	dst_release(dst);
1364 }
1365 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1366 
1367 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1368 {
1369 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1370 			sk->sk_bound_dev_if, sk->sk_mark);
1371 }
1372 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1373 
1374 /* Handle redirects */
1375 struct ip6rd_flowi {
1376 	struct flowi6 fl6;
1377 	struct in6_addr gateway;
1378 };
1379 
1380 static struct rt6_info *__ip6_route_redirect(struct net *net,
1381 					     struct fib6_table *table,
1382 					     struct flowi6 *fl6,
1383 					     int flags)
1384 {
1385 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1386 	struct rt6_info *rt;
1387 	struct fib6_node *fn;
1388 
1389 	/* Get the "current" route for this destination and
1390 	 * check if the redirect has come from approriate router.
1391 	 *
1392 	 * RFC 4861 specifies that redirects should only be
1393 	 * accepted if they come from the nexthop to the target.
1394 	 * Due to the way the routes are chosen, this notion
1395 	 * is a bit fuzzy and one might need to check all possible
1396 	 * routes.
1397 	 */
1398 
1399 	read_lock_bh(&table->tb6_lock);
1400 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1401 restart:
1402 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1403 		if (rt6_check_expired(rt))
1404 			continue;
1405 		if (rt->dst.error)
1406 			break;
1407 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1408 			continue;
1409 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1410 			continue;
1411 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1412 			continue;
1413 		break;
1414 	}
1415 
1416 	if (!rt)
1417 		rt = net->ipv6.ip6_null_entry;
1418 	else if (rt->dst.error) {
1419 		rt = net->ipv6.ip6_null_entry;
1420 		goto out;
1421 	}
1422 
1423 	if (rt == net->ipv6.ip6_null_entry) {
1424 		fn = fib6_backtrack(fn, &fl6->saddr);
1425 		if (fn)
1426 			goto restart;
1427 	}
1428 
1429 out:
1430 	dst_hold(&rt->dst);
1431 
1432 	read_unlock_bh(&table->tb6_lock);
1433 
1434 	return rt;
1435 };
1436 
1437 static struct dst_entry *ip6_route_redirect(struct net *net,
1438 					const struct flowi6 *fl6,
1439 					const struct in6_addr *gateway)
1440 {
1441 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1442 	struct ip6rd_flowi rdfl;
1443 
1444 	rdfl.fl6 = *fl6;
1445 	rdfl.gateway = *gateway;
1446 
1447 	return fib6_rule_lookup(net, &rdfl.fl6,
1448 				flags, __ip6_route_redirect);
1449 }
1450 
1451 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1452 {
1453 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1454 	struct dst_entry *dst;
1455 	struct flowi6 fl6;
1456 
1457 	memset(&fl6, 0, sizeof(fl6));
1458 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1459 	fl6.flowi6_oif = oif;
1460 	fl6.flowi6_mark = mark;
1461 	fl6.daddr = iph->daddr;
1462 	fl6.saddr = iph->saddr;
1463 	fl6.flowlabel = ip6_flowinfo(iph);
1464 
1465 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1466 	rt6_do_redirect(dst, NULL, skb);
1467 	dst_release(dst);
1468 }
1469 EXPORT_SYMBOL_GPL(ip6_redirect);
1470 
1471 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1472 			    u32 mark)
1473 {
1474 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1475 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1476 	struct dst_entry *dst;
1477 	struct flowi6 fl6;
1478 
1479 	memset(&fl6, 0, sizeof(fl6));
1480 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1481 	fl6.flowi6_oif = oif;
1482 	fl6.flowi6_mark = mark;
1483 	fl6.daddr = msg->dest;
1484 	fl6.saddr = iph->daddr;
1485 
1486 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1487 	rt6_do_redirect(dst, NULL, skb);
1488 	dst_release(dst);
1489 }
1490 
1491 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1492 {
1493 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1494 }
1495 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1496 
1497 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1498 {
1499 	struct net_device *dev = dst->dev;
1500 	unsigned int mtu = dst_mtu(dst);
1501 	struct net *net = dev_net(dev);
1502 
1503 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1504 
1505 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1506 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1507 
1508 	/*
1509 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1510 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1511 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1512 	 * rely only on pmtu discovery"
1513 	 */
1514 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1515 		mtu = IPV6_MAXPLEN;
1516 	return mtu;
1517 }
1518 
1519 static unsigned int ip6_mtu(const struct dst_entry *dst)
1520 {
1521 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1522 	unsigned int mtu = rt->rt6i_pmtu;
1523 	struct inet6_dev *idev;
1524 
1525 	if (mtu)
1526 		goto out;
1527 
1528 	mtu = dst_metric_raw(dst, RTAX_MTU);
1529 	if (mtu)
1530 		goto out;
1531 
1532 	mtu = IPV6_MIN_MTU;
1533 
1534 	rcu_read_lock();
1535 	idev = __in6_dev_get(dst->dev);
1536 	if (idev)
1537 		mtu = idev->cnf.mtu6;
1538 	rcu_read_unlock();
1539 
1540 out:
1541 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1542 }
1543 
1544 static struct dst_entry *icmp6_dst_gc_list;
1545 static DEFINE_SPINLOCK(icmp6_dst_lock);
1546 
1547 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1548 				  struct flowi6 *fl6)
1549 {
1550 	struct dst_entry *dst;
1551 	struct rt6_info *rt;
1552 	struct inet6_dev *idev = in6_dev_get(dev);
1553 	struct net *net = dev_net(dev);
1554 
1555 	if (unlikely(!idev))
1556 		return ERR_PTR(-ENODEV);
1557 
1558 	rt = ip6_dst_alloc(net, dev, 0, NULL);
1559 	if (unlikely(!rt)) {
1560 		in6_dev_put(idev);
1561 		dst = ERR_PTR(-ENOMEM);
1562 		goto out;
1563 	}
1564 
1565 	rt->dst.flags |= DST_HOST;
1566 	rt->dst.output  = ip6_output;
1567 	atomic_set(&rt->dst.__refcnt, 1);
1568 	rt->rt6i_gateway  = fl6->daddr;
1569 	rt->rt6i_dst.addr = fl6->daddr;
1570 	rt->rt6i_dst.plen = 128;
1571 	rt->rt6i_idev     = idev;
1572 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1573 
1574 	spin_lock_bh(&icmp6_dst_lock);
1575 	rt->dst.next = icmp6_dst_gc_list;
1576 	icmp6_dst_gc_list = &rt->dst;
1577 	spin_unlock_bh(&icmp6_dst_lock);
1578 
1579 	fib6_force_start_gc(net);
1580 
1581 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1582 
1583 out:
1584 	return dst;
1585 }
1586 
1587 int icmp6_dst_gc(void)
1588 {
1589 	struct dst_entry *dst, **pprev;
1590 	int more = 0;
1591 
1592 	spin_lock_bh(&icmp6_dst_lock);
1593 	pprev = &icmp6_dst_gc_list;
1594 
1595 	while ((dst = *pprev) != NULL) {
1596 		if (!atomic_read(&dst->__refcnt)) {
1597 			*pprev = dst->next;
1598 			dst_free(dst);
1599 		} else {
1600 			pprev = &dst->next;
1601 			++more;
1602 		}
1603 	}
1604 
1605 	spin_unlock_bh(&icmp6_dst_lock);
1606 
1607 	return more;
1608 }
1609 
1610 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1611 			    void *arg)
1612 {
1613 	struct dst_entry *dst, **pprev;
1614 
1615 	spin_lock_bh(&icmp6_dst_lock);
1616 	pprev = &icmp6_dst_gc_list;
1617 	while ((dst = *pprev) != NULL) {
1618 		struct rt6_info *rt = (struct rt6_info *) dst;
1619 		if (func(rt, arg)) {
1620 			*pprev = dst->next;
1621 			dst_free(dst);
1622 		} else {
1623 			pprev = &dst->next;
1624 		}
1625 	}
1626 	spin_unlock_bh(&icmp6_dst_lock);
1627 }
1628 
1629 static int ip6_dst_gc(struct dst_ops *ops)
1630 {
1631 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1632 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1633 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1634 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1635 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1636 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1637 	int entries;
1638 
1639 	entries = dst_entries_get_fast(ops);
1640 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1641 	    entries <= rt_max_size)
1642 		goto out;
1643 
1644 	net->ipv6.ip6_rt_gc_expire++;
1645 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1646 	entries = dst_entries_get_slow(ops);
1647 	if (entries < ops->gc_thresh)
1648 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1649 out:
1650 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1651 	return entries > rt_max_size;
1652 }
1653 
1654 static int ip6_convert_metrics(struct mx6_config *mxc,
1655 			       const struct fib6_config *cfg)
1656 {
1657 	struct nlattr *nla;
1658 	int remaining;
1659 	u32 *mp;
1660 
1661 	if (!cfg->fc_mx)
1662 		return 0;
1663 
1664 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1665 	if (unlikely(!mp))
1666 		return -ENOMEM;
1667 
1668 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1669 		int type = nla_type(nla);
1670 
1671 		if (type) {
1672 			u32 val;
1673 
1674 			if (unlikely(type > RTAX_MAX))
1675 				goto err;
1676 			if (type == RTAX_CC_ALGO) {
1677 				char tmp[TCP_CA_NAME_MAX];
1678 
1679 				nla_strlcpy(tmp, nla, sizeof(tmp));
1680 				val = tcp_ca_get_key_by_name(tmp);
1681 				if (val == TCP_CA_UNSPEC)
1682 					goto err;
1683 			} else {
1684 				val = nla_get_u32(nla);
1685 			}
1686 
1687 			mp[type - 1] = val;
1688 			__set_bit(type - 1, mxc->mx_valid);
1689 		}
1690 	}
1691 
1692 	mxc->mx = mp;
1693 
1694 	return 0;
1695  err:
1696 	kfree(mp);
1697 	return -EINVAL;
1698 }
1699 
1700 int ip6_route_add(struct fib6_config *cfg)
1701 {
1702 	int err;
1703 	struct net *net = cfg->fc_nlinfo.nl_net;
1704 	struct rt6_info *rt = NULL;
1705 	struct net_device *dev = NULL;
1706 	struct inet6_dev *idev = NULL;
1707 	struct fib6_table *table;
1708 	struct mx6_config mxc = { .mx = NULL, };
1709 	int addr_type;
1710 
1711 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1712 		return -EINVAL;
1713 #ifndef CONFIG_IPV6_SUBTREES
1714 	if (cfg->fc_src_len)
1715 		return -EINVAL;
1716 #endif
1717 	if (cfg->fc_ifindex) {
1718 		err = -ENODEV;
1719 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1720 		if (!dev)
1721 			goto out;
1722 		idev = in6_dev_get(dev);
1723 		if (!idev)
1724 			goto out;
1725 	}
1726 
1727 	if (cfg->fc_metric == 0)
1728 		cfg->fc_metric = IP6_RT_PRIO_USER;
1729 
1730 	err = -ENOBUFS;
1731 	if (cfg->fc_nlinfo.nlh &&
1732 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1733 		table = fib6_get_table(net, cfg->fc_table);
1734 		if (!table) {
1735 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1736 			table = fib6_new_table(net, cfg->fc_table);
1737 		}
1738 	} else {
1739 		table = fib6_new_table(net, cfg->fc_table);
1740 	}
1741 
1742 	if (!table)
1743 		goto out;
1744 
1745 	rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1746 
1747 	if (!rt) {
1748 		err = -ENOMEM;
1749 		goto out;
1750 	}
1751 
1752 	if (cfg->fc_flags & RTF_EXPIRES)
1753 		rt6_set_expires(rt, jiffies +
1754 				clock_t_to_jiffies(cfg->fc_expires));
1755 	else
1756 		rt6_clean_expires(rt);
1757 
1758 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1759 		cfg->fc_protocol = RTPROT_BOOT;
1760 	rt->rt6i_protocol = cfg->fc_protocol;
1761 
1762 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1763 
1764 	if (addr_type & IPV6_ADDR_MULTICAST)
1765 		rt->dst.input = ip6_mc_input;
1766 	else if (cfg->fc_flags & RTF_LOCAL)
1767 		rt->dst.input = ip6_input;
1768 	else
1769 		rt->dst.input = ip6_forward;
1770 
1771 	rt->dst.output = ip6_output;
1772 
1773 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1774 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1775 	if (rt->rt6i_dst.plen == 128)
1776 		rt->dst.flags |= DST_HOST;
1777 
1778 #ifdef CONFIG_IPV6_SUBTREES
1779 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1780 	rt->rt6i_src.plen = cfg->fc_src_len;
1781 #endif
1782 
1783 	rt->rt6i_metric = cfg->fc_metric;
1784 
1785 	/* We cannot add true routes via loopback here,
1786 	   they would result in kernel looping; promote them to reject routes
1787 	 */
1788 	if ((cfg->fc_flags & RTF_REJECT) ||
1789 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1790 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1791 	     !(cfg->fc_flags & RTF_LOCAL))) {
1792 		/* hold loopback dev/idev if we haven't done so. */
1793 		if (dev != net->loopback_dev) {
1794 			if (dev) {
1795 				dev_put(dev);
1796 				in6_dev_put(idev);
1797 			}
1798 			dev = net->loopback_dev;
1799 			dev_hold(dev);
1800 			idev = in6_dev_get(dev);
1801 			if (!idev) {
1802 				err = -ENODEV;
1803 				goto out;
1804 			}
1805 		}
1806 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1807 		switch (cfg->fc_type) {
1808 		case RTN_BLACKHOLE:
1809 			rt->dst.error = -EINVAL;
1810 			rt->dst.output = dst_discard_sk;
1811 			rt->dst.input = dst_discard;
1812 			break;
1813 		case RTN_PROHIBIT:
1814 			rt->dst.error = -EACCES;
1815 			rt->dst.output = ip6_pkt_prohibit_out;
1816 			rt->dst.input = ip6_pkt_prohibit;
1817 			break;
1818 		case RTN_THROW:
1819 		default:
1820 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1821 					: -ENETUNREACH;
1822 			rt->dst.output = ip6_pkt_discard_out;
1823 			rt->dst.input = ip6_pkt_discard;
1824 			break;
1825 		}
1826 		goto install_route;
1827 	}
1828 
1829 	if (cfg->fc_flags & RTF_GATEWAY) {
1830 		const struct in6_addr *gw_addr;
1831 		int gwa_type;
1832 
1833 		gw_addr = &cfg->fc_gateway;
1834 
1835 		/* if gw_addr is local we will fail to detect this in case
1836 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1837 		 * will return already-added prefix route via interface that
1838 		 * prefix route was assigned to, which might be non-loopback.
1839 		 */
1840 		err = -EINVAL;
1841 		if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1842 			goto out;
1843 
1844 		rt->rt6i_gateway = *gw_addr;
1845 		gwa_type = ipv6_addr_type(gw_addr);
1846 
1847 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1848 			struct rt6_info *grt;
1849 
1850 			/* IPv6 strictly inhibits using not link-local
1851 			   addresses as nexthop address.
1852 			   Otherwise, router will not able to send redirects.
1853 			   It is very good, but in some (rare!) circumstances
1854 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1855 			   some exceptions. --ANK
1856 			 */
1857 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1858 				goto out;
1859 
1860 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1861 
1862 			err = -EHOSTUNREACH;
1863 			if (!grt)
1864 				goto out;
1865 			if (dev) {
1866 				if (dev != grt->dst.dev) {
1867 					ip6_rt_put(grt);
1868 					goto out;
1869 				}
1870 			} else {
1871 				dev = grt->dst.dev;
1872 				idev = grt->rt6i_idev;
1873 				dev_hold(dev);
1874 				in6_dev_hold(grt->rt6i_idev);
1875 			}
1876 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1877 				err = 0;
1878 			ip6_rt_put(grt);
1879 
1880 			if (err)
1881 				goto out;
1882 		}
1883 		err = -EINVAL;
1884 		if (!dev || (dev->flags & IFF_LOOPBACK))
1885 			goto out;
1886 	}
1887 
1888 	err = -ENODEV;
1889 	if (!dev)
1890 		goto out;
1891 
1892 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1893 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1894 			err = -EINVAL;
1895 			goto out;
1896 		}
1897 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1898 		rt->rt6i_prefsrc.plen = 128;
1899 	} else
1900 		rt->rt6i_prefsrc.plen = 0;
1901 
1902 	rt->rt6i_flags = cfg->fc_flags;
1903 
1904 install_route:
1905 	rt->dst.dev = dev;
1906 	rt->rt6i_idev = idev;
1907 	rt->rt6i_table = table;
1908 
1909 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1910 
1911 	err = ip6_convert_metrics(&mxc, cfg);
1912 	if (err)
1913 		goto out;
1914 
1915 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1916 
1917 	kfree(mxc.mx);
1918 	return err;
1919 out:
1920 	if (dev)
1921 		dev_put(dev);
1922 	if (idev)
1923 		in6_dev_put(idev);
1924 	if (rt)
1925 		dst_free(&rt->dst);
1926 	return err;
1927 }
1928 
1929 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1930 {
1931 	int err;
1932 	struct fib6_table *table;
1933 	struct net *net = dev_net(rt->dst.dev);
1934 
1935 	if (rt == net->ipv6.ip6_null_entry) {
1936 		err = -ENOENT;
1937 		goto out;
1938 	}
1939 
1940 	table = rt->rt6i_table;
1941 	write_lock_bh(&table->tb6_lock);
1942 	err = fib6_del(rt, info);
1943 	write_unlock_bh(&table->tb6_lock);
1944 
1945 out:
1946 	ip6_rt_put(rt);
1947 	return err;
1948 }
1949 
1950 int ip6_del_rt(struct rt6_info *rt)
1951 {
1952 	struct nl_info info = {
1953 		.nl_net = dev_net(rt->dst.dev),
1954 	};
1955 	return __ip6_del_rt(rt, &info);
1956 }
1957 
1958 static int ip6_route_del(struct fib6_config *cfg)
1959 {
1960 	struct fib6_table *table;
1961 	struct fib6_node *fn;
1962 	struct rt6_info *rt;
1963 	int err = -ESRCH;
1964 
1965 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1966 	if (!table)
1967 		return err;
1968 
1969 	read_lock_bh(&table->tb6_lock);
1970 
1971 	fn = fib6_locate(&table->tb6_root,
1972 			 &cfg->fc_dst, cfg->fc_dst_len,
1973 			 &cfg->fc_src, cfg->fc_src_len);
1974 
1975 	if (fn) {
1976 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1977 			if ((rt->rt6i_flags & RTF_CACHE) &&
1978 			    !(cfg->fc_flags & RTF_CACHE))
1979 				continue;
1980 			if (cfg->fc_ifindex &&
1981 			    (!rt->dst.dev ||
1982 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
1983 				continue;
1984 			if (cfg->fc_flags & RTF_GATEWAY &&
1985 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1986 				continue;
1987 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1988 				continue;
1989 			dst_hold(&rt->dst);
1990 			read_unlock_bh(&table->tb6_lock);
1991 
1992 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1993 		}
1994 	}
1995 	read_unlock_bh(&table->tb6_lock);
1996 
1997 	return err;
1998 }
1999 
2000 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2001 {
2002 	struct net *net = dev_net(skb->dev);
2003 	struct netevent_redirect netevent;
2004 	struct rt6_info *rt, *nrt = NULL;
2005 	struct ndisc_options ndopts;
2006 	struct inet6_dev *in6_dev;
2007 	struct neighbour *neigh;
2008 	struct rd_msg *msg;
2009 	int optlen, on_link;
2010 	u8 *lladdr;
2011 
2012 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2013 	optlen -= sizeof(*msg);
2014 
2015 	if (optlen < 0) {
2016 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2017 		return;
2018 	}
2019 
2020 	msg = (struct rd_msg *)icmp6_hdr(skb);
2021 
2022 	if (ipv6_addr_is_multicast(&msg->dest)) {
2023 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2024 		return;
2025 	}
2026 
2027 	on_link = 0;
2028 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2029 		on_link = 1;
2030 	} else if (ipv6_addr_type(&msg->target) !=
2031 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2032 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2033 		return;
2034 	}
2035 
2036 	in6_dev = __in6_dev_get(skb->dev);
2037 	if (!in6_dev)
2038 		return;
2039 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2040 		return;
2041 
2042 	/* RFC2461 8.1:
2043 	 *	The IP source address of the Redirect MUST be the same as the current
2044 	 *	first-hop router for the specified ICMP Destination Address.
2045 	 */
2046 
2047 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2048 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2049 		return;
2050 	}
2051 
2052 	lladdr = NULL;
2053 	if (ndopts.nd_opts_tgt_lladdr) {
2054 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2055 					     skb->dev);
2056 		if (!lladdr) {
2057 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2058 			return;
2059 		}
2060 	}
2061 
2062 	rt = (struct rt6_info *) dst;
2063 	if (rt == net->ipv6.ip6_null_entry) {
2064 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2065 		return;
2066 	}
2067 
2068 	/* Redirect received -> path was valid.
2069 	 * Look, redirects are sent only in response to data packets,
2070 	 * so that this nexthop apparently is reachable. --ANK
2071 	 */
2072 	dst_confirm(&rt->dst);
2073 
2074 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2075 	if (!neigh)
2076 		return;
2077 
2078 	/*
2079 	 *	We have finally decided to accept it.
2080 	 */
2081 
2082 	neigh_update(neigh, lladdr, NUD_STALE,
2083 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2084 		     NEIGH_UPDATE_F_OVERRIDE|
2085 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2086 				     NEIGH_UPDATE_F_ISROUTER))
2087 		     );
2088 
2089 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2090 	if (!nrt)
2091 		goto out;
2092 
2093 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2094 	if (on_link)
2095 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2096 
2097 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2098 
2099 	if (ip6_ins_rt(nrt))
2100 		goto out;
2101 
2102 	netevent.old = &rt->dst;
2103 	netevent.new = &nrt->dst;
2104 	netevent.daddr = &msg->dest;
2105 	netevent.neigh = neigh;
2106 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2107 
2108 	if (rt->rt6i_flags & RTF_CACHE) {
2109 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2110 		ip6_del_rt(rt);
2111 	}
2112 
2113 out:
2114 	neigh_release(neigh);
2115 }
2116 
2117 /*
2118  *	Misc support functions
2119  */
2120 
2121 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2122 {
2123 	BUG_ON(from->dst.from);
2124 
2125 	rt->rt6i_flags &= ~RTF_EXPIRES;
2126 	dst_hold(&from->dst);
2127 	rt->dst.from = &from->dst;
2128 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2129 }
2130 
2131 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2132 {
2133 	rt->dst.input = ort->dst.input;
2134 	rt->dst.output = ort->dst.output;
2135 	rt->rt6i_dst = ort->rt6i_dst;
2136 	rt->dst.error = ort->dst.error;
2137 	rt->rt6i_idev = ort->rt6i_idev;
2138 	if (rt->rt6i_idev)
2139 		in6_dev_hold(rt->rt6i_idev);
2140 	rt->dst.lastuse = jiffies;
2141 	rt->rt6i_gateway = ort->rt6i_gateway;
2142 	rt->rt6i_flags = ort->rt6i_flags;
2143 	rt6_set_from(rt, ort);
2144 	rt->rt6i_metric = ort->rt6i_metric;
2145 #ifdef CONFIG_IPV6_SUBTREES
2146 	rt->rt6i_src = ort->rt6i_src;
2147 #endif
2148 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2149 	rt->rt6i_table = ort->rt6i_table;
2150 }
2151 
2152 #ifdef CONFIG_IPV6_ROUTE_INFO
2153 static struct rt6_info *rt6_get_route_info(struct net *net,
2154 					   const struct in6_addr *prefix, int prefixlen,
2155 					   const struct in6_addr *gwaddr, int ifindex)
2156 {
2157 	struct fib6_node *fn;
2158 	struct rt6_info *rt = NULL;
2159 	struct fib6_table *table;
2160 
2161 	table = fib6_get_table(net, RT6_TABLE_INFO);
2162 	if (!table)
2163 		return NULL;
2164 
2165 	read_lock_bh(&table->tb6_lock);
2166 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2167 	if (!fn)
2168 		goto out;
2169 
2170 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2171 		if (rt->dst.dev->ifindex != ifindex)
2172 			continue;
2173 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2174 			continue;
2175 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2176 			continue;
2177 		dst_hold(&rt->dst);
2178 		break;
2179 	}
2180 out:
2181 	read_unlock_bh(&table->tb6_lock);
2182 	return rt;
2183 }
2184 
2185 static struct rt6_info *rt6_add_route_info(struct net *net,
2186 					   const struct in6_addr *prefix, int prefixlen,
2187 					   const struct in6_addr *gwaddr, int ifindex,
2188 					   unsigned int pref)
2189 {
2190 	struct fib6_config cfg = {
2191 		.fc_table	= RT6_TABLE_INFO,
2192 		.fc_metric	= IP6_RT_PRIO_USER,
2193 		.fc_ifindex	= ifindex,
2194 		.fc_dst_len	= prefixlen,
2195 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2196 				  RTF_UP | RTF_PREF(pref),
2197 		.fc_nlinfo.portid = 0,
2198 		.fc_nlinfo.nlh = NULL,
2199 		.fc_nlinfo.nl_net = net,
2200 	};
2201 
2202 	cfg.fc_dst = *prefix;
2203 	cfg.fc_gateway = *gwaddr;
2204 
2205 	/* We should treat it as a default route if prefix length is 0. */
2206 	if (!prefixlen)
2207 		cfg.fc_flags |= RTF_DEFAULT;
2208 
2209 	ip6_route_add(&cfg);
2210 
2211 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2212 }
2213 #endif
2214 
2215 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2216 {
2217 	struct rt6_info *rt;
2218 	struct fib6_table *table;
2219 
2220 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2221 	if (!table)
2222 		return NULL;
2223 
2224 	read_lock_bh(&table->tb6_lock);
2225 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2226 		if (dev == rt->dst.dev &&
2227 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2228 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2229 			break;
2230 	}
2231 	if (rt)
2232 		dst_hold(&rt->dst);
2233 	read_unlock_bh(&table->tb6_lock);
2234 	return rt;
2235 }
2236 
2237 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2238 				     struct net_device *dev,
2239 				     unsigned int pref)
2240 {
2241 	struct fib6_config cfg = {
2242 		.fc_table	= RT6_TABLE_DFLT,
2243 		.fc_metric	= IP6_RT_PRIO_USER,
2244 		.fc_ifindex	= dev->ifindex,
2245 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2246 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2247 		.fc_nlinfo.portid = 0,
2248 		.fc_nlinfo.nlh = NULL,
2249 		.fc_nlinfo.nl_net = dev_net(dev),
2250 	};
2251 
2252 	cfg.fc_gateway = *gwaddr;
2253 
2254 	ip6_route_add(&cfg);
2255 
2256 	return rt6_get_dflt_router(gwaddr, dev);
2257 }
2258 
2259 void rt6_purge_dflt_routers(struct net *net)
2260 {
2261 	struct rt6_info *rt;
2262 	struct fib6_table *table;
2263 
2264 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2265 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2266 	if (!table)
2267 		return;
2268 
2269 restart:
2270 	read_lock_bh(&table->tb6_lock);
2271 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2272 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2273 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2274 			dst_hold(&rt->dst);
2275 			read_unlock_bh(&table->tb6_lock);
2276 			ip6_del_rt(rt);
2277 			goto restart;
2278 		}
2279 	}
2280 	read_unlock_bh(&table->tb6_lock);
2281 }
2282 
2283 static void rtmsg_to_fib6_config(struct net *net,
2284 				 struct in6_rtmsg *rtmsg,
2285 				 struct fib6_config *cfg)
2286 {
2287 	memset(cfg, 0, sizeof(*cfg));
2288 
2289 	cfg->fc_table = RT6_TABLE_MAIN;
2290 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2291 	cfg->fc_metric = rtmsg->rtmsg_metric;
2292 	cfg->fc_expires = rtmsg->rtmsg_info;
2293 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2294 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2295 	cfg->fc_flags = rtmsg->rtmsg_flags;
2296 
2297 	cfg->fc_nlinfo.nl_net = net;
2298 
2299 	cfg->fc_dst = rtmsg->rtmsg_dst;
2300 	cfg->fc_src = rtmsg->rtmsg_src;
2301 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2302 }
2303 
2304 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2305 {
2306 	struct fib6_config cfg;
2307 	struct in6_rtmsg rtmsg;
2308 	int err;
2309 
2310 	switch (cmd) {
2311 	case SIOCADDRT:		/* Add a route */
2312 	case SIOCDELRT:		/* Delete a route */
2313 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2314 			return -EPERM;
2315 		err = copy_from_user(&rtmsg, arg,
2316 				     sizeof(struct in6_rtmsg));
2317 		if (err)
2318 			return -EFAULT;
2319 
2320 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2321 
2322 		rtnl_lock();
2323 		switch (cmd) {
2324 		case SIOCADDRT:
2325 			err = ip6_route_add(&cfg);
2326 			break;
2327 		case SIOCDELRT:
2328 			err = ip6_route_del(&cfg);
2329 			break;
2330 		default:
2331 			err = -EINVAL;
2332 		}
2333 		rtnl_unlock();
2334 
2335 		return err;
2336 	}
2337 
2338 	return -EINVAL;
2339 }
2340 
2341 /*
2342  *	Drop the packet on the floor
2343  */
2344 
2345 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2346 {
2347 	int type;
2348 	struct dst_entry *dst = skb_dst(skb);
2349 	switch (ipstats_mib_noroutes) {
2350 	case IPSTATS_MIB_INNOROUTES:
2351 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2352 		if (type == IPV6_ADDR_ANY) {
2353 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2354 				      IPSTATS_MIB_INADDRERRORS);
2355 			break;
2356 		}
2357 		/* FALLTHROUGH */
2358 	case IPSTATS_MIB_OUTNOROUTES:
2359 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2360 			      ipstats_mib_noroutes);
2361 		break;
2362 	}
2363 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2364 	kfree_skb(skb);
2365 	return 0;
2366 }
2367 
2368 static int ip6_pkt_discard(struct sk_buff *skb)
2369 {
2370 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2371 }
2372 
2373 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2374 {
2375 	skb->dev = skb_dst(skb)->dev;
2376 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2377 }
2378 
2379 static int ip6_pkt_prohibit(struct sk_buff *skb)
2380 {
2381 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2382 }
2383 
2384 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2385 {
2386 	skb->dev = skb_dst(skb)->dev;
2387 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2388 }
2389 
2390 /*
2391  *	Allocate a dst for local (unicast / anycast) address.
2392  */
2393 
2394 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2395 				    const struct in6_addr *addr,
2396 				    bool anycast)
2397 {
2398 	struct net *net = dev_net(idev->dev);
2399 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2400 					    DST_NOCOUNT, NULL);
2401 	if (!rt)
2402 		return ERR_PTR(-ENOMEM);
2403 
2404 	in6_dev_hold(idev);
2405 
2406 	rt->dst.flags |= DST_HOST;
2407 	rt->dst.input = ip6_input;
2408 	rt->dst.output = ip6_output;
2409 	rt->rt6i_idev = idev;
2410 
2411 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2412 	if (anycast)
2413 		rt->rt6i_flags |= RTF_ANYCAST;
2414 	else
2415 		rt->rt6i_flags |= RTF_LOCAL;
2416 
2417 	rt->rt6i_gateway  = *addr;
2418 	rt->rt6i_dst.addr = *addr;
2419 	rt->rt6i_dst.plen = 128;
2420 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2421 
2422 	atomic_set(&rt->dst.__refcnt, 1);
2423 
2424 	return rt;
2425 }
2426 
2427 int ip6_route_get_saddr(struct net *net,
2428 			struct rt6_info *rt,
2429 			const struct in6_addr *daddr,
2430 			unsigned int prefs,
2431 			struct in6_addr *saddr)
2432 {
2433 	struct inet6_dev *idev =
2434 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2435 	int err = 0;
2436 	if (rt && rt->rt6i_prefsrc.plen)
2437 		*saddr = rt->rt6i_prefsrc.addr;
2438 	else
2439 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2440 					 daddr, prefs, saddr);
2441 	return err;
2442 }
2443 
2444 /* remove deleted ip from prefsrc entries */
2445 struct arg_dev_net_ip {
2446 	struct net_device *dev;
2447 	struct net *net;
2448 	struct in6_addr *addr;
2449 };
2450 
2451 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2452 {
2453 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2454 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2455 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2456 
2457 	if (((void *)rt->dst.dev == dev || !dev) &&
2458 	    rt != net->ipv6.ip6_null_entry &&
2459 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2460 		/* remove prefsrc entry */
2461 		rt->rt6i_prefsrc.plen = 0;
2462 	}
2463 	return 0;
2464 }
2465 
2466 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2467 {
2468 	struct net *net = dev_net(ifp->idev->dev);
2469 	struct arg_dev_net_ip adni = {
2470 		.dev = ifp->idev->dev,
2471 		.net = net,
2472 		.addr = &ifp->addr,
2473 	};
2474 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2475 }
2476 
2477 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2478 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2479 
2480 /* Remove routers and update dst entries when gateway turn into host. */
2481 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2482 {
2483 	struct in6_addr *gateway = (struct in6_addr *)arg;
2484 
2485 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2486 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2487 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2488 		return -1;
2489 	}
2490 	return 0;
2491 }
2492 
2493 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2494 {
2495 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2496 }
2497 
2498 struct arg_dev_net {
2499 	struct net_device *dev;
2500 	struct net *net;
2501 };
2502 
2503 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2504 {
2505 	const struct arg_dev_net *adn = arg;
2506 	const struct net_device *dev = adn->dev;
2507 
2508 	if ((rt->dst.dev == dev || !dev) &&
2509 	    rt != adn->net->ipv6.ip6_null_entry)
2510 		return -1;
2511 
2512 	return 0;
2513 }
2514 
2515 void rt6_ifdown(struct net *net, struct net_device *dev)
2516 {
2517 	struct arg_dev_net adn = {
2518 		.dev = dev,
2519 		.net = net,
2520 	};
2521 
2522 	fib6_clean_all(net, fib6_ifdown, &adn);
2523 	icmp6_clean_all(fib6_ifdown, &adn);
2524 	rt6_uncached_list_flush_dev(net, dev);
2525 }
2526 
2527 struct rt6_mtu_change_arg {
2528 	struct net_device *dev;
2529 	unsigned int mtu;
2530 };
2531 
2532 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2533 {
2534 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2535 	struct inet6_dev *idev;
2536 
2537 	/* In IPv6 pmtu discovery is not optional,
2538 	   so that RTAX_MTU lock cannot disable it.
2539 	   We still use this lock to block changes
2540 	   caused by addrconf/ndisc.
2541 	*/
2542 
2543 	idev = __in6_dev_get(arg->dev);
2544 	if (!idev)
2545 		return 0;
2546 
2547 	/* For administrative MTU increase, there is no way to discover
2548 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2549 	   Since RFC 1981 doesn't include administrative MTU increase
2550 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2551 	 */
2552 	/*
2553 	   If new MTU is less than route PMTU, this new MTU will be the
2554 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2555 	   decreases; if new MTU is greater than route PMTU, and the
2556 	   old MTU is the lowest MTU in the path, update the route PMTU
2557 	   to reflect the increase. In this case if the other nodes' MTU
2558 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2559 	   PMTU discouvery.
2560 	 */
2561 	if (rt->dst.dev == arg->dev &&
2562 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2563 		if (rt->rt6i_flags & RTF_CACHE) {
2564 			/* For RTF_CACHE with rt6i_pmtu == 0
2565 			 * (i.e. a redirected route),
2566 			 * the metrics of its rt->dst.from has already
2567 			 * been updated.
2568 			 */
2569 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2570 				rt->rt6i_pmtu = arg->mtu;
2571 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2572 			   (dst_mtu(&rt->dst) < arg->mtu &&
2573 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2574 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2575 		}
2576 	}
2577 	return 0;
2578 }
2579 
2580 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2581 {
2582 	struct rt6_mtu_change_arg arg = {
2583 		.dev = dev,
2584 		.mtu = mtu,
2585 	};
2586 
2587 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2588 }
2589 
2590 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2591 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2592 	[RTA_OIF]               = { .type = NLA_U32 },
2593 	[RTA_IIF]		= { .type = NLA_U32 },
2594 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2595 	[RTA_METRICS]           = { .type = NLA_NESTED },
2596 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2597 	[RTA_PREF]              = { .type = NLA_U8 },
2598 };
2599 
2600 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2601 			      struct fib6_config *cfg)
2602 {
2603 	struct rtmsg *rtm;
2604 	struct nlattr *tb[RTA_MAX+1];
2605 	unsigned int pref;
2606 	int err;
2607 
2608 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2609 	if (err < 0)
2610 		goto errout;
2611 
2612 	err = -EINVAL;
2613 	rtm = nlmsg_data(nlh);
2614 	memset(cfg, 0, sizeof(*cfg));
2615 
2616 	cfg->fc_table = rtm->rtm_table;
2617 	cfg->fc_dst_len = rtm->rtm_dst_len;
2618 	cfg->fc_src_len = rtm->rtm_src_len;
2619 	cfg->fc_flags = RTF_UP;
2620 	cfg->fc_protocol = rtm->rtm_protocol;
2621 	cfg->fc_type = rtm->rtm_type;
2622 
2623 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2624 	    rtm->rtm_type == RTN_BLACKHOLE ||
2625 	    rtm->rtm_type == RTN_PROHIBIT ||
2626 	    rtm->rtm_type == RTN_THROW)
2627 		cfg->fc_flags |= RTF_REJECT;
2628 
2629 	if (rtm->rtm_type == RTN_LOCAL)
2630 		cfg->fc_flags |= RTF_LOCAL;
2631 
2632 	if (rtm->rtm_flags & RTM_F_CLONED)
2633 		cfg->fc_flags |= RTF_CACHE;
2634 
2635 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2636 	cfg->fc_nlinfo.nlh = nlh;
2637 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2638 
2639 	if (tb[RTA_GATEWAY]) {
2640 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2641 		cfg->fc_flags |= RTF_GATEWAY;
2642 	}
2643 
2644 	if (tb[RTA_DST]) {
2645 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2646 
2647 		if (nla_len(tb[RTA_DST]) < plen)
2648 			goto errout;
2649 
2650 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2651 	}
2652 
2653 	if (tb[RTA_SRC]) {
2654 		int plen = (rtm->rtm_src_len + 7) >> 3;
2655 
2656 		if (nla_len(tb[RTA_SRC]) < plen)
2657 			goto errout;
2658 
2659 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2660 	}
2661 
2662 	if (tb[RTA_PREFSRC])
2663 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2664 
2665 	if (tb[RTA_OIF])
2666 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2667 
2668 	if (tb[RTA_PRIORITY])
2669 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2670 
2671 	if (tb[RTA_METRICS]) {
2672 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2673 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2674 	}
2675 
2676 	if (tb[RTA_TABLE])
2677 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2678 
2679 	if (tb[RTA_MULTIPATH]) {
2680 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2681 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2682 	}
2683 
2684 	if (tb[RTA_PREF]) {
2685 		pref = nla_get_u8(tb[RTA_PREF]);
2686 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2687 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2688 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2689 		cfg->fc_flags |= RTF_PREF(pref);
2690 	}
2691 
2692 	err = 0;
2693 errout:
2694 	return err;
2695 }
2696 
2697 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2698 {
2699 	struct fib6_config r_cfg;
2700 	struct rtnexthop *rtnh;
2701 	int remaining;
2702 	int attrlen;
2703 	int err = 0, last_err = 0;
2704 
2705 	remaining = cfg->fc_mp_len;
2706 beginning:
2707 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2708 
2709 	/* Parse a Multipath Entry */
2710 	while (rtnh_ok(rtnh, remaining)) {
2711 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2712 		if (rtnh->rtnh_ifindex)
2713 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2714 
2715 		attrlen = rtnh_attrlen(rtnh);
2716 		if (attrlen > 0) {
2717 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2718 
2719 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2720 			if (nla) {
2721 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2722 				r_cfg.fc_flags |= RTF_GATEWAY;
2723 			}
2724 		}
2725 		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2726 		if (err) {
2727 			last_err = err;
2728 			/* If we are trying to remove a route, do not stop the
2729 			 * loop when ip6_route_del() fails (because next hop is
2730 			 * already gone), we should try to remove all next hops.
2731 			 */
2732 			if (add) {
2733 				/* If add fails, we should try to delete all
2734 				 * next hops that have been already added.
2735 				 */
2736 				add = 0;
2737 				remaining = cfg->fc_mp_len - remaining;
2738 				goto beginning;
2739 			}
2740 		}
2741 		/* Because each route is added like a single route we remove
2742 		 * these flags after the first nexthop: if there is a collision,
2743 		 * we have already failed to add the first nexthop:
2744 		 * fib6_add_rt2node() has rejected it; when replacing, old
2745 		 * nexthops have been replaced by first new, the rest should
2746 		 * be added to it.
2747 		 */
2748 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2749 						     NLM_F_REPLACE);
2750 		rtnh = rtnh_next(rtnh, &remaining);
2751 	}
2752 
2753 	return last_err;
2754 }
2755 
2756 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2757 {
2758 	struct fib6_config cfg;
2759 	int err;
2760 
2761 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2762 	if (err < 0)
2763 		return err;
2764 
2765 	if (cfg.fc_mp)
2766 		return ip6_route_multipath(&cfg, 0);
2767 	else
2768 		return ip6_route_del(&cfg);
2769 }
2770 
2771 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2772 {
2773 	struct fib6_config cfg;
2774 	int err;
2775 
2776 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2777 	if (err < 0)
2778 		return err;
2779 
2780 	if (cfg.fc_mp)
2781 		return ip6_route_multipath(&cfg, 1);
2782 	else
2783 		return ip6_route_add(&cfg);
2784 }
2785 
2786 static inline size_t rt6_nlmsg_size(void)
2787 {
2788 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2789 	       + nla_total_size(16) /* RTA_SRC */
2790 	       + nla_total_size(16) /* RTA_DST */
2791 	       + nla_total_size(16) /* RTA_GATEWAY */
2792 	       + nla_total_size(16) /* RTA_PREFSRC */
2793 	       + nla_total_size(4) /* RTA_TABLE */
2794 	       + nla_total_size(4) /* RTA_IIF */
2795 	       + nla_total_size(4) /* RTA_OIF */
2796 	       + nla_total_size(4) /* RTA_PRIORITY */
2797 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2798 	       + nla_total_size(sizeof(struct rta_cacheinfo))
2799 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2800 	       + nla_total_size(1); /* RTA_PREF */
2801 }
2802 
2803 static int rt6_fill_node(struct net *net,
2804 			 struct sk_buff *skb, struct rt6_info *rt,
2805 			 struct in6_addr *dst, struct in6_addr *src,
2806 			 int iif, int type, u32 portid, u32 seq,
2807 			 int prefix, int nowait, unsigned int flags)
2808 {
2809 	u32 metrics[RTAX_MAX];
2810 	struct rtmsg *rtm;
2811 	struct nlmsghdr *nlh;
2812 	long expires;
2813 	u32 table;
2814 
2815 	if (prefix) {	/* user wants prefix routes only */
2816 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2817 			/* success since this is not a prefix route */
2818 			return 1;
2819 		}
2820 	}
2821 
2822 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2823 	if (!nlh)
2824 		return -EMSGSIZE;
2825 
2826 	rtm = nlmsg_data(nlh);
2827 	rtm->rtm_family = AF_INET6;
2828 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2829 	rtm->rtm_src_len = rt->rt6i_src.plen;
2830 	rtm->rtm_tos = 0;
2831 	if (rt->rt6i_table)
2832 		table = rt->rt6i_table->tb6_id;
2833 	else
2834 		table = RT6_TABLE_UNSPEC;
2835 	rtm->rtm_table = table;
2836 	if (nla_put_u32(skb, RTA_TABLE, table))
2837 		goto nla_put_failure;
2838 	if (rt->rt6i_flags & RTF_REJECT) {
2839 		switch (rt->dst.error) {
2840 		case -EINVAL:
2841 			rtm->rtm_type = RTN_BLACKHOLE;
2842 			break;
2843 		case -EACCES:
2844 			rtm->rtm_type = RTN_PROHIBIT;
2845 			break;
2846 		case -EAGAIN:
2847 			rtm->rtm_type = RTN_THROW;
2848 			break;
2849 		default:
2850 			rtm->rtm_type = RTN_UNREACHABLE;
2851 			break;
2852 		}
2853 	}
2854 	else if (rt->rt6i_flags & RTF_LOCAL)
2855 		rtm->rtm_type = RTN_LOCAL;
2856 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2857 		rtm->rtm_type = RTN_LOCAL;
2858 	else
2859 		rtm->rtm_type = RTN_UNICAST;
2860 	rtm->rtm_flags = 0;
2861 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2862 	rtm->rtm_protocol = rt->rt6i_protocol;
2863 	if (rt->rt6i_flags & RTF_DYNAMIC)
2864 		rtm->rtm_protocol = RTPROT_REDIRECT;
2865 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
2866 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2867 			rtm->rtm_protocol = RTPROT_RA;
2868 		else
2869 			rtm->rtm_protocol = RTPROT_KERNEL;
2870 	}
2871 
2872 	if (rt->rt6i_flags & RTF_CACHE)
2873 		rtm->rtm_flags |= RTM_F_CLONED;
2874 
2875 	if (dst) {
2876 		if (nla_put_in6_addr(skb, RTA_DST, dst))
2877 			goto nla_put_failure;
2878 		rtm->rtm_dst_len = 128;
2879 	} else if (rtm->rtm_dst_len)
2880 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2881 			goto nla_put_failure;
2882 #ifdef CONFIG_IPV6_SUBTREES
2883 	if (src) {
2884 		if (nla_put_in6_addr(skb, RTA_SRC, src))
2885 			goto nla_put_failure;
2886 		rtm->rtm_src_len = 128;
2887 	} else if (rtm->rtm_src_len &&
2888 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2889 		goto nla_put_failure;
2890 #endif
2891 	if (iif) {
2892 #ifdef CONFIG_IPV6_MROUTE
2893 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2894 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2895 			if (err <= 0) {
2896 				if (!nowait) {
2897 					if (err == 0)
2898 						return 0;
2899 					goto nla_put_failure;
2900 				} else {
2901 					if (err == -EMSGSIZE)
2902 						goto nla_put_failure;
2903 				}
2904 			}
2905 		} else
2906 #endif
2907 			if (nla_put_u32(skb, RTA_IIF, iif))
2908 				goto nla_put_failure;
2909 	} else if (dst) {
2910 		struct in6_addr saddr_buf;
2911 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2912 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2913 			goto nla_put_failure;
2914 	}
2915 
2916 	if (rt->rt6i_prefsrc.plen) {
2917 		struct in6_addr saddr_buf;
2918 		saddr_buf = rt->rt6i_prefsrc.addr;
2919 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2920 			goto nla_put_failure;
2921 	}
2922 
2923 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2924 	if (rt->rt6i_pmtu)
2925 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2926 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2927 		goto nla_put_failure;
2928 
2929 	if (rt->rt6i_flags & RTF_GATEWAY) {
2930 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2931 			goto nla_put_failure;
2932 	}
2933 
2934 	if (rt->dst.dev &&
2935 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2936 		goto nla_put_failure;
2937 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2938 		goto nla_put_failure;
2939 
2940 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2941 
2942 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2943 		goto nla_put_failure;
2944 
2945 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2946 		goto nla_put_failure;
2947 
2948 	nlmsg_end(skb, nlh);
2949 	return 0;
2950 
2951 nla_put_failure:
2952 	nlmsg_cancel(skb, nlh);
2953 	return -EMSGSIZE;
2954 }
2955 
2956 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2957 {
2958 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2959 	int prefix;
2960 
2961 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2962 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2963 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2964 	} else
2965 		prefix = 0;
2966 
2967 	return rt6_fill_node(arg->net,
2968 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2969 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2970 		     prefix, 0, NLM_F_MULTI);
2971 }
2972 
2973 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2974 {
2975 	struct net *net = sock_net(in_skb->sk);
2976 	struct nlattr *tb[RTA_MAX+1];
2977 	struct rt6_info *rt;
2978 	struct sk_buff *skb;
2979 	struct rtmsg *rtm;
2980 	struct flowi6 fl6;
2981 	int err, iif = 0, oif = 0;
2982 
2983 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2984 	if (err < 0)
2985 		goto errout;
2986 
2987 	err = -EINVAL;
2988 	memset(&fl6, 0, sizeof(fl6));
2989 
2990 	if (tb[RTA_SRC]) {
2991 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2992 			goto errout;
2993 
2994 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2995 	}
2996 
2997 	if (tb[RTA_DST]) {
2998 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2999 			goto errout;
3000 
3001 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3002 	}
3003 
3004 	if (tb[RTA_IIF])
3005 		iif = nla_get_u32(tb[RTA_IIF]);
3006 
3007 	if (tb[RTA_OIF])
3008 		oif = nla_get_u32(tb[RTA_OIF]);
3009 
3010 	if (tb[RTA_MARK])
3011 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3012 
3013 	if (iif) {
3014 		struct net_device *dev;
3015 		int flags = 0;
3016 
3017 		dev = __dev_get_by_index(net, iif);
3018 		if (!dev) {
3019 			err = -ENODEV;
3020 			goto errout;
3021 		}
3022 
3023 		fl6.flowi6_iif = iif;
3024 
3025 		if (!ipv6_addr_any(&fl6.saddr))
3026 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3027 
3028 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3029 							       flags);
3030 	} else {
3031 		fl6.flowi6_oif = oif;
3032 
3033 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3034 	}
3035 
3036 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3037 	if (!skb) {
3038 		ip6_rt_put(rt);
3039 		err = -ENOBUFS;
3040 		goto errout;
3041 	}
3042 
3043 	/* Reserve room for dummy headers, this skb can pass
3044 	   through good chunk of routing engine.
3045 	 */
3046 	skb_reset_mac_header(skb);
3047 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3048 
3049 	skb_dst_set(skb, &rt->dst);
3050 
3051 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3052 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3053 			    nlh->nlmsg_seq, 0, 0, 0);
3054 	if (err < 0) {
3055 		kfree_skb(skb);
3056 		goto errout;
3057 	}
3058 
3059 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3060 errout:
3061 	return err;
3062 }
3063 
3064 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3065 {
3066 	struct sk_buff *skb;
3067 	struct net *net = info->nl_net;
3068 	u32 seq;
3069 	int err;
3070 
3071 	err = -ENOBUFS;
3072 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3073 
3074 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3075 	if (!skb)
3076 		goto errout;
3077 
3078 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3079 				event, info->portid, seq, 0, 0, 0);
3080 	if (err < 0) {
3081 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3082 		WARN_ON(err == -EMSGSIZE);
3083 		kfree_skb(skb);
3084 		goto errout;
3085 	}
3086 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3087 		    info->nlh, gfp_any());
3088 	return;
3089 errout:
3090 	if (err < 0)
3091 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3092 }
3093 
3094 static int ip6_route_dev_notify(struct notifier_block *this,
3095 				unsigned long event, void *ptr)
3096 {
3097 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3098 	struct net *net = dev_net(dev);
3099 
3100 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3101 		net->ipv6.ip6_null_entry->dst.dev = dev;
3102 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3103 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3104 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3105 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3106 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3107 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3108 #endif
3109 	}
3110 
3111 	return NOTIFY_OK;
3112 }
3113 
3114 /*
3115  *	/proc
3116  */
3117 
3118 #ifdef CONFIG_PROC_FS
3119 
3120 static const struct file_operations ipv6_route_proc_fops = {
3121 	.owner		= THIS_MODULE,
3122 	.open		= ipv6_route_open,
3123 	.read		= seq_read,
3124 	.llseek		= seq_lseek,
3125 	.release	= seq_release_net,
3126 };
3127 
3128 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3129 {
3130 	struct net *net = (struct net *)seq->private;
3131 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3132 		   net->ipv6.rt6_stats->fib_nodes,
3133 		   net->ipv6.rt6_stats->fib_route_nodes,
3134 		   net->ipv6.rt6_stats->fib_rt_alloc,
3135 		   net->ipv6.rt6_stats->fib_rt_entries,
3136 		   net->ipv6.rt6_stats->fib_rt_cache,
3137 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3138 		   net->ipv6.rt6_stats->fib_discarded_routes);
3139 
3140 	return 0;
3141 }
3142 
3143 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3144 {
3145 	return single_open_net(inode, file, rt6_stats_seq_show);
3146 }
3147 
3148 static const struct file_operations rt6_stats_seq_fops = {
3149 	.owner	 = THIS_MODULE,
3150 	.open	 = rt6_stats_seq_open,
3151 	.read	 = seq_read,
3152 	.llseek	 = seq_lseek,
3153 	.release = single_release_net,
3154 };
3155 #endif	/* CONFIG_PROC_FS */
3156 
3157 #ifdef CONFIG_SYSCTL
3158 
3159 static
3160 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3161 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3162 {
3163 	struct net *net;
3164 	int delay;
3165 	if (!write)
3166 		return -EINVAL;
3167 
3168 	net = (struct net *)ctl->extra1;
3169 	delay = net->ipv6.sysctl.flush_delay;
3170 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3171 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3172 	return 0;
3173 }
3174 
3175 struct ctl_table ipv6_route_table_template[] = {
3176 	{
3177 		.procname	=	"flush",
3178 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3179 		.maxlen		=	sizeof(int),
3180 		.mode		=	0200,
3181 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3182 	},
3183 	{
3184 		.procname	=	"gc_thresh",
3185 		.data		=	&ip6_dst_ops_template.gc_thresh,
3186 		.maxlen		=	sizeof(int),
3187 		.mode		=	0644,
3188 		.proc_handler	=	proc_dointvec,
3189 	},
3190 	{
3191 		.procname	=	"max_size",
3192 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3193 		.maxlen		=	sizeof(int),
3194 		.mode		=	0644,
3195 		.proc_handler	=	proc_dointvec,
3196 	},
3197 	{
3198 		.procname	=	"gc_min_interval",
3199 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3200 		.maxlen		=	sizeof(int),
3201 		.mode		=	0644,
3202 		.proc_handler	=	proc_dointvec_jiffies,
3203 	},
3204 	{
3205 		.procname	=	"gc_timeout",
3206 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3207 		.maxlen		=	sizeof(int),
3208 		.mode		=	0644,
3209 		.proc_handler	=	proc_dointvec_jiffies,
3210 	},
3211 	{
3212 		.procname	=	"gc_interval",
3213 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3214 		.maxlen		=	sizeof(int),
3215 		.mode		=	0644,
3216 		.proc_handler	=	proc_dointvec_jiffies,
3217 	},
3218 	{
3219 		.procname	=	"gc_elasticity",
3220 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3221 		.maxlen		=	sizeof(int),
3222 		.mode		=	0644,
3223 		.proc_handler	=	proc_dointvec,
3224 	},
3225 	{
3226 		.procname	=	"mtu_expires",
3227 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3228 		.maxlen		=	sizeof(int),
3229 		.mode		=	0644,
3230 		.proc_handler	=	proc_dointvec_jiffies,
3231 	},
3232 	{
3233 		.procname	=	"min_adv_mss",
3234 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3235 		.maxlen		=	sizeof(int),
3236 		.mode		=	0644,
3237 		.proc_handler	=	proc_dointvec,
3238 	},
3239 	{
3240 		.procname	=	"gc_min_interval_ms",
3241 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3242 		.maxlen		=	sizeof(int),
3243 		.mode		=	0644,
3244 		.proc_handler	=	proc_dointvec_ms_jiffies,
3245 	},
3246 	{ }
3247 };
3248 
3249 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3250 {
3251 	struct ctl_table *table;
3252 
3253 	table = kmemdup(ipv6_route_table_template,
3254 			sizeof(ipv6_route_table_template),
3255 			GFP_KERNEL);
3256 
3257 	if (table) {
3258 		table[0].data = &net->ipv6.sysctl.flush_delay;
3259 		table[0].extra1 = net;
3260 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3261 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3262 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3263 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3264 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3265 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3266 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3267 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3268 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3269 
3270 		/* Don't export sysctls to unprivileged users */
3271 		if (net->user_ns != &init_user_ns)
3272 			table[0].procname = NULL;
3273 	}
3274 
3275 	return table;
3276 }
3277 #endif
3278 
3279 static int __net_init ip6_route_net_init(struct net *net)
3280 {
3281 	int ret = -ENOMEM;
3282 
3283 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3284 	       sizeof(net->ipv6.ip6_dst_ops));
3285 
3286 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3287 		goto out_ip6_dst_ops;
3288 
3289 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3290 					   sizeof(*net->ipv6.ip6_null_entry),
3291 					   GFP_KERNEL);
3292 	if (!net->ipv6.ip6_null_entry)
3293 		goto out_ip6_dst_entries;
3294 	net->ipv6.ip6_null_entry->dst.path =
3295 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3296 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3297 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3298 			 ip6_template_metrics, true);
3299 
3300 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3301 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3302 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3303 					       GFP_KERNEL);
3304 	if (!net->ipv6.ip6_prohibit_entry)
3305 		goto out_ip6_null_entry;
3306 	net->ipv6.ip6_prohibit_entry->dst.path =
3307 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3308 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3309 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3310 			 ip6_template_metrics, true);
3311 
3312 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3313 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3314 					       GFP_KERNEL);
3315 	if (!net->ipv6.ip6_blk_hole_entry)
3316 		goto out_ip6_prohibit_entry;
3317 	net->ipv6.ip6_blk_hole_entry->dst.path =
3318 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3319 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3320 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3321 			 ip6_template_metrics, true);
3322 #endif
3323 
3324 	net->ipv6.sysctl.flush_delay = 0;
3325 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3326 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3327 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3328 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3329 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3330 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3331 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3332 
3333 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3334 
3335 	ret = 0;
3336 out:
3337 	return ret;
3338 
3339 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3340 out_ip6_prohibit_entry:
3341 	kfree(net->ipv6.ip6_prohibit_entry);
3342 out_ip6_null_entry:
3343 	kfree(net->ipv6.ip6_null_entry);
3344 #endif
3345 out_ip6_dst_entries:
3346 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3347 out_ip6_dst_ops:
3348 	goto out;
3349 }
3350 
3351 static void __net_exit ip6_route_net_exit(struct net *net)
3352 {
3353 	kfree(net->ipv6.ip6_null_entry);
3354 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3355 	kfree(net->ipv6.ip6_prohibit_entry);
3356 	kfree(net->ipv6.ip6_blk_hole_entry);
3357 #endif
3358 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3359 }
3360 
3361 static int __net_init ip6_route_net_init_late(struct net *net)
3362 {
3363 #ifdef CONFIG_PROC_FS
3364 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3365 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3366 #endif
3367 	return 0;
3368 }
3369 
3370 static void __net_exit ip6_route_net_exit_late(struct net *net)
3371 {
3372 #ifdef CONFIG_PROC_FS
3373 	remove_proc_entry("ipv6_route", net->proc_net);
3374 	remove_proc_entry("rt6_stats", net->proc_net);
3375 #endif
3376 }
3377 
3378 static struct pernet_operations ip6_route_net_ops = {
3379 	.init = ip6_route_net_init,
3380 	.exit = ip6_route_net_exit,
3381 };
3382 
3383 static int __net_init ipv6_inetpeer_init(struct net *net)
3384 {
3385 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3386 
3387 	if (!bp)
3388 		return -ENOMEM;
3389 	inet_peer_base_init(bp);
3390 	net->ipv6.peers = bp;
3391 	return 0;
3392 }
3393 
3394 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3395 {
3396 	struct inet_peer_base *bp = net->ipv6.peers;
3397 
3398 	net->ipv6.peers = NULL;
3399 	inetpeer_invalidate_tree(bp);
3400 	kfree(bp);
3401 }
3402 
3403 static struct pernet_operations ipv6_inetpeer_ops = {
3404 	.init	=	ipv6_inetpeer_init,
3405 	.exit	=	ipv6_inetpeer_exit,
3406 };
3407 
3408 static struct pernet_operations ip6_route_net_late_ops = {
3409 	.init = ip6_route_net_init_late,
3410 	.exit = ip6_route_net_exit_late,
3411 };
3412 
3413 static struct notifier_block ip6_route_dev_notifier = {
3414 	.notifier_call = ip6_route_dev_notify,
3415 	.priority = 0,
3416 };
3417 
3418 int __init ip6_route_init(void)
3419 {
3420 	int ret;
3421 	int cpu;
3422 
3423 	ret = -ENOMEM;
3424 	ip6_dst_ops_template.kmem_cachep =
3425 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3426 				  SLAB_HWCACHE_ALIGN, NULL);
3427 	if (!ip6_dst_ops_template.kmem_cachep)
3428 		goto out;
3429 
3430 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3431 	if (ret)
3432 		goto out_kmem_cache;
3433 
3434 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3435 	if (ret)
3436 		goto out_dst_entries;
3437 
3438 	ret = register_pernet_subsys(&ip6_route_net_ops);
3439 	if (ret)
3440 		goto out_register_inetpeer;
3441 
3442 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3443 
3444 	/* Registering of the loopback is done before this portion of code,
3445 	 * the loopback reference in rt6_info will not be taken, do it
3446 	 * manually for init_net */
3447 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3448 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3449   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3450 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3451 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3452 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3453 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3454   #endif
3455 	ret = fib6_init();
3456 	if (ret)
3457 		goto out_register_subsys;
3458 
3459 	ret = xfrm6_init();
3460 	if (ret)
3461 		goto out_fib6_init;
3462 
3463 	ret = fib6_rules_init();
3464 	if (ret)
3465 		goto xfrm6_init;
3466 
3467 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3468 	if (ret)
3469 		goto fib6_rules_init;
3470 
3471 	ret = -ENOBUFS;
3472 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3473 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3474 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3475 		goto out_register_late_subsys;
3476 
3477 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3478 	if (ret)
3479 		goto out_register_late_subsys;
3480 
3481 	for_each_possible_cpu(cpu) {
3482 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3483 
3484 		INIT_LIST_HEAD(&ul->head);
3485 		spin_lock_init(&ul->lock);
3486 	}
3487 
3488 out:
3489 	return ret;
3490 
3491 out_register_late_subsys:
3492 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3493 fib6_rules_init:
3494 	fib6_rules_cleanup();
3495 xfrm6_init:
3496 	xfrm6_fini();
3497 out_fib6_init:
3498 	fib6_gc_cleanup();
3499 out_register_subsys:
3500 	unregister_pernet_subsys(&ip6_route_net_ops);
3501 out_register_inetpeer:
3502 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3503 out_dst_entries:
3504 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3505 out_kmem_cache:
3506 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3507 	goto out;
3508 }
3509 
3510 void ip6_route_cleanup(void)
3511 {
3512 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3513 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3514 	fib6_rules_cleanup();
3515 	xfrm6_fini();
3516 	fib6_gc_cleanup();
3517 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3518 	unregister_pernet_subsys(&ip6_route_net_ops);
3519 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3520 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3521 }
3522