xref: /openbmc/linux/net/ipv6/route.c (revision bbde9fc1824aab58bc78c084163007dd6c03fe5b)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #define pr_fmt(fmt) "IPv6: " fmt
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61 #include <net/lwtunnel.h>
62 
63 #include <asm/uaccess.h>
64 
65 #ifdef CONFIG_SYSCTL
66 #include <linux/sysctl.h>
67 #endif
68 
69 enum rt6_nud_state {
70 	RT6_NUD_FAIL_HARD = -3,
71 	RT6_NUD_FAIL_PROBE = -2,
72 	RT6_NUD_FAIL_DO_RR = -1,
73 	RT6_NUD_SUCCEED = 1
74 };
75 
76 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
77 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int	 ip6_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void		ip6_dst_destroy(struct dst_entry *);
82 static void		ip6_dst_ifdown(struct dst_entry *,
83 				       struct net_device *dev, int how);
84 static int		 ip6_dst_gc(struct dst_ops *ops);
85 
86 static int		ip6_pkt_discard(struct sk_buff *skb);
87 static int		ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
88 static int		ip6_pkt_prohibit(struct sk_buff *skb);
89 static int		ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
90 static void		ip6_link_failure(struct sk_buff *skb);
91 static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
92 					   struct sk_buff *skb, u32 mtu);
93 static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
94 					struct sk_buff *skb);
95 static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
96 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
97 
98 #ifdef CONFIG_IPV6_ROUTE_INFO
99 static struct rt6_info *rt6_add_route_info(struct net *net,
100 					   const struct in6_addr *prefix, int prefixlen,
101 					   const struct in6_addr *gwaddr, int ifindex,
102 					   unsigned int pref);
103 static struct rt6_info *rt6_get_route_info(struct net *net,
104 					   const struct in6_addr *prefix, int prefixlen,
105 					   const struct in6_addr *gwaddr, int ifindex);
106 #endif
107 
108 struct uncached_list {
109 	spinlock_t		lock;
110 	struct list_head	head;
111 };
112 
113 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
114 
115 static void rt6_uncached_list_add(struct rt6_info *rt)
116 {
117 	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
118 
119 	rt->dst.flags |= DST_NOCACHE;
120 	rt->rt6i_uncached_list = ul;
121 
122 	spin_lock_bh(&ul->lock);
123 	list_add_tail(&rt->rt6i_uncached, &ul->head);
124 	spin_unlock_bh(&ul->lock);
125 }
126 
127 static void rt6_uncached_list_del(struct rt6_info *rt)
128 {
129 	if (!list_empty(&rt->rt6i_uncached)) {
130 		struct uncached_list *ul = rt->rt6i_uncached_list;
131 
132 		spin_lock_bh(&ul->lock);
133 		list_del(&rt->rt6i_uncached);
134 		spin_unlock_bh(&ul->lock);
135 	}
136 }
137 
138 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
139 {
140 	struct net_device *loopback_dev = net->loopback_dev;
141 	int cpu;
142 
143 	for_each_possible_cpu(cpu) {
144 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
145 		struct rt6_info *rt;
146 
147 		spin_lock_bh(&ul->lock);
148 		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
149 			struct inet6_dev *rt_idev = rt->rt6i_idev;
150 			struct net_device *rt_dev = rt->dst.dev;
151 
152 			if (rt_idev && (rt_idev->dev == dev || !dev) &&
153 			    rt_idev->dev != loopback_dev) {
154 				rt->rt6i_idev = in6_dev_get(loopback_dev);
155 				in6_dev_put(rt_idev);
156 			}
157 
158 			if (rt_dev && (rt_dev == dev || !dev) &&
159 			    rt_dev != loopback_dev) {
160 				rt->dst.dev = loopback_dev;
161 				dev_hold(rt->dst.dev);
162 				dev_put(rt_dev);
163 			}
164 		}
165 		spin_unlock_bh(&ul->lock);
166 	}
167 }
168 
169 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
170 {
171 	return dst_metrics_write_ptr(rt->dst.from);
172 }
173 
174 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
175 {
176 	struct rt6_info *rt = (struct rt6_info *)dst;
177 
178 	if (rt->rt6i_flags & RTF_PCPU)
179 		return rt6_pcpu_cow_metrics(rt);
180 	else if (rt->rt6i_flags & RTF_CACHE)
181 		return NULL;
182 	else
183 		return dst_cow_metrics_generic(dst, old);
184 }
185 
186 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
187 					     struct sk_buff *skb,
188 					     const void *daddr)
189 {
190 	struct in6_addr *p = &rt->rt6i_gateway;
191 
192 	if (!ipv6_addr_any(p))
193 		return (const void *) p;
194 	else if (skb)
195 		return &ipv6_hdr(skb)->daddr;
196 	return daddr;
197 }
198 
199 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
200 					  struct sk_buff *skb,
201 					  const void *daddr)
202 {
203 	struct rt6_info *rt = (struct rt6_info *) dst;
204 	struct neighbour *n;
205 
206 	daddr = choose_neigh_daddr(rt, skb, daddr);
207 	n = __ipv6_neigh_lookup(dst->dev, daddr);
208 	if (n)
209 		return n;
210 	return neigh_create(&nd_tbl, daddr, dst->dev);
211 }
212 
213 static struct dst_ops ip6_dst_ops_template = {
214 	.family			=	AF_INET6,
215 	.gc			=	ip6_dst_gc,
216 	.gc_thresh		=	1024,
217 	.check			=	ip6_dst_check,
218 	.default_advmss		=	ip6_default_advmss,
219 	.mtu			=	ip6_mtu,
220 	.cow_metrics		=	ipv6_cow_metrics,
221 	.destroy		=	ip6_dst_destroy,
222 	.ifdown			=	ip6_dst_ifdown,
223 	.negative_advice	=	ip6_negative_advice,
224 	.link_failure		=	ip6_link_failure,
225 	.update_pmtu		=	ip6_rt_update_pmtu,
226 	.redirect		=	rt6_do_redirect,
227 	.local_out		=	__ip6_local_out,
228 	.neigh_lookup		=	ip6_neigh_lookup,
229 };
230 
231 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
232 {
233 	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
234 
235 	return mtu ? : dst->dev->mtu;
236 }
237 
238 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
239 					 struct sk_buff *skb, u32 mtu)
240 {
241 }
242 
243 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
244 				      struct sk_buff *skb)
245 {
246 }
247 
248 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
249 					 unsigned long old)
250 {
251 	return NULL;
252 }
253 
254 static struct dst_ops ip6_dst_blackhole_ops = {
255 	.family			=	AF_INET6,
256 	.destroy		=	ip6_dst_destroy,
257 	.check			=	ip6_dst_check,
258 	.mtu			=	ip6_blackhole_mtu,
259 	.default_advmss		=	ip6_default_advmss,
260 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
261 	.redirect		=	ip6_rt_blackhole_redirect,
262 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
263 	.neigh_lookup		=	ip6_neigh_lookup,
264 };
265 
266 static const u32 ip6_template_metrics[RTAX_MAX] = {
267 	[RTAX_HOPLIMIT - 1] = 0,
268 };
269 
270 static const struct rt6_info ip6_null_entry_template = {
271 	.dst = {
272 		.__refcnt	= ATOMIC_INIT(1),
273 		.__use		= 1,
274 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
275 		.error		= -ENETUNREACH,
276 		.input		= ip6_pkt_discard,
277 		.output		= ip6_pkt_discard_out,
278 	},
279 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
280 	.rt6i_protocol  = RTPROT_KERNEL,
281 	.rt6i_metric	= ~(u32) 0,
282 	.rt6i_ref	= ATOMIC_INIT(1),
283 };
284 
285 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
286 
287 static const struct rt6_info ip6_prohibit_entry_template = {
288 	.dst = {
289 		.__refcnt	= ATOMIC_INIT(1),
290 		.__use		= 1,
291 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
292 		.error		= -EACCES,
293 		.input		= ip6_pkt_prohibit,
294 		.output		= ip6_pkt_prohibit_out,
295 	},
296 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
297 	.rt6i_protocol  = RTPROT_KERNEL,
298 	.rt6i_metric	= ~(u32) 0,
299 	.rt6i_ref	= ATOMIC_INIT(1),
300 };
301 
302 static const struct rt6_info ip6_blk_hole_entry_template = {
303 	.dst = {
304 		.__refcnt	= ATOMIC_INIT(1),
305 		.__use		= 1,
306 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
307 		.error		= -EINVAL,
308 		.input		= dst_discard,
309 		.output		= dst_discard_sk,
310 	},
311 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
312 	.rt6i_protocol  = RTPROT_KERNEL,
313 	.rt6i_metric	= ~(u32) 0,
314 	.rt6i_ref	= ATOMIC_INIT(1),
315 };
316 
317 #endif
318 
319 /* allocate dst with ip6_dst_ops */
320 static struct rt6_info *__ip6_dst_alloc(struct net *net,
321 					struct net_device *dev,
322 					int flags,
323 					struct fib6_table *table)
324 {
325 	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
326 					0, DST_OBSOLETE_FORCE_CHK, flags);
327 
328 	if (rt) {
329 		struct dst_entry *dst = &rt->dst;
330 
331 		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
332 		INIT_LIST_HEAD(&rt->rt6i_siblings);
333 		INIT_LIST_HEAD(&rt->rt6i_uncached);
334 	}
335 	return rt;
336 }
337 
338 static struct rt6_info *ip6_dst_alloc(struct net *net,
339 				      struct net_device *dev,
340 				      int flags,
341 				      struct fib6_table *table)
342 {
343 	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
344 
345 	if (rt) {
346 		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
347 		if (rt->rt6i_pcpu) {
348 			int cpu;
349 
350 			for_each_possible_cpu(cpu) {
351 				struct rt6_info **p;
352 
353 				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
354 				/* no one shares rt */
355 				*p =  NULL;
356 			}
357 		} else {
358 			dst_destroy((struct dst_entry *)rt);
359 			return NULL;
360 		}
361 	}
362 
363 	return rt;
364 }
365 
366 static void ip6_dst_destroy(struct dst_entry *dst)
367 {
368 	struct rt6_info *rt = (struct rt6_info *)dst;
369 	struct dst_entry *from = dst->from;
370 	struct inet6_dev *idev;
371 
372 	dst_destroy_metrics_generic(dst);
373 	free_percpu(rt->rt6i_pcpu);
374 	rt6_uncached_list_del(rt);
375 
376 	idev = rt->rt6i_idev;
377 	if (idev) {
378 		rt->rt6i_idev = NULL;
379 		in6_dev_put(idev);
380 	}
381 
382 	dst->from = NULL;
383 	dst_release(from);
384 }
385 
386 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
387 			   int how)
388 {
389 	struct rt6_info *rt = (struct rt6_info *)dst;
390 	struct inet6_dev *idev = rt->rt6i_idev;
391 	struct net_device *loopback_dev =
392 		dev_net(dev)->loopback_dev;
393 
394 	if (dev != loopback_dev) {
395 		if (idev && idev->dev == dev) {
396 			struct inet6_dev *loopback_idev =
397 				in6_dev_get(loopback_dev);
398 			if (loopback_idev) {
399 				rt->rt6i_idev = loopback_idev;
400 				in6_dev_put(idev);
401 			}
402 		}
403 	}
404 }
405 
406 static bool rt6_check_expired(const struct rt6_info *rt)
407 {
408 	if (rt->rt6i_flags & RTF_EXPIRES) {
409 		if (time_after(jiffies, rt->dst.expires))
410 			return true;
411 	} else if (rt->dst.from) {
412 		return rt6_check_expired((struct rt6_info *) rt->dst.from);
413 	}
414 	return false;
415 }
416 
417 /* Multipath route selection:
418  *   Hash based function using packet header and flowlabel.
419  * Adapted from fib_info_hashfn()
420  */
421 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
422 			       const struct flowi6 *fl6)
423 {
424 	unsigned int val = fl6->flowi6_proto;
425 
426 	val ^= ipv6_addr_hash(&fl6->daddr);
427 	val ^= ipv6_addr_hash(&fl6->saddr);
428 
429 	/* Work only if this not encapsulated */
430 	switch (fl6->flowi6_proto) {
431 	case IPPROTO_UDP:
432 	case IPPROTO_TCP:
433 	case IPPROTO_SCTP:
434 		val ^= (__force u16)fl6->fl6_sport;
435 		val ^= (__force u16)fl6->fl6_dport;
436 		break;
437 
438 	case IPPROTO_ICMPV6:
439 		val ^= (__force u16)fl6->fl6_icmp_type;
440 		val ^= (__force u16)fl6->fl6_icmp_code;
441 		break;
442 	}
443 	/* RFC6438 recommands to use flowlabel */
444 	val ^= (__force u32)fl6->flowlabel;
445 
446 	/* Perhaps, we need to tune, this function? */
447 	val = val ^ (val >> 7) ^ (val >> 12);
448 	return val % candidate_count;
449 }
450 
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 					     struct flowi6 *fl6, int oif,
453 					     int strict)
454 {
455 	struct rt6_info *sibling, *next_sibling;
456 	int route_choosen;
457 
458 	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
459 	/* Don't change the route, if route_choosen == 0
460 	 * (siblings does not include ourself)
461 	 */
462 	if (route_choosen)
463 		list_for_each_entry_safe(sibling, next_sibling,
464 				&match->rt6i_siblings, rt6i_siblings) {
465 			route_choosen--;
466 			if (route_choosen == 0) {
467 				if (rt6_score_route(sibling, oif, strict) < 0)
468 					break;
469 				match = sibling;
470 				break;
471 			}
472 		}
473 	return match;
474 }
475 
476 /*
477  *	Route lookup. Any table->tb6_lock is implied.
478  */
479 
480 static inline struct rt6_info *rt6_device_match(struct net *net,
481 						    struct rt6_info *rt,
482 						    const struct in6_addr *saddr,
483 						    int oif,
484 						    int flags)
485 {
486 	struct rt6_info *local = NULL;
487 	struct rt6_info *sprt;
488 
489 	if (!oif && ipv6_addr_any(saddr))
490 		goto out;
491 
492 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
493 		struct net_device *dev = sprt->dst.dev;
494 
495 		if (oif) {
496 			if (dev->ifindex == oif)
497 				return sprt;
498 			if (dev->flags & IFF_LOOPBACK) {
499 				if (!sprt->rt6i_idev ||
500 				    sprt->rt6i_idev->dev->ifindex != oif) {
501 					if (flags & RT6_LOOKUP_F_IFACE && oif)
502 						continue;
503 					if (local && (!oif ||
504 						      local->rt6i_idev->dev->ifindex == oif))
505 						continue;
506 				}
507 				local = sprt;
508 			}
509 		} else {
510 			if (ipv6_chk_addr(net, saddr, dev,
511 					  flags & RT6_LOOKUP_F_IFACE))
512 				return sprt;
513 		}
514 	}
515 
516 	if (oif) {
517 		if (local)
518 			return local;
519 
520 		if (flags & RT6_LOOKUP_F_IFACE)
521 			return net->ipv6.ip6_null_entry;
522 	}
523 out:
524 	return rt;
525 }
526 
527 #ifdef CONFIG_IPV6_ROUTER_PREF
528 struct __rt6_probe_work {
529 	struct work_struct work;
530 	struct in6_addr target;
531 	struct net_device *dev;
532 };
533 
534 static void rt6_probe_deferred(struct work_struct *w)
535 {
536 	struct in6_addr mcaddr;
537 	struct __rt6_probe_work *work =
538 		container_of(w, struct __rt6_probe_work, work);
539 
540 	addrconf_addr_solict_mult(&work->target, &mcaddr);
541 	ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
542 	dev_put(work->dev);
543 	kfree(work);
544 }
545 
546 static void rt6_probe(struct rt6_info *rt)
547 {
548 	struct __rt6_probe_work *work;
549 	struct neighbour *neigh;
550 	/*
551 	 * Okay, this does not seem to be appropriate
552 	 * for now, however, we need to check if it
553 	 * is really so; aka Router Reachability Probing.
554 	 *
555 	 * Router Reachability Probe MUST be rate-limited
556 	 * to no more than one per minute.
557 	 */
558 	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
559 		return;
560 	rcu_read_lock_bh();
561 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
562 	if (neigh) {
563 		if (neigh->nud_state & NUD_VALID)
564 			goto out;
565 
566 		work = NULL;
567 		write_lock(&neigh->lock);
568 		if (!(neigh->nud_state & NUD_VALID) &&
569 		    time_after(jiffies,
570 			       neigh->updated +
571 			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
572 			work = kmalloc(sizeof(*work), GFP_ATOMIC);
573 			if (work)
574 				__neigh_set_probe_once(neigh);
575 		}
576 		write_unlock(&neigh->lock);
577 	} else {
578 		work = kmalloc(sizeof(*work), GFP_ATOMIC);
579 	}
580 
581 	if (work) {
582 		INIT_WORK(&work->work, rt6_probe_deferred);
583 		work->target = rt->rt6i_gateway;
584 		dev_hold(rt->dst.dev);
585 		work->dev = rt->dst.dev;
586 		schedule_work(&work->work);
587 	}
588 
589 out:
590 	rcu_read_unlock_bh();
591 }
592 #else
593 static inline void rt6_probe(struct rt6_info *rt)
594 {
595 }
596 #endif
597 
598 /*
599  * Default Router Selection (RFC 2461 6.3.6)
600  */
601 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
602 {
603 	struct net_device *dev = rt->dst.dev;
604 	if (!oif || dev->ifindex == oif)
605 		return 2;
606 	if ((dev->flags & IFF_LOOPBACK) &&
607 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
608 		return 1;
609 	return 0;
610 }
611 
612 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
613 {
614 	struct neighbour *neigh;
615 	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
616 
617 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
618 	    !(rt->rt6i_flags & RTF_GATEWAY))
619 		return RT6_NUD_SUCCEED;
620 
621 	rcu_read_lock_bh();
622 	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
623 	if (neigh) {
624 		read_lock(&neigh->lock);
625 		if (neigh->nud_state & NUD_VALID)
626 			ret = RT6_NUD_SUCCEED;
627 #ifdef CONFIG_IPV6_ROUTER_PREF
628 		else if (!(neigh->nud_state & NUD_FAILED))
629 			ret = RT6_NUD_SUCCEED;
630 		else
631 			ret = RT6_NUD_FAIL_PROBE;
632 #endif
633 		read_unlock(&neigh->lock);
634 	} else {
635 		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
636 		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
637 	}
638 	rcu_read_unlock_bh();
639 
640 	return ret;
641 }
642 
643 static int rt6_score_route(struct rt6_info *rt, int oif,
644 			   int strict)
645 {
646 	int m;
647 
648 	m = rt6_check_dev(rt, oif);
649 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
650 		return RT6_NUD_FAIL_HARD;
651 #ifdef CONFIG_IPV6_ROUTER_PREF
652 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
653 #endif
654 	if (strict & RT6_LOOKUP_F_REACHABLE) {
655 		int n = rt6_check_neigh(rt);
656 		if (n < 0)
657 			return n;
658 	}
659 	return m;
660 }
661 
662 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
663 				   int *mpri, struct rt6_info *match,
664 				   bool *do_rr)
665 {
666 	int m;
667 	bool match_do_rr = false;
668 
669 	if (rt6_check_expired(rt))
670 		goto out;
671 
672 	m = rt6_score_route(rt, oif, strict);
673 	if (m == RT6_NUD_FAIL_DO_RR) {
674 		match_do_rr = true;
675 		m = 0; /* lowest valid score */
676 	} else if (m == RT6_NUD_FAIL_HARD) {
677 		goto out;
678 	}
679 
680 	if (strict & RT6_LOOKUP_F_REACHABLE)
681 		rt6_probe(rt);
682 
683 	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
684 	if (m > *mpri) {
685 		*do_rr = match_do_rr;
686 		*mpri = m;
687 		match = rt;
688 	}
689 out:
690 	return match;
691 }
692 
693 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
694 				     struct rt6_info *rr_head,
695 				     u32 metric, int oif, int strict,
696 				     bool *do_rr)
697 {
698 	struct rt6_info *rt, *match, *cont;
699 	int mpri = -1;
700 
701 	match = NULL;
702 	cont = NULL;
703 	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
704 		if (rt->rt6i_metric != metric) {
705 			cont = rt;
706 			break;
707 		}
708 
709 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
710 	}
711 
712 	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
713 		if (rt->rt6i_metric != metric) {
714 			cont = rt;
715 			break;
716 		}
717 
718 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
719 	}
720 
721 	if (match || !cont)
722 		return match;
723 
724 	for (rt = cont; rt; rt = rt->dst.rt6_next)
725 		match = find_match(rt, oif, strict, &mpri, match, do_rr);
726 
727 	return match;
728 }
729 
730 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
731 {
732 	struct rt6_info *match, *rt0;
733 	struct net *net;
734 	bool do_rr = false;
735 
736 	rt0 = fn->rr_ptr;
737 	if (!rt0)
738 		fn->rr_ptr = rt0 = fn->leaf;
739 
740 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
741 			     &do_rr);
742 
743 	if (do_rr) {
744 		struct rt6_info *next = rt0->dst.rt6_next;
745 
746 		/* no entries matched; do round-robin */
747 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
748 			next = fn->leaf;
749 
750 		if (next != rt0)
751 			fn->rr_ptr = next;
752 	}
753 
754 	net = dev_net(rt0->dst.dev);
755 	return match ? match : net->ipv6.ip6_null_entry;
756 }
757 
758 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
759 {
760 	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
761 }
762 
763 #ifdef CONFIG_IPV6_ROUTE_INFO
764 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
765 		  const struct in6_addr *gwaddr)
766 {
767 	struct net *net = dev_net(dev);
768 	struct route_info *rinfo = (struct route_info *) opt;
769 	struct in6_addr prefix_buf, *prefix;
770 	unsigned int pref;
771 	unsigned long lifetime;
772 	struct rt6_info *rt;
773 
774 	if (len < sizeof(struct route_info)) {
775 		return -EINVAL;
776 	}
777 
778 	/* Sanity check for prefix_len and length */
779 	if (rinfo->length > 3) {
780 		return -EINVAL;
781 	} else if (rinfo->prefix_len > 128) {
782 		return -EINVAL;
783 	} else if (rinfo->prefix_len > 64) {
784 		if (rinfo->length < 2) {
785 			return -EINVAL;
786 		}
787 	} else if (rinfo->prefix_len > 0) {
788 		if (rinfo->length < 1) {
789 			return -EINVAL;
790 		}
791 	}
792 
793 	pref = rinfo->route_pref;
794 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
795 		return -EINVAL;
796 
797 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
798 
799 	if (rinfo->length == 3)
800 		prefix = (struct in6_addr *)rinfo->prefix;
801 	else {
802 		/* this function is safe */
803 		ipv6_addr_prefix(&prefix_buf,
804 				 (struct in6_addr *)rinfo->prefix,
805 				 rinfo->prefix_len);
806 		prefix = &prefix_buf;
807 	}
808 
809 	if (rinfo->prefix_len == 0)
810 		rt = rt6_get_dflt_router(gwaddr, dev);
811 	else
812 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
813 					gwaddr, dev->ifindex);
814 
815 	if (rt && !lifetime) {
816 		ip6_del_rt(rt);
817 		rt = NULL;
818 	}
819 
820 	if (!rt && lifetime)
821 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
822 					pref);
823 	else if (rt)
824 		rt->rt6i_flags = RTF_ROUTEINFO |
825 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
826 
827 	if (rt) {
828 		if (!addrconf_finite_timeout(lifetime))
829 			rt6_clean_expires(rt);
830 		else
831 			rt6_set_expires(rt, jiffies + HZ * lifetime);
832 
833 		ip6_rt_put(rt);
834 	}
835 	return 0;
836 }
837 #endif
838 
839 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
840 					struct in6_addr *saddr)
841 {
842 	struct fib6_node *pn;
843 	while (1) {
844 		if (fn->fn_flags & RTN_TL_ROOT)
845 			return NULL;
846 		pn = fn->parent;
847 		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
848 			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
849 		else
850 			fn = pn;
851 		if (fn->fn_flags & RTN_RTINFO)
852 			return fn;
853 	}
854 }
855 
856 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
857 					     struct fib6_table *table,
858 					     struct flowi6 *fl6, int flags)
859 {
860 	struct fib6_node *fn;
861 	struct rt6_info *rt;
862 
863 	read_lock_bh(&table->tb6_lock);
864 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
865 restart:
866 	rt = fn->leaf;
867 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
868 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
869 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
870 	if (rt == net->ipv6.ip6_null_entry) {
871 		fn = fib6_backtrack(fn, &fl6->saddr);
872 		if (fn)
873 			goto restart;
874 	}
875 	dst_use(&rt->dst, jiffies);
876 	read_unlock_bh(&table->tb6_lock);
877 	return rt;
878 
879 }
880 
881 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
882 				    int flags)
883 {
884 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
885 }
886 EXPORT_SYMBOL_GPL(ip6_route_lookup);
887 
888 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
889 			    const struct in6_addr *saddr, int oif, int strict)
890 {
891 	struct flowi6 fl6 = {
892 		.flowi6_oif = oif,
893 		.daddr = *daddr,
894 	};
895 	struct dst_entry *dst;
896 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
897 
898 	if (saddr) {
899 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
900 		flags |= RT6_LOOKUP_F_HAS_SADDR;
901 	}
902 
903 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
904 	if (dst->error == 0)
905 		return (struct rt6_info *) dst;
906 
907 	dst_release(dst);
908 
909 	return NULL;
910 }
911 EXPORT_SYMBOL(rt6_lookup);
912 
913 /* ip6_ins_rt is called with FREE table->tb6_lock.
914    It takes new route entry, the addition fails by any reason the
915    route is freed. In any case, if caller does not hold it, it may
916    be destroyed.
917  */
918 
919 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
920 			struct mx6_config *mxc)
921 {
922 	int err;
923 	struct fib6_table *table;
924 
925 	table = rt->rt6i_table;
926 	write_lock_bh(&table->tb6_lock);
927 	err = fib6_add(&table->tb6_root, rt, info, mxc);
928 	write_unlock_bh(&table->tb6_lock);
929 
930 	return err;
931 }
932 
933 int ip6_ins_rt(struct rt6_info *rt)
934 {
935 	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
936 	struct mx6_config mxc = { .mx = NULL, };
937 
938 	return __ip6_ins_rt(rt, &info, &mxc);
939 }
940 
941 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
942 					   const struct in6_addr *daddr,
943 					   const struct in6_addr *saddr)
944 {
945 	struct rt6_info *rt;
946 
947 	/*
948 	 *	Clone the route.
949 	 */
950 
951 	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
952 		ort = (struct rt6_info *)ort->dst.from;
953 
954 	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
955 			     0, ort->rt6i_table);
956 
957 	if (!rt)
958 		return NULL;
959 
960 	ip6_rt_copy_init(rt, ort);
961 	rt->rt6i_flags |= RTF_CACHE;
962 	rt->rt6i_metric = 0;
963 	rt->dst.flags |= DST_HOST;
964 	rt->rt6i_dst.addr = *daddr;
965 	rt->rt6i_dst.plen = 128;
966 
967 	if (!rt6_is_gw_or_nonexthop(ort)) {
968 		if (ort->rt6i_dst.plen != 128 &&
969 		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
970 			rt->rt6i_flags |= RTF_ANYCAST;
971 #ifdef CONFIG_IPV6_SUBTREES
972 		if (rt->rt6i_src.plen && saddr) {
973 			rt->rt6i_src.addr = *saddr;
974 			rt->rt6i_src.plen = 128;
975 		}
976 #endif
977 	}
978 
979 	return rt;
980 }
981 
982 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
983 {
984 	struct rt6_info *pcpu_rt;
985 
986 	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
987 				  rt->dst.dev, rt->dst.flags,
988 				  rt->rt6i_table);
989 
990 	if (!pcpu_rt)
991 		return NULL;
992 	ip6_rt_copy_init(pcpu_rt, rt);
993 	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
994 	pcpu_rt->rt6i_flags |= RTF_PCPU;
995 	return pcpu_rt;
996 }
997 
998 /* It should be called with read_lock_bh(&tb6_lock) acquired */
999 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1000 {
1001 	struct rt6_info *pcpu_rt, *prev, **p;
1002 
1003 	p = this_cpu_ptr(rt->rt6i_pcpu);
1004 	pcpu_rt = *p;
1005 
1006 	if (pcpu_rt)
1007 		goto done;
1008 
1009 	pcpu_rt = ip6_rt_pcpu_alloc(rt);
1010 	if (!pcpu_rt) {
1011 		struct net *net = dev_net(rt->dst.dev);
1012 
1013 		pcpu_rt = net->ipv6.ip6_null_entry;
1014 		goto done;
1015 	}
1016 
1017 	prev = cmpxchg(p, NULL, pcpu_rt);
1018 	if (prev) {
1019 		/* If someone did it before us, return prev instead */
1020 		dst_destroy(&pcpu_rt->dst);
1021 		pcpu_rt = prev;
1022 	}
1023 
1024 done:
1025 	dst_hold(&pcpu_rt->dst);
1026 	rt6_dst_from_metrics_check(pcpu_rt);
1027 	return pcpu_rt;
1028 }
1029 
1030 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1031 				      struct flowi6 *fl6, int flags)
1032 {
1033 	struct fib6_node *fn, *saved_fn;
1034 	struct rt6_info *rt;
1035 	int strict = 0;
1036 
1037 	strict |= flags & RT6_LOOKUP_F_IFACE;
1038 	if (net->ipv6.devconf_all->forwarding == 0)
1039 		strict |= RT6_LOOKUP_F_REACHABLE;
1040 
1041 	read_lock_bh(&table->tb6_lock);
1042 
1043 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1044 	saved_fn = fn;
1045 
1046 redo_rt6_select:
1047 	rt = rt6_select(fn, oif, strict);
1048 	if (rt->rt6i_nsiblings)
1049 		rt = rt6_multipath_select(rt, fl6, oif, strict);
1050 	if (rt == net->ipv6.ip6_null_entry) {
1051 		fn = fib6_backtrack(fn, &fl6->saddr);
1052 		if (fn)
1053 			goto redo_rt6_select;
1054 		else if (strict & RT6_LOOKUP_F_REACHABLE) {
1055 			/* also consider unreachable route */
1056 			strict &= ~RT6_LOOKUP_F_REACHABLE;
1057 			fn = saved_fn;
1058 			goto redo_rt6_select;
1059 		}
1060 	}
1061 
1062 
1063 	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1064 		dst_use(&rt->dst, jiffies);
1065 		read_unlock_bh(&table->tb6_lock);
1066 
1067 		rt6_dst_from_metrics_check(rt);
1068 		return rt;
1069 	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1070 			    !(rt->rt6i_flags & RTF_GATEWAY))) {
1071 		/* Create a RTF_CACHE clone which will not be
1072 		 * owned by the fib6 tree.  It is for the special case where
1073 		 * the daddr in the skb during the neighbor look-up is different
1074 		 * from the fl6->daddr used to look-up route here.
1075 		 */
1076 
1077 		struct rt6_info *uncached_rt;
1078 
1079 		dst_use(&rt->dst, jiffies);
1080 		read_unlock_bh(&table->tb6_lock);
1081 
1082 		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1083 		dst_release(&rt->dst);
1084 
1085 		if (uncached_rt)
1086 			rt6_uncached_list_add(uncached_rt);
1087 		else
1088 			uncached_rt = net->ipv6.ip6_null_entry;
1089 
1090 		dst_hold(&uncached_rt->dst);
1091 		return uncached_rt;
1092 
1093 	} else {
1094 		/* Get a percpu copy */
1095 
1096 		struct rt6_info *pcpu_rt;
1097 
1098 		rt->dst.lastuse = jiffies;
1099 		rt->dst.__use++;
1100 		pcpu_rt = rt6_get_pcpu_route(rt);
1101 		read_unlock_bh(&table->tb6_lock);
1102 
1103 		return pcpu_rt;
1104 	}
1105 }
1106 
1107 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1108 					    struct flowi6 *fl6, int flags)
1109 {
1110 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1111 }
1112 
1113 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1114 						struct net_device *dev,
1115 						struct flowi6 *fl6, int flags)
1116 {
1117 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1118 		flags |= RT6_LOOKUP_F_IFACE;
1119 
1120 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1121 }
1122 
1123 void ip6_route_input(struct sk_buff *skb)
1124 {
1125 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1126 	struct net *net = dev_net(skb->dev);
1127 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1128 	struct flowi6 fl6 = {
1129 		.flowi6_iif = skb->dev->ifindex,
1130 		.daddr = iph->daddr,
1131 		.saddr = iph->saddr,
1132 		.flowlabel = ip6_flowinfo(iph),
1133 		.flowi6_mark = skb->mark,
1134 		.flowi6_proto = iph->nexthdr,
1135 	};
1136 
1137 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1138 }
1139 
1140 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1141 					     struct flowi6 *fl6, int flags)
1142 {
1143 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1144 }
1145 
1146 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1147 				    struct flowi6 *fl6)
1148 {
1149 	int flags = 0;
1150 
1151 	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1152 
1153 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1154 		flags |= RT6_LOOKUP_F_IFACE;
1155 
1156 	if (!ipv6_addr_any(&fl6->saddr))
1157 		flags |= RT6_LOOKUP_F_HAS_SADDR;
1158 	else if (sk)
1159 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1160 
1161 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1162 }
1163 EXPORT_SYMBOL(ip6_route_output);
1164 
1165 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1166 {
1167 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1168 	struct dst_entry *new = NULL;
1169 
1170 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1171 	if (rt) {
1172 		new = &rt->dst;
1173 
1174 		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1175 
1176 		new->__use = 1;
1177 		new->input = dst_discard;
1178 		new->output = dst_discard_sk;
1179 
1180 		if (dst_metrics_read_only(&ort->dst))
1181 			new->_metrics = ort->dst._metrics;
1182 		else
1183 			dst_copy_metrics(new, &ort->dst);
1184 		rt->rt6i_idev = ort->rt6i_idev;
1185 		if (rt->rt6i_idev)
1186 			in6_dev_hold(rt->rt6i_idev);
1187 
1188 		rt->rt6i_gateway = ort->rt6i_gateway;
1189 		rt->rt6i_flags = ort->rt6i_flags;
1190 		rt->rt6i_metric = 0;
1191 
1192 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1193 #ifdef CONFIG_IPV6_SUBTREES
1194 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1195 #endif
1196 
1197 		dst_free(new);
1198 	}
1199 
1200 	dst_release(dst_orig);
1201 	return new ? new : ERR_PTR(-ENOMEM);
1202 }
1203 
1204 /*
1205  *	Destination cache support functions
1206  */
1207 
1208 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1209 {
1210 	if (rt->dst.from &&
1211 	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1212 		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1213 }
1214 
1215 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1216 {
1217 	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1218 		return NULL;
1219 
1220 	if (rt6_check_expired(rt))
1221 		return NULL;
1222 
1223 	return &rt->dst;
1224 }
1225 
1226 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1227 {
1228 	if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1229 	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1230 		return &rt->dst;
1231 	else
1232 		return NULL;
1233 }
1234 
1235 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1236 {
1237 	struct rt6_info *rt;
1238 
1239 	rt = (struct rt6_info *) dst;
1240 
1241 	/* All IPV6 dsts are created with ->obsolete set to the value
1242 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1243 	 * into this function always.
1244 	 */
1245 
1246 	rt6_dst_from_metrics_check(rt);
1247 
1248 	if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1249 		return rt6_dst_from_check(rt, cookie);
1250 	else
1251 		return rt6_check(rt, cookie);
1252 }
1253 
1254 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1255 {
1256 	struct rt6_info *rt = (struct rt6_info *) dst;
1257 
1258 	if (rt) {
1259 		if (rt->rt6i_flags & RTF_CACHE) {
1260 			if (rt6_check_expired(rt)) {
1261 				ip6_del_rt(rt);
1262 				dst = NULL;
1263 			}
1264 		} else {
1265 			dst_release(dst);
1266 			dst = NULL;
1267 		}
1268 	}
1269 	return dst;
1270 }
1271 
1272 static void ip6_link_failure(struct sk_buff *skb)
1273 {
1274 	struct rt6_info *rt;
1275 
1276 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1277 
1278 	rt = (struct rt6_info *) skb_dst(skb);
1279 	if (rt) {
1280 		if (rt->rt6i_flags & RTF_CACHE) {
1281 			dst_hold(&rt->dst);
1282 			if (ip6_del_rt(rt))
1283 				dst_free(&rt->dst);
1284 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1285 			rt->rt6i_node->fn_sernum = -1;
1286 		}
1287 	}
1288 }
1289 
1290 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1291 {
1292 	struct net *net = dev_net(rt->dst.dev);
1293 
1294 	rt->rt6i_flags |= RTF_MODIFIED;
1295 	rt->rt6i_pmtu = mtu;
1296 	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1297 }
1298 
1299 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1300 				 const struct ipv6hdr *iph, u32 mtu)
1301 {
1302 	struct rt6_info *rt6 = (struct rt6_info *)dst;
1303 
1304 	if (rt6->rt6i_flags & RTF_LOCAL)
1305 		return;
1306 
1307 	dst_confirm(dst);
1308 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1309 	if (mtu >= dst_mtu(dst))
1310 		return;
1311 
1312 	if (rt6->rt6i_flags & RTF_CACHE) {
1313 		rt6_do_update_pmtu(rt6, mtu);
1314 	} else {
1315 		const struct in6_addr *daddr, *saddr;
1316 		struct rt6_info *nrt6;
1317 
1318 		if (iph) {
1319 			daddr = &iph->daddr;
1320 			saddr = &iph->saddr;
1321 		} else if (sk) {
1322 			daddr = &sk->sk_v6_daddr;
1323 			saddr = &inet6_sk(sk)->saddr;
1324 		} else {
1325 			return;
1326 		}
1327 		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1328 		if (nrt6) {
1329 			rt6_do_update_pmtu(nrt6, mtu);
1330 
1331 			/* ip6_ins_rt(nrt6) will bump the
1332 			 * rt6->rt6i_node->fn_sernum
1333 			 * which will fail the next rt6_check() and
1334 			 * invalidate the sk->sk_dst_cache.
1335 			 */
1336 			ip6_ins_rt(nrt6);
1337 		}
1338 	}
1339 }
1340 
1341 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1342 			       struct sk_buff *skb, u32 mtu)
1343 {
1344 	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1345 }
1346 
1347 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1348 		     int oif, u32 mark)
1349 {
1350 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1351 	struct dst_entry *dst;
1352 	struct flowi6 fl6;
1353 
1354 	memset(&fl6, 0, sizeof(fl6));
1355 	fl6.flowi6_oif = oif;
1356 	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1357 	fl6.daddr = iph->daddr;
1358 	fl6.saddr = iph->saddr;
1359 	fl6.flowlabel = ip6_flowinfo(iph);
1360 
1361 	dst = ip6_route_output(net, NULL, &fl6);
1362 	if (!dst->error)
1363 		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1364 	dst_release(dst);
1365 }
1366 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1367 
1368 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1369 {
1370 	ip6_update_pmtu(skb, sock_net(sk), mtu,
1371 			sk->sk_bound_dev_if, sk->sk_mark);
1372 }
1373 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1374 
1375 /* Handle redirects */
1376 struct ip6rd_flowi {
1377 	struct flowi6 fl6;
1378 	struct in6_addr gateway;
1379 };
1380 
1381 static struct rt6_info *__ip6_route_redirect(struct net *net,
1382 					     struct fib6_table *table,
1383 					     struct flowi6 *fl6,
1384 					     int flags)
1385 {
1386 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1387 	struct rt6_info *rt;
1388 	struct fib6_node *fn;
1389 
1390 	/* Get the "current" route for this destination and
1391 	 * check if the redirect has come from approriate router.
1392 	 *
1393 	 * RFC 4861 specifies that redirects should only be
1394 	 * accepted if they come from the nexthop to the target.
1395 	 * Due to the way the routes are chosen, this notion
1396 	 * is a bit fuzzy and one might need to check all possible
1397 	 * routes.
1398 	 */
1399 
1400 	read_lock_bh(&table->tb6_lock);
1401 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1402 restart:
1403 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1404 		if (rt6_check_expired(rt))
1405 			continue;
1406 		if (rt->dst.error)
1407 			break;
1408 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1409 			continue;
1410 		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1411 			continue;
1412 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1413 			continue;
1414 		break;
1415 	}
1416 
1417 	if (!rt)
1418 		rt = net->ipv6.ip6_null_entry;
1419 	else if (rt->dst.error) {
1420 		rt = net->ipv6.ip6_null_entry;
1421 		goto out;
1422 	}
1423 
1424 	if (rt == net->ipv6.ip6_null_entry) {
1425 		fn = fib6_backtrack(fn, &fl6->saddr);
1426 		if (fn)
1427 			goto restart;
1428 	}
1429 
1430 out:
1431 	dst_hold(&rt->dst);
1432 
1433 	read_unlock_bh(&table->tb6_lock);
1434 
1435 	return rt;
1436 };
1437 
1438 static struct dst_entry *ip6_route_redirect(struct net *net,
1439 					const struct flowi6 *fl6,
1440 					const struct in6_addr *gateway)
1441 {
1442 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1443 	struct ip6rd_flowi rdfl;
1444 
1445 	rdfl.fl6 = *fl6;
1446 	rdfl.gateway = *gateway;
1447 
1448 	return fib6_rule_lookup(net, &rdfl.fl6,
1449 				flags, __ip6_route_redirect);
1450 }
1451 
1452 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1453 {
1454 	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1455 	struct dst_entry *dst;
1456 	struct flowi6 fl6;
1457 
1458 	memset(&fl6, 0, sizeof(fl6));
1459 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1460 	fl6.flowi6_oif = oif;
1461 	fl6.flowi6_mark = mark;
1462 	fl6.daddr = iph->daddr;
1463 	fl6.saddr = iph->saddr;
1464 	fl6.flowlabel = ip6_flowinfo(iph);
1465 
1466 	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1467 	rt6_do_redirect(dst, NULL, skb);
1468 	dst_release(dst);
1469 }
1470 EXPORT_SYMBOL_GPL(ip6_redirect);
1471 
1472 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1473 			    u32 mark)
1474 {
1475 	const struct ipv6hdr *iph = ipv6_hdr(skb);
1476 	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1477 	struct dst_entry *dst;
1478 	struct flowi6 fl6;
1479 
1480 	memset(&fl6, 0, sizeof(fl6));
1481 	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1482 	fl6.flowi6_oif = oif;
1483 	fl6.flowi6_mark = mark;
1484 	fl6.daddr = msg->dest;
1485 	fl6.saddr = iph->daddr;
1486 
1487 	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1488 	rt6_do_redirect(dst, NULL, skb);
1489 	dst_release(dst);
1490 }
1491 
1492 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1493 {
1494 	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1495 }
1496 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1497 
1498 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1499 {
1500 	struct net_device *dev = dst->dev;
1501 	unsigned int mtu = dst_mtu(dst);
1502 	struct net *net = dev_net(dev);
1503 
1504 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1505 
1506 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1507 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1508 
1509 	/*
1510 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1511 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1512 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1513 	 * rely only on pmtu discovery"
1514 	 */
1515 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1516 		mtu = IPV6_MAXPLEN;
1517 	return mtu;
1518 }
1519 
1520 static unsigned int ip6_mtu(const struct dst_entry *dst)
1521 {
1522 	const struct rt6_info *rt = (const struct rt6_info *)dst;
1523 	unsigned int mtu = rt->rt6i_pmtu;
1524 	struct inet6_dev *idev;
1525 
1526 	if (mtu)
1527 		goto out;
1528 
1529 	mtu = dst_metric_raw(dst, RTAX_MTU);
1530 	if (mtu)
1531 		goto out;
1532 
1533 	mtu = IPV6_MIN_MTU;
1534 
1535 	rcu_read_lock();
1536 	idev = __in6_dev_get(dst->dev);
1537 	if (idev)
1538 		mtu = idev->cnf.mtu6;
1539 	rcu_read_unlock();
1540 
1541 out:
1542 	return min_t(unsigned int, mtu, IP6_MAX_MTU);
1543 }
1544 
1545 static struct dst_entry *icmp6_dst_gc_list;
1546 static DEFINE_SPINLOCK(icmp6_dst_lock);
1547 
1548 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1549 				  struct flowi6 *fl6)
1550 {
1551 	struct dst_entry *dst;
1552 	struct rt6_info *rt;
1553 	struct inet6_dev *idev = in6_dev_get(dev);
1554 	struct net *net = dev_net(dev);
1555 
1556 	if (unlikely(!idev))
1557 		return ERR_PTR(-ENODEV);
1558 
1559 	rt = ip6_dst_alloc(net, dev, 0, NULL);
1560 	if (unlikely(!rt)) {
1561 		in6_dev_put(idev);
1562 		dst = ERR_PTR(-ENOMEM);
1563 		goto out;
1564 	}
1565 
1566 	rt->dst.flags |= DST_HOST;
1567 	rt->dst.output  = ip6_output;
1568 	atomic_set(&rt->dst.__refcnt, 1);
1569 	rt->rt6i_gateway  = fl6->daddr;
1570 	rt->rt6i_dst.addr = fl6->daddr;
1571 	rt->rt6i_dst.plen = 128;
1572 	rt->rt6i_idev     = idev;
1573 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1574 
1575 	spin_lock_bh(&icmp6_dst_lock);
1576 	rt->dst.next = icmp6_dst_gc_list;
1577 	icmp6_dst_gc_list = &rt->dst;
1578 	spin_unlock_bh(&icmp6_dst_lock);
1579 
1580 	fib6_force_start_gc(net);
1581 
1582 	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1583 
1584 out:
1585 	return dst;
1586 }
1587 
1588 int icmp6_dst_gc(void)
1589 {
1590 	struct dst_entry *dst, **pprev;
1591 	int more = 0;
1592 
1593 	spin_lock_bh(&icmp6_dst_lock);
1594 	pprev = &icmp6_dst_gc_list;
1595 
1596 	while ((dst = *pprev) != NULL) {
1597 		if (!atomic_read(&dst->__refcnt)) {
1598 			*pprev = dst->next;
1599 			dst_free(dst);
1600 		} else {
1601 			pprev = &dst->next;
1602 			++more;
1603 		}
1604 	}
1605 
1606 	spin_unlock_bh(&icmp6_dst_lock);
1607 
1608 	return more;
1609 }
1610 
1611 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1612 			    void *arg)
1613 {
1614 	struct dst_entry *dst, **pprev;
1615 
1616 	spin_lock_bh(&icmp6_dst_lock);
1617 	pprev = &icmp6_dst_gc_list;
1618 	while ((dst = *pprev) != NULL) {
1619 		struct rt6_info *rt = (struct rt6_info *) dst;
1620 		if (func(rt, arg)) {
1621 			*pprev = dst->next;
1622 			dst_free(dst);
1623 		} else {
1624 			pprev = &dst->next;
1625 		}
1626 	}
1627 	spin_unlock_bh(&icmp6_dst_lock);
1628 }
1629 
1630 static int ip6_dst_gc(struct dst_ops *ops)
1631 {
1632 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1633 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1634 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1635 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1636 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1637 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1638 	int entries;
1639 
1640 	entries = dst_entries_get_fast(ops);
1641 	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1642 	    entries <= rt_max_size)
1643 		goto out;
1644 
1645 	net->ipv6.ip6_rt_gc_expire++;
1646 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1647 	entries = dst_entries_get_slow(ops);
1648 	if (entries < ops->gc_thresh)
1649 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1650 out:
1651 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1652 	return entries > rt_max_size;
1653 }
1654 
1655 static int ip6_convert_metrics(struct mx6_config *mxc,
1656 			       const struct fib6_config *cfg)
1657 {
1658 	struct nlattr *nla;
1659 	int remaining;
1660 	u32 *mp;
1661 
1662 	if (!cfg->fc_mx)
1663 		return 0;
1664 
1665 	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1666 	if (unlikely(!mp))
1667 		return -ENOMEM;
1668 
1669 	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1670 		int type = nla_type(nla);
1671 
1672 		if (type) {
1673 			u32 val;
1674 
1675 			if (unlikely(type > RTAX_MAX))
1676 				goto err;
1677 			if (type == RTAX_CC_ALGO) {
1678 				char tmp[TCP_CA_NAME_MAX];
1679 
1680 				nla_strlcpy(tmp, nla, sizeof(tmp));
1681 				val = tcp_ca_get_key_by_name(tmp);
1682 				if (val == TCP_CA_UNSPEC)
1683 					goto err;
1684 			} else {
1685 				val = nla_get_u32(nla);
1686 			}
1687 
1688 			mp[type - 1] = val;
1689 			__set_bit(type - 1, mxc->mx_valid);
1690 		}
1691 	}
1692 
1693 	mxc->mx = mp;
1694 
1695 	return 0;
1696  err:
1697 	kfree(mp);
1698 	return -EINVAL;
1699 }
1700 
1701 int ip6_route_add(struct fib6_config *cfg)
1702 {
1703 	int err;
1704 	struct net *net = cfg->fc_nlinfo.nl_net;
1705 	struct rt6_info *rt = NULL;
1706 	struct net_device *dev = NULL;
1707 	struct inet6_dev *idev = NULL;
1708 	struct fib6_table *table;
1709 	struct mx6_config mxc = { .mx = NULL, };
1710 	int addr_type;
1711 
1712 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1713 		return -EINVAL;
1714 #ifndef CONFIG_IPV6_SUBTREES
1715 	if (cfg->fc_src_len)
1716 		return -EINVAL;
1717 #endif
1718 	if (cfg->fc_ifindex) {
1719 		err = -ENODEV;
1720 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1721 		if (!dev)
1722 			goto out;
1723 		idev = in6_dev_get(dev);
1724 		if (!idev)
1725 			goto out;
1726 	}
1727 
1728 	if (cfg->fc_metric == 0)
1729 		cfg->fc_metric = IP6_RT_PRIO_USER;
1730 
1731 	err = -ENOBUFS;
1732 	if (cfg->fc_nlinfo.nlh &&
1733 	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1734 		table = fib6_get_table(net, cfg->fc_table);
1735 		if (!table) {
1736 			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1737 			table = fib6_new_table(net, cfg->fc_table);
1738 		}
1739 	} else {
1740 		table = fib6_new_table(net, cfg->fc_table);
1741 	}
1742 
1743 	if (!table)
1744 		goto out;
1745 
1746 	rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1747 
1748 	if (!rt) {
1749 		err = -ENOMEM;
1750 		goto out;
1751 	}
1752 
1753 	if (cfg->fc_flags & RTF_EXPIRES)
1754 		rt6_set_expires(rt, jiffies +
1755 				clock_t_to_jiffies(cfg->fc_expires));
1756 	else
1757 		rt6_clean_expires(rt);
1758 
1759 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1760 		cfg->fc_protocol = RTPROT_BOOT;
1761 	rt->rt6i_protocol = cfg->fc_protocol;
1762 
1763 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1764 
1765 	if (addr_type & IPV6_ADDR_MULTICAST)
1766 		rt->dst.input = ip6_mc_input;
1767 	else if (cfg->fc_flags & RTF_LOCAL)
1768 		rt->dst.input = ip6_input;
1769 	else
1770 		rt->dst.input = ip6_forward;
1771 
1772 	rt->dst.output = ip6_output;
1773 
1774 	if (cfg->fc_encap) {
1775 		struct lwtunnel_state *lwtstate;
1776 
1777 		err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1778 					   cfg->fc_encap, &lwtstate);
1779 		if (err)
1780 			goto out;
1781 		rt->rt6i_lwtstate = lwtstate_get(lwtstate);
1782 		if (lwtunnel_output_redirect(rt->rt6i_lwtstate))
1783 			rt->dst.output = lwtunnel_output6;
1784 	}
1785 
1786 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1787 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1788 	if (rt->rt6i_dst.plen == 128)
1789 		rt->dst.flags |= DST_HOST;
1790 
1791 #ifdef CONFIG_IPV6_SUBTREES
1792 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1793 	rt->rt6i_src.plen = cfg->fc_src_len;
1794 #endif
1795 
1796 	rt->rt6i_metric = cfg->fc_metric;
1797 
1798 	/* We cannot add true routes via loopback here,
1799 	   they would result in kernel looping; promote them to reject routes
1800 	 */
1801 	if ((cfg->fc_flags & RTF_REJECT) ||
1802 	    (dev && (dev->flags & IFF_LOOPBACK) &&
1803 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
1804 	     !(cfg->fc_flags & RTF_LOCAL))) {
1805 		/* hold loopback dev/idev if we haven't done so. */
1806 		if (dev != net->loopback_dev) {
1807 			if (dev) {
1808 				dev_put(dev);
1809 				in6_dev_put(idev);
1810 			}
1811 			dev = net->loopback_dev;
1812 			dev_hold(dev);
1813 			idev = in6_dev_get(dev);
1814 			if (!idev) {
1815 				err = -ENODEV;
1816 				goto out;
1817 			}
1818 		}
1819 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1820 		switch (cfg->fc_type) {
1821 		case RTN_BLACKHOLE:
1822 			rt->dst.error = -EINVAL;
1823 			rt->dst.output = dst_discard_sk;
1824 			rt->dst.input = dst_discard;
1825 			break;
1826 		case RTN_PROHIBIT:
1827 			rt->dst.error = -EACCES;
1828 			rt->dst.output = ip6_pkt_prohibit_out;
1829 			rt->dst.input = ip6_pkt_prohibit;
1830 			break;
1831 		case RTN_THROW:
1832 		default:
1833 			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1834 					: -ENETUNREACH;
1835 			rt->dst.output = ip6_pkt_discard_out;
1836 			rt->dst.input = ip6_pkt_discard;
1837 			break;
1838 		}
1839 		goto install_route;
1840 	}
1841 
1842 	if (cfg->fc_flags & RTF_GATEWAY) {
1843 		const struct in6_addr *gw_addr;
1844 		int gwa_type;
1845 
1846 		gw_addr = &cfg->fc_gateway;
1847 
1848 		/* if gw_addr is local we will fail to detect this in case
1849 		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
1850 		 * will return already-added prefix route via interface that
1851 		 * prefix route was assigned to, which might be non-loopback.
1852 		 */
1853 		err = -EINVAL;
1854 		if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1855 			goto out;
1856 
1857 		rt->rt6i_gateway = *gw_addr;
1858 		gwa_type = ipv6_addr_type(gw_addr);
1859 
1860 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1861 			struct rt6_info *grt;
1862 
1863 			/* IPv6 strictly inhibits using not link-local
1864 			   addresses as nexthop address.
1865 			   Otherwise, router will not able to send redirects.
1866 			   It is very good, but in some (rare!) circumstances
1867 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1868 			   some exceptions. --ANK
1869 			 */
1870 			if (!(gwa_type & IPV6_ADDR_UNICAST))
1871 				goto out;
1872 
1873 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1874 
1875 			err = -EHOSTUNREACH;
1876 			if (!grt)
1877 				goto out;
1878 			if (dev) {
1879 				if (dev != grt->dst.dev) {
1880 					ip6_rt_put(grt);
1881 					goto out;
1882 				}
1883 			} else {
1884 				dev = grt->dst.dev;
1885 				idev = grt->rt6i_idev;
1886 				dev_hold(dev);
1887 				in6_dev_hold(grt->rt6i_idev);
1888 			}
1889 			if (!(grt->rt6i_flags & RTF_GATEWAY))
1890 				err = 0;
1891 			ip6_rt_put(grt);
1892 
1893 			if (err)
1894 				goto out;
1895 		}
1896 		err = -EINVAL;
1897 		if (!dev || (dev->flags & IFF_LOOPBACK))
1898 			goto out;
1899 	}
1900 
1901 	err = -ENODEV;
1902 	if (!dev)
1903 		goto out;
1904 
1905 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1906 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1907 			err = -EINVAL;
1908 			goto out;
1909 		}
1910 		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1911 		rt->rt6i_prefsrc.plen = 128;
1912 	} else
1913 		rt->rt6i_prefsrc.plen = 0;
1914 
1915 	rt->rt6i_flags = cfg->fc_flags;
1916 
1917 install_route:
1918 	rt->dst.dev = dev;
1919 	rt->rt6i_idev = idev;
1920 	rt->rt6i_table = table;
1921 
1922 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1923 
1924 	err = ip6_convert_metrics(&mxc, cfg);
1925 	if (err)
1926 		goto out;
1927 
1928 	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1929 
1930 	kfree(mxc.mx);
1931 	return err;
1932 out:
1933 	if (dev)
1934 		dev_put(dev);
1935 	if (idev)
1936 		in6_dev_put(idev);
1937 	if (rt)
1938 		dst_free(&rt->dst);
1939 	return err;
1940 }
1941 
1942 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1943 {
1944 	int err;
1945 	struct fib6_table *table;
1946 	struct net *net = dev_net(rt->dst.dev);
1947 
1948 	if (rt == net->ipv6.ip6_null_entry) {
1949 		err = -ENOENT;
1950 		goto out;
1951 	}
1952 
1953 	table = rt->rt6i_table;
1954 	write_lock_bh(&table->tb6_lock);
1955 	err = fib6_del(rt, info);
1956 	write_unlock_bh(&table->tb6_lock);
1957 
1958 out:
1959 	ip6_rt_put(rt);
1960 	return err;
1961 }
1962 
1963 int ip6_del_rt(struct rt6_info *rt)
1964 {
1965 	struct nl_info info = {
1966 		.nl_net = dev_net(rt->dst.dev),
1967 	};
1968 	return __ip6_del_rt(rt, &info);
1969 }
1970 
1971 static int ip6_route_del(struct fib6_config *cfg)
1972 {
1973 	struct fib6_table *table;
1974 	struct fib6_node *fn;
1975 	struct rt6_info *rt;
1976 	int err = -ESRCH;
1977 
1978 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1979 	if (!table)
1980 		return err;
1981 
1982 	read_lock_bh(&table->tb6_lock);
1983 
1984 	fn = fib6_locate(&table->tb6_root,
1985 			 &cfg->fc_dst, cfg->fc_dst_len,
1986 			 &cfg->fc_src, cfg->fc_src_len);
1987 
1988 	if (fn) {
1989 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1990 			if ((rt->rt6i_flags & RTF_CACHE) &&
1991 			    !(cfg->fc_flags & RTF_CACHE))
1992 				continue;
1993 			if (cfg->fc_ifindex &&
1994 			    (!rt->dst.dev ||
1995 			     rt->dst.dev->ifindex != cfg->fc_ifindex))
1996 				continue;
1997 			if (cfg->fc_flags & RTF_GATEWAY &&
1998 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1999 				continue;
2000 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2001 				continue;
2002 			dst_hold(&rt->dst);
2003 			read_unlock_bh(&table->tb6_lock);
2004 
2005 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2006 		}
2007 	}
2008 	read_unlock_bh(&table->tb6_lock);
2009 
2010 	return err;
2011 }
2012 
2013 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2014 {
2015 	struct net *net = dev_net(skb->dev);
2016 	struct netevent_redirect netevent;
2017 	struct rt6_info *rt, *nrt = NULL;
2018 	struct ndisc_options ndopts;
2019 	struct inet6_dev *in6_dev;
2020 	struct neighbour *neigh;
2021 	struct rd_msg *msg;
2022 	int optlen, on_link;
2023 	u8 *lladdr;
2024 
2025 	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2026 	optlen -= sizeof(*msg);
2027 
2028 	if (optlen < 0) {
2029 		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2030 		return;
2031 	}
2032 
2033 	msg = (struct rd_msg *)icmp6_hdr(skb);
2034 
2035 	if (ipv6_addr_is_multicast(&msg->dest)) {
2036 		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2037 		return;
2038 	}
2039 
2040 	on_link = 0;
2041 	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2042 		on_link = 1;
2043 	} else if (ipv6_addr_type(&msg->target) !=
2044 		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2045 		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2046 		return;
2047 	}
2048 
2049 	in6_dev = __in6_dev_get(skb->dev);
2050 	if (!in6_dev)
2051 		return;
2052 	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2053 		return;
2054 
2055 	/* RFC2461 8.1:
2056 	 *	The IP source address of the Redirect MUST be the same as the current
2057 	 *	first-hop router for the specified ICMP Destination Address.
2058 	 */
2059 
2060 	if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2061 		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2062 		return;
2063 	}
2064 
2065 	lladdr = NULL;
2066 	if (ndopts.nd_opts_tgt_lladdr) {
2067 		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2068 					     skb->dev);
2069 		if (!lladdr) {
2070 			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2071 			return;
2072 		}
2073 	}
2074 
2075 	rt = (struct rt6_info *) dst;
2076 	if (rt == net->ipv6.ip6_null_entry) {
2077 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2078 		return;
2079 	}
2080 
2081 	/* Redirect received -> path was valid.
2082 	 * Look, redirects are sent only in response to data packets,
2083 	 * so that this nexthop apparently is reachable. --ANK
2084 	 */
2085 	dst_confirm(&rt->dst);
2086 
2087 	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2088 	if (!neigh)
2089 		return;
2090 
2091 	/*
2092 	 *	We have finally decided to accept it.
2093 	 */
2094 
2095 	neigh_update(neigh, lladdr, NUD_STALE,
2096 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2097 		     NEIGH_UPDATE_F_OVERRIDE|
2098 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2099 				     NEIGH_UPDATE_F_ISROUTER))
2100 		     );
2101 
2102 	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2103 	if (!nrt)
2104 		goto out;
2105 
2106 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2107 	if (on_link)
2108 		nrt->rt6i_flags &= ~RTF_GATEWAY;
2109 
2110 	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2111 
2112 	if (ip6_ins_rt(nrt))
2113 		goto out;
2114 
2115 	netevent.old = &rt->dst;
2116 	netevent.new = &nrt->dst;
2117 	netevent.daddr = &msg->dest;
2118 	netevent.neigh = neigh;
2119 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2120 
2121 	if (rt->rt6i_flags & RTF_CACHE) {
2122 		rt = (struct rt6_info *) dst_clone(&rt->dst);
2123 		ip6_del_rt(rt);
2124 	}
2125 
2126 out:
2127 	neigh_release(neigh);
2128 }
2129 
2130 /*
2131  *	Misc support functions
2132  */
2133 
2134 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2135 {
2136 	BUG_ON(from->dst.from);
2137 
2138 	rt->rt6i_flags &= ~RTF_EXPIRES;
2139 	dst_hold(&from->dst);
2140 	rt->dst.from = &from->dst;
2141 	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2142 }
2143 
2144 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2145 {
2146 	rt->dst.input = ort->dst.input;
2147 	rt->dst.output = ort->dst.output;
2148 	rt->rt6i_dst = ort->rt6i_dst;
2149 	rt->dst.error = ort->dst.error;
2150 	rt->rt6i_idev = ort->rt6i_idev;
2151 	if (rt->rt6i_idev)
2152 		in6_dev_hold(rt->rt6i_idev);
2153 	rt->dst.lastuse = jiffies;
2154 	rt->rt6i_gateway = ort->rt6i_gateway;
2155 	rt->rt6i_flags = ort->rt6i_flags;
2156 	rt6_set_from(rt, ort);
2157 	rt->rt6i_metric = ort->rt6i_metric;
2158 #ifdef CONFIG_IPV6_SUBTREES
2159 	rt->rt6i_src = ort->rt6i_src;
2160 #endif
2161 	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2162 	rt->rt6i_table = ort->rt6i_table;
2163 	rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate);
2164 }
2165 
2166 #ifdef CONFIG_IPV6_ROUTE_INFO
2167 static struct rt6_info *rt6_get_route_info(struct net *net,
2168 					   const struct in6_addr *prefix, int prefixlen,
2169 					   const struct in6_addr *gwaddr, int ifindex)
2170 {
2171 	struct fib6_node *fn;
2172 	struct rt6_info *rt = NULL;
2173 	struct fib6_table *table;
2174 
2175 	table = fib6_get_table(net, RT6_TABLE_INFO);
2176 	if (!table)
2177 		return NULL;
2178 
2179 	read_lock_bh(&table->tb6_lock);
2180 	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2181 	if (!fn)
2182 		goto out;
2183 
2184 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2185 		if (rt->dst.dev->ifindex != ifindex)
2186 			continue;
2187 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2188 			continue;
2189 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2190 			continue;
2191 		dst_hold(&rt->dst);
2192 		break;
2193 	}
2194 out:
2195 	read_unlock_bh(&table->tb6_lock);
2196 	return rt;
2197 }
2198 
2199 static struct rt6_info *rt6_add_route_info(struct net *net,
2200 					   const struct in6_addr *prefix, int prefixlen,
2201 					   const struct in6_addr *gwaddr, int ifindex,
2202 					   unsigned int pref)
2203 {
2204 	struct fib6_config cfg = {
2205 		.fc_table	= RT6_TABLE_INFO,
2206 		.fc_metric	= IP6_RT_PRIO_USER,
2207 		.fc_ifindex	= ifindex,
2208 		.fc_dst_len	= prefixlen,
2209 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2210 				  RTF_UP | RTF_PREF(pref),
2211 		.fc_nlinfo.portid = 0,
2212 		.fc_nlinfo.nlh = NULL,
2213 		.fc_nlinfo.nl_net = net,
2214 	};
2215 
2216 	cfg.fc_dst = *prefix;
2217 	cfg.fc_gateway = *gwaddr;
2218 
2219 	/* We should treat it as a default route if prefix length is 0. */
2220 	if (!prefixlen)
2221 		cfg.fc_flags |= RTF_DEFAULT;
2222 
2223 	ip6_route_add(&cfg);
2224 
2225 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2226 }
2227 #endif
2228 
2229 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2230 {
2231 	struct rt6_info *rt;
2232 	struct fib6_table *table;
2233 
2234 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2235 	if (!table)
2236 		return NULL;
2237 
2238 	read_lock_bh(&table->tb6_lock);
2239 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2240 		if (dev == rt->dst.dev &&
2241 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2242 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2243 			break;
2244 	}
2245 	if (rt)
2246 		dst_hold(&rt->dst);
2247 	read_unlock_bh(&table->tb6_lock);
2248 	return rt;
2249 }
2250 
2251 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2252 				     struct net_device *dev,
2253 				     unsigned int pref)
2254 {
2255 	struct fib6_config cfg = {
2256 		.fc_table	= RT6_TABLE_DFLT,
2257 		.fc_metric	= IP6_RT_PRIO_USER,
2258 		.fc_ifindex	= dev->ifindex,
2259 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2260 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2261 		.fc_nlinfo.portid = 0,
2262 		.fc_nlinfo.nlh = NULL,
2263 		.fc_nlinfo.nl_net = dev_net(dev),
2264 	};
2265 
2266 	cfg.fc_gateway = *gwaddr;
2267 
2268 	ip6_route_add(&cfg);
2269 
2270 	return rt6_get_dflt_router(gwaddr, dev);
2271 }
2272 
2273 void rt6_purge_dflt_routers(struct net *net)
2274 {
2275 	struct rt6_info *rt;
2276 	struct fib6_table *table;
2277 
2278 	/* NOTE: Keep consistent with rt6_get_dflt_router */
2279 	table = fib6_get_table(net, RT6_TABLE_DFLT);
2280 	if (!table)
2281 		return;
2282 
2283 restart:
2284 	read_lock_bh(&table->tb6_lock);
2285 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2286 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2287 		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2288 			dst_hold(&rt->dst);
2289 			read_unlock_bh(&table->tb6_lock);
2290 			ip6_del_rt(rt);
2291 			goto restart;
2292 		}
2293 	}
2294 	read_unlock_bh(&table->tb6_lock);
2295 }
2296 
2297 static void rtmsg_to_fib6_config(struct net *net,
2298 				 struct in6_rtmsg *rtmsg,
2299 				 struct fib6_config *cfg)
2300 {
2301 	memset(cfg, 0, sizeof(*cfg));
2302 
2303 	cfg->fc_table = RT6_TABLE_MAIN;
2304 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2305 	cfg->fc_metric = rtmsg->rtmsg_metric;
2306 	cfg->fc_expires = rtmsg->rtmsg_info;
2307 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2308 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
2309 	cfg->fc_flags = rtmsg->rtmsg_flags;
2310 
2311 	cfg->fc_nlinfo.nl_net = net;
2312 
2313 	cfg->fc_dst = rtmsg->rtmsg_dst;
2314 	cfg->fc_src = rtmsg->rtmsg_src;
2315 	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2316 }
2317 
2318 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2319 {
2320 	struct fib6_config cfg;
2321 	struct in6_rtmsg rtmsg;
2322 	int err;
2323 
2324 	switch (cmd) {
2325 	case SIOCADDRT:		/* Add a route */
2326 	case SIOCDELRT:		/* Delete a route */
2327 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2328 			return -EPERM;
2329 		err = copy_from_user(&rtmsg, arg,
2330 				     sizeof(struct in6_rtmsg));
2331 		if (err)
2332 			return -EFAULT;
2333 
2334 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2335 
2336 		rtnl_lock();
2337 		switch (cmd) {
2338 		case SIOCADDRT:
2339 			err = ip6_route_add(&cfg);
2340 			break;
2341 		case SIOCDELRT:
2342 			err = ip6_route_del(&cfg);
2343 			break;
2344 		default:
2345 			err = -EINVAL;
2346 		}
2347 		rtnl_unlock();
2348 
2349 		return err;
2350 	}
2351 
2352 	return -EINVAL;
2353 }
2354 
2355 /*
2356  *	Drop the packet on the floor
2357  */
2358 
2359 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2360 {
2361 	int type;
2362 	struct dst_entry *dst = skb_dst(skb);
2363 	switch (ipstats_mib_noroutes) {
2364 	case IPSTATS_MIB_INNOROUTES:
2365 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2366 		if (type == IPV6_ADDR_ANY) {
2367 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2368 				      IPSTATS_MIB_INADDRERRORS);
2369 			break;
2370 		}
2371 		/* FALLTHROUGH */
2372 	case IPSTATS_MIB_OUTNOROUTES:
2373 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2374 			      ipstats_mib_noroutes);
2375 		break;
2376 	}
2377 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2378 	kfree_skb(skb);
2379 	return 0;
2380 }
2381 
2382 static int ip6_pkt_discard(struct sk_buff *skb)
2383 {
2384 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2385 }
2386 
2387 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2388 {
2389 	skb->dev = skb_dst(skb)->dev;
2390 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2391 }
2392 
2393 static int ip6_pkt_prohibit(struct sk_buff *skb)
2394 {
2395 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2396 }
2397 
2398 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2399 {
2400 	skb->dev = skb_dst(skb)->dev;
2401 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2402 }
2403 
2404 /*
2405  *	Allocate a dst for local (unicast / anycast) address.
2406  */
2407 
2408 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2409 				    const struct in6_addr *addr,
2410 				    bool anycast)
2411 {
2412 	struct net *net = dev_net(idev->dev);
2413 	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2414 					    DST_NOCOUNT, NULL);
2415 	if (!rt)
2416 		return ERR_PTR(-ENOMEM);
2417 
2418 	in6_dev_hold(idev);
2419 
2420 	rt->dst.flags |= DST_HOST;
2421 	rt->dst.input = ip6_input;
2422 	rt->dst.output = ip6_output;
2423 	rt->rt6i_idev = idev;
2424 
2425 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2426 	if (anycast)
2427 		rt->rt6i_flags |= RTF_ANYCAST;
2428 	else
2429 		rt->rt6i_flags |= RTF_LOCAL;
2430 
2431 	rt->rt6i_gateway  = *addr;
2432 	rt->rt6i_dst.addr = *addr;
2433 	rt->rt6i_dst.plen = 128;
2434 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2435 
2436 	atomic_set(&rt->dst.__refcnt, 1);
2437 
2438 	return rt;
2439 }
2440 
2441 int ip6_route_get_saddr(struct net *net,
2442 			struct rt6_info *rt,
2443 			const struct in6_addr *daddr,
2444 			unsigned int prefs,
2445 			struct in6_addr *saddr)
2446 {
2447 	struct inet6_dev *idev =
2448 		rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2449 	int err = 0;
2450 	if (rt && rt->rt6i_prefsrc.plen)
2451 		*saddr = rt->rt6i_prefsrc.addr;
2452 	else
2453 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2454 					 daddr, prefs, saddr);
2455 	return err;
2456 }
2457 
2458 /* remove deleted ip from prefsrc entries */
2459 struct arg_dev_net_ip {
2460 	struct net_device *dev;
2461 	struct net *net;
2462 	struct in6_addr *addr;
2463 };
2464 
2465 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2466 {
2467 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2468 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2469 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2470 
2471 	if (((void *)rt->dst.dev == dev || !dev) &&
2472 	    rt != net->ipv6.ip6_null_entry &&
2473 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2474 		/* remove prefsrc entry */
2475 		rt->rt6i_prefsrc.plen = 0;
2476 	}
2477 	return 0;
2478 }
2479 
2480 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2481 {
2482 	struct net *net = dev_net(ifp->idev->dev);
2483 	struct arg_dev_net_ip adni = {
2484 		.dev = ifp->idev->dev,
2485 		.net = net,
2486 		.addr = &ifp->addr,
2487 	};
2488 	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2489 }
2490 
2491 #define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2492 #define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2493 
2494 /* Remove routers and update dst entries when gateway turn into host. */
2495 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2496 {
2497 	struct in6_addr *gateway = (struct in6_addr *)arg;
2498 
2499 	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2500 	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2501 	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2502 		return -1;
2503 	}
2504 	return 0;
2505 }
2506 
2507 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2508 {
2509 	fib6_clean_all(net, fib6_clean_tohost, gateway);
2510 }
2511 
2512 struct arg_dev_net {
2513 	struct net_device *dev;
2514 	struct net *net;
2515 };
2516 
2517 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2518 {
2519 	const struct arg_dev_net *adn = arg;
2520 	const struct net_device *dev = adn->dev;
2521 
2522 	if ((rt->dst.dev == dev || !dev) &&
2523 	    rt != adn->net->ipv6.ip6_null_entry)
2524 		return -1;
2525 
2526 	return 0;
2527 }
2528 
2529 void rt6_ifdown(struct net *net, struct net_device *dev)
2530 {
2531 	struct arg_dev_net adn = {
2532 		.dev = dev,
2533 		.net = net,
2534 	};
2535 
2536 	fib6_clean_all(net, fib6_ifdown, &adn);
2537 	icmp6_clean_all(fib6_ifdown, &adn);
2538 	rt6_uncached_list_flush_dev(net, dev);
2539 }
2540 
2541 struct rt6_mtu_change_arg {
2542 	struct net_device *dev;
2543 	unsigned int mtu;
2544 };
2545 
2546 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2547 {
2548 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2549 	struct inet6_dev *idev;
2550 
2551 	/* In IPv6 pmtu discovery is not optional,
2552 	   so that RTAX_MTU lock cannot disable it.
2553 	   We still use this lock to block changes
2554 	   caused by addrconf/ndisc.
2555 	*/
2556 
2557 	idev = __in6_dev_get(arg->dev);
2558 	if (!idev)
2559 		return 0;
2560 
2561 	/* For administrative MTU increase, there is no way to discover
2562 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2563 	   Since RFC 1981 doesn't include administrative MTU increase
2564 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2565 	 */
2566 	/*
2567 	   If new MTU is less than route PMTU, this new MTU will be the
2568 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2569 	   decreases; if new MTU is greater than route PMTU, and the
2570 	   old MTU is the lowest MTU in the path, update the route PMTU
2571 	   to reflect the increase. In this case if the other nodes' MTU
2572 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2573 	   PMTU discouvery.
2574 	 */
2575 	if (rt->dst.dev == arg->dev &&
2576 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2577 		if (rt->rt6i_flags & RTF_CACHE) {
2578 			/* For RTF_CACHE with rt6i_pmtu == 0
2579 			 * (i.e. a redirected route),
2580 			 * the metrics of its rt->dst.from has already
2581 			 * been updated.
2582 			 */
2583 			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2584 				rt->rt6i_pmtu = arg->mtu;
2585 		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
2586 			   (dst_mtu(&rt->dst) < arg->mtu &&
2587 			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2588 			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2589 		}
2590 	}
2591 	return 0;
2592 }
2593 
2594 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2595 {
2596 	struct rt6_mtu_change_arg arg = {
2597 		.dev = dev,
2598 		.mtu = mtu,
2599 	};
2600 
2601 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2602 }
2603 
2604 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2605 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2606 	[RTA_OIF]               = { .type = NLA_U32 },
2607 	[RTA_IIF]		= { .type = NLA_U32 },
2608 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2609 	[RTA_METRICS]           = { .type = NLA_NESTED },
2610 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2611 	[RTA_PREF]              = { .type = NLA_U8 },
2612 	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
2613 	[RTA_ENCAP]		= { .type = NLA_NESTED },
2614 };
2615 
2616 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2617 			      struct fib6_config *cfg)
2618 {
2619 	struct rtmsg *rtm;
2620 	struct nlattr *tb[RTA_MAX+1];
2621 	unsigned int pref;
2622 	int err;
2623 
2624 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2625 	if (err < 0)
2626 		goto errout;
2627 
2628 	err = -EINVAL;
2629 	rtm = nlmsg_data(nlh);
2630 	memset(cfg, 0, sizeof(*cfg));
2631 
2632 	cfg->fc_table = rtm->rtm_table;
2633 	cfg->fc_dst_len = rtm->rtm_dst_len;
2634 	cfg->fc_src_len = rtm->rtm_src_len;
2635 	cfg->fc_flags = RTF_UP;
2636 	cfg->fc_protocol = rtm->rtm_protocol;
2637 	cfg->fc_type = rtm->rtm_type;
2638 
2639 	if (rtm->rtm_type == RTN_UNREACHABLE ||
2640 	    rtm->rtm_type == RTN_BLACKHOLE ||
2641 	    rtm->rtm_type == RTN_PROHIBIT ||
2642 	    rtm->rtm_type == RTN_THROW)
2643 		cfg->fc_flags |= RTF_REJECT;
2644 
2645 	if (rtm->rtm_type == RTN_LOCAL)
2646 		cfg->fc_flags |= RTF_LOCAL;
2647 
2648 	if (rtm->rtm_flags & RTM_F_CLONED)
2649 		cfg->fc_flags |= RTF_CACHE;
2650 
2651 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2652 	cfg->fc_nlinfo.nlh = nlh;
2653 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2654 
2655 	if (tb[RTA_GATEWAY]) {
2656 		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2657 		cfg->fc_flags |= RTF_GATEWAY;
2658 	}
2659 
2660 	if (tb[RTA_DST]) {
2661 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2662 
2663 		if (nla_len(tb[RTA_DST]) < plen)
2664 			goto errout;
2665 
2666 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2667 	}
2668 
2669 	if (tb[RTA_SRC]) {
2670 		int plen = (rtm->rtm_src_len + 7) >> 3;
2671 
2672 		if (nla_len(tb[RTA_SRC]) < plen)
2673 			goto errout;
2674 
2675 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2676 	}
2677 
2678 	if (tb[RTA_PREFSRC])
2679 		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2680 
2681 	if (tb[RTA_OIF])
2682 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2683 
2684 	if (tb[RTA_PRIORITY])
2685 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2686 
2687 	if (tb[RTA_METRICS]) {
2688 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2689 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2690 	}
2691 
2692 	if (tb[RTA_TABLE])
2693 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2694 
2695 	if (tb[RTA_MULTIPATH]) {
2696 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2697 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2698 	}
2699 
2700 	if (tb[RTA_PREF]) {
2701 		pref = nla_get_u8(tb[RTA_PREF]);
2702 		if (pref != ICMPV6_ROUTER_PREF_LOW &&
2703 		    pref != ICMPV6_ROUTER_PREF_HIGH)
2704 			pref = ICMPV6_ROUTER_PREF_MEDIUM;
2705 		cfg->fc_flags |= RTF_PREF(pref);
2706 	}
2707 
2708 	if (tb[RTA_ENCAP])
2709 		cfg->fc_encap = tb[RTA_ENCAP];
2710 
2711 	if (tb[RTA_ENCAP_TYPE])
2712 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2713 
2714 	err = 0;
2715 errout:
2716 	return err;
2717 }
2718 
2719 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2720 {
2721 	struct fib6_config r_cfg;
2722 	struct rtnexthop *rtnh;
2723 	int remaining;
2724 	int attrlen;
2725 	int err = 0, last_err = 0;
2726 
2727 	remaining = cfg->fc_mp_len;
2728 beginning:
2729 	rtnh = (struct rtnexthop *)cfg->fc_mp;
2730 
2731 	/* Parse a Multipath Entry */
2732 	while (rtnh_ok(rtnh, remaining)) {
2733 		memcpy(&r_cfg, cfg, sizeof(*cfg));
2734 		if (rtnh->rtnh_ifindex)
2735 			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2736 
2737 		attrlen = rtnh_attrlen(rtnh);
2738 		if (attrlen > 0) {
2739 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2740 
2741 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2742 			if (nla) {
2743 				r_cfg.fc_gateway = nla_get_in6_addr(nla);
2744 				r_cfg.fc_flags |= RTF_GATEWAY;
2745 			}
2746 			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
2747 			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
2748 			if (nla)
2749 				r_cfg.fc_encap_type = nla_get_u16(nla);
2750 		}
2751 		err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2752 		if (err) {
2753 			last_err = err;
2754 			/* If we are trying to remove a route, do not stop the
2755 			 * loop when ip6_route_del() fails (because next hop is
2756 			 * already gone), we should try to remove all next hops.
2757 			 */
2758 			if (add) {
2759 				/* If add fails, we should try to delete all
2760 				 * next hops that have been already added.
2761 				 */
2762 				add = 0;
2763 				remaining = cfg->fc_mp_len - remaining;
2764 				goto beginning;
2765 			}
2766 		}
2767 		/* Because each route is added like a single route we remove
2768 		 * these flags after the first nexthop: if there is a collision,
2769 		 * we have already failed to add the first nexthop:
2770 		 * fib6_add_rt2node() has rejected it; when replacing, old
2771 		 * nexthops have been replaced by first new, the rest should
2772 		 * be added to it.
2773 		 */
2774 		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2775 						     NLM_F_REPLACE);
2776 		rtnh = rtnh_next(rtnh, &remaining);
2777 	}
2778 
2779 	return last_err;
2780 }
2781 
2782 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2783 {
2784 	struct fib6_config cfg;
2785 	int err;
2786 
2787 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2788 	if (err < 0)
2789 		return err;
2790 
2791 	if (cfg.fc_mp)
2792 		return ip6_route_multipath(&cfg, 0);
2793 	else
2794 		return ip6_route_del(&cfg);
2795 }
2796 
2797 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2798 {
2799 	struct fib6_config cfg;
2800 	int err;
2801 
2802 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2803 	if (err < 0)
2804 		return err;
2805 
2806 	if (cfg.fc_mp)
2807 		return ip6_route_multipath(&cfg, 1);
2808 	else
2809 		return ip6_route_add(&cfg);
2810 }
2811 
2812 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
2813 {
2814 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2815 	       + nla_total_size(16) /* RTA_SRC */
2816 	       + nla_total_size(16) /* RTA_DST */
2817 	       + nla_total_size(16) /* RTA_GATEWAY */
2818 	       + nla_total_size(16) /* RTA_PREFSRC */
2819 	       + nla_total_size(4) /* RTA_TABLE */
2820 	       + nla_total_size(4) /* RTA_IIF */
2821 	       + nla_total_size(4) /* RTA_OIF */
2822 	       + nla_total_size(4) /* RTA_PRIORITY */
2823 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2824 	       + nla_total_size(sizeof(struct rta_cacheinfo))
2825 	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2826 	       + nla_total_size(1) /* RTA_PREF */
2827 	       + lwtunnel_get_encap_size(rt->rt6i_lwtstate);
2828 }
2829 
2830 static int rt6_fill_node(struct net *net,
2831 			 struct sk_buff *skb, struct rt6_info *rt,
2832 			 struct in6_addr *dst, struct in6_addr *src,
2833 			 int iif, int type, u32 portid, u32 seq,
2834 			 int prefix, int nowait, unsigned int flags)
2835 {
2836 	u32 metrics[RTAX_MAX];
2837 	struct rtmsg *rtm;
2838 	struct nlmsghdr *nlh;
2839 	long expires;
2840 	u32 table;
2841 
2842 	if (prefix) {	/* user wants prefix routes only */
2843 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2844 			/* success since this is not a prefix route */
2845 			return 1;
2846 		}
2847 	}
2848 
2849 	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2850 	if (!nlh)
2851 		return -EMSGSIZE;
2852 
2853 	rtm = nlmsg_data(nlh);
2854 	rtm->rtm_family = AF_INET6;
2855 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2856 	rtm->rtm_src_len = rt->rt6i_src.plen;
2857 	rtm->rtm_tos = 0;
2858 	if (rt->rt6i_table)
2859 		table = rt->rt6i_table->tb6_id;
2860 	else
2861 		table = RT6_TABLE_UNSPEC;
2862 	rtm->rtm_table = table;
2863 	if (nla_put_u32(skb, RTA_TABLE, table))
2864 		goto nla_put_failure;
2865 	if (rt->rt6i_flags & RTF_REJECT) {
2866 		switch (rt->dst.error) {
2867 		case -EINVAL:
2868 			rtm->rtm_type = RTN_BLACKHOLE;
2869 			break;
2870 		case -EACCES:
2871 			rtm->rtm_type = RTN_PROHIBIT;
2872 			break;
2873 		case -EAGAIN:
2874 			rtm->rtm_type = RTN_THROW;
2875 			break;
2876 		default:
2877 			rtm->rtm_type = RTN_UNREACHABLE;
2878 			break;
2879 		}
2880 	}
2881 	else if (rt->rt6i_flags & RTF_LOCAL)
2882 		rtm->rtm_type = RTN_LOCAL;
2883 	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2884 		rtm->rtm_type = RTN_LOCAL;
2885 	else
2886 		rtm->rtm_type = RTN_UNICAST;
2887 	rtm->rtm_flags = 0;
2888 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2889 	rtm->rtm_protocol = rt->rt6i_protocol;
2890 	if (rt->rt6i_flags & RTF_DYNAMIC)
2891 		rtm->rtm_protocol = RTPROT_REDIRECT;
2892 	else if (rt->rt6i_flags & RTF_ADDRCONF) {
2893 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2894 			rtm->rtm_protocol = RTPROT_RA;
2895 		else
2896 			rtm->rtm_protocol = RTPROT_KERNEL;
2897 	}
2898 
2899 	if (rt->rt6i_flags & RTF_CACHE)
2900 		rtm->rtm_flags |= RTM_F_CLONED;
2901 
2902 	if (dst) {
2903 		if (nla_put_in6_addr(skb, RTA_DST, dst))
2904 			goto nla_put_failure;
2905 		rtm->rtm_dst_len = 128;
2906 	} else if (rtm->rtm_dst_len)
2907 		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2908 			goto nla_put_failure;
2909 #ifdef CONFIG_IPV6_SUBTREES
2910 	if (src) {
2911 		if (nla_put_in6_addr(skb, RTA_SRC, src))
2912 			goto nla_put_failure;
2913 		rtm->rtm_src_len = 128;
2914 	} else if (rtm->rtm_src_len &&
2915 		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2916 		goto nla_put_failure;
2917 #endif
2918 	if (iif) {
2919 #ifdef CONFIG_IPV6_MROUTE
2920 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2921 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2922 			if (err <= 0) {
2923 				if (!nowait) {
2924 					if (err == 0)
2925 						return 0;
2926 					goto nla_put_failure;
2927 				} else {
2928 					if (err == -EMSGSIZE)
2929 						goto nla_put_failure;
2930 				}
2931 			}
2932 		} else
2933 #endif
2934 			if (nla_put_u32(skb, RTA_IIF, iif))
2935 				goto nla_put_failure;
2936 	} else if (dst) {
2937 		struct in6_addr saddr_buf;
2938 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2939 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2940 			goto nla_put_failure;
2941 	}
2942 
2943 	if (rt->rt6i_prefsrc.plen) {
2944 		struct in6_addr saddr_buf;
2945 		saddr_buf = rt->rt6i_prefsrc.addr;
2946 		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2947 			goto nla_put_failure;
2948 	}
2949 
2950 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2951 	if (rt->rt6i_pmtu)
2952 		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2953 	if (rtnetlink_put_metrics(skb, metrics) < 0)
2954 		goto nla_put_failure;
2955 
2956 	if (rt->rt6i_flags & RTF_GATEWAY) {
2957 		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2958 			goto nla_put_failure;
2959 	}
2960 
2961 	if (rt->dst.dev &&
2962 	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2963 		goto nla_put_failure;
2964 	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2965 		goto nla_put_failure;
2966 
2967 	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2968 
2969 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2970 		goto nla_put_failure;
2971 
2972 	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2973 		goto nla_put_failure;
2974 
2975 	lwtunnel_fill_encap(skb, rt->rt6i_lwtstate);
2976 
2977 	nlmsg_end(skb, nlh);
2978 	return 0;
2979 
2980 nla_put_failure:
2981 	nlmsg_cancel(skb, nlh);
2982 	return -EMSGSIZE;
2983 }
2984 
2985 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2986 {
2987 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2988 	int prefix;
2989 
2990 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2991 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2992 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2993 	} else
2994 		prefix = 0;
2995 
2996 	return rt6_fill_node(arg->net,
2997 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2998 		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2999 		     prefix, 0, NLM_F_MULTI);
3000 }
3001 
3002 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3003 {
3004 	struct net *net = sock_net(in_skb->sk);
3005 	struct nlattr *tb[RTA_MAX+1];
3006 	struct rt6_info *rt;
3007 	struct sk_buff *skb;
3008 	struct rtmsg *rtm;
3009 	struct flowi6 fl6;
3010 	int err, iif = 0, oif = 0;
3011 
3012 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3013 	if (err < 0)
3014 		goto errout;
3015 
3016 	err = -EINVAL;
3017 	memset(&fl6, 0, sizeof(fl6));
3018 
3019 	if (tb[RTA_SRC]) {
3020 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3021 			goto errout;
3022 
3023 		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3024 	}
3025 
3026 	if (tb[RTA_DST]) {
3027 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3028 			goto errout;
3029 
3030 		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3031 	}
3032 
3033 	if (tb[RTA_IIF])
3034 		iif = nla_get_u32(tb[RTA_IIF]);
3035 
3036 	if (tb[RTA_OIF])
3037 		oif = nla_get_u32(tb[RTA_OIF]);
3038 
3039 	if (tb[RTA_MARK])
3040 		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3041 
3042 	if (iif) {
3043 		struct net_device *dev;
3044 		int flags = 0;
3045 
3046 		dev = __dev_get_by_index(net, iif);
3047 		if (!dev) {
3048 			err = -ENODEV;
3049 			goto errout;
3050 		}
3051 
3052 		fl6.flowi6_iif = iif;
3053 
3054 		if (!ipv6_addr_any(&fl6.saddr))
3055 			flags |= RT6_LOOKUP_F_HAS_SADDR;
3056 
3057 		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3058 							       flags);
3059 	} else {
3060 		fl6.flowi6_oif = oif;
3061 
3062 		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3063 	}
3064 
3065 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3066 	if (!skb) {
3067 		ip6_rt_put(rt);
3068 		err = -ENOBUFS;
3069 		goto errout;
3070 	}
3071 
3072 	/* Reserve room for dummy headers, this skb can pass
3073 	   through good chunk of routing engine.
3074 	 */
3075 	skb_reset_mac_header(skb);
3076 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3077 
3078 	skb_dst_set(skb, &rt->dst);
3079 
3080 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3081 			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3082 			    nlh->nlmsg_seq, 0, 0, 0);
3083 	if (err < 0) {
3084 		kfree_skb(skb);
3085 		goto errout;
3086 	}
3087 
3088 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3089 errout:
3090 	return err;
3091 }
3092 
3093 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3094 {
3095 	struct sk_buff *skb;
3096 	struct net *net = info->nl_net;
3097 	u32 seq;
3098 	int err;
3099 
3100 	err = -ENOBUFS;
3101 	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3102 
3103 	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3104 	if (!skb)
3105 		goto errout;
3106 
3107 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3108 				event, info->portid, seq, 0, 0, 0);
3109 	if (err < 0) {
3110 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3111 		WARN_ON(err == -EMSGSIZE);
3112 		kfree_skb(skb);
3113 		goto errout;
3114 	}
3115 	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3116 		    info->nlh, gfp_any());
3117 	return;
3118 errout:
3119 	if (err < 0)
3120 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3121 }
3122 
3123 static int ip6_route_dev_notify(struct notifier_block *this,
3124 				unsigned long event, void *ptr)
3125 {
3126 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3127 	struct net *net = dev_net(dev);
3128 
3129 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3130 		net->ipv6.ip6_null_entry->dst.dev = dev;
3131 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3132 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3133 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3134 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3135 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3136 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3137 #endif
3138 	}
3139 
3140 	return NOTIFY_OK;
3141 }
3142 
3143 /*
3144  *	/proc
3145  */
3146 
3147 #ifdef CONFIG_PROC_FS
3148 
3149 static const struct file_operations ipv6_route_proc_fops = {
3150 	.owner		= THIS_MODULE,
3151 	.open		= ipv6_route_open,
3152 	.read		= seq_read,
3153 	.llseek		= seq_lseek,
3154 	.release	= seq_release_net,
3155 };
3156 
3157 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3158 {
3159 	struct net *net = (struct net *)seq->private;
3160 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3161 		   net->ipv6.rt6_stats->fib_nodes,
3162 		   net->ipv6.rt6_stats->fib_route_nodes,
3163 		   net->ipv6.rt6_stats->fib_rt_alloc,
3164 		   net->ipv6.rt6_stats->fib_rt_entries,
3165 		   net->ipv6.rt6_stats->fib_rt_cache,
3166 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3167 		   net->ipv6.rt6_stats->fib_discarded_routes);
3168 
3169 	return 0;
3170 }
3171 
3172 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3173 {
3174 	return single_open_net(inode, file, rt6_stats_seq_show);
3175 }
3176 
3177 static const struct file_operations rt6_stats_seq_fops = {
3178 	.owner	 = THIS_MODULE,
3179 	.open	 = rt6_stats_seq_open,
3180 	.read	 = seq_read,
3181 	.llseek	 = seq_lseek,
3182 	.release = single_release_net,
3183 };
3184 #endif	/* CONFIG_PROC_FS */
3185 
3186 #ifdef CONFIG_SYSCTL
3187 
3188 static
3189 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3190 			      void __user *buffer, size_t *lenp, loff_t *ppos)
3191 {
3192 	struct net *net;
3193 	int delay;
3194 	if (!write)
3195 		return -EINVAL;
3196 
3197 	net = (struct net *)ctl->extra1;
3198 	delay = net->ipv6.sysctl.flush_delay;
3199 	proc_dointvec(ctl, write, buffer, lenp, ppos);
3200 	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3201 	return 0;
3202 }
3203 
3204 struct ctl_table ipv6_route_table_template[] = {
3205 	{
3206 		.procname	=	"flush",
3207 		.data		=	&init_net.ipv6.sysctl.flush_delay,
3208 		.maxlen		=	sizeof(int),
3209 		.mode		=	0200,
3210 		.proc_handler	=	ipv6_sysctl_rtcache_flush
3211 	},
3212 	{
3213 		.procname	=	"gc_thresh",
3214 		.data		=	&ip6_dst_ops_template.gc_thresh,
3215 		.maxlen		=	sizeof(int),
3216 		.mode		=	0644,
3217 		.proc_handler	=	proc_dointvec,
3218 	},
3219 	{
3220 		.procname	=	"max_size",
3221 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
3222 		.maxlen		=	sizeof(int),
3223 		.mode		=	0644,
3224 		.proc_handler	=	proc_dointvec,
3225 	},
3226 	{
3227 		.procname	=	"gc_min_interval",
3228 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3229 		.maxlen		=	sizeof(int),
3230 		.mode		=	0644,
3231 		.proc_handler	=	proc_dointvec_jiffies,
3232 	},
3233 	{
3234 		.procname	=	"gc_timeout",
3235 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3236 		.maxlen		=	sizeof(int),
3237 		.mode		=	0644,
3238 		.proc_handler	=	proc_dointvec_jiffies,
3239 	},
3240 	{
3241 		.procname	=	"gc_interval",
3242 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
3243 		.maxlen		=	sizeof(int),
3244 		.mode		=	0644,
3245 		.proc_handler	=	proc_dointvec_jiffies,
3246 	},
3247 	{
3248 		.procname	=	"gc_elasticity",
3249 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3250 		.maxlen		=	sizeof(int),
3251 		.mode		=	0644,
3252 		.proc_handler	=	proc_dointvec,
3253 	},
3254 	{
3255 		.procname	=	"mtu_expires",
3256 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3257 		.maxlen		=	sizeof(int),
3258 		.mode		=	0644,
3259 		.proc_handler	=	proc_dointvec_jiffies,
3260 	},
3261 	{
3262 		.procname	=	"min_adv_mss",
3263 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
3264 		.maxlen		=	sizeof(int),
3265 		.mode		=	0644,
3266 		.proc_handler	=	proc_dointvec,
3267 	},
3268 	{
3269 		.procname	=	"gc_min_interval_ms",
3270 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3271 		.maxlen		=	sizeof(int),
3272 		.mode		=	0644,
3273 		.proc_handler	=	proc_dointvec_ms_jiffies,
3274 	},
3275 	{ }
3276 };
3277 
3278 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3279 {
3280 	struct ctl_table *table;
3281 
3282 	table = kmemdup(ipv6_route_table_template,
3283 			sizeof(ipv6_route_table_template),
3284 			GFP_KERNEL);
3285 
3286 	if (table) {
3287 		table[0].data = &net->ipv6.sysctl.flush_delay;
3288 		table[0].extra1 = net;
3289 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3290 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3291 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3292 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3293 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3294 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3295 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3296 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3297 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3298 
3299 		/* Don't export sysctls to unprivileged users */
3300 		if (net->user_ns != &init_user_ns)
3301 			table[0].procname = NULL;
3302 	}
3303 
3304 	return table;
3305 }
3306 #endif
3307 
3308 static int __net_init ip6_route_net_init(struct net *net)
3309 {
3310 	int ret = -ENOMEM;
3311 
3312 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3313 	       sizeof(net->ipv6.ip6_dst_ops));
3314 
3315 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3316 		goto out_ip6_dst_ops;
3317 
3318 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3319 					   sizeof(*net->ipv6.ip6_null_entry),
3320 					   GFP_KERNEL);
3321 	if (!net->ipv6.ip6_null_entry)
3322 		goto out_ip6_dst_entries;
3323 	net->ipv6.ip6_null_entry->dst.path =
3324 		(struct dst_entry *)net->ipv6.ip6_null_entry;
3325 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3326 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3327 			 ip6_template_metrics, true);
3328 
3329 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3330 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3331 					       sizeof(*net->ipv6.ip6_prohibit_entry),
3332 					       GFP_KERNEL);
3333 	if (!net->ipv6.ip6_prohibit_entry)
3334 		goto out_ip6_null_entry;
3335 	net->ipv6.ip6_prohibit_entry->dst.path =
3336 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3337 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3338 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3339 			 ip6_template_metrics, true);
3340 
3341 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3342 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
3343 					       GFP_KERNEL);
3344 	if (!net->ipv6.ip6_blk_hole_entry)
3345 		goto out_ip6_prohibit_entry;
3346 	net->ipv6.ip6_blk_hole_entry->dst.path =
3347 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3348 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3349 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3350 			 ip6_template_metrics, true);
3351 #endif
3352 
3353 	net->ipv6.sysctl.flush_delay = 0;
3354 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
3355 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3356 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3357 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3358 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3359 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3360 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3361 
3362 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
3363 
3364 	ret = 0;
3365 out:
3366 	return ret;
3367 
3368 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3369 out_ip6_prohibit_entry:
3370 	kfree(net->ipv6.ip6_prohibit_entry);
3371 out_ip6_null_entry:
3372 	kfree(net->ipv6.ip6_null_entry);
3373 #endif
3374 out_ip6_dst_entries:
3375 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3376 out_ip6_dst_ops:
3377 	goto out;
3378 }
3379 
3380 static void __net_exit ip6_route_net_exit(struct net *net)
3381 {
3382 	kfree(net->ipv6.ip6_null_entry);
3383 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3384 	kfree(net->ipv6.ip6_prohibit_entry);
3385 	kfree(net->ipv6.ip6_blk_hole_entry);
3386 #endif
3387 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3388 }
3389 
3390 static int __net_init ip6_route_net_init_late(struct net *net)
3391 {
3392 #ifdef CONFIG_PROC_FS
3393 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3394 	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3395 #endif
3396 	return 0;
3397 }
3398 
3399 static void __net_exit ip6_route_net_exit_late(struct net *net)
3400 {
3401 #ifdef CONFIG_PROC_FS
3402 	remove_proc_entry("ipv6_route", net->proc_net);
3403 	remove_proc_entry("rt6_stats", net->proc_net);
3404 #endif
3405 }
3406 
3407 static struct pernet_operations ip6_route_net_ops = {
3408 	.init = ip6_route_net_init,
3409 	.exit = ip6_route_net_exit,
3410 };
3411 
3412 static int __net_init ipv6_inetpeer_init(struct net *net)
3413 {
3414 	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3415 
3416 	if (!bp)
3417 		return -ENOMEM;
3418 	inet_peer_base_init(bp);
3419 	net->ipv6.peers = bp;
3420 	return 0;
3421 }
3422 
3423 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3424 {
3425 	struct inet_peer_base *bp = net->ipv6.peers;
3426 
3427 	net->ipv6.peers = NULL;
3428 	inetpeer_invalidate_tree(bp);
3429 	kfree(bp);
3430 }
3431 
3432 static struct pernet_operations ipv6_inetpeer_ops = {
3433 	.init	=	ipv6_inetpeer_init,
3434 	.exit	=	ipv6_inetpeer_exit,
3435 };
3436 
3437 static struct pernet_operations ip6_route_net_late_ops = {
3438 	.init = ip6_route_net_init_late,
3439 	.exit = ip6_route_net_exit_late,
3440 };
3441 
3442 static struct notifier_block ip6_route_dev_notifier = {
3443 	.notifier_call = ip6_route_dev_notify,
3444 	.priority = 0,
3445 };
3446 
3447 int __init ip6_route_init(void)
3448 {
3449 	int ret;
3450 	int cpu;
3451 
3452 	ret = -ENOMEM;
3453 	ip6_dst_ops_template.kmem_cachep =
3454 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3455 				  SLAB_HWCACHE_ALIGN, NULL);
3456 	if (!ip6_dst_ops_template.kmem_cachep)
3457 		goto out;
3458 
3459 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3460 	if (ret)
3461 		goto out_kmem_cache;
3462 
3463 	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3464 	if (ret)
3465 		goto out_dst_entries;
3466 
3467 	ret = register_pernet_subsys(&ip6_route_net_ops);
3468 	if (ret)
3469 		goto out_register_inetpeer;
3470 
3471 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3472 
3473 	/* Registering of the loopback is done before this portion of code,
3474 	 * the loopback reference in rt6_info will not be taken, do it
3475 	 * manually for init_net */
3476 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3477 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3478   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3479 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3480 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3481 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3482 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3483   #endif
3484 	ret = fib6_init();
3485 	if (ret)
3486 		goto out_register_subsys;
3487 
3488 	ret = xfrm6_init();
3489 	if (ret)
3490 		goto out_fib6_init;
3491 
3492 	ret = fib6_rules_init();
3493 	if (ret)
3494 		goto xfrm6_init;
3495 
3496 	ret = register_pernet_subsys(&ip6_route_net_late_ops);
3497 	if (ret)
3498 		goto fib6_rules_init;
3499 
3500 	ret = -ENOBUFS;
3501 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3502 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3503 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3504 		goto out_register_late_subsys;
3505 
3506 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3507 	if (ret)
3508 		goto out_register_late_subsys;
3509 
3510 	for_each_possible_cpu(cpu) {
3511 		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3512 
3513 		INIT_LIST_HEAD(&ul->head);
3514 		spin_lock_init(&ul->lock);
3515 	}
3516 
3517 out:
3518 	return ret;
3519 
3520 out_register_late_subsys:
3521 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3522 fib6_rules_init:
3523 	fib6_rules_cleanup();
3524 xfrm6_init:
3525 	xfrm6_fini();
3526 out_fib6_init:
3527 	fib6_gc_cleanup();
3528 out_register_subsys:
3529 	unregister_pernet_subsys(&ip6_route_net_ops);
3530 out_register_inetpeer:
3531 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3532 out_dst_entries:
3533 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3534 out_kmem_cache:
3535 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3536 	goto out;
3537 }
3538 
3539 void ip6_route_cleanup(void)
3540 {
3541 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3542 	unregister_pernet_subsys(&ip6_route_net_late_ops);
3543 	fib6_rules_cleanup();
3544 	xfrm6_fini();
3545 	fib6_gc_cleanup();
3546 	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3547 	unregister_pernet_subsys(&ip6_route_net_ops);
3548 	dst_entries_destroy(&ip6_dst_blackhole_ops);
3549 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3550 }
3551