xref: /openbmc/linux/net/ipv6/route.c (revision 81d67439)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76 				    const struct in6_addr *dest);
77 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
79 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void		ip6_dst_destroy(struct dst_entry *);
82 static void		ip6_dst_ifdown(struct dst_entry *,
83 				       struct net_device *dev, int how);
84 static int		 ip6_dst_gc(struct dst_ops *ops);
85 
86 static int		ip6_pkt_discard(struct sk_buff *skb);
87 static int		ip6_pkt_discard_out(struct sk_buff *skb);
88 static void		ip6_link_failure(struct sk_buff *skb);
89 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 					   const struct in6_addr *prefix, int prefixlen,
94 					   const struct in6_addr *gwaddr, int ifindex,
95 					   unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 					   const struct in6_addr *prefix, int prefixlen,
98 					   const struct in6_addr *gwaddr, int ifindex);
99 #endif
100 
101 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 {
103 	struct rt6_info *rt = (struct rt6_info *) dst;
104 	struct inet_peer *peer;
105 	u32 *p = NULL;
106 
107 	if (!rt->rt6i_peer)
108 		rt6_bind_peer(rt, 1);
109 
110 	peer = rt->rt6i_peer;
111 	if (peer) {
112 		u32 *old_p = __DST_METRICS_PTR(old);
113 		unsigned long prev, new;
114 
115 		p = peer->metrics;
116 		if (inet_metrics_new(peer))
117 			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118 
119 		new = (unsigned long) p;
120 		prev = cmpxchg(&dst->_metrics, old, new);
121 
122 		if (prev != old) {
123 			p = __DST_METRICS_PTR(prev);
124 			if (prev & DST_METRICS_READ_ONLY)
125 				p = NULL;
126 		}
127 	}
128 	return p;
129 }
130 
131 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
132 {
133 	return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
134 }
135 
136 static struct dst_ops ip6_dst_ops_template = {
137 	.family			=	AF_INET6,
138 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
139 	.gc			=	ip6_dst_gc,
140 	.gc_thresh		=	1024,
141 	.check			=	ip6_dst_check,
142 	.default_advmss		=	ip6_default_advmss,
143 	.default_mtu		=	ip6_default_mtu,
144 	.cow_metrics		=	ipv6_cow_metrics,
145 	.destroy		=	ip6_dst_destroy,
146 	.ifdown			=	ip6_dst_ifdown,
147 	.negative_advice	=	ip6_negative_advice,
148 	.link_failure		=	ip6_link_failure,
149 	.update_pmtu		=	ip6_rt_update_pmtu,
150 	.local_out		=	__ip6_local_out,
151 	.neigh_lookup		=	ip6_neigh_lookup,
152 };
153 
154 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
155 {
156 	return 0;
157 }
158 
159 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
160 {
161 }
162 
163 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
164 					 unsigned long old)
165 {
166 	return NULL;
167 }
168 
169 static struct dst_ops ip6_dst_blackhole_ops = {
170 	.family			=	AF_INET6,
171 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
172 	.destroy		=	ip6_dst_destroy,
173 	.check			=	ip6_dst_check,
174 	.default_mtu		=	ip6_blackhole_default_mtu,
175 	.default_advmss		=	ip6_default_advmss,
176 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
177 	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
178 	.neigh_lookup		=	ip6_neigh_lookup,
179 };
180 
181 static const u32 ip6_template_metrics[RTAX_MAX] = {
182 	[RTAX_HOPLIMIT - 1] = 255,
183 };
184 
185 static struct rt6_info ip6_null_entry_template = {
186 	.dst = {
187 		.__refcnt	= ATOMIC_INIT(1),
188 		.__use		= 1,
189 		.obsolete	= -1,
190 		.error		= -ENETUNREACH,
191 		.input		= ip6_pkt_discard,
192 		.output		= ip6_pkt_discard_out,
193 	},
194 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
195 	.rt6i_protocol  = RTPROT_KERNEL,
196 	.rt6i_metric	= ~(u32) 0,
197 	.rt6i_ref	= ATOMIC_INIT(1),
198 };
199 
200 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
201 
202 static int ip6_pkt_prohibit(struct sk_buff *skb);
203 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
204 
205 static struct rt6_info ip6_prohibit_entry_template = {
206 	.dst = {
207 		.__refcnt	= ATOMIC_INIT(1),
208 		.__use		= 1,
209 		.obsolete	= -1,
210 		.error		= -EACCES,
211 		.input		= ip6_pkt_prohibit,
212 		.output		= ip6_pkt_prohibit_out,
213 	},
214 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
215 	.rt6i_protocol  = RTPROT_KERNEL,
216 	.rt6i_metric	= ~(u32) 0,
217 	.rt6i_ref	= ATOMIC_INIT(1),
218 };
219 
220 static struct rt6_info ip6_blk_hole_entry_template = {
221 	.dst = {
222 		.__refcnt	= ATOMIC_INIT(1),
223 		.__use		= 1,
224 		.obsolete	= -1,
225 		.error		= -EINVAL,
226 		.input		= dst_discard,
227 		.output		= dst_discard,
228 	},
229 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
230 	.rt6i_protocol  = RTPROT_KERNEL,
231 	.rt6i_metric	= ~(u32) 0,
232 	.rt6i_ref	= ATOMIC_INIT(1),
233 };
234 
235 #endif
236 
237 /* allocate dst with ip6_dst_ops */
238 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
239 					     struct net_device *dev,
240 					     int flags)
241 {
242 	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
243 
244 	memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
245 
246 	return rt;
247 }
248 
249 static void ip6_dst_destroy(struct dst_entry *dst)
250 {
251 	struct rt6_info *rt = (struct rt6_info *)dst;
252 	struct inet6_dev *idev = rt->rt6i_idev;
253 	struct inet_peer *peer = rt->rt6i_peer;
254 
255 	if (idev != NULL) {
256 		rt->rt6i_idev = NULL;
257 		in6_dev_put(idev);
258 	}
259 	if (peer) {
260 		rt->rt6i_peer = NULL;
261 		inet_putpeer(peer);
262 	}
263 }
264 
265 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
266 
267 static u32 rt6_peer_genid(void)
268 {
269 	return atomic_read(&__rt6_peer_genid);
270 }
271 
272 void rt6_bind_peer(struct rt6_info *rt, int create)
273 {
274 	struct inet_peer *peer;
275 
276 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
277 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
278 		inet_putpeer(peer);
279 	else
280 		rt->rt6i_peer_genid = rt6_peer_genid();
281 }
282 
283 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
284 			   int how)
285 {
286 	struct rt6_info *rt = (struct rt6_info *)dst;
287 	struct inet6_dev *idev = rt->rt6i_idev;
288 	struct net_device *loopback_dev =
289 		dev_net(dev)->loopback_dev;
290 
291 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
292 		struct inet6_dev *loopback_idev =
293 			in6_dev_get(loopback_dev);
294 		if (loopback_idev != NULL) {
295 			rt->rt6i_idev = loopback_idev;
296 			in6_dev_put(idev);
297 		}
298 	}
299 }
300 
301 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
302 {
303 	return (rt->rt6i_flags & RTF_EXPIRES) &&
304 		time_after(jiffies, rt->rt6i_expires);
305 }
306 
307 static inline int rt6_need_strict(const struct in6_addr *daddr)
308 {
309 	return ipv6_addr_type(daddr) &
310 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
311 }
312 
313 /*
314  *	Route lookup. Any table->tb6_lock is implied.
315  */
316 
317 static inline struct rt6_info *rt6_device_match(struct net *net,
318 						    struct rt6_info *rt,
319 						    const struct in6_addr *saddr,
320 						    int oif,
321 						    int flags)
322 {
323 	struct rt6_info *local = NULL;
324 	struct rt6_info *sprt;
325 
326 	if (!oif && ipv6_addr_any(saddr))
327 		goto out;
328 
329 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
330 		struct net_device *dev = sprt->rt6i_dev;
331 
332 		if (oif) {
333 			if (dev->ifindex == oif)
334 				return sprt;
335 			if (dev->flags & IFF_LOOPBACK) {
336 				if (sprt->rt6i_idev == NULL ||
337 				    sprt->rt6i_idev->dev->ifindex != oif) {
338 					if (flags & RT6_LOOKUP_F_IFACE && oif)
339 						continue;
340 					if (local && (!oif ||
341 						      local->rt6i_idev->dev->ifindex == oif))
342 						continue;
343 				}
344 				local = sprt;
345 			}
346 		} else {
347 			if (ipv6_chk_addr(net, saddr, dev,
348 					  flags & RT6_LOOKUP_F_IFACE))
349 				return sprt;
350 		}
351 	}
352 
353 	if (oif) {
354 		if (local)
355 			return local;
356 
357 		if (flags & RT6_LOOKUP_F_IFACE)
358 			return net->ipv6.ip6_null_entry;
359 	}
360 out:
361 	return rt;
362 }
363 
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 static void rt6_probe(struct rt6_info *rt)
366 {
367 	struct neighbour *neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
368 	/*
369 	 * Okay, this does not seem to be appropriate
370 	 * for now, however, we need to check if it
371 	 * is really so; aka Router Reachability Probing.
372 	 *
373 	 * Router Reachability Probe MUST be rate-limited
374 	 * to no more than one per minute.
375 	 */
376 	if (!neigh || (neigh->nud_state & NUD_VALID))
377 		return;
378 	read_lock_bh(&neigh->lock);
379 	if (!(neigh->nud_state & NUD_VALID) &&
380 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
381 		struct in6_addr mcaddr;
382 		struct in6_addr *target;
383 
384 		neigh->updated = jiffies;
385 		read_unlock_bh(&neigh->lock);
386 
387 		target = (struct in6_addr *)&neigh->primary_key;
388 		addrconf_addr_solict_mult(target, &mcaddr);
389 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
390 	} else
391 		read_unlock_bh(&neigh->lock);
392 }
393 #else
394 static inline void rt6_probe(struct rt6_info *rt)
395 {
396 }
397 #endif
398 
399 /*
400  * Default Router Selection (RFC 2461 6.3.6)
401  */
402 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
403 {
404 	struct net_device *dev = rt->rt6i_dev;
405 	if (!oif || dev->ifindex == oif)
406 		return 2;
407 	if ((dev->flags & IFF_LOOPBACK) &&
408 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
409 		return 1;
410 	return 0;
411 }
412 
413 static inline int rt6_check_neigh(struct rt6_info *rt)
414 {
415 	struct neighbour *neigh = dst_get_neighbour(&rt->dst);
416 	int m;
417 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
418 	    !(rt->rt6i_flags & RTF_GATEWAY))
419 		m = 1;
420 	else if (neigh) {
421 		read_lock_bh(&neigh->lock);
422 		if (neigh->nud_state & NUD_VALID)
423 			m = 2;
424 #ifdef CONFIG_IPV6_ROUTER_PREF
425 		else if (neigh->nud_state & NUD_FAILED)
426 			m = 0;
427 #endif
428 		else
429 			m = 1;
430 		read_unlock_bh(&neigh->lock);
431 	} else
432 		m = 0;
433 	return m;
434 }
435 
436 static int rt6_score_route(struct rt6_info *rt, int oif,
437 			   int strict)
438 {
439 	int m, n;
440 
441 	m = rt6_check_dev(rt, oif);
442 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
443 		return -1;
444 #ifdef CONFIG_IPV6_ROUTER_PREF
445 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
446 #endif
447 	n = rt6_check_neigh(rt);
448 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
449 		return -1;
450 	return m;
451 }
452 
453 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
454 				   int *mpri, struct rt6_info *match)
455 {
456 	int m;
457 
458 	if (rt6_check_expired(rt))
459 		goto out;
460 
461 	m = rt6_score_route(rt, oif, strict);
462 	if (m < 0)
463 		goto out;
464 
465 	if (m > *mpri) {
466 		if (strict & RT6_LOOKUP_F_REACHABLE)
467 			rt6_probe(match);
468 		*mpri = m;
469 		match = rt;
470 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
471 		rt6_probe(rt);
472 	}
473 
474 out:
475 	return match;
476 }
477 
478 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
479 				     struct rt6_info *rr_head,
480 				     u32 metric, int oif, int strict)
481 {
482 	struct rt6_info *rt, *match;
483 	int mpri = -1;
484 
485 	match = NULL;
486 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
487 	     rt = rt->dst.rt6_next)
488 		match = find_match(rt, oif, strict, &mpri, match);
489 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
490 	     rt = rt->dst.rt6_next)
491 		match = find_match(rt, oif, strict, &mpri, match);
492 
493 	return match;
494 }
495 
496 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
497 {
498 	struct rt6_info *match, *rt0;
499 	struct net *net;
500 
501 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
502 		  __func__, fn->leaf, oif);
503 
504 	rt0 = fn->rr_ptr;
505 	if (!rt0)
506 		fn->rr_ptr = rt0 = fn->leaf;
507 
508 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
509 
510 	if (!match &&
511 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
512 		struct rt6_info *next = rt0->dst.rt6_next;
513 
514 		/* no entries matched; do round-robin */
515 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
516 			next = fn->leaf;
517 
518 		if (next != rt0)
519 			fn->rr_ptr = next;
520 	}
521 
522 	RT6_TRACE("%s() => %p\n",
523 		  __func__, match);
524 
525 	net = dev_net(rt0->rt6i_dev);
526 	return match ? match : net->ipv6.ip6_null_entry;
527 }
528 
529 #ifdef CONFIG_IPV6_ROUTE_INFO
530 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
531 		  const struct in6_addr *gwaddr)
532 {
533 	struct net *net = dev_net(dev);
534 	struct route_info *rinfo = (struct route_info *) opt;
535 	struct in6_addr prefix_buf, *prefix;
536 	unsigned int pref;
537 	unsigned long lifetime;
538 	struct rt6_info *rt;
539 
540 	if (len < sizeof(struct route_info)) {
541 		return -EINVAL;
542 	}
543 
544 	/* Sanity check for prefix_len and length */
545 	if (rinfo->length > 3) {
546 		return -EINVAL;
547 	} else if (rinfo->prefix_len > 128) {
548 		return -EINVAL;
549 	} else if (rinfo->prefix_len > 64) {
550 		if (rinfo->length < 2) {
551 			return -EINVAL;
552 		}
553 	} else if (rinfo->prefix_len > 0) {
554 		if (rinfo->length < 1) {
555 			return -EINVAL;
556 		}
557 	}
558 
559 	pref = rinfo->route_pref;
560 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
561 		return -EINVAL;
562 
563 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
564 
565 	if (rinfo->length == 3)
566 		prefix = (struct in6_addr *)rinfo->prefix;
567 	else {
568 		/* this function is safe */
569 		ipv6_addr_prefix(&prefix_buf,
570 				 (struct in6_addr *)rinfo->prefix,
571 				 rinfo->prefix_len);
572 		prefix = &prefix_buf;
573 	}
574 
575 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
576 				dev->ifindex);
577 
578 	if (rt && !lifetime) {
579 		ip6_del_rt(rt);
580 		rt = NULL;
581 	}
582 
583 	if (!rt && lifetime)
584 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
585 					pref);
586 	else if (rt)
587 		rt->rt6i_flags = RTF_ROUTEINFO |
588 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
589 
590 	if (rt) {
591 		if (!addrconf_finite_timeout(lifetime)) {
592 			rt->rt6i_flags &= ~RTF_EXPIRES;
593 		} else {
594 			rt->rt6i_expires = jiffies + HZ * lifetime;
595 			rt->rt6i_flags |= RTF_EXPIRES;
596 		}
597 		dst_release(&rt->dst);
598 	}
599 	return 0;
600 }
601 #endif
602 
603 #define BACKTRACK(__net, saddr)			\
604 do { \
605 	if (rt == __net->ipv6.ip6_null_entry) {	\
606 		struct fib6_node *pn; \
607 		while (1) { \
608 			if (fn->fn_flags & RTN_TL_ROOT) \
609 				goto out; \
610 			pn = fn->parent; \
611 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
612 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
613 			else \
614 				fn = pn; \
615 			if (fn->fn_flags & RTN_RTINFO) \
616 				goto restart; \
617 		} \
618 	} \
619 } while(0)
620 
621 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
622 					     struct fib6_table *table,
623 					     struct flowi6 *fl6, int flags)
624 {
625 	struct fib6_node *fn;
626 	struct rt6_info *rt;
627 
628 	read_lock_bh(&table->tb6_lock);
629 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
630 restart:
631 	rt = fn->leaf;
632 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
633 	BACKTRACK(net, &fl6->saddr);
634 out:
635 	dst_use(&rt->dst, jiffies);
636 	read_unlock_bh(&table->tb6_lock);
637 	return rt;
638 
639 }
640 
641 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
642 			    const struct in6_addr *saddr, int oif, int strict)
643 {
644 	struct flowi6 fl6 = {
645 		.flowi6_oif = oif,
646 		.daddr = *daddr,
647 	};
648 	struct dst_entry *dst;
649 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
650 
651 	if (saddr) {
652 		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
653 		flags |= RT6_LOOKUP_F_HAS_SADDR;
654 	}
655 
656 	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
657 	if (dst->error == 0)
658 		return (struct rt6_info *) dst;
659 
660 	dst_release(dst);
661 
662 	return NULL;
663 }
664 
665 EXPORT_SYMBOL(rt6_lookup);
666 
667 /* ip6_ins_rt is called with FREE table->tb6_lock.
668    It takes new route entry, the addition fails by any reason the
669    route is freed. In any case, if caller does not hold it, it may
670    be destroyed.
671  */
672 
673 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
674 {
675 	int err;
676 	struct fib6_table *table;
677 
678 	table = rt->rt6i_table;
679 	write_lock_bh(&table->tb6_lock);
680 	err = fib6_add(&table->tb6_root, rt, info);
681 	write_unlock_bh(&table->tb6_lock);
682 
683 	return err;
684 }
685 
686 int ip6_ins_rt(struct rt6_info *rt)
687 {
688 	struct nl_info info = {
689 		.nl_net = dev_net(rt->rt6i_dev),
690 	};
691 	return __ip6_ins_rt(rt, &info);
692 }
693 
694 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
695 				      const struct in6_addr *daddr,
696 				      const struct in6_addr *saddr)
697 {
698 	struct rt6_info *rt;
699 
700 	/*
701 	 *	Clone the route.
702 	 */
703 
704 	rt = ip6_rt_copy(ort, daddr);
705 
706 	if (rt) {
707 		struct neighbour *neigh;
708 		int attempts = !in_softirq();
709 
710 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
711 			if (rt->rt6i_dst.plen != 128 &&
712 			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
713 				rt->rt6i_flags |= RTF_ANYCAST;
714 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
715 		}
716 
717 		rt->rt6i_dst.plen = 128;
718 		rt->rt6i_flags |= RTF_CACHE;
719 		rt->dst.flags |= DST_HOST;
720 
721 #ifdef CONFIG_IPV6_SUBTREES
722 		if (rt->rt6i_src.plen && saddr) {
723 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
724 			rt->rt6i_src.plen = 128;
725 		}
726 #endif
727 
728 	retry:
729 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
730 		if (IS_ERR(neigh)) {
731 			struct net *net = dev_net(rt->rt6i_dev);
732 			int saved_rt_min_interval =
733 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
734 			int saved_rt_elasticity =
735 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
736 
737 			if (attempts-- > 0) {
738 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
739 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
740 
741 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
742 
743 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
744 					saved_rt_elasticity;
745 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
746 					saved_rt_min_interval;
747 				goto retry;
748 			}
749 
750 			if (net_ratelimit())
751 				printk(KERN_WARNING
752 				       "ipv6: Neighbour table overflow.\n");
753 			dst_free(&rt->dst);
754 			return NULL;
755 		}
756 		dst_set_neighbour(&rt->dst, neigh);
757 
758 	}
759 
760 	return rt;
761 }
762 
763 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
764 					const struct in6_addr *daddr)
765 {
766 	struct rt6_info *rt = ip6_rt_copy(ort, daddr);
767 
768 	if (rt) {
769 		rt->rt6i_dst.plen = 128;
770 		rt->rt6i_flags |= RTF_CACHE;
771 		rt->dst.flags |= DST_HOST;
772 		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour(&ort->dst)));
773 	}
774 	return rt;
775 }
776 
777 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
778 				      struct flowi6 *fl6, int flags)
779 {
780 	struct fib6_node *fn;
781 	struct rt6_info *rt, *nrt;
782 	int strict = 0;
783 	int attempts = 3;
784 	int err;
785 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
786 
787 	strict |= flags & RT6_LOOKUP_F_IFACE;
788 
789 relookup:
790 	read_lock_bh(&table->tb6_lock);
791 
792 restart_2:
793 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
794 
795 restart:
796 	rt = rt6_select(fn, oif, strict | reachable);
797 
798 	BACKTRACK(net, &fl6->saddr);
799 	if (rt == net->ipv6.ip6_null_entry ||
800 	    rt->rt6i_flags & RTF_CACHE)
801 		goto out;
802 
803 	dst_hold(&rt->dst);
804 	read_unlock_bh(&table->tb6_lock);
805 
806 	if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
807 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
808 	else if (!(rt->dst.flags & DST_HOST))
809 		nrt = rt6_alloc_clone(rt, &fl6->daddr);
810 	else
811 		goto out2;
812 
813 	dst_release(&rt->dst);
814 	rt = nrt ? : net->ipv6.ip6_null_entry;
815 
816 	dst_hold(&rt->dst);
817 	if (nrt) {
818 		err = ip6_ins_rt(nrt);
819 		if (!err)
820 			goto out2;
821 	}
822 
823 	if (--attempts <= 0)
824 		goto out2;
825 
826 	/*
827 	 * Race condition! In the gap, when table->tb6_lock was
828 	 * released someone could insert this route.  Relookup.
829 	 */
830 	dst_release(&rt->dst);
831 	goto relookup;
832 
833 out:
834 	if (reachable) {
835 		reachable = 0;
836 		goto restart_2;
837 	}
838 	dst_hold(&rt->dst);
839 	read_unlock_bh(&table->tb6_lock);
840 out2:
841 	rt->dst.lastuse = jiffies;
842 	rt->dst.__use++;
843 
844 	return rt;
845 }
846 
847 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
848 					    struct flowi6 *fl6, int flags)
849 {
850 	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
851 }
852 
853 void ip6_route_input(struct sk_buff *skb)
854 {
855 	const struct ipv6hdr *iph = ipv6_hdr(skb);
856 	struct net *net = dev_net(skb->dev);
857 	int flags = RT6_LOOKUP_F_HAS_SADDR;
858 	struct flowi6 fl6 = {
859 		.flowi6_iif = skb->dev->ifindex,
860 		.daddr = iph->daddr,
861 		.saddr = iph->saddr,
862 		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
863 		.flowi6_mark = skb->mark,
864 		.flowi6_proto = iph->nexthdr,
865 	};
866 
867 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
868 		flags |= RT6_LOOKUP_F_IFACE;
869 
870 	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
871 }
872 
873 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
874 					     struct flowi6 *fl6, int flags)
875 {
876 	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
877 }
878 
879 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
880 				    struct flowi6 *fl6)
881 {
882 	int flags = 0;
883 
884 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
885 		flags |= RT6_LOOKUP_F_IFACE;
886 
887 	if (!ipv6_addr_any(&fl6->saddr))
888 		flags |= RT6_LOOKUP_F_HAS_SADDR;
889 	else if (sk)
890 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
891 
892 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
893 }
894 
895 EXPORT_SYMBOL(ip6_route_output);
896 
897 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
898 {
899 	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
900 	struct dst_entry *new = NULL;
901 
902 	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
903 	if (rt) {
904 		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
905 
906 		new = &rt->dst;
907 
908 		new->__use = 1;
909 		new->input = dst_discard;
910 		new->output = dst_discard;
911 
912 		if (dst_metrics_read_only(&ort->dst))
913 			new->_metrics = ort->dst._metrics;
914 		else
915 			dst_copy_metrics(new, &ort->dst);
916 		rt->rt6i_idev = ort->rt6i_idev;
917 		if (rt->rt6i_idev)
918 			in6_dev_hold(rt->rt6i_idev);
919 		rt->rt6i_expires = 0;
920 
921 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
922 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
923 		rt->rt6i_metric = 0;
924 
925 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
926 #ifdef CONFIG_IPV6_SUBTREES
927 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
928 #endif
929 
930 		dst_free(new);
931 	}
932 
933 	dst_release(dst_orig);
934 	return new ? new : ERR_PTR(-ENOMEM);
935 }
936 
937 /*
938  *	Destination cache support functions
939  */
940 
941 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
942 {
943 	struct rt6_info *rt;
944 
945 	rt = (struct rt6_info *) dst;
946 
947 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
948 		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
949 			if (!rt->rt6i_peer)
950 				rt6_bind_peer(rt, 0);
951 			rt->rt6i_peer_genid = rt6_peer_genid();
952 		}
953 		return dst;
954 	}
955 	return NULL;
956 }
957 
958 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
959 {
960 	struct rt6_info *rt = (struct rt6_info *) dst;
961 
962 	if (rt) {
963 		if (rt->rt6i_flags & RTF_CACHE) {
964 			if (rt6_check_expired(rt)) {
965 				ip6_del_rt(rt);
966 				dst = NULL;
967 			}
968 		} else {
969 			dst_release(dst);
970 			dst = NULL;
971 		}
972 	}
973 	return dst;
974 }
975 
976 static void ip6_link_failure(struct sk_buff *skb)
977 {
978 	struct rt6_info *rt;
979 
980 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
981 
982 	rt = (struct rt6_info *) skb_dst(skb);
983 	if (rt) {
984 		if (rt->rt6i_flags&RTF_CACHE) {
985 			dst_set_expires(&rt->dst, 0);
986 			rt->rt6i_flags |= RTF_EXPIRES;
987 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
988 			rt->rt6i_node->fn_sernum = -1;
989 	}
990 }
991 
992 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
993 {
994 	struct rt6_info *rt6 = (struct rt6_info*)dst;
995 
996 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
997 		rt6->rt6i_flags |= RTF_MODIFIED;
998 		if (mtu < IPV6_MIN_MTU) {
999 			u32 features = dst_metric(dst, RTAX_FEATURES);
1000 			mtu = IPV6_MIN_MTU;
1001 			features |= RTAX_FEATURE_ALLFRAG;
1002 			dst_metric_set(dst, RTAX_FEATURES, features);
1003 		}
1004 		dst_metric_set(dst, RTAX_MTU, mtu);
1005 	}
1006 }
1007 
1008 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1009 {
1010 	struct net_device *dev = dst->dev;
1011 	unsigned int mtu = dst_mtu(dst);
1012 	struct net *net = dev_net(dev);
1013 
1014 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1015 
1016 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1017 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1018 
1019 	/*
1020 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1021 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1022 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1023 	 * rely only on pmtu discovery"
1024 	 */
1025 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1026 		mtu = IPV6_MAXPLEN;
1027 	return mtu;
1028 }
1029 
1030 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1031 {
1032 	unsigned int mtu = IPV6_MIN_MTU;
1033 	struct inet6_dev *idev;
1034 
1035 	rcu_read_lock();
1036 	idev = __in6_dev_get(dst->dev);
1037 	if (idev)
1038 		mtu = idev->cnf.mtu6;
1039 	rcu_read_unlock();
1040 
1041 	return mtu;
1042 }
1043 
1044 static struct dst_entry *icmp6_dst_gc_list;
1045 static DEFINE_SPINLOCK(icmp6_dst_lock);
1046 
1047 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1048 				  struct neighbour *neigh,
1049 				  const struct in6_addr *addr)
1050 {
1051 	struct rt6_info *rt;
1052 	struct inet6_dev *idev = in6_dev_get(dev);
1053 	struct net *net = dev_net(dev);
1054 
1055 	if (unlikely(idev == NULL))
1056 		return NULL;
1057 
1058 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1059 	if (unlikely(rt == NULL)) {
1060 		in6_dev_put(idev);
1061 		goto out;
1062 	}
1063 
1064 	if (neigh)
1065 		neigh_hold(neigh);
1066 	else {
1067 		neigh = ndisc_get_neigh(dev, addr);
1068 		if (IS_ERR(neigh))
1069 			neigh = NULL;
1070 	}
1071 
1072 	rt->rt6i_idev     = idev;
1073 	dst_set_neighbour(&rt->dst, neigh);
1074 	atomic_set(&rt->dst.__refcnt, 1);
1075 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1076 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1077 	rt->dst.output  = ip6_output;
1078 
1079 	spin_lock_bh(&icmp6_dst_lock);
1080 	rt->dst.next = icmp6_dst_gc_list;
1081 	icmp6_dst_gc_list = &rt->dst;
1082 	spin_unlock_bh(&icmp6_dst_lock);
1083 
1084 	fib6_force_start_gc(net);
1085 
1086 out:
1087 	return &rt->dst;
1088 }
1089 
1090 int icmp6_dst_gc(void)
1091 {
1092 	struct dst_entry *dst, **pprev;
1093 	int more = 0;
1094 
1095 	spin_lock_bh(&icmp6_dst_lock);
1096 	pprev = &icmp6_dst_gc_list;
1097 
1098 	while ((dst = *pprev) != NULL) {
1099 		if (!atomic_read(&dst->__refcnt)) {
1100 			*pprev = dst->next;
1101 			dst_free(dst);
1102 		} else {
1103 			pprev = &dst->next;
1104 			++more;
1105 		}
1106 	}
1107 
1108 	spin_unlock_bh(&icmp6_dst_lock);
1109 
1110 	return more;
1111 }
1112 
1113 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1114 			    void *arg)
1115 {
1116 	struct dst_entry *dst, **pprev;
1117 
1118 	spin_lock_bh(&icmp6_dst_lock);
1119 	pprev = &icmp6_dst_gc_list;
1120 	while ((dst = *pprev) != NULL) {
1121 		struct rt6_info *rt = (struct rt6_info *) dst;
1122 		if (func(rt, arg)) {
1123 			*pprev = dst->next;
1124 			dst_free(dst);
1125 		} else {
1126 			pprev = &dst->next;
1127 		}
1128 	}
1129 	spin_unlock_bh(&icmp6_dst_lock);
1130 }
1131 
1132 static int ip6_dst_gc(struct dst_ops *ops)
1133 {
1134 	unsigned long now = jiffies;
1135 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1136 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1137 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1138 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1139 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1140 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1141 	int entries;
1142 
1143 	entries = dst_entries_get_fast(ops);
1144 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1145 	    entries <= rt_max_size)
1146 		goto out;
1147 
1148 	net->ipv6.ip6_rt_gc_expire++;
1149 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1150 	net->ipv6.ip6_rt_last_gc = now;
1151 	entries = dst_entries_get_slow(ops);
1152 	if (entries < ops->gc_thresh)
1153 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1154 out:
1155 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1156 	return entries > rt_max_size;
1157 }
1158 
1159 /* Clean host part of a prefix. Not necessary in radix tree,
1160    but results in cleaner routing tables.
1161 
1162    Remove it only when all the things will work!
1163  */
1164 
1165 int ip6_dst_hoplimit(struct dst_entry *dst)
1166 {
1167 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1168 	if (hoplimit == 0) {
1169 		struct net_device *dev = dst->dev;
1170 		struct inet6_dev *idev;
1171 
1172 		rcu_read_lock();
1173 		idev = __in6_dev_get(dev);
1174 		if (idev)
1175 			hoplimit = idev->cnf.hop_limit;
1176 		else
1177 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1178 		rcu_read_unlock();
1179 	}
1180 	return hoplimit;
1181 }
1182 EXPORT_SYMBOL(ip6_dst_hoplimit);
1183 
1184 /*
1185  *
1186  */
1187 
1188 int ip6_route_add(struct fib6_config *cfg)
1189 {
1190 	int err;
1191 	struct net *net = cfg->fc_nlinfo.nl_net;
1192 	struct rt6_info *rt = NULL;
1193 	struct net_device *dev = NULL;
1194 	struct inet6_dev *idev = NULL;
1195 	struct fib6_table *table;
1196 	int addr_type;
1197 
1198 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1199 		return -EINVAL;
1200 #ifndef CONFIG_IPV6_SUBTREES
1201 	if (cfg->fc_src_len)
1202 		return -EINVAL;
1203 #endif
1204 	if (cfg->fc_ifindex) {
1205 		err = -ENODEV;
1206 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1207 		if (!dev)
1208 			goto out;
1209 		idev = in6_dev_get(dev);
1210 		if (!idev)
1211 			goto out;
1212 	}
1213 
1214 	if (cfg->fc_metric == 0)
1215 		cfg->fc_metric = IP6_RT_PRIO_USER;
1216 
1217 	table = fib6_new_table(net, cfg->fc_table);
1218 	if (table == NULL) {
1219 		err = -ENOBUFS;
1220 		goto out;
1221 	}
1222 
1223 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1224 
1225 	if (rt == NULL) {
1226 		err = -ENOMEM;
1227 		goto out;
1228 	}
1229 
1230 	rt->dst.obsolete = -1;
1231 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1232 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1233 				0;
1234 
1235 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1236 		cfg->fc_protocol = RTPROT_BOOT;
1237 	rt->rt6i_protocol = cfg->fc_protocol;
1238 
1239 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1240 
1241 	if (addr_type & IPV6_ADDR_MULTICAST)
1242 		rt->dst.input = ip6_mc_input;
1243 	else if (cfg->fc_flags & RTF_LOCAL)
1244 		rt->dst.input = ip6_input;
1245 	else
1246 		rt->dst.input = ip6_forward;
1247 
1248 	rt->dst.output = ip6_output;
1249 
1250 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1251 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1252 	if (rt->rt6i_dst.plen == 128)
1253 	       rt->dst.flags |= DST_HOST;
1254 
1255 #ifdef CONFIG_IPV6_SUBTREES
1256 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1257 	rt->rt6i_src.plen = cfg->fc_src_len;
1258 #endif
1259 
1260 	rt->rt6i_metric = cfg->fc_metric;
1261 
1262 	/* We cannot add true routes via loopback here,
1263 	   they would result in kernel looping; promote them to reject routes
1264 	 */
1265 	if ((cfg->fc_flags & RTF_REJECT) ||
1266 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1267 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1268 		/* hold loopback dev/idev if we haven't done so. */
1269 		if (dev != net->loopback_dev) {
1270 			if (dev) {
1271 				dev_put(dev);
1272 				in6_dev_put(idev);
1273 			}
1274 			dev = net->loopback_dev;
1275 			dev_hold(dev);
1276 			idev = in6_dev_get(dev);
1277 			if (!idev) {
1278 				err = -ENODEV;
1279 				goto out;
1280 			}
1281 		}
1282 		rt->dst.output = ip6_pkt_discard_out;
1283 		rt->dst.input = ip6_pkt_discard;
1284 		rt->dst.error = -ENETUNREACH;
1285 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1286 		goto install_route;
1287 	}
1288 
1289 	if (cfg->fc_flags & RTF_GATEWAY) {
1290 		const struct in6_addr *gw_addr;
1291 		int gwa_type;
1292 
1293 		gw_addr = &cfg->fc_gateway;
1294 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1295 		gwa_type = ipv6_addr_type(gw_addr);
1296 
1297 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1298 			struct rt6_info *grt;
1299 
1300 			/* IPv6 strictly inhibits using not link-local
1301 			   addresses as nexthop address.
1302 			   Otherwise, router will not able to send redirects.
1303 			   It is very good, but in some (rare!) circumstances
1304 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1305 			   some exceptions. --ANK
1306 			 */
1307 			err = -EINVAL;
1308 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1309 				goto out;
1310 
1311 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1312 
1313 			err = -EHOSTUNREACH;
1314 			if (grt == NULL)
1315 				goto out;
1316 			if (dev) {
1317 				if (dev != grt->rt6i_dev) {
1318 					dst_release(&grt->dst);
1319 					goto out;
1320 				}
1321 			} else {
1322 				dev = grt->rt6i_dev;
1323 				idev = grt->rt6i_idev;
1324 				dev_hold(dev);
1325 				in6_dev_hold(grt->rt6i_idev);
1326 			}
1327 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1328 				err = 0;
1329 			dst_release(&grt->dst);
1330 
1331 			if (err)
1332 				goto out;
1333 		}
1334 		err = -EINVAL;
1335 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1336 			goto out;
1337 	}
1338 
1339 	err = -ENODEV;
1340 	if (dev == NULL)
1341 		goto out;
1342 
1343 	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1344 		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1345 			err = -EINVAL;
1346 			goto out;
1347 		}
1348 		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1349 		rt->rt6i_prefsrc.plen = 128;
1350 	} else
1351 		rt->rt6i_prefsrc.plen = 0;
1352 
1353 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1354 		struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1355 		if (IS_ERR(n)) {
1356 			err = PTR_ERR(n);
1357 			goto out;
1358 		}
1359 		dst_set_neighbour(&rt->dst, n);
1360 	}
1361 
1362 	rt->rt6i_flags = cfg->fc_flags;
1363 
1364 install_route:
1365 	if (cfg->fc_mx) {
1366 		struct nlattr *nla;
1367 		int remaining;
1368 
1369 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1370 			int type = nla_type(nla);
1371 
1372 			if (type) {
1373 				if (type > RTAX_MAX) {
1374 					err = -EINVAL;
1375 					goto out;
1376 				}
1377 
1378 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1379 			}
1380 		}
1381 	}
1382 
1383 	rt->dst.dev = dev;
1384 	rt->rt6i_idev = idev;
1385 	rt->rt6i_table = table;
1386 
1387 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1388 
1389 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1390 
1391 out:
1392 	if (dev)
1393 		dev_put(dev);
1394 	if (idev)
1395 		in6_dev_put(idev);
1396 	if (rt)
1397 		dst_free(&rt->dst);
1398 	return err;
1399 }
1400 
1401 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1402 {
1403 	int err;
1404 	struct fib6_table *table;
1405 	struct net *net = dev_net(rt->rt6i_dev);
1406 
1407 	if (rt == net->ipv6.ip6_null_entry)
1408 		return -ENOENT;
1409 
1410 	table = rt->rt6i_table;
1411 	write_lock_bh(&table->tb6_lock);
1412 
1413 	err = fib6_del(rt, info);
1414 	dst_release(&rt->dst);
1415 
1416 	write_unlock_bh(&table->tb6_lock);
1417 
1418 	return err;
1419 }
1420 
1421 int ip6_del_rt(struct rt6_info *rt)
1422 {
1423 	struct nl_info info = {
1424 		.nl_net = dev_net(rt->rt6i_dev),
1425 	};
1426 	return __ip6_del_rt(rt, &info);
1427 }
1428 
1429 static int ip6_route_del(struct fib6_config *cfg)
1430 {
1431 	struct fib6_table *table;
1432 	struct fib6_node *fn;
1433 	struct rt6_info *rt;
1434 	int err = -ESRCH;
1435 
1436 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1437 	if (table == NULL)
1438 		return err;
1439 
1440 	read_lock_bh(&table->tb6_lock);
1441 
1442 	fn = fib6_locate(&table->tb6_root,
1443 			 &cfg->fc_dst, cfg->fc_dst_len,
1444 			 &cfg->fc_src, cfg->fc_src_len);
1445 
1446 	if (fn) {
1447 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1448 			if (cfg->fc_ifindex &&
1449 			    (rt->rt6i_dev == NULL ||
1450 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1451 				continue;
1452 			if (cfg->fc_flags & RTF_GATEWAY &&
1453 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1454 				continue;
1455 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1456 				continue;
1457 			dst_hold(&rt->dst);
1458 			read_unlock_bh(&table->tb6_lock);
1459 
1460 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1461 		}
1462 	}
1463 	read_unlock_bh(&table->tb6_lock);
1464 
1465 	return err;
1466 }
1467 
1468 /*
1469  *	Handle redirects
1470  */
1471 struct ip6rd_flowi {
1472 	struct flowi6 fl6;
1473 	struct in6_addr gateway;
1474 };
1475 
1476 static struct rt6_info *__ip6_route_redirect(struct net *net,
1477 					     struct fib6_table *table,
1478 					     struct flowi6 *fl6,
1479 					     int flags)
1480 {
1481 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1482 	struct rt6_info *rt;
1483 	struct fib6_node *fn;
1484 
1485 	/*
1486 	 * Get the "current" route for this destination and
1487 	 * check if the redirect has come from approriate router.
1488 	 *
1489 	 * RFC 2461 specifies that redirects should only be
1490 	 * accepted if they come from the nexthop to the target.
1491 	 * Due to the way the routes are chosen, this notion
1492 	 * is a bit fuzzy and one might need to check all possible
1493 	 * routes.
1494 	 */
1495 
1496 	read_lock_bh(&table->tb6_lock);
1497 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1498 restart:
1499 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1500 		/*
1501 		 * Current route is on-link; redirect is always invalid.
1502 		 *
1503 		 * Seems, previous statement is not true. It could
1504 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1505 		 * But then router serving it might decide, that we should
1506 		 * know truth 8)8) --ANK (980726).
1507 		 */
1508 		if (rt6_check_expired(rt))
1509 			continue;
1510 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1511 			continue;
1512 		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1513 			continue;
1514 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1515 			continue;
1516 		break;
1517 	}
1518 
1519 	if (!rt)
1520 		rt = net->ipv6.ip6_null_entry;
1521 	BACKTRACK(net, &fl6->saddr);
1522 out:
1523 	dst_hold(&rt->dst);
1524 
1525 	read_unlock_bh(&table->tb6_lock);
1526 
1527 	return rt;
1528 };
1529 
1530 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1531 					   const struct in6_addr *src,
1532 					   const struct in6_addr *gateway,
1533 					   struct net_device *dev)
1534 {
1535 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1536 	struct net *net = dev_net(dev);
1537 	struct ip6rd_flowi rdfl = {
1538 		.fl6 = {
1539 			.flowi6_oif = dev->ifindex,
1540 			.daddr = *dest,
1541 			.saddr = *src,
1542 		},
1543 	};
1544 
1545 	ipv6_addr_copy(&rdfl.gateway, gateway);
1546 
1547 	if (rt6_need_strict(dest))
1548 		flags |= RT6_LOOKUP_F_IFACE;
1549 
1550 	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1551 						   flags, __ip6_route_redirect);
1552 }
1553 
1554 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1555 		  const struct in6_addr *saddr,
1556 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1557 {
1558 	struct rt6_info *rt, *nrt = NULL;
1559 	struct netevent_redirect netevent;
1560 	struct net *net = dev_net(neigh->dev);
1561 
1562 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1563 
1564 	if (rt == net->ipv6.ip6_null_entry) {
1565 		if (net_ratelimit())
1566 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1567 			       "for redirect target\n");
1568 		goto out;
1569 	}
1570 
1571 	/*
1572 	 *	We have finally decided to accept it.
1573 	 */
1574 
1575 	neigh_update(neigh, lladdr, NUD_STALE,
1576 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1577 		     NEIGH_UPDATE_F_OVERRIDE|
1578 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1579 				     NEIGH_UPDATE_F_ISROUTER))
1580 		     );
1581 
1582 	/*
1583 	 * Redirect received -> path was valid.
1584 	 * Look, redirects are sent only in response to data packets,
1585 	 * so that this nexthop apparently is reachable. --ANK
1586 	 */
1587 	dst_confirm(&rt->dst);
1588 
1589 	/* Duplicate redirect: silently ignore. */
1590 	if (neigh == dst_get_neighbour(&rt->dst))
1591 		goto out;
1592 
1593 	nrt = ip6_rt_copy(rt, dest);
1594 	if (nrt == NULL)
1595 		goto out;
1596 
1597 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1598 	if (on_link)
1599 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1600 
1601 	nrt->rt6i_dst.plen = 128;
1602 	nrt->dst.flags |= DST_HOST;
1603 
1604 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1605 	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1606 
1607 	if (ip6_ins_rt(nrt))
1608 		goto out;
1609 
1610 	netevent.old = &rt->dst;
1611 	netevent.new = &nrt->dst;
1612 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1613 
1614 	if (rt->rt6i_flags&RTF_CACHE) {
1615 		ip6_del_rt(rt);
1616 		return;
1617 	}
1618 
1619 out:
1620 	dst_release(&rt->dst);
1621 }
1622 
1623 /*
1624  *	Handle ICMP "packet too big" messages
1625  *	i.e. Path MTU discovery
1626  */
1627 
1628 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1629 			     struct net *net, u32 pmtu, int ifindex)
1630 {
1631 	struct rt6_info *rt, *nrt;
1632 	int allfrag = 0;
1633 again:
1634 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1635 	if (rt == NULL)
1636 		return;
1637 
1638 	if (rt6_check_expired(rt)) {
1639 		ip6_del_rt(rt);
1640 		goto again;
1641 	}
1642 
1643 	if (pmtu >= dst_mtu(&rt->dst))
1644 		goto out;
1645 
1646 	if (pmtu < IPV6_MIN_MTU) {
1647 		/*
1648 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1649 		 * MTU (1280) and a fragment header should always be included
1650 		 * after a node receiving Too Big message reporting PMTU is
1651 		 * less than the IPv6 Minimum Link MTU.
1652 		 */
1653 		pmtu = IPV6_MIN_MTU;
1654 		allfrag = 1;
1655 	}
1656 
1657 	/* New mtu received -> path was valid.
1658 	   They are sent only in response to data packets,
1659 	   so that this nexthop apparently is reachable. --ANK
1660 	 */
1661 	dst_confirm(&rt->dst);
1662 
1663 	/* Host route. If it is static, it would be better
1664 	   not to override it, but add new one, so that
1665 	   when cache entry will expire old pmtu
1666 	   would return automatically.
1667 	 */
1668 	if (rt->rt6i_flags & RTF_CACHE) {
1669 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1670 		if (allfrag) {
1671 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1672 			features |= RTAX_FEATURE_ALLFRAG;
1673 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1674 		}
1675 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1676 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1677 		goto out;
1678 	}
1679 
1680 	/* Network route.
1681 	   Two cases are possible:
1682 	   1. It is connected route. Action: COW
1683 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1684 	 */
1685 	if (!dst_get_neighbour(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1686 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1687 	else
1688 		nrt = rt6_alloc_clone(rt, daddr);
1689 
1690 	if (nrt) {
1691 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1692 		if (allfrag) {
1693 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1694 			features |= RTAX_FEATURE_ALLFRAG;
1695 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1696 		}
1697 
1698 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1699 		 * happened within 5 mins, the recommended timer is 10 mins.
1700 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1701 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1702 		 * and detecting PMTU increase will be automatically happened.
1703 		 */
1704 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1706 
1707 		ip6_ins_rt(nrt);
1708 	}
1709 out:
1710 	dst_release(&rt->dst);
1711 }
1712 
1713 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1714 			struct net_device *dev, u32 pmtu)
1715 {
1716 	struct net *net = dev_net(dev);
1717 
1718 	/*
1719 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1720 	 * is sending along the path" that caused the Packet Too Big message.
1721 	 * Since it's not possible in the general case to determine which
1722 	 * interface was used to send the original packet, we update the MTU
1723 	 * on the interface that will be used to send future packets. We also
1724 	 * update the MTU on the interface that received the Packet Too Big in
1725 	 * case the original packet was forced out that interface with
1726 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1727 	 * correct behaviour, which would be to update the MTU on all
1728 	 * interfaces.
1729 	 */
1730 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1731 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1732 }
1733 
1734 /*
1735  *	Misc support functions
1736  */
1737 
1738 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1739 				    const struct in6_addr *dest)
1740 {
1741 	struct net *net = dev_net(ort->rt6i_dev);
1742 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1743 					    ort->dst.dev, 0);
1744 
1745 	if (rt) {
1746 		rt->dst.input = ort->dst.input;
1747 		rt->dst.output = ort->dst.output;
1748 
1749 		ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1750 		rt->rt6i_dst.plen = ort->rt6i_dst.plen;
1751 		dst_copy_metrics(&rt->dst, &ort->dst);
1752 		rt->dst.error = ort->dst.error;
1753 		rt->rt6i_idev = ort->rt6i_idev;
1754 		if (rt->rt6i_idev)
1755 			in6_dev_hold(rt->rt6i_idev);
1756 		rt->dst.lastuse = jiffies;
1757 		rt->rt6i_expires = 0;
1758 
1759 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1760 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1761 		rt->rt6i_metric = 0;
1762 
1763 #ifdef CONFIG_IPV6_SUBTREES
1764 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1765 #endif
1766 		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1767 		rt->rt6i_table = ort->rt6i_table;
1768 	}
1769 	return rt;
1770 }
1771 
1772 #ifdef CONFIG_IPV6_ROUTE_INFO
1773 static struct rt6_info *rt6_get_route_info(struct net *net,
1774 					   const struct in6_addr *prefix, int prefixlen,
1775 					   const struct in6_addr *gwaddr, int ifindex)
1776 {
1777 	struct fib6_node *fn;
1778 	struct rt6_info *rt = NULL;
1779 	struct fib6_table *table;
1780 
1781 	table = fib6_get_table(net, RT6_TABLE_INFO);
1782 	if (table == NULL)
1783 		return NULL;
1784 
1785 	write_lock_bh(&table->tb6_lock);
1786 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1787 	if (!fn)
1788 		goto out;
1789 
1790 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1791 		if (rt->rt6i_dev->ifindex != ifindex)
1792 			continue;
1793 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1794 			continue;
1795 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1796 			continue;
1797 		dst_hold(&rt->dst);
1798 		break;
1799 	}
1800 out:
1801 	write_unlock_bh(&table->tb6_lock);
1802 	return rt;
1803 }
1804 
1805 static struct rt6_info *rt6_add_route_info(struct net *net,
1806 					   const struct in6_addr *prefix, int prefixlen,
1807 					   const struct in6_addr *gwaddr, int ifindex,
1808 					   unsigned pref)
1809 {
1810 	struct fib6_config cfg = {
1811 		.fc_table	= RT6_TABLE_INFO,
1812 		.fc_metric	= IP6_RT_PRIO_USER,
1813 		.fc_ifindex	= ifindex,
1814 		.fc_dst_len	= prefixlen,
1815 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1816 				  RTF_UP | RTF_PREF(pref),
1817 		.fc_nlinfo.pid = 0,
1818 		.fc_nlinfo.nlh = NULL,
1819 		.fc_nlinfo.nl_net = net,
1820 	};
1821 
1822 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1823 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1824 
1825 	/* We should treat it as a default route if prefix length is 0. */
1826 	if (!prefixlen)
1827 		cfg.fc_flags |= RTF_DEFAULT;
1828 
1829 	ip6_route_add(&cfg);
1830 
1831 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1832 }
1833 #endif
1834 
1835 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1836 {
1837 	struct rt6_info *rt;
1838 	struct fib6_table *table;
1839 
1840 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1841 	if (table == NULL)
1842 		return NULL;
1843 
1844 	write_lock_bh(&table->tb6_lock);
1845 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1846 		if (dev == rt->rt6i_dev &&
1847 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1848 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1849 			break;
1850 	}
1851 	if (rt)
1852 		dst_hold(&rt->dst);
1853 	write_unlock_bh(&table->tb6_lock);
1854 	return rt;
1855 }
1856 
1857 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1858 				     struct net_device *dev,
1859 				     unsigned int pref)
1860 {
1861 	struct fib6_config cfg = {
1862 		.fc_table	= RT6_TABLE_DFLT,
1863 		.fc_metric	= IP6_RT_PRIO_USER,
1864 		.fc_ifindex	= dev->ifindex,
1865 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1866 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1867 		.fc_nlinfo.pid = 0,
1868 		.fc_nlinfo.nlh = NULL,
1869 		.fc_nlinfo.nl_net = dev_net(dev),
1870 	};
1871 
1872 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1873 
1874 	ip6_route_add(&cfg);
1875 
1876 	return rt6_get_dflt_router(gwaddr, dev);
1877 }
1878 
1879 void rt6_purge_dflt_routers(struct net *net)
1880 {
1881 	struct rt6_info *rt;
1882 	struct fib6_table *table;
1883 
1884 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1885 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1886 	if (table == NULL)
1887 		return;
1888 
1889 restart:
1890 	read_lock_bh(&table->tb6_lock);
1891 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1892 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1893 			dst_hold(&rt->dst);
1894 			read_unlock_bh(&table->tb6_lock);
1895 			ip6_del_rt(rt);
1896 			goto restart;
1897 		}
1898 	}
1899 	read_unlock_bh(&table->tb6_lock);
1900 }
1901 
1902 static void rtmsg_to_fib6_config(struct net *net,
1903 				 struct in6_rtmsg *rtmsg,
1904 				 struct fib6_config *cfg)
1905 {
1906 	memset(cfg, 0, sizeof(*cfg));
1907 
1908 	cfg->fc_table = RT6_TABLE_MAIN;
1909 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1910 	cfg->fc_metric = rtmsg->rtmsg_metric;
1911 	cfg->fc_expires = rtmsg->rtmsg_info;
1912 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1913 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1914 	cfg->fc_flags = rtmsg->rtmsg_flags;
1915 
1916 	cfg->fc_nlinfo.nl_net = net;
1917 
1918 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1919 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1920 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1921 }
1922 
1923 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1924 {
1925 	struct fib6_config cfg;
1926 	struct in6_rtmsg rtmsg;
1927 	int err;
1928 
1929 	switch(cmd) {
1930 	case SIOCADDRT:		/* Add a route */
1931 	case SIOCDELRT:		/* Delete a route */
1932 		if (!capable(CAP_NET_ADMIN))
1933 			return -EPERM;
1934 		err = copy_from_user(&rtmsg, arg,
1935 				     sizeof(struct in6_rtmsg));
1936 		if (err)
1937 			return -EFAULT;
1938 
1939 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1940 
1941 		rtnl_lock();
1942 		switch (cmd) {
1943 		case SIOCADDRT:
1944 			err = ip6_route_add(&cfg);
1945 			break;
1946 		case SIOCDELRT:
1947 			err = ip6_route_del(&cfg);
1948 			break;
1949 		default:
1950 			err = -EINVAL;
1951 		}
1952 		rtnl_unlock();
1953 
1954 		return err;
1955 	}
1956 
1957 	return -EINVAL;
1958 }
1959 
1960 /*
1961  *	Drop the packet on the floor
1962  */
1963 
1964 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1965 {
1966 	int type;
1967 	struct dst_entry *dst = skb_dst(skb);
1968 	switch (ipstats_mib_noroutes) {
1969 	case IPSTATS_MIB_INNOROUTES:
1970 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1971 		if (type == IPV6_ADDR_ANY) {
1972 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1973 				      IPSTATS_MIB_INADDRERRORS);
1974 			break;
1975 		}
1976 		/* FALLTHROUGH */
1977 	case IPSTATS_MIB_OUTNOROUTES:
1978 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1979 			      ipstats_mib_noroutes);
1980 		break;
1981 	}
1982 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1983 	kfree_skb(skb);
1984 	return 0;
1985 }
1986 
1987 static int ip6_pkt_discard(struct sk_buff *skb)
1988 {
1989 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1990 }
1991 
1992 static int ip6_pkt_discard_out(struct sk_buff *skb)
1993 {
1994 	skb->dev = skb_dst(skb)->dev;
1995 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1996 }
1997 
1998 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1999 
2000 static int ip6_pkt_prohibit(struct sk_buff *skb)
2001 {
2002 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2003 }
2004 
2005 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2006 {
2007 	skb->dev = skb_dst(skb)->dev;
2008 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2009 }
2010 
2011 #endif
2012 
2013 /*
2014  *	Allocate a dst for local (unicast / anycast) address.
2015  */
2016 
2017 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2018 				    const struct in6_addr *addr,
2019 				    int anycast)
2020 {
2021 	struct net *net = dev_net(idev->dev);
2022 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2023 					    net->loopback_dev, 0);
2024 	struct neighbour *neigh;
2025 
2026 	if (rt == NULL) {
2027 		if (net_ratelimit())
2028 			pr_warning("IPv6:  Maximum number of routes reached,"
2029 				   " consider increasing route/max_size.\n");
2030 		return ERR_PTR(-ENOMEM);
2031 	}
2032 
2033 	in6_dev_hold(idev);
2034 
2035 	rt->dst.flags |= DST_HOST;
2036 	rt->dst.input = ip6_input;
2037 	rt->dst.output = ip6_output;
2038 	rt->rt6i_idev = idev;
2039 	rt->dst.obsolete = -1;
2040 
2041 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2042 	if (anycast)
2043 		rt->rt6i_flags |= RTF_ANYCAST;
2044 	else
2045 		rt->rt6i_flags |= RTF_LOCAL;
2046 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2047 	if (IS_ERR(neigh)) {
2048 		dst_free(&rt->dst);
2049 
2050 		return ERR_CAST(neigh);
2051 	}
2052 	dst_set_neighbour(&rt->dst, neigh);
2053 
2054 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2055 	rt->rt6i_dst.plen = 128;
2056 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2057 
2058 	atomic_set(&rt->dst.__refcnt, 1);
2059 
2060 	return rt;
2061 }
2062 
2063 int ip6_route_get_saddr(struct net *net,
2064 			struct rt6_info *rt,
2065 			const struct in6_addr *daddr,
2066 			unsigned int prefs,
2067 			struct in6_addr *saddr)
2068 {
2069 	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2070 	int err = 0;
2071 	if (rt->rt6i_prefsrc.plen)
2072 		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2073 	else
2074 		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2075 					 daddr, prefs, saddr);
2076 	return err;
2077 }
2078 
2079 /* remove deleted ip from prefsrc entries */
2080 struct arg_dev_net_ip {
2081 	struct net_device *dev;
2082 	struct net *net;
2083 	struct in6_addr *addr;
2084 };
2085 
2086 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2087 {
2088 	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2089 	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2090 	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2091 
2092 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2093 	    rt != net->ipv6.ip6_null_entry &&
2094 	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2095 		/* remove prefsrc entry */
2096 		rt->rt6i_prefsrc.plen = 0;
2097 	}
2098 	return 0;
2099 }
2100 
2101 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2102 {
2103 	struct net *net = dev_net(ifp->idev->dev);
2104 	struct arg_dev_net_ip adni = {
2105 		.dev = ifp->idev->dev,
2106 		.net = net,
2107 		.addr = &ifp->addr,
2108 	};
2109 	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2110 }
2111 
2112 struct arg_dev_net {
2113 	struct net_device *dev;
2114 	struct net *net;
2115 };
2116 
2117 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2118 {
2119 	const struct arg_dev_net *adn = arg;
2120 	const struct net_device *dev = adn->dev;
2121 
2122 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2123 	    rt != adn->net->ipv6.ip6_null_entry) {
2124 		RT6_TRACE("deleted by ifdown %p\n", rt);
2125 		return -1;
2126 	}
2127 	return 0;
2128 }
2129 
2130 void rt6_ifdown(struct net *net, struct net_device *dev)
2131 {
2132 	struct arg_dev_net adn = {
2133 		.dev = dev,
2134 		.net = net,
2135 	};
2136 
2137 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2138 	icmp6_clean_all(fib6_ifdown, &adn);
2139 }
2140 
2141 struct rt6_mtu_change_arg
2142 {
2143 	struct net_device *dev;
2144 	unsigned mtu;
2145 };
2146 
2147 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2148 {
2149 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2150 	struct inet6_dev *idev;
2151 
2152 	/* In IPv6 pmtu discovery is not optional,
2153 	   so that RTAX_MTU lock cannot disable it.
2154 	   We still use this lock to block changes
2155 	   caused by addrconf/ndisc.
2156 	*/
2157 
2158 	idev = __in6_dev_get(arg->dev);
2159 	if (idev == NULL)
2160 		return 0;
2161 
2162 	/* For administrative MTU increase, there is no way to discover
2163 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2164 	   Since RFC 1981 doesn't include administrative MTU increase
2165 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2166 	 */
2167 	/*
2168 	   If new MTU is less than route PMTU, this new MTU will be the
2169 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2170 	   decreases; if new MTU is greater than route PMTU, and the
2171 	   old MTU is the lowest MTU in the path, update the route PMTU
2172 	   to reflect the increase. In this case if the other nodes' MTU
2173 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2174 	   PMTU discouvery.
2175 	 */
2176 	if (rt->rt6i_dev == arg->dev &&
2177 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2178 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2179 	     (dst_mtu(&rt->dst) < arg->mtu &&
2180 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2181 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2182 	}
2183 	return 0;
2184 }
2185 
2186 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2187 {
2188 	struct rt6_mtu_change_arg arg = {
2189 		.dev = dev,
2190 		.mtu = mtu,
2191 	};
2192 
2193 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2194 }
2195 
2196 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2197 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2198 	[RTA_OIF]               = { .type = NLA_U32 },
2199 	[RTA_IIF]		= { .type = NLA_U32 },
2200 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2201 	[RTA_METRICS]           = { .type = NLA_NESTED },
2202 };
2203 
2204 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2205 			      struct fib6_config *cfg)
2206 {
2207 	struct rtmsg *rtm;
2208 	struct nlattr *tb[RTA_MAX+1];
2209 	int err;
2210 
2211 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2212 	if (err < 0)
2213 		goto errout;
2214 
2215 	err = -EINVAL;
2216 	rtm = nlmsg_data(nlh);
2217 	memset(cfg, 0, sizeof(*cfg));
2218 
2219 	cfg->fc_table = rtm->rtm_table;
2220 	cfg->fc_dst_len = rtm->rtm_dst_len;
2221 	cfg->fc_src_len = rtm->rtm_src_len;
2222 	cfg->fc_flags = RTF_UP;
2223 	cfg->fc_protocol = rtm->rtm_protocol;
2224 
2225 	if (rtm->rtm_type == RTN_UNREACHABLE)
2226 		cfg->fc_flags |= RTF_REJECT;
2227 
2228 	if (rtm->rtm_type == RTN_LOCAL)
2229 		cfg->fc_flags |= RTF_LOCAL;
2230 
2231 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2232 	cfg->fc_nlinfo.nlh = nlh;
2233 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2234 
2235 	if (tb[RTA_GATEWAY]) {
2236 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2237 		cfg->fc_flags |= RTF_GATEWAY;
2238 	}
2239 
2240 	if (tb[RTA_DST]) {
2241 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2242 
2243 		if (nla_len(tb[RTA_DST]) < plen)
2244 			goto errout;
2245 
2246 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2247 	}
2248 
2249 	if (tb[RTA_SRC]) {
2250 		int plen = (rtm->rtm_src_len + 7) >> 3;
2251 
2252 		if (nla_len(tb[RTA_SRC]) < plen)
2253 			goto errout;
2254 
2255 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2256 	}
2257 
2258 	if (tb[RTA_PREFSRC])
2259 		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2260 
2261 	if (tb[RTA_OIF])
2262 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2263 
2264 	if (tb[RTA_PRIORITY])
2265 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2266 
2267 	if (tb[RTA_METRICS]) {
2268 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2269 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2270 	}
2271 
2272 	if (tb[RTA_TABLE])
2273 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2274 
2275 	err = 0;
2276 errout:
2277 	return err;
2278 }
2279 
2280 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2281 {
2282 	struct fib6_config cfg;
2283 	int err;
2284 
2285 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2286 	if (err < 0)
2287 		return err;
2288 
2289 	return ip6_route_del(&cfg);
2290 }
2291 
2292 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2293 {
2294 	struct fib6_config cfg;
2295 	int err;
2296 
2297 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2298 	if (err < 0)
2299 		return err;
2300 
2301 	return ip6_route_add(&cfg);
2302 }
2303 
2304 static inline size_t rt6_nlmsg_size(void)
2305 {
2306 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2307 	       + nla_total_size(16) /* RTA_SRC */
2308 	       + nla_total_size(16) /* RTA_DST */
2309 	       + nla_total_size(16) /* RTA_GATEWAY */
2310 	       + nla_total_size(16) /* RTA_PREFSRC */
2311 	       + nla_total_size(4) /* RTA_TABLE */
2312 	       + nla_total_size(4) /* RTA_IIF */
2313 	       + nla_total_size(4) /* RTA_OIF */
2314 	       + nla_total_size(4) /* RTA_PRIORITY */
2315 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2316 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2317 }
2318 
2319 static int rt6_fill_node(struct net *net,
2320 			 struct sk_buff *skb, struct rt6_info *rt,
2321 			 struct in6_addr *dst, struct in6_addr *src,
2322 			 int iif, int type, u32 pid, u32 seq,
2323 			 int prefix, int nowait, unsigned int flags)
2324 {
2325 	struct rtmsg *rtm;
2326 	struct nlmsghdr *nlh;
2327 	long expires;
2328 	u32 table;
2329 
2330 	if (prefix) {	/* user wants prefix routes only */
2331 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2332 			/* success since this is not a prefix route */
2333 			return 1;
2334 		}
2335 	}
2336 
2337 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2338 	if (nlh == NULL)
2339 		return -EMSGSIZE;
2340 
2341 	rtm = nlmsg_data(nlh);
2342 	rtm->rtm_family = AF_INET6;
2343 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2344 	rtm->rtm_src_len = rt->rt6i_src.plen;
2345 	rtm->rtm_tos = 0;
2346 	if (rt->rt6i_table)
2347 		table = rt->rt6i_table->tb6_id;
2348 	else
2349 		table = RT6_TABLE_UNSPEC;
2350 	rtm->rtm_table = table;
2351 	NLA_PUT_U32(skb, RTA_TABLE, table);
2352 	if (rt->rt6i_flags&RTF_REJECT)
2353 		rtm->rtm_type = RTN_UNREACHABLE;
2354 	else if (rt->rt6i_flags&RTF_LOCAL)
2355 		rtm->rtm_type = RTN_LOCAL;
2356 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2357 		rtm->rtm_type = RTN_LOCAL;
2358 	else
2359 		rtm->rtm_type = RTN_UNICAST;
2360 	rtm->rtm_flags = 0;
2361 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2362 	rtm->rtm_protocol = rt->rt6i_protocol;
2363 	if (rt->rt6i_flags&RTF_DYNAMIC)
2364 		rtm->rtm_protocol = RTPROT_REDIRECT;
2365 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2366 		rtm->rtm_protocol = RTPROT_KERNEL;
2367 	else if (rt->rt6i_flags&RTF_DEFAULT)
2368 		rtm->rtm_protocol = RTPROT_RA;
2369 
2370 	if (rt->rt6i_flags&RTF_CACHE)
2371 		rtm->rtm_flags |= RTM_F_CLONED;
2372 
2373 	if (dst) {
2374 		NLA_PUT(skb, RTA_DST, 16, dst);
2375 		rtm->rtm_dst_len = 128;
2376 	} else if (rtm->rtm_dst_len)
2377 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2378 #ifdef CONFIG_IPV6_SUBTREES
2379 	if (src) {
2380 		NLA_PUT(skb, RTA_SRC, 16, src);
2381 		rtm->rtm_src_len = 128;
2382 	} else if (rtm->rtm_src_len)
2383 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2384 #endif
2385 	if (iif) {
2386 #ifdef CONFIG_IPV6_MROUTE
2387 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2388 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2389 			if (err <= 0) {
2390 				if (!nowait) {
2391 					if (err == 0)
2392 						return 0;
2393 					goto nla_put_failure;
2394 				} else {
2395 					if (err == -EMSGSIZE)
2396 						goto nla_put_failure;
2397 				}
2398 			}
2399 		} else
2400 #endif
2401 			NLA_PUT_U32(skb, RTA_IIF, iif);
2402 	} else if (dst) {
2403 		struct in6_addr saddr_buf;
2404 		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2405 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2406 	}
2407 
2408 	if (rt->rt6i_prefsrc.plen) {
2409 		struct in6_addr saddr_buf;
2410 		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2411 		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2412 	}
2413 
2414 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2415 		goto nla_put_failure;
2416 
2417 	if (dst_get_neighbour(&rt->dst))
2418 		NLA_PUT(skb, RTA_GATEWAY, 16, &dst_get_neighbour(&rt->dst)->primary_key);
2419 
2420 	if (rt->dst.dev)
2421 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2422 
2423 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2424 
2425 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2426 		expires = 0;
2427 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2428 		expires = rt->rt6i_expires - jiffies;
2429 	else
2430 		expires = INT_MAX;
2431 
2432 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2433 			       expires, rt->dst.error) < 0)
2434 		goto nla_put_failure;
2435 
2436 	return nlmsg_end(skb, nlh);
2437 
2438 nla_put_failure:
2439 	nlmsg_cancel(skb, nlh);
2440 	return -EMSGSIZE;
2441 }
2442 
2443 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2444 {
2445 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2446 	int prefix;
2447 
2448 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2449 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2450 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2451 	} else
2452 		prefix = 0;
2453 
2454 	return rt6_fill_node(arg->net,
2455 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2456 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2457 		     prefix, 0, NLM_F_MULTI);
2458 }
2459 
2460 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2461 {
2462 	struct net *net = sock_net(in_skb->sk);
2463 	struct nlattr *tb[RTA_MAX+1];
2464 	struct rt6_info *rt;
2465 	struct sk_buff *skb;
2466 	struct rtmsg *rtm;
2467 	struct flowi6 fl6;
2468 	int err, iif = 0;
2469 
2470 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2471 	if (err < 0)
2472 		goto errout;
2473 
2474 	err = -EINVAL;
2475 	memset(&fl6, 0, sizeof(fl6));
2476 
2477 	if (tb[RTA_SRC]) {
2478 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2479 			goto errout;
2480 
2481 		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2482 	}
2483 
2484 	if (tb[RTA_DST]) {
2485 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2486 			goto errout;
2487 
2488 		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2489 	}
2490 
2491 	if (tb[RTA_IIF])
2492 		iif = nla_get_u32(tb[RTA_IIF]);
2493 
2494 	if (tb[RTA_OIF])
2495 		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2496 
2497 	if (iif) {
2498 		struct net_device *dev;
2499 		dev = __dev_get_by_index(net, iif);
2500 		if (!dev) {
2501 			err = -ENODEV;
2502 			goto errout;
2503 		}
2504 	}
2505 
2506 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2507 	if (skb == NULL) {
2508 		err = -ENOBUFS;
2509 		goto errout;
2510 	}
2511 
2512 	/* Reserve room for dummy headers, this skb can pass
2513 	   through good chunk of routing engine.
2514 	 */
2515 	skb_reset_mac_header(skb);
2516 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2517 
2518 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2519 	skb_dst_set(skb, &rt->dst);
2520 
2521 	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2522 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2523 			    nlh->nlmsg_seq, 0, 0, 0);
2524 	if (err < 0) {
2525 		kfree_skb(skb);
2526 		goto errout;
2527 	}
2528 
2529 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2530 errout:
2531 	return err;
2532 }
2533 
2534 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2535 {
2536 	struct sk_buff *skb;
2537 	struct net *net = info->nl_net;
2538 	u32 seq;
2539 	int err;
2540 
2541 	err = -ENOBUFS;
2542 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2543 
2544 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2545 	if (skb == NULL)
2546 		goto errout;
2547 
2548 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2549 				event, info->pid, seq, 0, 0, 0);
2550 	if (err < 0) {
2551 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2552 		WARN_ON(err == -EMSGSIZE);
2553 		kfree_skb(skb);
2554 		goto errout;
2555 	}
2556 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2557 		    info->nlh, gfp_any());
2558 	return;
2559 errout:
2560 	if (err < 0)
2561 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2562 }
2563 
2564 static int ip6_route_dev_notify(struct notifier_block *this,
2565 				unsigned long event, void *data)
2566 {
2567 	struct net_device *dev = (struct net_device *)data;
2568 	struct net *net = dev_net(dev);
2569 
2570 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2571 		net->ipv6.ip6_null_entry->dst.dev = dev;
2572 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2573 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2574 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2575 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2576 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2577 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2578 #endif
2579 	}
2580 
2581 	return NOTIFY_OK;
2582 }
2583 
2584 /*
2585  *	/proc
2586  */
2587 
2588 #ifdef CONFIG_PROC_FS
2589 
2590 struct rt6_proc_arg
2591 {
2592 	char *buffer;
2593 	int offset;
2594 	int length;
2595 	int skip;
2596 	int len;
2597 };
2598 
2599 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2600 {
2601 	struct seq_file *m = p_arg;
2602 	struct neighbour *n;
2603 
2604 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2605 
2606 #ifdef CONFIG_IPV6_SUBTREES
2607 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2608 #else
2609 	seq_puts(m, "00000000000000000000000000000000 00 ");
2610 #endif
2611 	n = dst_get_neighbour(&rt->dst);
2612 	if (n) {
2613 		seq_printf(m, "%pi6", n->primary_key);
2614 	} else {
2615 		seq_puts(m, "00000000000000000000000000000000");
2616 	}
2617 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2618 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2619 		   rt->dst.__use, rt->rt6i_flags,
2620 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2621 	return 0;
2622 }
2623 
2624 static int ipv6_route_show(struct seq_file *m, void *v)
2625 {
2626 	struct net *net = (struct net *)m->private;
2627 	fib6_clean_all(net, rt6_info_route, 0, m);
2628 	return 0;
2629 }
2630 
2631 static int ipv6_route_open(struct inode *inode, struct file *file)
2632 {
2633 	return single_open_net(inode, file, ipv6_route_show);
2634 }
2635 
2636 static const struct file_operations ipv6_route_proc_fops = {
2637 	.owner		= THIS_MODULE,
2638 	.open		= ipv6_route_open,
2639 	.read		= seq_read,
2640 	.llseek		= seq_lseek,
2641 	.release	= single_release_net,
2642 };
2643 
2644 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2645 {
2646 	struct net *net = (struct net *)seq->private;
2647 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2648 		   net->ipv6.rt6_stats->fib_nodes,
2649 		   net->ipv6.rt6_stats->fib_route_nodes,
2650 		   net->ipv6.rt6_stats->fib_rt_alloc,
2651 		   net->ipv6.rt6_stats->fib_rt_entries,
2652 		   net->ipv6.rt6_stats->fib_rt_cache,
2653 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2654 		   net->ipv6.rt6_stats->fib_discarded_routes);
2655 
2656 	return 0;
2657 }
2658 
2659 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2660 {
2661 	return single_open_net(inode, file, rt6_stats_seq_show);
2662 }
2663 
2664 static const struct file_operations rt6_stats_seq_fops = {
2665 	.owner	 = THIS_MODULE,
2666 	.open	 = rt6_stats_seq_open,
2667 	.read	 = seq_read,
2668 	.llseek	 = seq_lseek,
2669 	.release = single_release_net,
2670 };
2671 #endif	/* CONFIG_PROC_FS */
2672 
2673 #ifdef CONFIG_SYSCTL
2674 
2675 static
2676 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2677 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2678 {
2679 	struct net *net;
2680 	int delay;
2681 	if (!write)
2682 		return -EINVAL;
2683 
2684 	net = (struct net *)ctl->extra1;
2685 	delay = net->ipv6.sysctl.flush_delay;
2686 	proc_dointvec(ctl, write, buffer, lenp, ppos);
2687 	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2688 	return 0;
2689 }
2690 
2691 ctl_table ipv6_route_table_template[] = {
2692 	{
2693 		.procname	=	"flush",
2694 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2695 		.maxlen		=	sizeof(int),
2696 		.mode		=	0200,
2697 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2698 	},
2699 	{
2700 		.procname	=	"gc_thresh",
2701 		.data		=	&ip6_dst_ops_template.gc_thresh,
2702 		.maxlen		=	sizeof(int),
2703 		.mode		=	0644,
2704 		.proc_handler	=	proc_dointvec,
2705 	},
2706 	{
2707 		.procname	=	"max_size",
2708 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2709 		.maxlen		=	sizeof(int),
2710 		.mode		=	0644,
2711 		.proc_handler	=	proc_dointvec,
2712 	},
2713 	{
2714 		.procname	=	"gc_min_interval",
2715 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2716 		.maxlen		=	sizeof(int),
2717 		.mode		=	0644,
2718 		.proc_handler	=	proc_dointvec_jiffies,
2719 	},
2720 	{
2721 		.procname	=	"gc_timeout",
2722 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2723 		.maxlen		=	sizeof(int),
2724 		.mode		=	0644,
2725 		.proc_handler	=	proc_dointvec_jiffies,
2726 	},
2727 	{
2728 		.procname	=	"gc_interval",
2729 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2730 		.maxlen		=	sizeof(int),
2731 		.mode		=	0644,
2732 		.proc_handler	=	proc_dointvec_jiffies,
2733 	},
2734 	{
2735 		.procname	=	"gc_elasticity",
2736 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2737 		.maxlen		=	sizeof(int),
2738 		.mode		=	0644,
2739 		.proc_handler	=	proc_dointvec,
2740 	},
2741 	{
2742 		.procname	=	"mtu_expires",
2743 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2744 		.maxlen		=	sizeof(int),
2745 		.mode		=	0644,
2746 		.proc_handler	=	proc_dointvec_jiffies,
2747 	},
2748 	{
2749 		.procname	=	"min_adv_mss",
2750 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2751 		.maxlen		=	sizeof(int),
2752 		.mode		=	0644,
2753 		.proc_handler	=	proc_dointvec,
2754 	},
2755 	{
2756 		.procname	=	"gc_min_interval_ms",
2757 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758 		.maxlen		=	sizeof(int),
2759 		.mode		=	0644,
2760 		.proc_handler	=	proc_dointvec_ms_jiffies,
2761 	},
2762 	{ }
2763 };
2764 
2765 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2766 {
2767 	struct ctl_table *table;
2768 
2769 	table = kmemdup(ipv6_route_table_template,
2770 			sizeof(ipv6_route_table_template),
2771 			GFP_KERNEL);
2772 
2773 	if (table) {
2774 		table[0].data = &net->ipv6.sysctl.flush_delay;
2775 		table[0].extra1 = net;
2776 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2777 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2778 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2779 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2780 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2781 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2782 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2783 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2784 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2785 	}
2786 
2787 	return table;
2788 }
2789 #endif
2790 
2791 static int __net_init ip6_route_net_init(struct net *net)
2792 {
2793 	int ret = -ENOMEM;
2794 
2795 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2796 	       sizeof(net->ipv6.ip6_dst_ops));
2797 
2798 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2799 		goto out_ip6_dst_ops;
2800 
2801 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2802 					   sizeof(*net->ipv6.ip6_null_entry),
2803 					   GFP_KERNEL);
2804 	if (!net->ipv6.ip6_null_entry)
2805 		goto out_ip6_dst_entries;
2806 	net->ipv6.ip6_null_entry->dst.path =
2807 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2808 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2809 	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2810 			 ip6_template_metrics, true);
2811 
2812 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2813 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2814 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2815 					       GFP_KERNEL);
2816 	if (!net->ipv6.ip6_prohibit_entry)
2817 		goto out_ip6_null_entry;
2818 	net->ipv6.ip6_prohibit_entry->dst.path =
2819 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2820 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2821 	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2822 			 ip6_template_metrics, true);
2823 
2824 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2825 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2826 					       GFP_KERNEL);
2827 	if (!net->ipv6.ip6_blk_hole_entry)
2828 		goto out_ip6_prohibit_entry;
2829 	net->ipv6.ip6_blk_hole_entry->dst.path =
2830 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2831 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2832 	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2833 			 ip6_template_metrics, true);
2834 #endif
2835 
2836 	net->ipv6.sysctl.flush_delay = 0;
2837 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2838 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2839 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2840 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2841 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2842 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2843 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2844 
2845 #ifdef CONFIG_PROC_FS
2846 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2847 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2848 #endif
2849 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2850 
2851 	ret = 0;
2852 out:
2853 	return ret;
2854 
2855 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2856 out_ip6_prohibit_entry:
2857 	kfree(net->ipv6.ip6_prohibit_entry);
2858 out_ip6_null_entry:
2859 	kfree(net->ipv6.ip6_null_entry);
2860 #endif
2861 out_ip6_dst_entries:
2862 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2863 out_ip6_dst_ops:
2864 	goto out;
2865 }
2866 
2867 static void __net_exit ip6_route_net_exit(struct net *net)
2868 {
2869 #ifdef CONFIG_PROC_FS
2870 	proc_net_remove(net, "ipv6_route");
2871 	proc_net_remove(net, "rt6_stats");
2872 #endif
2873 	kfree(net->ipv6.ip6_null_entry);
2874 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2875 	kfree(net->ipv6.ip6_prohibit_entry);
2876 	kfree(net->ipv6.ip6_blk_hole_entry);
2877 #endif
2878 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2879 }
2880 
2881 static struct pernet_operations ip6_route_net_ops = {
2882 	.init = ip6_route_net_init,
2883 	.exit = ip6_route_net_exit,
2884 };
2885 
2886 static struct notifier_block ip6_route_dev_notifier = {
2887 	.notifier_call = ip6_route_dev_notify,
2888 	.priority = 0,
2889 };
2890 
2891 int __init ip6_route_init(void)
2892 {
2893 	int ret;
2894 
2895 	ret = -ENOMEM;
2896 	ip6_dst_ops_template.kmem_cachep =
2897 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2898 				  SLAB_HWCACHE_ALIGN, NULL);
2899 	if (!ip6_dst_ops_template.kmem_cachep)
2900 		goto out;
2901 
2902 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2903 	if (ret)
2904 		goto out_kmem_cache;
2905 
2906 	ret = register_pernet_subsys(&ip6_route_net_ops);
2907 	if (ret)
2908 		goto out_dst_entries;
2909 
2910 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2911 
2912 	/* Registering of the loopback is done before this portion of code,
2913 	 * the loopback reference in rt6_info will not be taken, do it
2914 	 * manually for init_net */
2915 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2916 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2917   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2918 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2919 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2920 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2921 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2922   #endif
2923 	ret = fib6_init();
2924 	if (ret)
2925 		goto out_register_subsys;
2926 
2927 	ret = xfrm6_init();
2928 	if (ret)
2929 		goto out_fib6_init;
2930 
2931 	ret = fib6_rules_init();
2932 	if (ret)
2933 		goto xfrm6_init;
2934 
2935 	ret = -ENOBUFS;
2936 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2937 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2938 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2939 		goto fib6_rules_init;
2940 
2941 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2942 	if (ret)
2943 		goto fib6_rules_init;
2944 
2945 out:
2946 	return ret;
2947 
2948 fib6_rules_init:
2949 	fib6_rules_cleanup();
2950 xfrm6_init:
2951 	xfrm6_fini();
2952 out_fib6_init:
2953 	fib6_gc_cleanup();
2954 out_register_subsys:
2955 	unregister_pernet_subsys(&ip6_route_net_ops);
2956 out_dst_entries:
2957 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2958 out_kmem_cache:
2959 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2960 	goto out;
2961 }
2962 
2963 void ip6_route_cleanup(void)
2964 {
2965 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2966 	fib6_rules_cleanup();
2967 	xfrm6_fini();
2968 	fib6_gc_cleanup();
2969 	unregister_pernet_subsys(&ip6_route_net_ops);
2970 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2971 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2972 }
2973