xref: /openbmc/linux/net/ipv6/route.c (revision 6ee73861)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56 
57 #include <asm/uaccess.h>
58 
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62 
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65 
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73 
74 #define CLONE_OFFLINK_ROUTE 0
75 
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void		ip6_dst_destroy(struct dst_entry *);
80 static void		ip6_dst_ifdown(struct dst_entry *,
81 				       struct net_device *dev, int how);
82 static int		 ip6_dst_gc(struct dst_ops *ops);
83 
84 static int		ip6_pkt_discard(struct sk_buff *skb);
85 static int		ip6_pkt_discard_out(struct sk_buff *skb);
86 static void		ip6_link_failure(struct sk_buff *skb);
87 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88 
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91 					   struct in6_addr *prefix, int prefixlen,
92 					   struct in6_addr *gwaddr, int ifindex,
93 					   unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95 					   struct in6_addr *prefix, int prefixlen,
96 					   struct in6_addr *gwaddr, int ifindex);
97 #endif
98 
99 static struct dst_ops ip6_dst_ops_template = {
100 	.family			=	AF_INET6,
101 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
102 	.gc			=	ip6_dst_gc,
103 	.gc_thresh		=	1024,
104 	.check			=	ip6_dst_check,
105 	.destroy		=	ip6_dst_destroy,
106 	.ifdown			=	ip6_dst_ifdown,
107 	.negative_advice	=	ip6_negative_advice,
108 	.link_failure		=	ip6_link_failure,
109 	.update_pmtu		=	ip6_rt_update_pmtu,
110 	.local_out		=	__ip6_local_out,
111 	.entries		=	ATOMIC_INIT(0),
112 };
113 
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117 
118 static struct dst_ops ip6_dst_blackhole_ops = {
119 	.family			=	AF_INET6,
120 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
121 	.destroy		=	ip6_dst_destroy,
122 	.check			=	ip6_dst_check,
123 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
124 	.entries		=	ATOMIC_INIT(0),
125 };
126 
127 static struct rt6_info ip6_null_entry_template = {
128 	.u = {
129 		.dst = {
130 			.__refcnt	= ATOMIC_INIT(1),
131 			.__use		= 1,
132 			.obsolete	= -1,
133 			.error		= -ENETUNREACH,
134 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
135 			.input		= ip6_pkt_discard,
136 			.output		= ip6_pkt_discard_out,
137 		}
138 	},
139 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
140 	.rt6i_protocol  = RTPROT_KERNEL,
141 	.rt6i_metric	= ~(u32) 0,
142 	.rt6i_ref	= ATOMIC_INIT(1),
143 };
144 
145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
146 
147 static int ip6_pkt_prohibit(struct sk_buff *skb);
148 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
149 
150 static struct rt6_info ip6_prohibit_entry_template = {
151 	.u = {
152 		.dst = {
153 			.__refcnt	= ATOMIC_INIT(1),
154 			.__use		= 1,
155 			.obsolete	= -1,
156 			.error		= -EACCES,
157 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
158 			.input		= ip6_pkt_prohibit,
159 			.output		= ip6_pkt_prohibit_out,
160 		}
161 	},
162 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
163 	.rt6i_protocol  = RTPROT_KERNEL,
164 	.rt6i_metric	= ~(u32) 0,
165 	.rt6i_ref	= ATOMIC_INIT(1),
166 };
167 
168 static struct rt6_info ip6_blk_hole_entry_template = {
169 	.u = {
170 		.dst = {
171 			.__refcnt	= ATOMIC_INIT(1),
172 			.__use		= 1,
173 			.obsolete	= -1,
174 			.error		= -EINVAL,
175 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
176 			.input		= dst_discard,
177 			.output		= dst_discard,
178 		}
179 	},
180 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
181 	.rt6i_protocol  = RTPROT_KERNEL,
182 	.rt6i_metric	= ~(u32) 0,
183 	.rt6i_ref	= ATOMIC_INIT(1),
184 };
185 
186 #endif
187 
188 /* allocate dst with ip6_dst_ops */
189 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
190 {
191 	return (struct rt6_info *)dst_alloc(ops);
192 }
193 
194 static void ip6_dst_destroy(struct dst_entry *dst)
195 {
196 	struct rt6_info *rt = (struct rt6_info *)dst;
197 	struct inet6_dev *idev = rt->rt6i_idev;
198 
199 	if (idev != NULL) {
200 		rt->rt6i_idev = NULL;
201 		in6_dev_put(idev);
202 	}
203 }
204 
205 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 			   int how)
207 {
208 	struct rt6_info *rt = (struct rt6_info *)dst;
209 	struct inet6_dev *idev = rt->rt6i_idev;
210 	struct net_device *loopback_dev =
211 		dev_net(dev)->loopback_dev;
212 
213 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
214 		struct inet6_dev *loopback_idev =
215 			in6_dev_get(loopback_dev);
216 		if (loopback_idev != NULL) {
217 			rt->rt6i_idev = loopback_idev;
218 			in6_dev_put(idev);
219 		}
220 	}
221 }
222 
223 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
224 {
225 	return (rt->rt6i_flags & RTF_EXPIRES &&
226 		time_after(jiffies, rt->rt6i_expires));
227 }
228 
229 static inline int rt6_need_strict(struct in6_addr *daddr)
230 {
231 	return (ipv6_addr_type(daddr) &
232 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
233 }
234 
235 /*
236  *	Route lookup. Any table->tb6_lock is implied.
237  */
238 
239 static inline struct rt6_info *rt6_device_match(struct net *net,
240 						    struct rt6_info *rt,
241 						    struct in6_addr *saddr,
242 						    int oif,
243 						    int flags)
244 {
245 	struct rt6_info *local = NULL;
246 	struct rt6_info *sprt;
247 
248 	if (!oif && ipv6_addr_any(saddr))
249 		goto out;
250 
251 	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
252 		struct net_device *dev = sprt->rt6i_dev;
253 
254 		if (oif) {
255 			if (dev->ifindex == oif)
256 				return sprt;
257 			if (dev->flags & IFF_LOOPBACK) {
258 				if (sprt->rt6i_idev == NULL ||
259 				    sprt->rt6i_idev->dev->ifindex != oif) {
260 					if (flags & RT6_LOOKUP_F_IFACE && oif)
261 						continue;
262 					if (local && (!oif ||
263 						      local->rt6i_idev->dev->ifindex == oif))
264 						continue;
265 				}
266 				local = sprt;
267 			}
268 		} else {
269 			if (ipv6_chk_addr(net, saddr, dev,
270 					  flags & RT6_LOOKUP_F_IFACE))
271 				return sprt;
272 		}
273 	}
274 
275 	if (oif) {
276 		if (local)
277 			return local;
278 
279 		if (flags & RT6_LOOKUP_F_IFACE)
280 			return net->ipv6.ip6_null_entry;
281 	}
282 out:
283 	return rt;
284 }
285 
286 #ifdef CONFIG_IPV6_ROUTER_PREF
287 static void rt6_probe(struct rt6_info *rt)
288 {
289 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
290 	/*
291 	 * Okay, this does not seem to be appropriate
292 	 * for now, however, we need to check if it
293 	 * is really so; aka Router Reachability Probing.
294 	 *
295 	 * Router Reachability Probe MUST be rate-limited
296 	 * to no more than one per minute.
297 	 */
298 	if (!neigh || (neigh->nud_state & NUD_VALID))
299 		return;
300 	read_lock_bh(&neigh->lock);
301 	if (!(neigh->nud_state & NUD_VALID) &&
302 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
303 		struct in6_addr mcaddr;
304 		struct in6_addr *target;
305 
306 		neigh->updated = jiffies;
307 		read_unlock_bh(&neigh->lock);
308 
309 		target = (struct in6_addr *)&neigh->primary_key;
310 		addrconf_addr_solict_mult(target, &mcaddr);
311 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
312 	} else
313 		read_unlock_bh(&neigh->lock);
314 }
315 #else
316 static inline void rt6_probe(struct rt6_info *rt)
317 {
318 	return;
319 }
320 #endif
321 
322 /*
323  * Default Router Selection (RFC 2461 6.3.6)
324  */
325 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
326 {
327 	struct net_device *dev = rt->rt6i_dev;
328 	if (!oif || dev->ifindex == oif)
329 		return 2;
330 	if ((dev->flags & IFF_LOOPBACK) &&
331 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
332 		return 1;
333 	return 0;
334 }
335 
336 static inline int rt6_check_neigh(struct rt6_info *rt)
337 {
338 	struct neighbour *neigh = rt->rt6i_nexthop;
339 	int m;
340 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
341 	    !(rt->rt6i_flags & RTF_GATEWAY))
342 		m = 1;
343 	else if (neigh) {
344 		read_lock_bh(&neigh->lock);
345 		if (neigh->nud_state & NUD_VALID)
346 			m = 2;
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 		else if (neigh->nud_state & NUD_FAILED)
349 			m = 0;
350 #endif
351 		else
352 			m = 1;
353 		read_unlock_bh(&neigh->lock);
354 	} else
355 		m = 0;
356 	return m;
357 }
358 
359 static int rt6_score_route(struct rt6_info *rt, int oif,
360 			   int strict)
361 {
362 	int m, n;
363 
364 	m = rt6_check_dev(rt, oif);
365 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
366 		return -1;
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
369 #endif
370 	n = rt6_check_neigh(rt);
371 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
372 		return -1;
373 	return m;
374 }
375 
376 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
377 				   int *mpri, struct rt6_info *match)
378 {
379 	int m;
380 
381 	if (rt6_check_expired(rt))
382 		goto out;
383 
384 	m = rt6_score_route(rt, oif, strict);
385 	if (m < 0)
386 		goto out;
387 
388 	if (m > *mpri) {
389 		if (strict & RT6_LOOKUP_F_REACHABLE)
390 			rt6_probe(match);
391 		*mpri = m;
392 		match = rt;
393 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
394 		rt6_probe(rt);
395 	}
396 
397 out:
398 	return match;
399 }
400 
401 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
402 				     struct rt6_info *rr_head,
403 				     u32 metric, int oif, int strict)
404 {
405 	struct rt6_info *rt, *match;
406 	int mpri = -1;
407 
408 	match = NULL;
409 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
410 	     rt = rt->u.dst.rt6_next)
411 		match = find_match(rt, oif, strict, &mpri, match);
412 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
413 	     rt = rt->u.dst.rt6_next)
414 		match = find_match(rt, oif, strict, &mpri, match);
415 
416 	return match;
417 }
418 
419 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
420 {
421 	struct rt6_info *match, *rt0;
422 	struct net *net;
423 
424 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 		  __func__, fn->leaf, oif);
426 
427 	rt0 = fn->rr_ptr;
428 	if (!rt0)
429 		fn->rr_ptr = rt0 = fn->leaf;
430 
431 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
432 
433 	if (!match &&
434 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
435 		struct rt6_info *next = rt0->u.dst.rt6_next;
436 
437 		/* no entries matched; do round-robin */
438 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
439 			next = fn->leaf;
440 
441 		if (next != rt0)
442 			fn->rr_ptr = next;
443 	}
444 
445 	RT6_TRACE("%s() => %p\n",
446 		  __func__, match);
447 
448 	net = dev_net(rt0->rt6i_dev);
449 	return (match ? match : net->ipv6.ip6_null_entry);
450 }
451 
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
454 		  struct in6_addr *gwaddr)
455 {
456 	struct net *net = dev_net(dev);
457 	struct route_info *rinfo = (struct route_info *) opt;
458 	struct in6_addr prefix_buf, *prefix;
459 	unsigned int pref;
460 	unsigned long lifetime;
461 	struct rt6_info *rt;
462 
463 	if (len < sizeof(struct route_info)) {
464 		return -EINVAL;
465 	}
466 
467 	/* Sanity check for prefix_len and length */
468 	if (rinfo->length > 3) {
469 		return -EINVAL;
470 	} else if (rinfo->prefix_len > 128) {
471 		return -EINVAL;
472 	} else if (rinfo->prefix_len > 64) {
473 		if (rinfo->length < 2) {
474 			return -EINVAL;
475 		}
476 	} else if (rinfo->prefix_len > 0) {
477 		if (rinfo->length < 1) {
478 			return -EINVAL;
479 		}
480 	}
481 
482 	pref = rinfo->route_pref;
483 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
484 		return -EINVAL;
485 
486 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
487 
488 	if (rinfo->length == 3)
489 		prefix = (struct in6_addr *)rinfo->prefix;
490 	else {
491 		/* this function is safe */
492 		ipv6_addr_prefix(&prefix_buf,
493 				 (struct in6_addr *)rinfo->prefix,
494 				 rinfo->prefix_len);
495 		prefix = &prefix_buf;
496 	}
497 
498 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
499 				dev->ifindex);
500 
501 	if (rt && !lifetime) {
502 		ip6_del_rt(rt);
503 		rt = NULL;
504 	}
505 
506 	if (!rt && lifetime)
507 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
508 					pref);
509 	else if (rt)
510 		rt->rt6i_flags = RTF_ROUTEINFO |
511 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
512 
513 	if (rt) {
514 		if (!addrconf_finite_timeout(lifetime)) {
515 			rt->rt6i_flags &= ~RTF_EXPIRES;
516 		} else {
517 			rt->rt6i_expires = jiffies + HZ * lifetime;
518 			rt->rt6i_flags |= RTF_EXPIRES;
519 		}
520 		dst_release(&rt->u.dst);
521 	}
522 	return 0;
523 }
524 #endif
525 
526 #define BACKTRACK(__net, saddr)			\
527 do { \
528 	if (rt == __net->ipv6.ip6_null_entry) {	\
529 		struct fib6_node *pn; \
530 		while (1) { \
531 			if (fn->fn_flags & RTN_TL_ROOT) \
532 				goto out; \
533 			pn = fn->parent; \
534 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
536 			else \
537 				fn = pn; \
538 			if (fn->fn_flags & RTN_RTINFO) \
539 				goto restart; \
540 		} \
541 	} \
542 } while(0)
543 
544 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
545 					     struct fib6_table *table,
546 					     struct flowi *fl, int flags)
547 {
548 	struct fib6_node *fn;
549 	struct rt6_info *rt;
550 
551 	read_lock_bh(&table->tb6_lock);
552 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
553 restart:
554 	rt = fn->leaf;
555 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
556 	BACKTRACK(net, &fl->fl6_src);
557 out:
558 	dst_use(&rt->u.dst, jiffies);
559 	read_unlock_bh(&table->tb6_lock);
560 	return rt;
561 
562 }
563 
564 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
565 			    const struct in6_addr *saddr, int oif, int strict)
566 {
567 	struct flowi fl = {
568 		.oif = oif,
569 		.nl_u = {
570 			.ip6_u = {
571 				.daddr = *daddr,
572 			},
573 		},
574 	};
575 	struct dst_entry *dst;
576 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
577 
578 	if (saddr) {
579 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
580 		flags |= RT6_LOOKUP_F_HAS_SADDR;
581 	}
582 
583 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
584 	if (dst->error == 0)
585 		return (struct rt6_info *) dst;
586 
587 	dst_release(dst);
588 
589 	return NULL;
590 }
591 
592 EXPORT_SYMBOL(rt6_lookup);
593 
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595    It takes new route entry, the addition fails by any reason the
596    route is freed. In any case, if caller does not hold it, it may
597    be destroyed.
598  */
599 
600 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
601 {
602 	int err;
603 	struct fib6_table *table;
604 
605 	table = rt->rt6i_table;
606 	write_lock_bh(&table->tb6_lock);
607 	err = fib6_add(&table->tb6_root, rt, info);
608 	write_unlock_bh(&table->tb6_lock);
609 
610 	return err;
611 }
612 
613 int ip6_ins_rt(struct rt6_info *rt)
614 {
615 	struct nl_info info = {
616 		.nl_net = dev_net(rt->rt6i_dev),
617 	};
618 	return __ip6_ins_rt(rt, &info);
619 }
620 
621 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
622 				      struct in6_addr *saddr)
623 {
624 	struct rt6_info *rt;
625 
626 	/*
627 	 *	Clone the route.
628 	 */
629 
630 	rt = ip6_rt_copy(ort);
631 
632 	if (rt) {
633 		struct neighbour *neigh;
634 		int attempts = !in_softirq();
635 
636 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
637 			if (rt->rt6i_dst.plen != 128 &&
638 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
639 				rt->rt6i_flags |= RTF_ANYCAST;
640 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
641 		}
642 
643 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
644 		rt->rt6i_dst.plen = 128;
645 		rt->rt6i_flags |= RTF_CACHE;
646 		rt->u.dst.flags |= DST_HOST;
647 
648 #ifdef CONFIG_IPV6_SUBTREES
649 		if (rt->rt6i_src.plen && saddr) {
650 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
651 			rt->rt6i_src.plen = 128;
652 		}
653 #endif
654 
655 	retry:
656 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
657 		if (IS_ERR(neigh)) {
658 			struct net *net = dev_net(rt->rt6i_dev);
659 			int saved_rt_min_interval =
660 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
661 			int saved_rt_elasticity =
662 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
663 
664 			if (attempts-- > 0) {
665 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
666 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
667 
668 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
669 
670 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
671 					saved_rt_elasticity;
672 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
673 					saved_rt_min_interval;
674 				goto retry;
675 			}
676 
677 			if (net_ratelimit())
678 				printk(KERN_WARNING
679 				       "Neighbour table overflow.\n");
680 			dst_free(&rt->u.dst);
681 			return NULL;
682 		}
683 		rt->rt6i_nexthop = neigh;
684 
685 	}
686 
687 	return rt;
688 }
689 
690 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
691 {
692 	struct rt6_info *rt = ip6_rt_copy(ort);
693 	if (rt) {
694 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 		rt->rt6i_dst.plen = 128;
696 		rt->rt6i_flags |= RTF_CACHE;
697 		rt->u.dst.flags |= DST_HOST;
698 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
699 	}
700 	return rt;
701 }
702 
703 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
704 				      struct flowi *fl, int flags)
705 {
706 	struct fib6_node *fn;
707 	struct rt6_info *rt, *nrt;
708 	int strict = 0;
709 	int attempts = 3;
710 	int err;
711 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
712 
713 	strict |= flags & RT6_LOOKUP_F_IFACE;
714 
715 relookup:
716 	read_lock_bh(&table->tb6_lock);
717 
718 restart_2:
719 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
720 
721 restart:
722 	rt = rt6_select(fn, oif, strict | reachable);
723 
724 	BACKTRACK(net, &fl->fl6_src);
725 	if (rt == net->ipv6.ip6_null_entry ||
726 	    rt->rt6i_flags & RTF_CACHE)
727 		goto out;
728 
729 	dst_hold(&rt->u.dst);
730 	read_unlock_bh(&table->tb6_lock);
731 
732 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
733 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
734 	else {
735 #if CLONE_OFFLINK_ROUTE
736 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
737 #else
738 		goto out2;
739 #endif
740 	}
741 
742 	dst_release(&rt->u.dst);
743 	rt = nrt ? : net->ipv6.ip6_null_entry;
744 
745 	dst_hold(&rt->u.dst);
746 	if (nrt) {
747 		err = ip6_ins_rt(nrt);
748 		if (!err)
749 			goto out2;
750 	}
751 
752 	if (--attempts <= 0)
753 		goto out2;
754 
755 	/*
756 	 * Race condition! In the gap, when table->tb6_lock was
757 	 * released someone could insert this route.  Relookup.
758 	 */
759 	dst_release(&rt->u.dst);
760 	goto relookup;
761 
762 out:
763 	if (reachable) {
764 		reachable = 0;
765 		goto restart_2;
766 	}
767 	dst_hold(&rt->u.dst);
768 	read_unlock_bh(&table->tb6_lock);
769 out2:
770 	rt->u.dst.lastuse = jiffies;
771 	rt->u.dst.__use++;
772 
773 	return rt;
774 }
775 
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777 					    struct flowi *fl, int flags)
778 {
779 	return ip6_pol_route(net, table, fl->iif, fl, flags);
780 }
781 
782 void ip6_route_input(struct sk_buff *skb)
783 {
784 	struct ipv6hdr *iph = ipv6_hdr(skb);
785 	struct net *net = dev_net(skb->dev);
786 	int flags = RT6_LOOKUP_F_HAS_SADDR;
787 	struct flowi fl = {
788 		.iif = skb->dev->ifindex,
789 		.nl_u = {
790 			.ip6_u = {
791 				.daddr = iph->daddr,
792 				.saddr = iph->saddr,
793 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
794 			},
795 		},
796 		.mark = skb->mark,
797 		.proto = iph->nexthdr,
798 	};
799 
800 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
801 		flags |= RT6_LOOKUP_F_IFACE;
802 
803 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
804 }
805 
806 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
807 					     struct flowi *fl, int flags)
808 {
809 	return ip6_pol_route(net, table, fl->oif, fl, flags);
810 }
811 
812 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
813 				    struct flowi *fl)
814 {
815 	int flags = 0;
816 
817 	if (rt6_need_strict(&fl->fl6_dst))
818 		flags |= RT6_LOOKUP_F_IFACE;
819 
820 	if (!ipv6_addr_any(&fl->fl6_src))
821 		flags |= RT6_LOOKUP_F_HAS_SADDR;
822 	else if (sk) {
823 		unsigned int prefs = inet6_sk(sk)->srcprefs;
824 		if (prefs & IPV6_PREFER_SRC_TMP)
825 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
826 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
827 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
828 		if (prefs & IPV6_PREFER_SRC_COA)
829 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
830 	}
831 
832 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
833 }
834 
835 EXPORT_SYMBOL(ip6_route_output);
836 
837 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
838 {
839 	struct rt6_info *ort = (struct rt6_info *) *dstp;
840 	struct rt6_info *rt = (struct rt6_info *)
841 		dst_alloc(&ip6_dst_blackhole_ops);
842 	struct dst_entry *new = NULL;
843 
844 	if (rt) {
845 		new = &rt->u.dst;
846 
847 		atomic_set(&new->__refcnt, 1);
848 		new->__use = 1;
849 		new->input = dst_discard;
850 		new->output = dst_discard;
851 
852 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
853 		new->dev = ort->u.dst.dev;
854 		if (new->dev)
855 			dev_hold(new->dev);
856 		rt->rt6i_idev = ort->rt6i_idev;
857 		if (rt->rt6i_idev)
858 			in6_dev_hold(rt->rt6i_idev);
859 		rt->rt6i_expires = 0;
860 
861 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
862 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
863 		rt->rt6i_metric = 0;
864 
865 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
866 #ifdef CONFIG_IPV6_SUBTREES
867 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
868 #endif
869 
870 		dst_free(new);
871 	}
872 
873 	dst_release(*dstp);
874 	*dstp = new;
875 	return (new ? 0 : -ENOMEM);
876 }
877 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
878 
879 /*
880  *	Destination cache support functions
881  */
882 
883 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
884 {
885 	struct rt6_info *rt;
886 
887 	rt = (struct rt6_info *) dst;
888 
889 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
890 		return dst;
891 
892 	return NULL;
893 }
894 
895 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
896 {
897 	struct rt6_info *rt = (struct rt6_info *) dst;
898 
899 	if (rt) {
900 		if (rt->rt6i_flags & RTF_CACHE)
901 			ip6_del_rt(rt);
902 		else
903 			dst_release(dst);
904 	}
905 	return NULL;
906 }
907 
908 static void ip6_link_failure(struct sk_buff *skb)
909 {
910 	struct rt6_info *rt;
911 
912 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
913 
914 	rt = (struct rt6_info *) skb_dst(skb);
915 	if (rt) {
916 		if (rt->rt6i_flags&RTF_CACHE) {
917 			dst_set_expires(&rt->u.dst, 0);
918 			rt->rt6i_flags |= RTF_EXPIRES;
919 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
920 			rt->rt6i_node->fn_sernum = -1;
921 	}
922 }
923 
924 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
925 {
926 	struct rt6_info *rt6 = (struct rt6_info*)dst;
927 
928 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
929 		rt6->rt6i_flags |= RTF_MODIFIED;
930 		if (mtu < IPV6_MIN_MTU) {
931 			mtu = IPV6_MIN_MTU;
932 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
933 		}
934 		dst->metrics[RTAX_MTU-1] = mtu;
935 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
936 	}
937 }
938 
939 static int ipv6_get_mtu(struct net_device *dev);
940 
941 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
942 {
943 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
944 
945 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
946 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
947 
948 	/*
949 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
950 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
951 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
952 	 * rely only on pmtu discovery"
953 	 */
954 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
955 		mtu = IPV6_MAXPLEN;
956 	return mtu;
957 }
958 
959 static struct dst_entry *icmp6_dst_gc_list;
960 static DEFINE_SPINLOCK(icmp6_dst_lock);
961 
962 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
963 				  struct neighbour *neigh,
964 				  const struct in6_addr *addr)
965 {
966 	struct rt6_info *rt;
967 	struct inet6_dev *idev = in6_dev_get(dev);
968 	struct net *net = dev_net(dev);
969 
970 	if (unlikely(idev == NULL))
971 		return NULL;
972 
973 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
974 	if (unlikely(rt == NULL)) {
975 		in6_dev_put(idev);
976 		goto out;
977 	}
978 
979 	dev_hold(dev);
980 	if (neigh)
981 		neigh_hold(neigh);
982 	else {
983 		neigh = ndisc_get_neigh(dev, addr);
984 		if (IS_ERR(neigh))
985 			neigh = NULL;
986 	}
987 
988 	rt->rt6i_dev	  = dev;
989 	rt->rt6i_idev     = idev;
990 	rt->rt6i_nexthop  = neigh;
991 	atomic_set(&rt->u.dst.__refcnt, 1);
992 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
993 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
994 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
995 	rt->u.dst.output  = ip6_output;
996 
997 #if 0	/* there's no chance to use these for ndisc */
998 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
999 				? DST_HOST
1000 				: 0;
1001 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1002 	rt->rt6i_dst.plen = 128;
1003 #endif
1004 
1005 	spin_lock_bh(&icmp6_dst_lock);
1006 	rt->u.dst.next = icmp6_dst_gc_list;
1007 	icmp6_dst_gc_list = &rt->u.dst;
1008 	spin_unlock_bh(&icmp6_dst_lock);
1009 
1010 	fib6_force_start_gc(net);
1011 
1012 out:
1013 	return &rt->u.dst;
1014 }
1015 
1016 int icmp6_dst_gc(void)
1017 {
1018 	struct dst_entry *dst, *next, **pprev;
1019 	int more = 0;
1020 
1021 	next = NULL;
1022 
1023 	spin_lock_bh(&icmp6_dst_lock);
1024 	pprev = &icmp6_dst_gc_list;
1025 
1026 	while ((dst = *pprev) != NULL) {
1027 		if (!atomic_read(&dst->__refcnt)) {
1028 			*pprev = dst->next;
1029 			dst_free(dst);
1030 		} else {
1031 			pprev = &dst->next;
1032 			++more;
1033 		}
1034 	}
1035 
1036 	spin_unlock_bh(&icmp6_dst_lock);
1037 
1038 	return more;
1039 }
1040 
1041 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1042 			    void *arg)
1043 {
1044 	struct dst_entry *dst, **pprev;
1045 
1046 	spin_lock_bh(&icmp6_dst_lock);
1047 	pprev = &icmp6_dst_gc_list;
1048 	while ((dst = *pprev) != NULL) {
1049 		struct rt6_info *rt = (struct rt6_info *) dst;
1050 		if (func(rt, arg)) {
1051 			*pprev = dst->next;
1052 			dst_free(dst);
1053 		} else {
1054 			pprev = &dst->next;
1055 		}
1056 	}
1057 	spin_unlock_bh(&icmp6_dst_lock);
1058 }
1059 
1060 static int ip6_dst_gc(struct dst_ops *ops)
1061 {
1062 	unsigned long now = jiffies;
1063 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1064 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1065 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1066 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1067 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1068 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1069 
1070 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1071 	    atomic_read(&ops->entries) <= rt_max_size)
1072 		goto out;
1073 
1074 	net->ipv6.ip6_rt_gc_expire++;
1075 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1076 	net->ipv6.ip6_rt_last_gc = now;
1077 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1078 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1079 out:
1080 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1081 	return (atomic_read(&ops->entries) > rt_max_size);
1082 }
1083 
1084 /* Clean host part of a prefix. Not necessary in radix tree,
1085    but results in cleaner routing tables.
1086 
1087    Remove it only when all the things will work!
1088  */
1089 
1090 static int ipv6_get_mtu(struct net_device *dev)
1091 {
1092 	int mtu = IPV6_MIN_MTU;
1093 	struct inet6_dev *idev;
1094 
1095 	idev = in6_dev_get(dev);
1096 	if (idev) {
1097 		mtu = idev->cnf.mtu6;
1098 		in6_dev_put(idev);
1099 	}
1100 	return mtu;
1101 }
1102 
1103 int ip6_dst_hoplimit(struct dst_entry *dst)
1104 {
1105 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1106 	if (hoplimit < 0) {
1107 		struct net_device *dev = dst->dev;
1108 		struct inet6_dev *idev = in6_dev_get(dev);
1109 		if (idev) {
1110 			hoplimit = idev->cnf.hop_limit;
1111 			in6_dev_put(idev);
1112 		} else
1113 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1114 	}
1115 	return hoplimit;
1116 }
1117 
1118 /*
1119  *
1120  */
1121 
1122 int ip6_route_add(struct fib6_config *cfg)
1123 {
1124 	int err;
1125 	struct net *net = cfg->fc_nlinfo.nl_net;
1126 	struct rt6_info *rt = NULL;
1127 	struct net_device *dev = NULL;
1128 	struct inet6_dev *idev = NULL;
1129 	struct fib6_table *table;
1130 	int addr_type;
1131 
1132 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1133 		return -EINVAL;
1134 #ifndef CONFIG_IPV6_SUBTREES
1135 	if (cfg->fc_src_len)
1136 		return -EINVAL;
1137 #endif
1138 	if (cfg->fc_ifindex) {
1139 		err = -ENODEV;
1140 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1141 		if (!dev)
1142 			goto out;
1143 		idev = in6_dev_get(dev);
1144 		if (!idev)
1145 			goto out;
1146 	}
1147 
1148 	if (cfg->fc_metric == 0)
1149 		cfg->fc_metric = IP6_RT_PRIO_USER;
1150 
1151 	table = fib6_new_table(net, cfg->fc_table);
1152 	if (table == NULL) {
1153 		err = -ENOBUFS;
1154 		goto out;
1155 	}
1156 
1157 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1158 
1159 	if (rt == NULL) {
1160 		err = -ENOMEM;
1161 		goto out;
1162 	}
1163 
1164 	rt->u.dst.obsolete = -1;
1165 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1166 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1167 				0;
1168 
1169 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1170 		cfg->fc_protocol = RTPROT_BOOT;
1171 	rt->rt6i_protocol = cfg->fc_protocol;
1172 
1173 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1174 
1175 	if (addr_type & IPV6_ADDR_MULTICAST)
1176 		rt->u.dst.input = ip6_mc_input;
1177 	else
1178 		rt->u.dst.input = ip6_forward;
1179 
1180 	rt->u.dst.output = ip6_output;
1181 
1182 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1183 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1184 	if (rt->rt6i_dst.plen == 128)
1185 	       rt->u.dst.flags = DST_HOST;
1186 
1187 #ifdef CONFIG_IPV6_SUBTREES
1188 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1189 	rt->rt6i_src.plen = cfg->fc_src_len;
1190 #endif
1191 
1192 	rt->rt6i_metric = cfg->fc_metric;
1193 
1194 	/* We cannot add true routes via loopback here,
1195 	   they would result in kernel looping; promote them to reject routes
1196 	 */
1197 	if ((cfg->fc_flags & RTF_REJECT) ||
1198 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1199 		/* hold loopback dev/idev if we haven't done so. */
1200 		if (dev != net->loopback_dev) {
1201 			if (dev) {
1202 				dev_put(dev);
1203 				in6_dev_put(idev);
1204 			}
1205 			dev = net->loopback_dev;
1206 			dev_hold(dev);
1207 			idev = in6_dev_get(dev);
1208 			if (!idev) {
1209 				err = -ENODEV;
1210 				goto out;
1211 			}
1212 		}
1213 		rt->u.dst.output = ip6_pkt_discard_out;
1214 		rt->u.dst.input = ip6_pkt_discard;
1215 		rt->u.dst.error = -ENETUNREACH;
1216 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1217 		goto install_route;
1218 	}
1219 
1220 	if (cfg->fc_flags & RTF_GATEWAY) {
1221 		struct in6_addr *gw_addr;
1222 		int gwa_type;
1223 
1224 		gw_addr = &cfg->fc_gateway;
1225 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1226 		gwa_type = ipv6_addr_type(gw_addr);
1227 
1228 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1229 			struct rt6_info *grt;
1230 
1231 			/* IPv6 strictly inhibits using not link-local
1232 			   addresses as nexthop address.
1233 			   Otherwise, router will not able to send redirects.
1234 			   It is very good, but in some (rare!) circumstances
1235 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1236 			   some exceptions. --ANK
1237 			 */
1238 			err = -EINVAL;
1239 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1240 				goto out;
1241 
1242 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1243 
1244 			err = -EHOSTUNREACH;
1245 			if (grt == NULL)
1246 				goto out;
1247 			if (dev) {
1248 				if (dev != grt->rt6i_dev) {
1249 					dst_release(&grt->u.dst);
1250 					goto out;
1251 				}
1252 			} else {
1253 				dev = grt->rt6i_dev;
1254 				idev = grt->rt6i_idev;
1255 				dev_hold(dev);
1256 				in6_dev_hold(grt->rt6i_idev);
1257 			}
1258 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1259 				err = 0;
1260 			dst_release(&grt->u.dst);
1261 
1262 			if (err)
1263 				goto out;
1264 		}
1265 		err = -EINVAL;
1266 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1267 			goto out;
1268 	}
1269 
1270 	err = -ENODEV;
1271 	if (dev == NULL)
1272 		goto out;
1273 
1274 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1275 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1276 		if (IS_ERR(rt->rt6i_nexthop)) {
1277 			err = PTR_ERR(rt->rt6i_nexthop);
1278 			rt->rt6i_nexthop = NULL;
1279 			goto out;
1280 		}
1281 	}
1282 
1283 	rt->rt6i_flags = cfg->fc_flags;
1284 
1285 install_route:
1286 	if (cfg->fc_mx) {
1287 		struct nlattr *nla;
1288 		int remaining;
1289 
1290 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1291 			int type = nla_type(nla);
1292 
1293 			if (type) {
1294 				if (type > RTAX_MAX) {
1295 					err = -EINVAL;
1296 					goto out;
1297 				}
1298 
1299 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1300 			}
1301 		}
1302 	}
1303 
1304 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1305 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1306 	if (!dst_mtu(&rt->u.dst))
1307 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1308 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1309 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1310 	rt->u.dst.dev = dev;
1311 	rt->rt6i_idev = idev;
1312 	rt->rt6i_table = table;
1313 
1314 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1315 
1316 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1317 
1318 out:
1319 	if (dev)
1320 		dev_put(dev);
1321 	if (idev)
1322 		in6_dev_put(idev);
1323 	if (rt)
1324 		dst_free(&rt->u.dst);
1325 	return err;
1326 }
1327 
1328 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1329 {
1330 	int err;
1331 	struct fib6_table *table;
1332 	struct net *net = dev_net(rt->rt6i_dev);
1333 
1334 	if (rt == net->ipv6.ip6_null_entry)
1335 		return -ENOENT;
1336 
1337 	table = rt->rt6i_table;
1338 	write_lock_bh(&table->tb6_lock);
1339 
1340 	err = fib6_del(rt, info);
1341 	dst_release(&rt->u.dst);
1342 
1343 	write_unlock_bh(&table->tb6_lock);
1344 
1345 	return err;
1346 }
1347 
1348 int ip6_del_rt(struct rt6_info *rt)
1349 {
1350 	struct nl_info info = {
1351 		.nl_net = dev_net(rt->rt6i_dev),
1352 	};
1353 	return __ip6_del_rt(rt, &info);
1354 }
1355 
1356 static int ip6_route_del(struct fib6_config *cfg)
1357 {
1358 	struct fib6_table *table;
1359 	struct fib6_node *fn;
1360 	struct rt6_info *rt;
1361 	int err = -ESRCH;
1362 
1363 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1364 	if (table == NULL)
1365 		return err;
1366 
1367 	read_lock_bh(&table->tb6_lock);
1368 
1369 	fn = fib6_locate(&table->tb6_root,
1370 			 &cfg->fc_dst, cfg->fc_dst_len,
1371 			 &cfg->fc_src, cfg->fc_src_len);
1372 
1373 	if (fn) {
1374 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1375 			if (cfg->fc_ifindex &&
1376 			    (rt->rt6i_dev == NULL ||
1377 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1378 				continue;
1379 			if (cfg->fc_flags & RTF_GATEWAY &&
1380 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1381 				continue;
1382 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1383 				continue;
1384 			dst_hold(&rt->u.dst);
1385 			read_unlock_bh(&table->tb6_lock);
1386 
1387 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1388 		}
1389 	}
1390 	read_unlock_bh(&table->tb6_lock);
1391 
1392 	return err;
1393 }
1394 
1395 /*
1396  *	Handle redirects
1397  */
1398 struct ip6rd_flowi {
1399 	struct flowi fl;
1400 	struct in6_addr gateway;
1401 };
1402 
1403 static struct rt6_info *__ip6_route_redirect(struct net *net,
1404 					     struct fib6_table *table,
1405 					     struct flowi *fl,
1406 					     int flags)
1407 {
1408 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1409 	struct rt6_info *rt;
1410 	struct fib6_node *fn;
1411 
1412 	/*
1413 	 * Get the "current" route for this destination and
1414 	 * check if the redirect has come from approriate router.
1415 	 *
1416 	 * RFC 2461 specifies that redirects should only be
1417 	 * accepted if they come from the nexthop to the target.
1418 	 * Due to the way the routes are chosen, this notion
1419 	 * is a bit fuzzy and one might need to check all possible
1420 	 * routes.
1421 	 */
1422 
1423 	read_lock_bh(&table->tb6_lock);
1424 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1425 restart:
1426 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1427 		/*
1428 		 * Current route is on-link; redirect is always invalid.
1429 		 *
1430 		 * Seems, previous statement is not true. It could
1431 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1432 		 * But then router serving it might decide, that we should
1433 		 * know truth 8)8) --ANK (980726).
1434 		 */
1435 		if (rt6_check_expired(rt))
1436 			continue;
1437 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1438 			continue;
1439 		if (fl->oif != rt->rt6i_dev->ifindex)
1440 			continue;
1441 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1442 			continue;
1443 		break;
1444 	}
1445 
1446 	if (!rt)
1447 		rt = net->ipv6.ip6_null_entry;
1448 	BACKTRACK(net, &fl->fl6_src);
1449 out:
1450 	dst_hold(&rt->u.dst);
1451 
1452 	read_unlock_bh(&table->tb6_lock);
1453 
1454 	return rt;
1455 };
1456 
1457 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1458 					   struct in6_addr *src,
1459 					   struct in6_addr *gateway,
1460 					   struct net_device *dev)
1461 {
1462 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1463 	struct net *net = dev_net(dev);
1464 	struct ip6rd_flowi rdfl = {
1465 		.fl = {
1466 			.oif = dev->ifindex,
1467 			.nl_u = {
1468 				.ip6_u = {
1469 					.daddr = *dest,
1470 					.saddr = *src,
1471 				},
1472 			},
1473 		},
1474 		.gateway = *gateway,
1475 	};
1476 
1477 	if (rt6_need_strict(dest))
1478 		flags |= RT6_LOOKUP_F_IFACE;
1479 
1480 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1481 						   flags, __ip6_route_redirect);
1482 }
1483 
1484 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1485 		  struct in6_addr *saddr,
1486 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1487 {
1488 	struct rt6_info *rt, *nrt = NULL;
1489 	struct netevent_redirect netevent;
1490 	struct net *net = dev_net(neigh->dev);
1491 
1492 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1493 
1494 	if (rt == net->ipv6.ip6_null_entry) {
1495 		if (net_ratelimit())
1496 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1497 			       "for redirect target\n");
1498 		goto out;
1499 	}
1500 
1501 	/*
1502 	 *	We have finally decided to accept it.
1503 	 */
1504 
1505 	neigh_update(neigh, lladdr, NUD_STALE,
1506 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1507 		     NEIGH_UPDATE_F_OVERRIDE|
1508 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1509 				     NEIGH_UPDATE_F_ISROUTER))
1510 		     );
1511 
1512 	/*
1513 	 * Redirect received -> path was valid.
1514 	 * Look, redirects are sent only in response to data packets,
1515 	 * so that this nexthop apparently is reachable. --ANK
1516 	 */
1517 	dst_confirm(&rt->u.dst);
1518 
1519 	/* Duplicate redirect: silently ignore. */
1520 	if (neigh == rt->u.dst.neighbour)
1521 		goto out;
1522 
1523 	nrt = ip6_rt_copy(rt);
1524 	if (nrt == NULL)
1525 		goto out;
1526 
1527 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1528 	if (on_link)
1529 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1530 
1531 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1532 	nrt->rt6i_dst.plen = 128;
1533 	nrt->u.dst.flags |= DST_HOST;
1534 
1535 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1536 	nrt->rt6i_nexthop = neigh_clone(neigh);
1537 	/* Reset pmtu, it may be better */
1538 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1539 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1540 							dst_mtu(&nrt->u.dst));
1541 
1542 	if (ip6_ins_rt(nrt))
1543 		goto out;
1544 
1545 	netevent.old = &rt->u.dst;
1546 	netevent.new = &nrt->u.dst;
1547 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1548 
1549 	if (rt->rt6i_flags&RTF_CACHE) {
1550 		ip6_del_rt(rt);
1551 		return;
1552 	}
1553 
1554 out:
1555 	dst_release(&rt->u.dst);
1556 	return;
1557 }
1558 
1559 /*
1560  *	Handle ICMP "packet too big" messages
1561  *	i.e. Path MTU discovery
1562  */
1563 
1564 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1565 			struct net_device *dev, u32 pmtu)
1566 {
1567 	struct rt6_info *rt, *nrt;
1568 	struct net *net = dev_net(dev);
1569 	int allfrag = 0;
1570 
1571 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1572 	if (rt == NULL)
1573 		return;
1574 
1575 	if (pmtu >= dst_mtu(&rt->u.dst))
1576 		goto out;
1577 
1578 	if (pmtu < IPV6_MIN_MTU) {
1579 		/*
1580 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1581 		 * MTU (1280) and a fragment header should always be included
1582 		 * after a node receiving Too Big message reporting PMTU is
1583 		 * less than the IPv6 Minimum Link MTU.
1584 		 */
1585 		pmtu = IPV6_MIN_MTU;
1586 		allfrag = 1;
1587 	}
1588 
1589 	/* New mtu received -> path was valid.
1590 	   They are sent only in response to data packets,
1591 	   so that this nexthop apparently is reachable. --ANK
1592 	 */
1593 	dst_confirm(&rt->u.dst);
1594 
1595 	/* Host route. If it is static, it would be better
1596 	   not to override it, but add new one, so that
1597 	   when cache entry will expire old pmtu
1598 	   would return automatically.
1599 	 */
1600 	if (rt->rt6i_flags & RTF_CACHE) {
1601 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1602 		if (allfrag)
1603 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1604 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1605 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1606 		goto out;
1607 	}
1608 
1609 	/* Network route.
1610 	   Two cases are possible:
1611 	   1. It is connected route. Action: COW
1612 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1613 	 */
1614 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1615 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1616 	else
1617 		nrt = rt6_alloc_clone(rt, daddr);
1618 
1619 	if (nrt) {
1620 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1621 		if (allfrag)
1622 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1623 
1624 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1625 		 * happened within 5 mins, the recommended timer is 10 mins.
1626 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1627 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1628 		 * and detecting PMTU increase will be automatically happened.
1629 		 */
1630 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1631 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1632 
1633 		ip6_ins_rt(nrt);
1634 	}
1635 out:
1636 	dst_release(&rt->u.dst);
1637 }
1638 
1639 /*
1640  *	Misc support functions
1641  */
1642 
1643 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1644 {
1645 	struct net *net = dev_net(ort->rt6i_dev);
1646 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1647 
1648 	if (rt) {
1649 		rt->u.dst.input = ort->u.dst.input;
1650 		rt->u.dst.output = ort->u.dst.output;
1651 
1652 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1653 		rt->u.dst.error = ort->u.dst.error;
1654 		rt->u.dst.dev = ort->u.dst.dev;
1655 		if (rt->u.dst.dev)
1656 			dev_hold(rt->u.dst.dev);
1657 		rt->rt6i_idev = ort->rt6i_idev;
1658 		if (rt->rt6i_idev)
1659 			in6_dev_hold(rt->rt6i_idev);
1660 		rt->u.dst.lastuse = jiffies;
1661 		rt->rt6i_expires = 0;
1662 
1663 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1664 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1665 		rt->rt6i_metric = 0;
1666 
1667 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1668 #ifdef CONFIG_IPV6_SUBTREES
1669 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1670 #endif
1671 		rt->rt6i_table = ort->rt6i_table;
1672 	}
1673 	return rt;
1674 }
1675 
1676 #ifdef CONFIG_IPV6_ROUTE_INFO
1677 static struct rt6_info *rt6_get_route_info(struct net *net,
1678 					   struct in6_addr *prefix, int prefixlen,
1679 					   struct in6_addr *gwaddr, int ifindex)
1680 {
1681 	struct fib6_node *fn;
1682 	struct rt6_info *rt = NULL;
1683 	struct fib6_table *table;
1684 
1685 	table = fib6_get_table(net, RT6_TABLE_INFO);
1686 	if (table == NULL)
1687 		return NULL;
1688 
1689 	write_lock_bh(&table->tb6_lock);
1690 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1691 	if (!fn)
1692 		goto out;
1693 
1694 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1695 		if (rt->rt6i_dev->ifindex != ifindex)
1696 			continue;
1697 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1698 			continue;
1699 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1700 			continue;
1701 		dst_hold(&rt->u.dst);
1702 		break;
1703 	}
1704 out:
1705 	write_unlock_bh(&table->tb6_lock);
1706 	return rt;
1707 }
1708 
1709 static struct rt6_info *rt6_add_route_info(struct net *net,
1710 					   struct in6_addr *prefix, int prefixlen,
1711 					   struct in6_addr *gwaddr, int ifindex,
1712 					   unsigned pref)
1713 {
1714 	struct fib6_config cfg = {
1715 		.fc_table	= RT6_TABLE_INFO,
1716 		.fc_metric	= IP6_RT_PRIO_USER,
1717 		.fc_ifindex	= ifindex,
1718 		.fc_dst_len	= prefixlen,
1719 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1720 				  RTF_UP | RTF_PREF(pref),
1721 		.fc_nlinfo.pid = 0,
1722 		.fc_nlinfo.nlh = NULL,
1723 		.fc_nlinfo.nl_net = net,
1724 	};
1725 
1726 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1727 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1728 
1729 	/* We should treat it as a default route if prefix length is 0. */
1730 	if (!prefixlen)
1731 		cfg.fc_flags |= RTF_DEFAULT;
1732 
1733 	ip6_route_add(&cfg);
1734 
1735 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1736 }
1737 #endif
1738 
1739 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1740 {
1741 	struct rt6_info *rt;
1742 	struct fib6_table *table;
1743 
1744 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1745 	if (table == NULL)
1746 		return NULL;
1747 
1748 	write_lock_bh(&table->tb6_lock);
1749 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1750 		if (dev == rt->rt6i_dev &&
1751 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1752 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1753 			break;
1754 	}
1755 	if (rt)
1756 		dst_hold(&rt->u.dst);
1757 	write_unlock_bh(&table->tb6_lock);
1758 	return rt;
1759 }
1760 
1761 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1762 				     struct net_device *dev,
1763 				     unsigned int pref)
1764 {
1765 	struct fib6_config cfg = {
1766 		.fc_table	= RT6_TABLE_DFLT,
1767 		.fc_metric	= IP6_RT_PRIO_USER,
1768 		.fc_ifindex	= dev->ifindex,
1769 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1770 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1771 		.fc_nlinfo.pid = 0,
1772 		.fc_nlinfo.nlh = NULL,
1773 		.fc_nlinfo.nl_net = dev_net(dev),
1774 	};
1775 
1776 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1777 
1778 	ip6_route_add(&cfg);
1779 
1780 	return rt6_get_dflt_router(gwaddr, dev);
1781 }
1782 
1783 void rt6_purge_dflt_routers(struct net *net)
1784 {
1785 	struct rt6_info *rt;
1786 	struct fib6_table *table;
1787 
1788 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1789 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1790 	if (table == NULL)
1791 		return;
1792 
1793 restart:
1794 	read_lock_bh(&table->tb6_lock);
1795 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1796 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1797 			dst_hold(&rt->u.dst);
1798 			read_unlock_bh(&table->tb6_lock);
1799 			ip6_del_rt(rt);
1800 			goto restart;
1801 		}
1802 	}
1803 	read_unlock_bh(&table->tb6_lock);
1804 }
1805 
1806 static void rtmsg_to_fib6_config(struct net *net,
1807 				 struct in6_rtmsg *rtmsg,
1808 				 struct fib6_config *cfg)
1809 {
1810 	memset(cfg, 0, sizeof(*cfg));
1811 
1812 	cfg->fc_table = RT6_TABLE_MAIN;
1813 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1814 	cfg->fc_metric = rtmsg->rtmsg_metric;
1815 	cfg->fc_expires = rtmsg->rtmsg_info;
1816 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1817 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1818 	cfg->fc_flags = rtmsg->rtmsg_flags;
1819 
1820 	cfg->fc_nlinfo.nl_net = net;
1821 
1822 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1823 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1824 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1825 }
1826 
1827 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1828 {
1829 	struct fib6_config cfg;
1830 	struct in6_rtmsg rtmsg;
1831 	int err;
1832 
1833 	switch(cmd) {
1834 	case SIOCADDRT:		/* Add a route */
1835 	case SIOCDELRT:		/* Delete a route */
1836 		if (!capable(CAP_NET_ADMIN))
1837 			return -EPERM;
1838 		err = copy_from_user(&rtmsg, arg,
1839 				     sizeof(struct in6_rtmsg));
1840 		if (err)
1841 			return -EFAULT;
1842 
1843 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1844 
1845 		rtnl_lock();
1846 		switch (cmd) {
1847 		case SIOCADDRT:
1848 			err = ip6_route_add(&cfg);
1849 			break;
1850 		case SIOCDELRT:
1851 			err = ip6_route_del(&cfg);
1852 			break;
1853 		default:
1854 			err = -EINVAL;
1855 		}
1856 		rtnl_unlock();
1857 
1858 		return err;
1859 	}
1860 
1861 	return -EINVAL;
1862 }
1863 
1864 /*
1865  *	Drop the packet on the floor
1866  */
1867 
1868 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1869 {
1870 	int type;
1871 	struct dst_entry *dst = skb_dst(skb);
1872 	switch (ipstats_mib_noroutes) {
1873 	case IPSTATS_MIB_INNOROUTES:
1874 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1875 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1876 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1877 				      IPSTATS_MIB_INADDRERRORS);
1878 			break;
1879 		}
1880 		/* FALLTHROUGH */
1881 	case IPSTATS_MIB_OUTNOROUTES:
1882 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1883 			      ipstats_mib_noroutes);
1884 		break;
1885 	}
1886 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1887 	kfree_skb(skb);
1888 	return 0;
1889 }
1890 
1891 static int ip6_pkt_discard(struct sk_buff *skb)
1892 {
1893 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1894 }
1895 
1896 static int ip6_pkt_discard_out(struct sk_buff *skb)
1897 {
1898 	skb->dev = skb_dst(skb)->dev;
1899 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1900 }
1901 
1902 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1903 
1904 static int ip6_pkt_prohibit(struct sk_buff *skb)
1905 {
1906 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1907 }
1908 
1909 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1910 {
1911 	skb->dev = skb_dst(skb)->dev;
1912 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1913 }
1914 
1915 #endif
1916 
1917 /*
1918  *	Allocate a dst for local (unicast / anycast) address.
1919  */
1920 
1921 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1922 				    const struct in6_addr *addr,
1923 				    int anycast)
1924 {
1925 	struct net *net = dev_net(idev->dev);
1926 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1927 	struct neighbour *neigh;
1928 
1929 	if (rt == NULL)
1930 		return ERR_PTR(-ENOMEM);
1931 
1932 	dev_hold(net->loopback_dev);
1933 	in6_dev_hold(idev);
1934 
1935 	rt->u.dst.flags = DST_HOST;
1936 	rt->u.dst.input = ip6_input;
1937 	rt->u.dst.output = ip6_output;
1938 	rt->rt6i_dev = net->loopback_dev;
1939 	rt->rt6i_idev = idev;
1940 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1941 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1942 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1943 	rt->u.dst.obsolete = -1;
1944 
1945 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1946 	if (anycast)
1947 		rt->rt6i_flags |= RTF_ANYCAST;
1948 	else
1949 		rt->rt6i_flags |= RTF_LOCAL;
1950 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1951 	if (IS_ERR(neigh)) {
1952 		dst_free(&rt->u.dst);
1953 
1954 		/* We are casting this because that is the return
1955 		 * value type.  But an errno encoded pointer is the
1956 		 * same regardless of the underlying pointer type,
1957 		 * and that's what we are returning.  So this is OK.
1958 		 */
1959 		return (struct rt6_info *) neigh;
1960 	}
1961 	rt->rt6i_nexthop = neigh;
1962 
1963 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1964 	rt->rt6i_dst.plen = 128;
1965 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1966 
1967 	atomic_set(&rt->u.dst.__refcnt, 1);
1968 
1969 	return rt;
1970 }
1971 
1972 struct arg_dev_net {
1973 	struct net_device *dev;
1974 	struct net *net;
1975 };
1976 
1977 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1978 {
1979 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1980 	struct net *net = ((struct arg_dev_net *)arg)->net;
1981 
1982 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1983 	    rt != net->ipv6.ip6_null_entry) {
1984 		RT6_TRACE("deleted by ifdown %p\n", rt);
1985 		return -1;
1986 	}
1987 	return 0;
1988 }
1989 
1990 void rt6_ifdown(struct net *net, struct net_device *dev)
1991 {
1992 	struct arg_dev_net adn = {
1993 		.dev = dev,
1994 		.net = net,
1995 	};
1996 
1997 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1998 	icmp6_clean_all(fib6_ifdown, &adn);
1999 }
2000 
2001 struct rt6_mtu_change_arg
2002 {
2003 	struct net_device *dev;
2004 	unsigned mtu;
2005 };
2006 
2007 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2008 {
2009 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2010 	struct inet6_dev *idev;
2011 	struct net *net = dev_net(arg->dev);
2012 
2013 	/* In IPv6 pmtu discovery is not optional,
2014 	   so that RTAX_MTU lock cannot disable it.
2015 	   We still use this lock to block changes
2016 	   caused by addrconf/ndisc.
2017 	*/
2018 
2019 	idev = __in6_dev_get(arg->dev);
2020 	if (idev == NULL)
2021 		return 0;
2022 
2023 	/* For administrative MTU increase, there is no way to discover
2024 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2025 	   Since RFC 1981 doesn't include administrative MTU increase
2026 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2027 	 */
2028 	/*
2029 	   If new MTU is less than route PMTU, this new MTU will be the
2030 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2031 	   decreases; if new MTU is greater than route PMTU, and the
2032 	   old MTU is the lowest MTU in the path, update the route PMTU
2033 	   to reflect the increase. In this case if the other nodes' MTU
2034 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2035 	   PMTU discouvery.
2036 	 */
2037 	if (rt->rt6i_dev == arg->dev &&
2038 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2039 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
2040 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
2041 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2042 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2043 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2044 	}
2045 	return 0;
2046 }
2047 
2048 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2049 {
2050 	struct rt6_mtu_change_arg arg = {
2051 		.dev = dev,
2052 		.mtu = mtu,
2053 	};
2054 
2055 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2056 }
2057 
2058 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2059 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2060 	[RTA_OIF]               = { .type = NLA_U32 },
2061 	[RTA_IIF]		= { .type = NLA_U32 },
2062 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2063 	[RTA_METRICS]           = { .type = NLA_NESTED },
2064 };
2065 
2066 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2067 			      struct fib6_config *cfg)
2068 {
2069 	struct rtmsg *rtm;
2070 	struct nlattr *tb[RTA_MAX+1];
2071 	int err;
2072 
2073 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2074 	if (err < 0)
2075 		goto errout;
2076 
2077 	err = -EINVAL;
2078 	rtm = nlmsg_data(nlh);
2079 	memset(cfg, 0, sizeof(*cfg));
2080 
2081 	cfg->fc_table = rtm->rtm_table;
2082 	cfg->fc_dst_len = rtm->rtm_dst_len;
2083 	cfg->fc_src_len = rtm->rtm_src_len;
2084 	cfg->fc_flags = RTF_UP;
2085 	cfg->fc_protocol = rtm->rtm_protocol;
2086 
2087 	if (rtm->rtm_type == RTN_UNREACHABLE)
2088 		cfg->fc_flags |= RTF_REJECT;
2089 
2090 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2091 	cfg->fc_nlinfo.nlh = nlh;
2092 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2093 
2094 	if (tb[RTA_GATEWAY]) {
2095 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2096 		cfg->fc_flags |= RTF_GATEWAY;
2097 	}
2098 
2099 	if (tb[RTA_DST]) {
2100 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2101 
2102 		if (nla_len(tb[RTA_DST]) < plen)
2103 			goto errout;
2104 
2105 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2106 	}
2107 
2108 	if (tb[RTA_SRC]) {
2109 		int plen = (rtm->rtm_src_len + 7) >> 3;
2110 
2111 		if (nla_len(tb[RTA_SRC]) < plen)
2112 			goto errout;
2113 
2114 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2115 	}
2116 
2117 	if (tb[RTA_OIF])
2118 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2119 
2120 	if (tb[RTA_PRIORITY])
2121 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2122 
2123 	if (tb[RTA_METRICS]) {
2124 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2125 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2126 	}
2127 
2128 	if (tb[RTA_TABLE])
2129 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2130 
2131 	err = 0;
2132 errout:
2133 	return err;
2134 }
2135 
2136 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2137 {
2138 	struct fib6_config cfg;
2139 	int err;
2140 
2141 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2142 	if (err < 0)
2143 		return err;
2144 
2145 	return ip6_route_del(&cfg);
2146 }
2147 
2148 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2149 {
2150 	struct fib6_config cfg;
2151 	int err;
2152 
2153 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2154 	if (err < 0)
2155 		return err;
2156 
2157 	return ip6_route_add(&cfg);
2158 }
2159 
2160 static inline size_t rt6_nlmsg_size(void)
2161 {
2162 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2163 	       + nla_total_size(16) /* RTA_SRC */
2164 	       + nla_total_size(16) /* RTA_DST */
2165 	       + nla_total_size(16) /* RTA_GATEWAY */
2166 	       + nla_total_size(16) /* RTA_PREFSRC */
2167 	       + nla_total_size(4) /* RTA_TABLE */
2168 	       + nla_total_size(4) /* RTA_IIF */
2169 	       + nla_total_size(4) /* RTA_OIF */
2170 	       + nla_total_size(4) /* RTA_PRIORITY */
2171 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2172 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2173 }
2174 
2175 static int rt6_fill_node(struct net *net,
2176 			 struct sk_buff *skb, struct rt6_info *rt,
2177 			 struct in6_addr *dst, struct in6_addr *src,
2178 			 int iif, int type, u32 pid, u32 seq,
2179 			 int prefix, int nowait, unsigned int flags)
2180 {
2181 	struct rtmsg *rtm;
2182 	struct nlmsghdr *nlh;
2183 	long expires;
2184 	u32 table;
2185 
2186 	if (prefix) {	/* user wants prefix routes only */
2187 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2188 			/* success since this is not a prefix route */
2189 			return 1;
2190 		}
2191 	}
2192 
2193 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2194 	if (nlh == NULL)
2195 		return -EMSGSIZE;
2196 
2197 	rtm = nlmsg_data(nlh);
2198 	rtm->rtm_family = AF_INET6;
2199 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2200 	rtm->rtm_src_len = rt->rt6i_src.plen;
2201 	rtm->rtm_tos = 0;
2202 	if (rt->rt6i_table)
2203 		table = rt->rt6i_table->tb6_id;
2204 	else
2205 		table = RT6_TABLE_UNSPEC;
2206 	rtm->rtm_table = table;
2207 	NLA_PUT_U32(skb, RTA_TABLE, table);
2208 	if (rt->rt6i_flags&RTF_REJECT)
2209 		rtm->rtm_type = RTN_UNREACHABLE;
2210 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2211 		rtm->rtm_type = RTN_LOCAL;
2212 	else
2213 		rtm->rtm_type = RTN_UNICAST;
2214 	rtm->rtm_flags = 0;
2215 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2216 	rtm->rtm_protocol = rt->rt6i_protocol;
2217 	if (rt->rt6i_flags&RTF_DYNAMIC)
2218 		rtm->rtm_protocol = RTPROT_REDIRECT;
2219 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2220 		rtm->rtm_protocol = RTPROT_KERNEL;
2221 	else if (rt->rt6i_flags&RTF_DEFAULT)
2222 		rtm->rtm_protocol = RTPROT_RA;
2223 
2224 	if (rt->rt6i_flags&RTF_CACHE)
2225 		rtm->rtm_flags |= RTM_F_CLONED;
2226 
2227 	if (dst) {
2228 		NLA_PUT(skb, RTA_DST, 16, dst);
2229 		rtm->rtm_dst_len = 128;
2230 	} else if (rtm->rtm_dst_len)
2231 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2232 #ifdef CONFIG_IPV6_SUBTREES
2233 	if (src) {
2234 		NLA_PUT(skb, RTA_SRC, 16, src);
2235 		rtm->rtm_src_len = 128;
2236 	} else if (rtm->rtm_src_len)
2237 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2238 #endif
2239 	if (iif) {
2240 #ifdef CONFIG_IPV6_MROUTE
2241 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2242 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2243 			if (err <= 0) {
2244 				if (!nowait) {
2245 					if (err == 0)
2246 						return 0;
2247 					goto nla_put_failure;
2248 				} else {
2249 					if (err == -EMSGSIZE)
2250 						goto nla_put_failure;
2251 				}
2252 			}
2253 		} else
2254 #endif
2255 			NLA_PUT_U32(skb, RTA_IIF, iif);
2256 	} else if (dst) {
2257 		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2258 		struct in6_addr saddr_buf;
2259 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2260 				       dst, 0, &saddr_buf) == 0)
2261 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2262 	}
2263 
2264 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2265 		goto nla_put_failure;
2266 
2267 	if (rt->u.dst.neighbour)
2268 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2269 
2270 	if (rt->u.dst.dev)
2271 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2272 
2273 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2274 
2275 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2276 		expires = 0;
2277 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2278 		expires = rt->rt6i_expires - jiffies;
2279 	else
2280 		expires = INT_MAX;
2281 
2282 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2283 			       expires, rt->u.dst.error) < 0)
2284 		goto nla_put_failure;
2285 
2286 	return nlmsg_end(skb, nlh);
2287 
2288 nla_put_failure:
2289 	nlmsg_cancel(skb, nlh);
2290 	return -EMSGSIZE;
2291 }
2292 
2293 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2294 {
2295 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2296 	int prefix;
2297 
2298 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2299 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2300 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2301 	} else
2302 		prefix = 0;
2303 
2304 	return rt6_fill_node(arg->net,
2305 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2306 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2307 		     prefix, 0, NLM_F_MULTI);
2308 }
2309 
2310 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2311 {
2312 	struct net *net = sock_net(in_skb->sk);
2313 	struct nlattr *tb[RTA_MAX+1];
2314 	struct rt6_info *rt;
2315 	struct sk_buff *skb;
2316 	struct rtmsg *rtm;
2317 	struct flowi fl;
2318 	int err, iif = 0;
2319 
2320 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2321 	if (err < 0)
2322 		goto errout;
2323 
2324 	err = -EINVAL;
2325 	memset(&fl, 0, sizeof(fl));
2326 
2327 	if (tb[RTA_SRC]) {
2328 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2329 			goto errout;
2330 
2331 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2332 	}
2333 
2334 	if (tb[RTA_DST]) {
2335 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2336 			goto errout;
2337 
2338 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2339 	}
2340 
2341 	if (tb[RTA_IIF])
2342 		iif = nla_get_u32(tb[RTA_IIF]);
2343 
2344 	if (tb[RTA_OIF])
2345 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2346 
2347 	if (iif) {
2348 		struct net_device *dev;
2349 		dev = __dev_get_by_index(net, iif);
2350 		if (!dev) {
2351 			err = -ENODEV;
2352 			goto errout;
2353 		}
2354 	}
2355 
2356 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2357 	if (skb == NULL) {
2358 		err = -ENOBUFS;
2359 		goto errout;
2360 	}
2361 
2362 	/* Reserve room for dummy headers, this skb can pass
2363 	   through good chunk of routing engine.
2364 	 */
2365 	skb_reset_mac_header(skb);
2366 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2367 
2368 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2369 	skb_dst_set(skb, &rt->u.dst);
2370 
2371 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2372 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2373 			    nlh->nlmsg_seq, 0, 0, 0);
2374 	if (err < 0) {
2375 		kfree_skb(skb);
2376 		goto errout;
2377 	}
2378 
2379 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2380 errout:
2381 	return err;
2382 }
2383 
2384 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2385 {
2386 	struct sk_buff *skb;
2387 	struct net *net = info->nl_net;
2388 	u32 seq;
2389 	int err;
2390 
2391 	err = -ENOBUFS;
2392 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2393 
2394 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2395 	if (skb == NULL)
2396 		goto errout;
2397 
2398 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2399 				event, info->pid, seq, 0, 0, 0);
2400 	if (err < 0) {
2401 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2402 		WARN_ON(err == -EMSGSIZE);
2403 		kfree_skb(skb);
2404 		goto errout;
2405 	}
2406 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2407 		    info->nlh, gfp_any());
2408 	return;
2409 errout:
2410 	if (err < 0)
2411 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2412 }
2413 
2414 static int ip6_route_dev_notify(struct notifier_block *this,
2415 				unsigned long event, void *data)
2416 {
2417 	struct net_device *dev = (struct net_device *)data;
2418 	struct net *net = dev_net(dev);
2419 
2420 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2421 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2422 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2423 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2424 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2425 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2426 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2427 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2428 #endif
2429 	}
2430 
2431 	return NOTIFY_OK;
2432 }
2433 
2434 /*
2435  *	/proc
2436  */
2437 
2438 #ifdef CONFIG_PROC_FS
2439 
2440 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2441 
2442 struct rt6_proc_arg
2443 {
2444 	char *buffer;
2445 	int offset;
2446 	int length;
2447 	int skip;
2448 	int len;
2449 };
2450 
2451 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2452 {
2453 	struct seq_file *m = p_arg;
2454 
2455 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2456 
2457 #ifdef CONFIG_IPV6_SUBTREES
2458 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2459 #else
2460 	seq_puts(m, "00000000000000000000000000000000 00 ");
2461 #endif
2462 
2463 	if (rt->rt6i_nexthop) {
2464 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2465 	} else {
2466 		seq_puts(m, "00000000000000000000000000000000");
2467 	}
2468 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2469 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2470 		   rt->u.dst.__use, rt->rt6i_flags,
2471 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2472 	return 0;
2473 }
2474 
2475 static int ipv6_route_show(struct seq_file *m, void *v)
2476 {
2477 	struct net *net = (struct net *)m->private;
2478 	fib6_clean_all(net, rt6_info_route, 0, m);
2479 	return 0;
2480 }
2481 
2482 static int ipv6_route_open(struct inode *inode, struct file *file)
2483 {
2484 	return single_open_net(inode, file, ipv6_route_show);
2485 }
2486 
2487 static const struct file_operations ipv6_route_proc_fops = {
2488 	.owner		= THIS_MODULE,
2489 	.open		= ipv6_route_open,
2490 	.read		= seq_read,
2491 	.llseek		= seq_lseek,
2492 	.release	= single_release_net,
2493 };
2494 
2495 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2496 {
2497 	struct net *net = (struct net *)seq->private;
2498 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2499 		   net->ipv6.rt6_stats->fib_nodes,
2500 		   net->ipv6.rt6_stats->fib_route_nodes,
2501 		   net->ipv6.rt6_stats->fib_rt_alloc,
2502 		   net->ipv6.rt6_stats->fib_rt_entries,
2503 		   net->ipv6.rt6_stats->fib_rt_cache,
2504 		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
2505 		   net->ipv6.rt6_stats->fib_discarded_routes);
2506 
2507 	return 0;
2508 }
2509 
2510 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2511 {
2512 	return single_open_net(inode, file, rt6_stats_seq_show);
2513 }
2514 
2515 static const struct file_operations rt6_stats_seq_fops = {
2516 	.owner	 = THIS_MODULE,
2517 	.open	 = rt6_stats_seq_open,
2518 	.read	 = seq_read,
2519 	.llseek	 = seq_lseek,
2520 	.release = single_release_net,
2521 };
2522 #endif	/* CONFIG_PROC_FS */
2523 
2524 #ifdef CONFIG_SYSCTL
2525 
2526 static
2527 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2528 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2529 {
2530 	struct net *net = current->nsproxy->net_ns;
2531 	int delay = net->ipv6.sysctl.flush_delay;
2532 	if (write) {
2533 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2534 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2535 		return 0;
2536 	} else
2537 		return -EINVAL;
2538 }
2539 
2540 ctl_table ipv6_route_table_template[] = {
2541 	{
2542 		.procname	=	"flush",
2543 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2544 		.maxlen		=	sizeof(int),
2545 		.mode		=	0200,
2546 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2547 	},
2548 	{
2549 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2550 		.procname	=	"gc_thresh",
2551 		.data		=	&ip6_dst_ops_template.gc_thresh,
2552 		.maxlen		=	sizeof(int),
2553 		.mode		=	0644,
2554 		.proc_handler	=	proc_dointvec,
2555 	},
2556 	{
2557 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2558 		.procname	=	"max_size",
2559 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2560 		.maxlen		=	sizeof(int),
2561 		.mode		=	0644,
2562 		.proc_handler	=	proc_dointvec,
2563 	},
2564 	{
2565 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2566 		.procname	=	"gc_min_interval",
2567 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2568 		.maxlen		=	sizeof(int),
2569 		.mode		=	0644,
2570 		.proc_handler	=	proc_dointvec_jiffies,
2571 		.strategy	=	sysctl_jiffies,
2572 	},
2573 	{
2574 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2575 		.procname	=	"gc_timeout",
2576 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2577 		.maxlen		=	sizeof(int),
2578 		.mode		=	0644,
2579 		.proc_handler	=	proc_dointvec_jiffies,
2580 		.strategy	=	sysctl_jiffies,
2581 	},
2582 	{
2583 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2584 		.procname	=	"gc_interval",
2585 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2586 		.maxlen		=	sizeof(int),
2587 		.mode		=	0644,
2588 		.proc_handler	=	proc_dointvec_jiffies,
2589 		.strategy	=	sysctl_jiffies,
2590 	},
2591 	{
2592 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2593 		.procname	=	"gc_elasticity",
2594 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2595 		.maxlen		=	sizeof(int),
2596 		.mode		=	0644,
2597 		.proc_handler	=	proc_dointvec_jiffies,
2598 		.strategy	=	sysctl_jiffies,
2599 	},
2600 	{
2601 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2602 		.procname	=	"mtu_expires",
2603 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2604 		.maxlen		=	sizeof(int),
2605 		.mode		=	0644,
2606 		.proc_handler	=	proc_dointvec_jiffies,
2607 		.strategy	=	sysctl_jiffies,
2608 	},
2609 	{
2610 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2611 		.procname	=	"min_adv_mss",
2612 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2613 		.maxlen		=	sizeof(int),
2614 		.mode		=	0644,
2615 		.proc_handler	=	proc_dointvec_jiffies,
2616 		.strategy	=	sysctl_jiffies,
2617 	},
2618 	{
2619 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2620 		.procname	=	"gc_min_interval_ms",
2621 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2622 		.maxlen		=	sizeof(int),
2623 		.mode		=	0644,
2624 		.proc_handler	=	proc_dointvec_ms_jiffies,
2625 		.strategy	=	sysctl_ms_jiffies,
2626 	},
2627 	{ .ctl_name = 0 }
2628 };
2629 
2630 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2631 {
2632 	struct ctl_table *table;
2633 
2634 	table = kmemdup(ipv6_route_table_template,
2635 			sizeof(ipv6_route_table_template),
2636 			GFP_KERNEL);
2637 
2638 	if (table) {
2639 		table[0].data = &net->ipv6.sysctl.flush_delay;
2640 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2641 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2642 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2643 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2644 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2645 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2646 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2647 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2648 	}
2649 
2650 	return table;
2651 }
2652 #endif
2653 
2654 static int ip6_route_net_init(struct net *net)
2655 {
2656 	int ret = -ENOMEM;
2657 
2658 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2659 	       sizeof(net->ipv6.ip6_dst_ops));
2660 
2661 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2662 					   sizeof(*net->ipv6.ip6_null_entry),
2663 					   GFP_KERNEL);
2664 	if (!net->ipv6.ip6_null_entry)
2665 		goto out_ip6_dst_ops;
2666 	net->ipv6.ip6_null_entry->u.dst.path =
2667 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2668 	net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2669 
2670 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2671 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2672 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2673 					       GFP_KERNEL);
2674 	if (!net->ipv6.ip6_prohibit_entry)
2675 		goto out_ip6_null_entry;
2676 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2677 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2678 	net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2679 
2680 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2681 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2682 					       GFP_KERNEL);
2683 	if (!net->ipv6.ip6_blk_hole_entry)
2684 		goto out_ip6_prohibit_entry;
2685 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2686 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2687 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2688 #endif
2689 
2690 	net->ipv6.sysctl.flush_delay = 0;
2691 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2692 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2693 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2694 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2695 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2696 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2697 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2698 
2699 #ifdef CONFIG_PROC_FS
2700 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2701 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2702 #endif
2703 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2704 
2705 	ret = 0;
2706 out:
2707 	return ret;
2708 
2709 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2710 out_ip6_prohibit_entry:
2711 	kfree(net->ipv6.ip6_prohibit_entry);
2712 out_ip6_null_entry:
2713 	kfree(net->ipv6.ip6_null_entry);
2714 #endif
2715 out_ip6_dst_ops:
2716 	goto out;
2717 }
2718 
2719 static void ip6_route_net_exit(struct net *net)
2720 {
2721 #ifdef CONFIG_PROC_FS
2722 	proc_net_remove(net, "ipv6_route");
2723 	proc_net_remove(net, "rt6_stats");
2724 #endif
2725 	kfree(net->ipv6.ip6_null_entry);
2726 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2727 	kfree(net->ipv6.ip6_prohibit_entry);
2728 	kfree(net->ipv6.ip6_blk_hole_entry);
2729 #endif
2730 }
2731 
2732 static struct pernet_operations ip6_route_net_ops = {
2733 	.init = ip6_route_net_init,
2734 	.exit = ip6_route_net_exit,
2735 };
2736 
2737 static struct notifier_block ip6_route_dev_notifier = {
2738 	.notifier_call = ip6_route_dev_notify,
2739 	.priority = 0,
2740 };
2741 
2742 int __init ip6_route_init(void)
2743 {
2744 	int ret;
2745 
2746 	ret = -ENOMEM;
2747 	ip6_dst_ops_template.kmem_cachep =
2748 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2749 				  SLAB_HWCACHE_ALIGN, NULL);
2750 	if (!ip6_dst_ops_template.kmem_cachep)
2751 		goto out;
2752 
2753 	ret = register_pernet_subsys(&ip6_route_net_ops);
2754 	if (ret)
2755 		goto out_kmem_cache;
2756 
2757 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2758 
2759 	/* Registering of the loopback is done before this portion of code,
2760 	 * the loopback reference in rt6_info will not be taken, do it
2761 	 * manually for init_net */
2762 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2763 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2764   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2765 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2766 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2767 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2768 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2769   #endif
2770 	ret = fib6_init();
2771 	if (ret)
2772 		goto out_register_subsys;
2773 
2774 	ret = xfrm6_init();
2775 	if (ret)
2776 		goto out_fib6_init;
2777 
2778 	ret = fib6_rules_init();
2779 	if (ret)
2780 		goto xfrm6_init;
2781 
2782 	ret = -ENOBUFS;
2783 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2784 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2785 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2786 		goto fib6_rules_init;
2787 
2788 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2789 	if (ret)
2790 		goto fib6_rules_init;
2791 
2792 out:
2793 	return ret;
2794 
2795 fib6_rules_init:
2796 	fib6_rules_cleanup();
2797 xfrm6_init:
2798 	xfrm6_fini();
2799 out_fib6_init:
2800 	fib6_gc_cleanup();
2801 out_register_subsys:
2802 	unregister_pernet_subsys(&ip6_route_net_ops);
2803 out_kmem_cache:
2804 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2805 	goto out;
2806 }
2807 
2808 void ip6_route_cleanup(void)
2809 {
2810 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2811 	fib6_rules_cleanup();
2812 	xfrm6_fini();
2813 	fib6_gc_cleanup();
2814 	unregister_pernet_subsys(&ip6_route_net_ops);
2815 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2816 }
2817