xref: /openbmc/linux/net/ipv6/route.c (revision b6dcefde)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56 
57 #include <asm/uaccess.h>
58 
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62 
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65 
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73 
74 #define CLONE_OFFLINK_ROUTE 0
75 
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void		ip6_dst_destroy(struct dst_entry *);
80 static void		ip6_dst_ifdown(struct dst_entry *,
81 				       struct net_device *dev, int how);
82 static int		 ip6_dst_gc(struct dst_ops *ops);
83 
84 static int		ip6_pkt_discard(struct sk_buff *skb);
85 static int		ip6_pkt_discard_out(struct sk_buff *skb);
86 static void		ip6_link_failure(struct sk_buff *skb);
87 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88 
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91 					   struct in6_addr *prefix, int prefixlen,
92 					   struct in6_addr *gwaddr, int ifindex,
93 					   unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95 					   struct in6_addr *prefix, int prefixlen,
96 					   struct in6_addr *gwaddr, int ifindex);
97 #endif
98 
99 static struct dst_ops ip6_dst_ops_template = {
100 	.family			=	AF_INET6,
101 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
102 	.gc			=	ip6_dst_gc,
103 	.gc_thresh		=	1024,
104 	.check			=	ip6_dst_check,
105 	.destroy		=	ip6_dst_destroy,
106 	.ifdown			=	ip6_dst_ifdown,
107 	.negative_advice	=	ip6_negative_advice,
108 	.link_failure		=	ip6_link_failure,
109 	.update_pmtu		=	ip6_rt_update_pmtu,
110 	.local_out		=	__ip6_local_out,
111 	.entries		=	ATOMIC_INIT(0),
112 };
113 
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117 
118 static struct dst_ops ip6_dst_blackhole_ops = {
119 	.family			=	AF_INET6,
120 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
121 	.destroy		=	ip6_dst_destroy,
122 	.check			=	ip6_dst_check,
123 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
124 	.entries		=	ATOMIC_INIT(0),
125 };
126 
127 static struct rt6_info ip6_null_entry_template = {
128 	.u = {
129 		.dst = {
130 			.__refcnt	= ATOMIC_INIT(1),
131 			.__use		= 1,
132 			.obsolete	= -1,
133 			.error		= -ENETUNREACH,
134 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
135 			.input		= ip6_pkt_discard,
136 			.output		= ip6_pkt_discard_out,
137 		}
138 	},
139 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
140 	.rt6i_protocol  = RTPROT_KERNEL,
141 	.rt6i_metric	= ~(u32) 0,
142 	.rt6i_ref	= ATOMIC_INIT(1),
143 };
144 
145 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
146 
147 static int ip6_pkt_prohibit(struct sk_buff *skb);
148 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
149 
150 static struct rt6_info ip6_prohibit_entry_template = {
151 	.u = {
152 		.dst = {
153 			.__refcnt	= ATOMIC_INIT(1),
154 			.__use		= 1,
155 			.obsolete	= -1,
156 			.error		= -EACCES,
157 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
158 			.input		= ip6_pkt_prohibit,
159 			.output		= ip6_pkt_prohibit_out,
160 		}
161 	},
162 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
163 	.rt6i_protocol  = RTPROT_KERNEL,
164 	.rt6i_metric	= ~(u32) 0,
165 	.rt6i_ref	= ATOMIC_INIT(1),
166 };
167 
168 static struct rt6_info ip6_blk_hole_entry_template = {
169 	.u = {
170 		.dst = {
171 			.__refcnt	= ATOMIC_INIT(1),
172 			.__use		= 1,
173 			.obsolete	= -1,
174 			.error		= -EINVAL,
175 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
176 			.input		= dst_discard,
177 			.output		= dst_discard,
178 		}
179 	},
180 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
181 	.rt6i_protocol  = RTPROT_KERNEL,
182 	.rt6i_metric	= ~(u32) 0,
183 	.rt6i_ref	= ATOMIC_INIT(1),
184 };
185 
186 #endif
187 
188 /* allocate dst with ip6_dst_ops */
189 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
190 {
191 	return (struct rt6_info *)dst_alloc(ops);
192 }
193 
194 static void ip6_dst_destroy(struct dst_entry *dst)
195 {
196 	struct rt6_info *rt = (struct rt6_info *)dst;
197 	struct inet6_dev *idev = rt->rt6i_idev;
198 
199 	if (idev != NULL) {
200 		rt->rt6i_idev = NULL;
201 		in6_dev_put(idev);
202 	}
203 }
204 
205 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
206 			   int how)
207 {
208 	struct rt6_info *rt = (struct rt6_info *)dst;
209 	struct inet6_dev *idev = rt->rt6i_idev;
210 	struct net_device *loopback_dev =
211 		dev_net(dev)->loopback_dev;
212 
213 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
214 		struct inet6_dev *loopback_idev =
215 			in6_dev_get(loopback_dev);
216 		if (loopback_idev != NULL) {
217 			rt->rt6i_idev = loopback_idev;
218 			in6_dev_put(idev);
219 		}
220 	}
221 }
222 
223 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
224 {
225 	return (rt->rt6i_flags & RTF_EXPIRES &&
226 		time_after(jiffies, rt->rt6i_expires));
227 }
228 
229 static inline int rt6_need_strict(struct in6_addr *daddr)
230 {
231 	return (ipv6_addr_type(daddr) &
232 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
233 }
234 
235 /*
236  *	Route lookup. Any table->tb6_lock is implied.
237  */
238 
239 static inline struct rt6_info *rt6_device_match(struct net *net,
240 						    struct rt6_info *rt,
241 						    struct in6_addr *saddr,
242 						    int oif,
243 						    int flags)
244 {
245 	struct rt6_info *local = NULL;
246 	struct rt6_info *sprt;
247 
248 	if (!oif && ipv6_addr_any(saddr))
249 		goto out;
250 
251 	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
252 		struct net_device *dev = sprt->rt6i_dev;
253 
254 		if (oif) {
255 			if (dev->ifindex == oif)
256 				return sprt;
257 			if (dev->flags & IFF_LOOPBACK) {
258 				if (sprt->rt6i_idev == NULL ||
259 				    sprt->rt6i_idev->dev->ifindex != oif) {
260 					if (flags & RT6_LOOKUP_F_IFACE && oif)
261 						continue;
262 					if (local && (!oif ||
263 						      local->rt6i_idev->dev->ifindex == oif))
264 						continue;
265 				}
266 				local = sprt;
267 			}
268 		} else {
269 			if (ipv6_chk_addr(net, saddr, dev,
270 					  flags & RT6_LOOKUP_F_IFACE))
271 				return sprt;
272 		}
273 	}
274 
275 	if (oif) {
276 		if (local)
277 			return local;
278 
279 		if (flags & RT6_LOOKUP_F_IFACE)
280 			return net->ipv6.ip6_null_entry;
281 	}
282 out:
283 	return rt;
284 }
285 
286 #ifdef CONFIG_IPV6_ROUTER_PREF
287 static void rt6_probe(struct rt6_info *rt)
288 {
289 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
290 	/*
291 	 * Okay, this does not seem to be appropriate
292 	 * for now, however, we need to check if it
293 	 * is really so; aka Router Reachability Probing.
294 	 *
295 	 * Router Reachability Probe MUST be rate-limited
296 	 * to no more than one per minute.
297 	 */
298 	if (!neigh || (neigh->nud_state & NUD_VALID))
299 		return;
300 	read_lock_bh(&neigh->lock);
301 	if (!(neigh->nud_state & NUD_VALID) &&
302 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
303 		struct in6_addr mcaddr;
304 		struct in6_addr *target;
305 
306 		neigh->updated = jiffies;
307 		read_unlock_bh(&neigh->lock);
308 
309 		target = (struct in6_addr *)&neigh->primary_key;
310 		addrconf_addr_solict_mult(target, &mcaddr);
311 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
312 	} else
313 		read_unlock_bh(&neigh->lock);
314 }
315 #else
316 static inline void rt6_probe(struct rt6_info *rt)
317 {
318 	return;
319 }
320 #endif
321 
322 /*
323  * Default Router Selection (RFC 2461 6.3.6)
324  */
325 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
326 {
327 	struct net_device *dev = rt->rt6i_dev;
328 	if (!oif || dev->ifindex == oif)
329 		return 2;
330 	if ((dev->flags & IFF_LOOPBACK) &&
331 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
332 		return 1;
333 	return 0;
334 }
335 
336 static inline int rt6_check_neigh(struct rt6_info *rt)
337 {
338 	struct neighbour *neigh = rt->rt6i_nexthop;
339 	int m;
340 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
341 	    !(rt->rt6i_flags & RTF_GATEWAY))
342 		m = 1;
343 	else if (neigh) {
344 		read_lock_bh(&neigh->lock);
345 		if (neigh->nud_state & NUD_VALID)
346 			m = 2;
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 		else if (neigh->nud_state & NUD_FAILED)
349 			m = 0;
350 #endif
351 		else
352 			m = 1;
353 		read_unlock_bh(&neigh->lock);
354 	} else
355 		m = 0;
356 	return m;
357 }
358 
359 static int rt6_score_route(struct rt6_info *rt, int oif,
360 			   int strict)
361 {
362 	int m, n;
363 
364 	m = rt6_check_dev(rt, oif);
365 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
366 		return -1;
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
369 #endif
370 	n = rt6_check_neigh(rt);
371 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
372 		return -1;
373 	return m;
374 }
375 
376 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
377 				   int *mpri, struct rt6_info *match)
378 {
379 	int m;
380 
381 	if (rt6_check_expired(rt))
382 		goto out;
383 
384 	m = rt6_score_route(rt, oif, strict);
385 	if (m < 0)
386 		goto out;
387 
388 	if (m > *mpri) {
389 		if (strict & RT6_LOOKUP_F_REACHABLE)
390 			rt6_probe(match);
391 		*mpri = m;
392 		match = rt;
393 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
394 		rt6_probe(rt);
395 	}
396 
397 out:
398 	return match;
399 }
400 
401 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
402 				     struct rt6_info *rr_head,
403 				     u32 metric, int oif, int strict)
404 {
405 	struct rt6_info *rt, *match;
406 	int mpri = -1;
407 
408 	match = NULL;
409 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
410 	     rt = rt->u.dst.rt6_next)
411 		match = find_match(rt, oif, strict, &mpri, match);
412 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
413 	     rt = rt->u.dst.rt6_next)
414 		match = find_match(rt, oif, strict, &mpri, match);
415 
416 	return match;
417 }
418 
419 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
420 {
421 	struct rt6_info *match, *rt0;
422 	struct net *net;
423 
424 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 		  __func__, fn->leaf, oif);
426 
427 	rt0 = fn->rr_ptr;
428 	if (!rt0)
429 		fn->rr_ptr = rt0 = fn->leaf;
430 
431 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
432 
433 	if (!match &&
434 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
435 		struct rt6_info *next = rt0->u.dst.rt6_next;
436 
437 		/* no entries matched; do round-robin */
438 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
439 			next = fn->leaf;
440 
441 		if (next != rt0)
442 			fn->rr_ptr = next;
443 	}
444 
445 	RT6_TRACE("%s() => %p\n",
446 		  __func__, match);
447 
448 	net = dev_net(rt0->rt6i_dev);
449 	return (match ? match : net->ipv6.ip6_null_entry);
450 }
451 
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
454 		  struct in6_addr *gwaddr)
455 {
456 	struct net *net = dev_net(dev);
457 	struct route_info *rinfo = (struct route_info *) opt;
458 	struct in6_addr prefix_buf, *prefix;
459 	unsigned int pref;
460 	unsigned long lifetime;
461 	struct rt6_info *rt;
462 
463 	if (len < sizeof(struct route_info)) {
464 		return -EINVAL;
465 	}
466 
467 	/* Sanity check for prefix_len and length */
468 	if (rinfo->length > 3) {
469 		return -EINVAL;
470 	} else if (rinfo->prefix_len > 128) {
471 		return -EINVAL;
472 	} else if (rinfo->prefix_len > 64) {
473 		if (rinfo->length < 2) {
474 			return -EINVAL;
475 		}
476 	} else if (rinfo->prefix_len > 0) {
477 		if (rinfo->length < 1) {
478 			return -EINVAL;
479 		}
480 	}
481 
482 	pref = rinfo->route_pref;
483 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
484 		return -EINVAL;
485 
486 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
487 
488 	if (rinfo->length == 3)
489 		prefix = (struct in6_addr *)rinfo->prefix;
490 	else {
491 		/* this function is safe */
492 		ipv6_addr_prefix(&prefix_buf,
493 				 (struct in6_addr *)rinfo->prefix,
494 				 rinfo->prefix_len);
495 		prefix = &prefix_buf;
496 	}
497 
498 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
499 				dev->ifindex);
500 
501 	if (rt && !lifetime) {
502 		ip6_del_rt(rt);
503 		rt = NULL;
504 	}
505 
506 	if (!rt && lifetime)
507 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
508 					pref);
509 	else if (rt)
510 		rt->rt6i_flags = RTF_ROUTEINFO |
511 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
512 
513 	if (rt) {
514 		if (!addrconf_finite_timeout(lifetime)) {
515 			rt->rt6i_flags &= ~RTF_EXPIRES;
516 		} else {
517 			rt->rt6i_expires = jiffies + HZ * lifetime;
518 			rt->rt6i_flags |= RTF_EXPIRES;
519 		}
520 		dst_release(&rt->u.dst);
521 	}
522 	return 0;
523 }
524 #endif
525 
526 #define BACKTRACK(__net, saddr)			\
527 do { \
528 	if (rt == __net->ipv6.ip6_null_entry) {	\
529 		struct fib6_node *pn; \
530 		while (1) { \
531 			if (fn->fn_flags & RTN_TL_ROOT) \
532 				goto out; \
533 			pn = fn->parent; \
534 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
536 			else \
537 				fn = pn; \
538 			if (fn->fn_flags & RTN_RTINFO) \
539 				goto restart; \
540 		} \
541 	} \
542 } while(0)
543 
544 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
545 					     struct fib6_table *table,
546 					     struct flowi *fl, int flags)
547 {
548 	struct fib6_node *fn;
549 	struct rt6_info *rt;
550 
551 	read_lock_bh(&table->tb6_lock);
552 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
553 restart:
554 	rt = fn->leaf;
555 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
556 	BACKTRACK(net, &fl->fl6_src);
557 out:
558 	dst_use(&rt->u.dst, jiffies);
559 	read_unlock_bh(&table->tb6_lock);
560 	return rt;
561 
562 }
563 
564 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
565 			    const struct in6_addr *saddr, int oif, int strict)
566 {
567 	struct flowi fl = {
568 		.oif = oif,
569 		.nl_u = {
570 			.ip6_u = {
571 				.daddr = *daddr,
572 			},
573 		},
574 	};
575 	struct dst_entry *dst;
576 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
577 
578 	if (saddr) {
579 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
580 		flags |= RT6_LOOKUP_F_HAS_SADDR;
581 	}
582 
583 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
584 	if (dst->error == 0)
585 		return (struct rt6_info *) dst;
586 
587 	dst_release(dst);
588 
589 	return NULL;
590 }
591 
592 EXPORT_SYMBOL(rt6_lookup);
593 
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595    It takes new route entry, the addition fails by any reason the
596    route is freed. In any case, if caller does not hold it, it may
597    be destroyed.
598  */
599 
600 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
601 {
602 	int err;
603 	struct fib6_table *table;
604 
605 	table = rt->rt6i_table;
606 	write_lock_bh(&table->tb6_lock);
607 	err = fib6_add(&table->tb6_root, rt, info);
608 	write_unlock_bh(&table->tb6_lock);
609 
610 	return err;
611 }
612 
613 int ip6_ins_rt(struct rt6_info *rt)
614 {
615 	struct nl_info info = {
616 		.nl_net = dev_net(rt->rt6i_dev),
617 	};
618 	return __ip6_ins_rt(rt, &info);
619 }
620 
621 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
622 				      struct in6_addr *saddr)
623 {
624 	struct rt6_info *rt;
625 
626 	/*
627 	 *	Clone the route.
628 	 */
629 
630 	rt = ip6_rt_copy(ort);
631 
632 	if (rt) {
633 		struct neighbour *neigh;
634 		int attempts = !in_softirq();
635 
636 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
637 			if (rt->rt6i_dst.plen != 128 &&
638 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
639 				rt->rt6i_flags |= RTF_ANYCAST;
640 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
641 		}
642 
643 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
644 		rt->rt6i_dst.plen = 128;
645 		rt->rt6i_flags |= RTF_CACHE;
646 		rt->u.dst.flags |= DST_HOST;
647 
648 #ifdef CONFIG_IPV6_SUBTREES
649 		if (rt->rt6i_src.plen && saddr) {
650 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
651 			rt->rt6i_src.plen = 128;
652 		}
653 #endif
654 
655 	retry:
656 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
657 		if (IS_ERR(neigh)) {
658 			struct net *net = dev_net(rt->rt6i_dev);
659 			int saved_rt_min_interval =
660 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
661 			int saved_rt_elasticity =
662 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
663 
664 			if (attempts-- > 0) {
665 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
666 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
667 
668 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
669 
670 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
671 					saved_rt_elasticity;
672 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
673 					saved_rt_min_interval;
674 				goto retry;
675 			}
676 
677 			if (net_ratelimit())
678 				printk(KERN_WARNING
679 				       "Neighbour table overflow.\n");
680 			dst_free(&rt->u.dst);
681 			return NULL;
682 		}
683 		rt->rt6i_nexthop = neigh;
684 
685 	}
686 
687 	return rt;
688 }
689 
690 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
691 {
692 	struct rt6_info *rt = ip6_rt_copy(ort);
693 	if (rt) {
694 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 		rt->rt6i_dst.plen = 128;
696 		rt->rt6i_flags |= RTF_CACHE;
697 		rt->u.dst.flags |= DST_HOST;
698 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
699 	}
700 	return rt;
701 }
702 
703 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
704 				      struct flowi *fl, int flags)
705 {
706 	struct fib6_node *fn;
707 	struct rt6_info *rt, *nrt;
708 	int strict = 0;
709 	int attempts = 3;
710 	int err;
711 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
712 
713 	strict |= flags & RT6_LOOKUP_F_IFACE;
714 
715 relookup:
716 	read_lock_bh(&table->tb6_lock);
717 
718 restart_2:
719 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
720 
721 restart:
722 	rt = rt6_select(fn, oif, strict | reachable);
723 
724 	BACKTRACK(net, &fl->fl6_src);
725 	if (rt == net->ipv6.ip6_null_entry ||
726 	    rt->rt6i_flags & RTF_CACHE)
727 		goto out;
728 
729 	dst_hold(&rt->u.dst);
730 	read_unlock_bh(&table->tb6_lock);
731 
732 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
733 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
734 	else {
735 #if CLONE_OFFLINK_ROUTE
736 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
737 #else
738 		goto out2;
739 #endif
740 	}
741 
742 	dst_release(&rt->u.dst);
743 	rt = nrt ? : net->ipv6.ip6_null_entry;
744 
745 	dst_hold(&rt->u.dst);
746 	if (nrt) {
747 		err = ip6_ins_rt(nrt);
748 		if (!err)
749 			goto out2;
750 	}
751 
752 	if (--attempts <= 0)
753 		goto out2;
754 
755 	/*
756 	 * Race condition! In the gap, when table->tb6_lock was
757 	 * released someone could insert this route.  Relookup.
758 	 */
759 	dst_release(&rt->u.dst);
760 	goto relookup;
761 
762 out:
763 	if (reachable) {
764 		reachable = 0;
765 		goto restart_2;
766 	}
767 	dst_hold(&rt->u.dst);
768 	read_unlock_bh(&table->tb6_lock);
769 out2:
770 	rt->u.dst.lastuse = jiffies;
771 	rt->u.dst.__use++;
772 
773 	return rt;
774 }
775 
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777 					    struct flowi *fl, int flags)
778 {
779 	return ip6_pol_route(net, table, fl->iif, fl, flags);
780 }
781 
782 void ip6_route_input(struct sk_buff *skb)
783 {
784 	struct ipv6hdr *iph = ipv6_hdr(skb);
785 	struct net *net = dev_net(skb->dev);
786 	int flags = RT6_LOOKUP_F_HAS_SADDR;
787 	struct flowi fl = {
788 		.iif = skb->dev->ifindex,
789 		.nl_u = {
790 			.ip6_u = {
791 				.daddr = iph->daddr,
792 				.saddr = iph->saddr,
793 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
794 			},
795 		},
796 		.mark = skb->mark,
797 		.proto = iph->nexthdr,
798 	};
799 
800 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
801 		flags |= RT6_LOOKUP_F_IFACE;
802 
803 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
804 }
805 
806 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
807 					     struct flowi *fl, int flags)
808 {
809 	return ip6_pol_route(net, table, fl->oif, fl, flags);
810 }
811 
812 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
813 				    struct flowi *fl)
814 {
815 	int flags = 0;
816 
817 	if (rt6_need_strict(&fl->fl6_dst))
818 		flags |= RT6_LOOKUP_F_IFACE;
819 
820 	if (!ipv6_addr_any(&fl->fl6_src))
821 		flags |= RT6_LOOKUP_F_HAS_SADDR;
822 	else if (sk) {
823 		unsigned int prefs = inet6_sk(sk)->srcprefs;
824 		if (prefs & IPV6_PREFER_SRC_TMP)
825 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
826 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
827 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
828 		if (prefs & IPV6_PREFER_SRC_COA)
829 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
830 	}
831 
832 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
833 }
834 
835 EXPORT_SYMBOL(ip6_route_output);
836 
837 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
838 {
839 	struct rt6_info *ort = (struct rt6_info *) *dstp;
840 	struct rt6_info *rt = (struct rt6_info *)
841 		dst_alloc(&ip6_dst_blackhole_ops);
842 	struct dst_entry *new = NULL;
843 
844 	if (rt) {
845 		new = &rt->u.dst;
846 
847 		atomic_set(&new->__refcnt, 1);
848 		new->__use = 1;
849 		new->input = dst_discard;
850 		new->output = dst_discard;
851 
852 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
853 		new->dev = ort->u.dst.dev;
854 		if (new->dev)
855 			dev_hold(new->dev);
856 		rt->rt6i_idev = ort->rt6i_idev;
857 		if (rt->rt6i_idev)
858 			in6_dev_hold(rt->rt6i_idev);
859 		rt->rt6i_expires = 0;
860 
861 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
862 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
863 		rt->rt6i_metric = 0;
864 
865 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
866 #ifdef CONFIG_IPV6_SUBTREES
867 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
868 #endif
869 
870 		dst_free(new);
871 	}
872 
873 	dst_release(*dstp);
874 	*dstp = new;
875 	return (new ? 0 : -ENOMEM);
876 }
877 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
878 
879 /*
880  *	Destination cache support functions
881  */
882 
883 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
884 {
885 	struct rt6_info *rt;
886 
887 	rt = (struct rt6_info *) dst;
888 
889 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
890 		return dst;
891 
892 	return NULL;
893 }
894 
895 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
896 {
897 	struct rt6_info *rt = (struct rt6_info *) dst;
898 
899 	if (rt) {
900 		if (rt->rt6i_flags & RTF_CACHE)
901 			ip6_del_rt(rt);
902 		else
903 			dst_release(dst);
904 	}
905 	return NULL;
906 }
907 
908 static void ip6_link_failure(struct sk_buff *skb)
909 {
910 	struct rt6_info *rt;
911 
912 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
913 
914 	rt = (struct rt6_info *) skb_dst(skb);
915 	if (rt) {
916 		if (rt->rt6i_flags&RTF_CACHE) {
917 			dst_set_expires(&rt->u.dst, 0);
918 			rt->rt6i_flags |= RTF_EXPIRES;
919 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
920 			rt->rt6i_node->fn_sernum = -1;
921 	}
922 }
923 
924 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
925 {
926 	struct rt6_info *rt6 = (struct rt6_info*)dst;
927 
928 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
929 		rt6->rt6i_flags |= RTF_MODIFIED;
930 		if (mtu < IPV6_MIN_MTU) {
931 			mtu = IPV6_MIN_MTU;
932 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
933 		}
934 		dst->metrics[RTAX_MTU-1] = mtu;
935 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
936 	}
937 }
938 
939 static int ipv6_get_mtu(struct net_device *dev);
940 
941 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
942 {
943 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
944 
945 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
946 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
947 
948 	/*
949 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
950 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
951 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
952 	 * rely only on pmtu discovery"
953 	 */
954 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
955 		mtu = IPV6_MAXPLEN;
956 	return mtu;
957 }
958 
959 static struct dst_entry *icmp6_dst_gc_list;
960 static DEFINE_SPINLOCK(icmp6_dst_lock);
961 
962 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
963 				  struct neighbour *neigh,
964 				  const struct in6_addr *addr)
965 {
966 	struct rt6_info *rt;
967 	struct inet6_dev *idev = in6_dev_get(dev);
968 	struct net *net = dev_net(dev);
969 
970 	if (unlikely(idev == NULL))
971 		return NULL;
972 
973 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
974 	if (unlikely(rt == NULL)) {
975 		in6_dev_put(idev);
976 		goto out;
977 	}
978 
979 	dev_hold(dev);
980 	if (neigh)
981 		neigh_hold(neigh);
982 	else {
983 		neigh = ndisc_get_neigh(dev, addr);
984 		if (IS_ERR(neigh))
985 			neigh = NULL;
986 	}
987 
988 	rt->rt6i_dev	  = dev;
989 	rt->rt6i_idev     = idev;
990 	rt->rt6i_nexthop  = neigh;
991 	atomic_set(&rt->u.dst.__refcnt, 1);
992 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
993 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
994 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
995 	rt->u.dst.output  = ip6_output;
996 
997 #if 0	/* there's no chance to use these for ndisc */
998 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
999 				? DST_HOST
1000 				: 0;
1001 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1002 	rt->rt6i_dst.plen = 128;
1003 #endif
1004 
1005 	spin_lock_bh(&icmp6_dst_lock);
1006 	rt->u.dst.next = icmp6_dst_gc_list;
1007 	icmp6_dst_gc_list = &rt->u.dst;
1008 	spin_unlock_bh(&icmp6_dst_lock);
1009 
1010 	fib6_force_start_gc(net);
1011 
1012 out:
1013 	return &rt->u.dst;
1014 }
1015 
1016 int icmp6_dst_gc(void)
1017 {
1018 	struct dst_entry *dst, *next, **pprev;
1019 	int more = 0;
1020 
1021 	next = NULL;
1022 
1023 	spin_lock_bh(&icmp6_dst_lock);
1024 	pprev = &icmp6_dst_gc_list;
1025 
1026 	while ((dst = *pprev) != NULL) {
1027 		if (!atomic_read(&dst->__refcnt)) {
1028 			*pprev = dst->next;
1029 			dst_free(dst);
1030 		} else {
1031 			pprev = &dst->next;
1032 			++more;
1033 		}
1034 	}
1035 
1036 	spin_unlock_bh(&icmp6_dst_lock);
1037 
1038 	return more;
1039 }
1040 
1041 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1042 			    void *arg)
1043 {
1044 	struct dst_entry *dst, **pprev;
1045 
1046 	spin_lock_bh(&icmp6_dst_lock);
1047 	pprev = &icmp6_dst_gc_list;
1048 	while ((dst = *pprev) != NULL) {
1049 		struct rt6_info *rt = (struct rt6_info *) dst;
1050 		if (func(rt, arg)) {
1051 			*pprev = dst->next;
1052 			dst_free(dst);
1053 		} else {
1054 			pprev = &dst->next;
1055 		}
1056 	}
1057 	spin_unlock_bh(&icmp6_dst_lock);
1058 }
1059 
1060 static int ip6_dst_gc(struct dst_ops *ops)
1061 {
1062 	unsigned long now = jiffies;
1063 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1064 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1065 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1066 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1067 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1068 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1069 
1070 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1071 	    atomic_read(&ops->entries) <= rt_max_size)
1072 		goto out;
1073 
1074 	net->ipv6.ip6_rt_gc_expire++;
1075 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1076 	net->ipv6.ip6_rt_last_gc = now;
1077 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1078 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1079 out:
1080 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1081 	return (atomic_read(&ops->entries) > rt_max_size);
1082 }
1083 
1084 /* Clean host part of a prefix. Not necessary in radix tree,
1085    but results in cleaner routing tables.
1086 
1087    Remove it only when all the things will work!
1088  */
1089 
1090 static int ipv6_get_mtu(struct net_device *dev)
1091 {
1092 	int mtu = IPV6_MIN_MTU;
1093 	struct inet6_dev *idev;
1094 
1095 	idev = in6_dev_get(dev);
1096 	if (idev) {
1097 		mtu = idev->cnf.mtu6;
1098 		in6_dev_put(idev);
1099 	}
1100 	return mtu;
1101 }
1102 
1103 int ip6_dst_hoplimit(struct dst_entry *dst)
1104 {
1105 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1106 	if (hoplimit < 0) {
1107 		struct net_device *dev = dst->dev;
1108 		struct inet6_dev *idev = in6_dev_get(dev);
1109 		if (idev) {
1110 			hoplimit = idev->cnf.hop_limit;
1111 			in6_dev_put(idev);
1112 		} else
1113 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1114 	}
1115 	return hoplimit;
1116 }
1117 
1118 /*
1119  *
1120  */
1121 
1122 int ip6_route_add(struct fib6_config *cfg)
1123 {
1124 	int err;
1125 	struct net *net = cfg->fc_nlinfo.nl_net;
1126 	struct rt6_info *rt = NULL;
1127 	struct net_device *dev = NULL;
1128 	struct inet6_dev *idev = NULL;
1129 	struct fib6_table *table;
1130 	int addr_type;
1131 
1132 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1133 		return -EINVAL;
1134 #ifndef CONFIG_IPV6_SUBTREES
1135 	if (cfg->fc_src_len)
1136 		return -EINVAL;
1137 #endif
1138 	if (cfg->fc_ifindex) {
1139 		err = -ENODEV;
1140 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1141 		if (!dev)
1142 			goto out;
1143 		idev = in6_dev_get(dev);
1144 		if (!idev)
1145 			goto out;
1146 	}
1147 
1148 	if (cfg->fc_metric == 0)
1149 		cfg->fc_metric = IP6_RT_PRIO_USER;
1150 
1151 	table = fib6_new_table(net, cfg->fc_table);
1152 	if (table == NULL) {
1153 		err = -ENOBUFS;
1154 		goto out;
1155 	}
1156 
1157 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1158 
1159 	if (rt == NULL) {
1160 		err = -ENOMEM;
1161 		goto out;
1162 	}
1163 
1164 	rt->u.dst.obsolete = -1;
1165 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1166 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1167 				0;
1168 
1169 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1170 		cfg->fc_protocol = RTPROT_BOOT;
1171 	rt->rt6i_protocol = cfg->fc_protocol;
1172 
1173 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1174 
1175 	if (addr_type & IPV6_ADDR_MULTICAST)
1176 		rt->u.dst.input = ip6_mc_input;
1177 	else
1178 		rt->u.dst.input = ip6_forward;
1179 
1180 	rt->u.dst.output = ip6_output;
1181 
1182 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1183 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1184 	if (rt->rt6i_dst.plen == 128)
1185 	       rt->u.dst.flags = DST_HOST;
1186 
1187 #ifdef CONFIG_IPV6_SUBTREES
1188 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1189 	rt->rt6i_src.plen = cfg->fc_src_len;
1190 #endif
1191 
1192 	rt->rt6i_metric = cfg->fc_metric;
1193 
1194 	/* We cannot add true routes via loopback here,
1195 	   they would result in kernel looping; promote them to reject routes
1196 	 */
1197 	if ((cfg->fc_flags & RTF_REJECT) ||
1198 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1199 		/* hold loopback dev/idev if we haven't done so. */
1200 		if (dev != net->loopback_dev) {
1201 			if (dev) {
1202 				dev_put(dev);
1203 				in6_dev_put(idev);
1204 			}
1205 			dev = net->loopback_dev;
1206 			dev_hold(dev);
1207 			idev = in6_dev_get(dev);
1208 			if (!idev) {
1209 				err = -ENODEV;
1210 				goto out;
1211 			}
1212 		}
1213 		rt->u.dst.output = ip6_pkt_discard_out;
1214 		rt->u.dst.input = ip6_pkt_discard;
1215 		rt->u.dst.error = -ENETUNREACH;
1216 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1217 		goto install_route;
1218 	}
1219 
1220 	if (cfg->fc_flags & RTF_GATEWAY) {
1221 		struct in6_addr *gw_addr;
1222 		int gwa_type;
1223 
1224 		gw_addr = &cfg->fc_gateway;
1225 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1226 		gwa_type = ipv6_addr_type(gw_addr);
1227 
1228 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1229 			struct rt6_info *grt;
1230 
1231 			/* IPv6 strictly inhibits using not link-local
1232 			   addresses as nexthop address.
1233 			   Otherwise, router will not able to send redirects.
1234 			   It is very good, but in some (rare!) circumstances
1235 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1236 			   some exceptions. --ANK
1237 			 */
1238 			err = -EINVAL;
1239 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1240 				goto out;
1241 
1242 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1243 
1244 			err = -EHOSTUNREACH;
1245 			if (grt == NULL)
1246 				goto out;
1247 			if (dev) {
1248 				if (dev != grt->rt6i_dev) {
1249 					dst_release(&grt->u.dst);
1250 					goto out;
1251 				}
1252 			} else {
1253 				dev = grt->rt6i_dev;
1254 				idev = grt->rt6i_idev;
1255 				dev_hold(dev);
1256 				in6_dev_hold(grt->rt6i_idev);
1257 			}
1258 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1259 				err = 0;
1260 			dst_release(&grt->u.dst);
1261 
1262 			if (err)
1263 				goto out;
1264 		}
1265 		err = -EINVAL;
1266 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1267 			goto out;
1268 	}
1269 
1270 	err = -ENODEV;
1271 	if (dev == NULL)
1272 		goto out;
1273 
1274 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1275 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1276 		if (IS_ERR(rt->rt6i_nexthop)) {
1277 			err = PTR_ERR(rt->rt6i_nexthop);
1278 			rt->rt6i_nexthop = NULL;
1279 			goto out;
1280 		}
1281 	}
1282 
1283 	rt->rt6i_flags = cfg->fc_flags;
1284 
1285 install_route:
1286 	if (cfg->fc_mx) {
1287 		struct nlattr *nla;
1288 		int remaining;
1289 
1290 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1291 			int type = nla_type(nla);
1292 
1293 			if (type) {
1294 				if (type > RTAX_MAX) {
1295 					err = -EINVAL;
1296 					goto out;
1297 				}
1298 
1299 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1300 			}
1301 		}
1302 	}
1303 
1304 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1305 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1306 	if (!dst_mtu(&rt->u.dst))
1307 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1308 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1309 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1310 	rt->u.dst.dev = dev;
1311 	rt->rt6i_idev = idev;
1312 	rt->rt6i_table = table;
1313 
1314 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1315 
1316 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1317 
1318 out:
1319 	if (dev)
1320 		dev_put(dev);
1321 	if (idev)
1322 		in6_dev_put(idev);
1323 	if (rt)
1324 		dst_free(&rt->u.dst);
1325 	return err;
1326 }
1327 
1328 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1329 {
1330 	int err;
1331 	struct fib6_table *table;
1332 	struct net *net = dev_net(rt->rt6i_dev);
1333 
1334 	if (rt == net->ipv6.ip6_null_entry)
1335 		return -ENOENT;
1336 
1337 	table = rt->rt6i_table;
1338 	write_lock_bh(&table->tb6_lock);
1339 
1340 	err = fib6_del(rt, info);
1341 	dst_release(&rt->u.dst);
1342 
1343 	write_unlock_bh(&table->tb6_lock);
1344 
1345 	return err;
1346 }
1347 
1348 int ip6_del_rt(struct rt6_info *rt)
1349 {
1350 	struct nl_info info = {
1351 		.nl_net = dev_net(rt->rt6i_dev),
1352 	};
1353 	return __ip6_del_rt(rt, &info);
1354 }
1355 
1356 static int ip6_route_del(struct fib6_config *cfg)
1357 {
1358 	struct fib6_table *table;
1359 	struct fib6_node *fn;
1360 	struct rt6_info *rt;
1361 	int err = -ESRCH;
1362 
1363 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1364 	if (table == NULL)
1365 		return err;
1366 
1367 	read_lock_bh(&table->tb6_lock);
1368 
1369 	fn = fib6_locate(&table->tb6_root,
1370 			 &cfg->fc_dst, cfg->fc_dst_len,
1371 			 &cfg->fc_src, cfg->fc_src_len);
1372 
1373 	if (fn) {
1374 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1375 			if (cfg->fc_ifindex &&
1376 			    (rt->rt6i_dev == NULL ||
1377 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1378 				continue;
1379 			if (cfg->fc_flags & RTF_GATEWAY &&
1380 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1381 				continue;
1382 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1383 				continue;
1384 			dst_hold(&rt->u.dst);
1385 			read_unlock_bh(&table->tb6_lock);
1386 
1387 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1388 		}
1389 	}
1390 	read_unlock_bh(&table->tb6_lock);
1391 
1392 	return err;
1393 }
1394 
1395 /*
1396  *	Handle redirects
1397  */
1398 struct ip6rd_flowi {
1399 	struct flowi fl;
1400 	struct in6_addr gateway;
1401 };
1402 
1403 static struct rt6_info *__ip6_route_redirect(struct net *net,
1404 					     struct fib6_table *table,
1405 					     struct flowi *fl,
1406 					     int flags)
1407 {
1408 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1409 	struct rt6_info *rt;
1410 	struct fib6_node *fn;
1411 
1412 	/*
1413 	 * Get the "current" route for this destination and
1414 	 * check if the redirect has come from approriate router.
1415 	 *
1416 	 * RFC 2461 specifies that redirects should only be
1417 	 * accepted if they come from the nexthop to the target.
1418 	 * Due to the way the routes are chosen, this notion
1419 	 * is a bit fuzzy and one might need to check all possible
1420 	 * routes.
1421 	 */
1422 
1423 	read_lock_bh(&table->tb6_lock);
1424 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1425 restart:
1426 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1427 		/*
1428 		 * Current route is on-link; redirect is always invalid.
1429 		 *
1430 		 * Seems, previous statement is not true. It could
1431 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1432 		 * But then router serving it might decide, that we should
1433 		 * know truth 8)8) --ANK (980726).
1434 		 */
1435 		if (rt6_check_expired(rt))
1436 			continue;
1437 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1438 			continue;
1439 		if (fl->oif != rt->rt6i_dev->ifindex)
1440 			continue;
1441 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1442 			continue;
1443 		break;
1444 	}
1445 
1446 	if (!rt)
1447 		rt = net->ipv6.ip6_null_entry;
1448 	BACKTRACK(net, &fl->fl6_src);
1449 out:
1450 	dst_hold(&rt->u.dst);
1451 
1452 	read_unlock_bh(&table->tb6_lock);
1453 
1454 	return rt;
1455 };
1456 
1457 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1458 					   struct in6_addr *src,
1459 					   struct in6_addr *gateway,
1460 					   struct net_device *dev)
1461 {
1462 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1463 	struct net *net = dev_net(dev);
1464 	struct ip6rd_flowi rdfl = {
1465 		.fl = {
1466 			.oif = dev->ifindex,
1467 			.nl_u = {
1468 				.ip6_u = {
1469 					.daddr = *dest,
1470 					.saddr = *src,
1471 				},
1472 			},
1473 		},
1474 	};
1475 
1476 	ipv6_addr_copy(&rdfl.gateway, gateway);
1477 
1478 	if (rt6_need_strict(dest))
1479 		flags |= RT6_LOOKUP_F_IFACE;
1480 
1481 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1482 						   flags, __ip6_route_redirect);
1483 }
1484 
1485 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1486 		  struct in6_addr *saddr,
1487 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1488 {
1489 	struct rt6_info *rt, *nrt = NULL;
1490 	struct netevent_redirect netevent;
1491 	struct net *net = dev_net(neigh->dev);
1492 
1493 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1494 
1495 	if (rt == net->ipv6.ip6_null_entry) {
1496 		if (net_ratelimit())
1497 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1498 			       "for redirect target\n");
1499 		goto out;
1500 	}
1501 
1502 	/*
1503 	 *	We have finally decided to accept it.
1504 	 */
1505 
1506 	neigh_update(neigh, lladdr, NUD_STALE,
1507 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1508 		     NEIGH_UPDATE_F_OVERRIDE|
1509 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1510 				     NEIGH_UPDATE_F_ISROUTER))
1511 		     );
1512 
1513 	/*
1514 	 * Redirect received -> path was valid.
1515 	 * Look, redirects are sent only in response to data packets,
1516 	 * so that this nexthop apparently is reachable. --ANK
1517 	 */
1518 	dst_confirm(&rt->u.dst);
1519 
1520 	/* Duplicate redirect: silently ignore. */
1521 	if (neigh == rt->u.dst.neighbour)
1522 		goto out;
1523 
1524 	nrt = ip6_rt_copy(rt);
1525 	if (nrt == NULL)
1526 		goto out;
1527 
1528 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1529 	if (on_link)
1530 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1531 
1532 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1533 	nrt->rt6i_dst.plen = 128;
1534 	nrt->u.dst.flags |= DST_HOST;
1535 
1536 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1537 	nrt->rt6i_nexthop = neigh_clone(neigh);
1538 	/* Reset pmtu, it may be better */
1539 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1540 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1541 							dst_mtu(&nrt->u.dst));
1542 
1543 	if (ip6_ins_rt(nrt))
1544 		goto out;
1545 
1546 	netevent.old = &rt->u.dst;
1547 	netevent.new = &nrt->u.dst;
1548 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1549 
1550 	if (rt->rt6i_flags&RTF_CACHE) {
1551 		ip6_del_rt(rt);
1552 		return;
1553 	}
1554 
1555 out:
1556 	dst_release(&rt->u.dst);
1557 	return;
1558 }
1559 
1560 /*
1561  *	Handle ICMP "packet too big" messages
1562  *	i.e. Path MTU discovery
1563  */
1564 
1565 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1566 			struct net_device *dev, u32 pmtu)
1567 {
1568 	struct rt6_info *rt, *nrt;
1569 	struct net *net = dev_net(dev);
1570 	int allfrag = 0;
1571 
1572 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1573 	if (rt == NULL)
1574 		return;
1575 
1576 	if (pmtu >= dst_mtu(&rt->u.dst))
1577 		goto out;
1578 
1579 	if (pmtu < IPV6_MIN_MTU) {
1580 		/*
1581 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1582 		 * MTU (1280) and a fragment header should always be included
1583 		 * after a node receiving Too Big message reporting PMTU is
1584 		 * less than the IPv6 Minimum Link MTU.
1585 		 */
1586 		pmtu = IPV6_MIN_MTU;
1587 		allfrag = 1;
1588 	}
1589 
1590 	/* New mtu received -> path was valid.
1591 	   They are sent only in response to data packets,
1592 	   so that this nexthop apparently is reachable. --ANK
1593 	 */
1594 	dst_confirm(&rt->u.dst);
1595 
1596 	/* Host route. If it is static, it would be better
1597 	   not to override it, but add new one, so that
1598 	   when cache entry will expire old pmtu
1599 	   would return automatically.
1600 	 */
1601 	if (rt->rt6i_flags & RTF_CACHE) {
1602 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1603 		if (allfrag)
1604 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1605 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1606 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1607 		goto out;
1608 	}
1609 
1610 	/* Network route.
1611 	   Two cases are possible:
1612 	   1. It is connected route. Action: COW
1613 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1614 	 */
1615 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1616 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1617 	else
1618 		nrt = rt6_alloc_clone(rt, daddr);
1619 
1620 	if (nrt) {
1621 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1622 		if (allfrag)
1623 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1624 
1625 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1626 		 * happened within 5 mins, the recommended timer is 10 mins.
1627 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1628 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1629 		 * and detecting PMTU increase will be automatically happened.
1630 		 */
1631 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1632 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1633 
1634 		ip6_ins_rt(nrt);
1635 	}
1636 out:
1637 	dst_release(&rt->u.dst);
1638 }
1639 
1640 /*
1641  *	Misc support functions
1642  */
1643 
1644 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1645 {
1646 	struct net *net = dev_net(ort->rt6i_dev);
1647 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1648 
1649 	if (rt) {
1650 		rt->u.dst.input = ort->u.dst.input;
1651 		rt->u.dst.output = ort->u.dst.output;
1652 
1653 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1654 		rt->u.dst.error = ort->u.dst.error;
1655 		rt->u.dst.dev = ort->u.dst.dev;
1656 		if (rt->u.dst.dev)
1657 			dev_hold(rt->u.dst.dev);
1658 		rt->rt6i_idev = ort->rt6i_idev;
1659 		if (rt->rt6i_idev)
1660 			in6_dev_hold(rt->rt6i_idev);
1661 		rt->u.dst.lastuse = jiffies;
1662 		rt->rt6i_expires = 0;
1663 
1664 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1665 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1666 		rt->rt6i_metric = 0;
1667 
1668 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1669 #ifdef CONFIG_IPV6_SUBTREES
1670 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1671 #endif
1672 		rt->rt6i_table = ort->rt6i_table;
1673 	}
1674 	return rt;
1675 }
1676 
1677 #ifdef CONFIG_IPV6_ROUTE_INFO
1678 static struct rt6_info *rt6_get_route_info(struct net *net,
1679 					   struct in6_addr *prefix, int prefixlen,
1680 					   struct in6_addr *gwaddr, int ifindex)
1681 {
1682 	struct fib6_node *fn;
1683 	struct rt6_info *rt = NULL;
1684 	struct fib6_table *table;
1685 
1686 	table = fib6_get_table(net, RT6_TABLE_INFO);
1687 	if (table == NULL)
1688 		return NULL;
1689 
1690 	write_lock_bh(&table->tb6_lock);
1691 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1692 	if (!fn)
1693 		goto out;
1694 
1695 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1696 		if (rt->rt6i_dev->ifindex != ifindex)
1697 			continue;
1698 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1699 			continue;
1700 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1701 			continue;
1702 		dst_hold(&rt->u.dst);
1703 		break;
1704 	}
1705 out:
1706 	write_unlock_bh(&table->tb6_lock);
1707 	return rt;
1708 }
1709 
1710 static struct rt6_info *rt6_add_route_info(struct net *net,
1711 					   struct in6_addr *prefix, int prefixlen,
1712 					   struct in6_addr *gwaddr, int ifindex,
1713 					   unsigned pref)
1714 {
1715 	struct fib6_config cfg = {
1716 		.fc_table	= RT6_TABLE_INFO,
1717 		.fc_metric	= IP6_RT_PRIO_USER,
1718 		.fc_ifindex	= ifindex,
1719 		.fc_dst_len	= prefixlen,
1720 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1721 				  RTF_UP | RTF_PREF(pref),
1722 		.fc_nlinfo.pid = 0,
1723 		.fc_nlinfo.nlh = NULL,
1724 		.fc_nlinfo.nl_net = net,
1725 	};
1726 
1727 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1728 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1729 
1730 	/* We should treat it as a default route if prefix length is 0. */
1731 	if (!prefixlen)
1732 		cfg.fc_flags |= RTF_DEFAULT;
1733 
1734 	ip6_route_add(&cfg);
1735 
1736 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1737 }
1738 #endif
1739 
1740 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1741 {
1742 	struct rt6_info *rt;
1743 	struct fib6_table *table;
1744 
1745 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1746 	if (table == NULL)
1747 		return NULL;
1748 
1749 	write_lock_bh(&table->tb6_lock);
1750 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1751 		if (dev == rt->rt6i_dev &&
1752 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1753 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1754 			break;
1755 	}
1756 	if (rt)
1757 		dst_hold(&rt->u.dst);
1758 	write_unlock_bh(&table->tb6_lock);
1759 	return rt;
1760 }
1761 
1762 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1763 				     struct net_device *dev,
1764 				     unsigned int pref)
1765 {
1766 	struct fib6_config cfg = {
1767 		.fc_table	= RT6_TABLE_DFLT,
1768 		.fc_metric	= IP6_RT_PRIO_USER,
1769 		.fc_ifindex	= dev->ifindex,
1770 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1771 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1772 		.fc_nlinfo.pid = 0,
1773 		.fc_nlinfo.nlh = NULL,
1774 		.fc_nlinfo.nl_net = dev_net(dev),
1775 	};
1776 
1777 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1778 
1779 	ip6_route_add(&cfg);
1780 
1781 	return rt6_get_dflt_router(gwaddr, dev);
1782 }
1783 
1784 void rt6_purge_dflt_routers(struct net *net)
1785 {
1786 	struct rt6_info *rt;
1787 	struct fib6_table *table;
1788 
1789 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1790 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1791 	if (table == NULL)
1792 		return;
1793 
1794 restart:
1795 	read_lock_bh(&table->tb6_lock);
1796 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1797 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1798 			dst_hold(&rt->u.dst);
1799 			read_unlock_bh(&table->tb6_lock);
1800 			ip6_del_rt(rt);
1801 			goto restart;
1802 		}
1803 	}
1804 	read_unlock_bh(&table->tb6_lock);
1805 }
1806 
1807 static void rtmsg_to_fib6_config(struct net *net,
1808 				 struct in6_rtmsg *rtmsg,
1809 				 struct fib6_config *cfg)
1810 {
1811 	memset(cfg, 0, sizeof(*cfg));
1812 
1813 	cfg->fc_table = RT6_TABLE_MAIN;
1814 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1815 	cfg->fc_metric = rtmsg->rtmsg_metric;
1816 	cfg->fc_expires = rtmsg->rtmsg_info;
1817 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1818 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1819 	cfg->fc_flags = rtmsg->rtmsg_flags;
1820 
1821 	cfg->fc_nlinfo.nl_net = net;
1822 
1823 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1824 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1825 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1826 }
1827 
1828 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1829 {
1830 	struct fib6_config cfg;
1831 	struct in6_rtmsg rtmsg;
1832 	int err;
1833 
1834 	switch(cmd) {
1835 	case SIOCADDRT:		/* Add a route */
1836 	case SIOCDELRT:		/* Delete a route */
1837 		if (!capable(CAP_NET_ADMIN))
1838 			return -EPERM;
1839 		err = copy_from_user(&rtmsg, arg,
1840 				     sizeof(struct in6_rtmsg));
1841 		if (err)
1842 			return -EFAULT;
1843 
1844 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1845 
1846 		rtnl_lock();
1847 		switch (cmd) {
1848 		case SIOCADDRT:
1849 			err = ip6_route_add(&cfg);
1850 			break;
1851 		case SIOCDELRT:
1852 			err = ip6_route_del(&cfg);
1853 			break;
1854 		default:
1855 			err = -EINVAL;
1856 		}
1857 		rtnl_unlock();
1858 
1859 		return err;
1860 	}
1861 
1862 	return -EINVAL;
1863 }
1864 
1865 /*
1866  *	Drop the packet on the floor
1867  */
1868 
1869 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1870 {
1871 	int type;
1872 	struct dst_entry *dst = skb_dst(skb);
1873 	switch (ipstats_mib_noroutes) {
1874 	case IPSTATS_MIB_INNOROUTES:
1875 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1876 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1877 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1878 				      IPSTATS_MIB_INADDRERRORS);
1879 			break;
1880 		}
1881 		/* FALLTHROUGH */
1882 	case IPSTATS_MIB_OUTNOROUTES:
1883 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1884 			      ipstats_mib_noroutes);
1885 		break;
1886 	}
1887 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1888 	kfree_skb(skb);
1889 	return 0;
1890 }
1891 
1892 static int ip6_pkt_discard(struct sk_buff *skb)
1893 {
1894 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1895 }
1896 
1897 static int ip6_pkt_discard_out(struct sk_buff *skb)
1898 {
1899 	skb->dev = skb_dst(skb)->dev;
1900 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1901 }
1902 
1903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1904 
1905 static int ip6_pkt_prohibit(struct sk_buff *skb)
1906 {
1907 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1908 }
1909 
1910 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1911 {
1912 	skb->dev = skb_dst(skb)->dev;
1913 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1914 }
1915 
1916 #endif
1917 
1918 /*
1919  *	Allocate a dst for local (unicast / anycast) address.
1920  */
1921 
1922 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1923 				    const struct in6_addr *addr,
1924 				    int anycast)
1925 {
1926 	struct net *net = dev_net(idev->dev);
1927 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1928 	struct neighbour *neigh;
1929 
1930 	if (rt == NULL)
1931 		return ERR_PTR(-ENOMEM);
1932 
1933 	dev_hold(net->loopback_dev);
1934 	in6_dev_hold(idev);
1935 
1936 	rt->u.dst.flags = DST_HOST;
1937 	rt->u.dst.input = ip6_input;
1938 	rt->u.dst.output = ip6_output;
1939 	rt->rt6i_dev = net->loopback_dev;
1940 	rt->rt6i_idev = idev;
1941 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1942 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1943 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1944 	rt->u.dst.obsolete = -1;
1945 
1946 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1947 	if (anycast)
1948 		rt->rt6i_flags |= RTF_ANYCAST;
1949 	else
1950 		rt->rt6i_flags |= RTF_LOCAL;
1951 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1952 	if (IS_ERR(neigh)) {
1953 		dst_free(&rt->u.dst);
1954 
1955 		/* We are casting this because that is the return
1956 		 * value type.  But an errno encoded pointer is the
1957 		 * same regardless of the underlying pointer type,
1958 		 * and that's what we are returning.  So this is OK.
1959 		 */
1960 		return (struct rt6_info *) neigh;
1961 	}
1962 	rt->rt6i_nexthop = neigh;
1963 
1964 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1965 	rt->rt6i_dst.plen = 128;
1966 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1967 
1968 	atomic_set(&rt->u.dst.__refcnt, 1);
1969 
1970 	return rt;
1971 }
1972 
1973 struct arg_dev_net {
1974 	struct net_device *dev;
1975 	struct net *net;
1976 };
1977 
1978 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1979 {
1980 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1981 	struct net *net = ((struct arg_dev_net *)arg)->net;
1982 
1983 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1984 	    rt != net->ipv6.ip6_null_entry) {
1985 		RT6_TRACE("deleted by ifdown %p\n", rt);
1986 		return -1;
1987 	}
1988 	return 0;
1989 }
1990 
1991 void rt6_ifdown(struct net *net, struct net_device *dev)
1992 {
1993 	struct arg_dev_net adn = {
1994 		.dev = dev,
1995 		.net = net,
1996 	};
1997 
1998 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1999 	icmp6_clean_all(fib6_ifdown, &adn);
2000 }
2001 
2002 struct rt6_mtu_change_arg
2003 {
2004 	struct net_device *dev;
2005 	unsigned mtu;
2006 };
2007 
2008 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2009 {
2010 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2011 	struct inet6_dev *idev;
2012 	struct net *net = dev_net(arg->dev);
2013 
2014 	/* In IPv6 pmtu discovery is not optional,
2015 	   so that RTAX_MTU lock cannot disable it.
2016 	   We still use this lock to block changes
2017 	   caused by addrconf/ndisc.
2018 	*/
2019 
2020 	idev = __in6_dev_get(arg->dev);
2021 	if (idev == NULL)
2022 		return 0;
2023 
2024 	/* For administrative MTU increase, there is no way to discover
2025 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2026 	   Since RFC 1981 doesn't include administrative MTU increase
2027 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2028 	 */
2029 	/*
2030 	   If new MTU is less than route PMTU, this new MTU will be the
2031 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2032 	   decreases; if new MTU is greater than route PMTU, and the
2033 	   old MTU is the lowest MTU in the path, update the route PMTU
2034 	   to reflect the increase. In this case if the other nodes' MTU
2035 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2036 	   PMTU discouvery.
2037 	 */
2038 	if (rt->rt6i_dev == arg->dev &&
2039 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2040 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
2041 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
2042 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2043 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2044 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2045 	}
2046 	return 0;
2047 }
2048 
2049 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2050 {
2051 	struct rt6_mtu_change_arg arg = {
2052 		.dev = dev,
2053 		.mtu = mtu,
2054 	};
2055 
2056 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2057 }
2058 
2059 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2060 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2061 	[RTA_OIF]               = { .type = NLA_U32 },
2062 	[RTA_IIF]		= { .type = NLA_U32 },
2063 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2064 	[RTA_METRICS]           = { .type = NLA_NESTED },
2065 };
2066 
2067 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2068 			      struct fib6_config *cfg)
2069 {
2070 	struct rtmsg *rtm;
2071 	struct nlattr *tb[RTA_MAX+1];
2072 	int err;
2073 
2074 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2075 	if (err < 0)
2076 		goto errout;
2077 
2078 	err = -EINVAL;
2079 	rtm = nlmsg_data(nlh);
2080 	memset(cfg, 0, sizeof(*cfg));
2081 
2082 	cfg->fc_table = rtm->rtm_table;
2083 	cfg->fc_dst_len = rtm->rtm_dst_len;
2084 	cfg->fc_src_len = rtm->rtm_src_len;
2085 	cfg->fc_flags = RTF_UP;
2086 	cfg->fc_protocol = rtm->rtm_protocol;
2087 
2088 	if (rtm->rtm_type == RTN_UNREACHABLE)
2089 		cfg->fc_flags |= RTF_REJECT;
2090 
2091 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2092 	cfg->fc_nlinfo.nlh = nlh;
2093 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2094 
2095 	if (tb[RTA_GATEWAY]) {
2096 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2097 		cfg->fc_flags |= RTF_GATEWAY;
2098 	}
2099 
2100 	if (tb[RTA_DST]) {
2101 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2102 
2103 		if (nla_len(tb[RTA_DST]) < plen)
2104 			goto errout;
2105 
2106 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2107 	}
2108 
2109 	if (tb[RTA_SRC]) {
2110 		int plen = (rtm->rtm_src_len + 7) >> 3;
2111 
2112 		if (nla_len(tb[RTA_SRC]) < plen)
2113 			goto errout;
2114 
2115 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2116 	}
2117 
2118 	if (tb[RTA_OIF])
2119 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2120 
2121 	if (tb[RTA_PRIORITY])
2122 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2123 
2124 	if (tb[RTA_METRICS]) {
2125 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2126 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2127 	}
2128 
2129 	if (tb[RTA_TABLE])
2130 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2131 
2132 	err = 0;
2133 errout:
2134 	return err;
2135 }
2136 
2137 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2138 {
2139 	struct fib6_config cfg;
2140 	int err;
2141 
2142 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2143 	if (err < 0)
2144 		return err;
2145 
2146 	return ip6_route_del(&cfg);
2147 }
2148 
2149 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2150 {
2151 	struct fib6_config cfg;
2152 	int err;
2153 
2154 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2155 	if (err < 0)
2156 		return err;
2157 
2158 	return ip6_route_add(&cfg);
2159 }
2160 
2161 static inline size_t rt6_nlmsg_size(void)
2162 {
2163 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2164 	       + nla_total_size(16) /* RTA_SRC */
2165 	       + nla_total_size(16) /* RTA_DST */
2166 	       + nla_total_size(16) /* RTA_GATEWAY */
2167 	       + nla_total_size(16) /* RTA_PREFSRC */
2168 	       + nla_total_size(4) /* RTA_TABLE */
2169 	       + nla_total_size(4) /* RTA_IIF */
2170 	       + nla_total_size(4) /* RTA_OIF */
2171 	       + nla_total_size(4) /* RTA_PRIORITY */
2172 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2173 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2174 }
2175 
2176 static int rt6_fill_node(struct net *net,
2177 			 struct sk_buff *skb, struct rt6_info *rt,
2178 			 struct in6_addr *dst, struct in6_addr *src,
2179 			 int iif, int type, u32 pid, u32 seq,
2180 			 int prefix, int nowait, unsigned int flags)
2181 {
2182 	struct rtmsg *rtm;
2183 	struct nlmsghdr *nlh;
2184 	long expires;
2185 	u32 table;
2186 
2187 	if (prefix) {	/* user wants prefix routes only */
2188 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2189 			/* success since this is not a prefix route */
2190 			return 1;
2191 		}
2192 	}
2193 
2194 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2195 	if (nlh == NULL)
2196 		return -EMSGSIZE;
2197 
2198 	rtm = nlmsg_data(nlh);
2199 	rtm->rtm_family = AF_INET6;
2200 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2201 	rtm->rtm_src_len = rt->rt6i_src.plen;
2202 	rtm->rtm_tos = 0;
2203 	if (rt->rt6i_table)
2204 		table = rt->rt6i_table->tb6_id;
2205 	else
2206 		table = RT6_TABLE_UNSPEC;
2207 	rtm->rtm_table = table;
2208 	NLA_PUT_U32(skb, RTA_TABLE, table);
2209 	if (rt->rt6i_flags&RTF_REJECT)
2210 		rtm->rtm_type = RTN_UNREACHABLE;
2211 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2212 		rtm->rtm_type = RTN_LOCAL;
2213 	else
2214 		rtm->rtm_type = RTN_UNICAST;
2215 	rtm->rtm_flags = 0;
2216 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2217 	rtm->rtm_protocol = rt->rt6i_protocol;
2218 	if (rt->rt6i_flags&RTF_DYNAMIC)
2219 		rtm->rtm_protocol = RTPROT_REDIRECT;
2220 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2221 		rtm->rtm_protocol = RTPROT_KERNEL;
2222 	else if (rt->rt6i_flags&RTF_DEFAULT)
2223 		rtm->rtm_protocol = RTPROT_RA;
2224 
2225 	if (rt->rt6i_flags&RTF_CACHE)
2226 		rtm->rtm_flags |= RTM_F_CLONED;
2227 
2228 	if (dst) {
2229 		NLA_PUT(skb, RTA_DST, 16, dst);
2230 		rtm->rtm_dst_len = 128;
2231 	} else if (rtm->rtm_dst_len)
2232 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2233 #ifdef CONFIG_IPV6_SUBTREES
2234 	if (src) {
2235 		NLA_PUT(skb, RTA_SRC, 16, src);
2236 		rtm->rtm_src_len = 128;
2237 	} else if (rtm->rtm_src_len)
2238 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2239 #endif
2240 	if (iif) {
2241 #ifdef CONFIG_IPV6_MROUTE
2242 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2243 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2244 			if (err <= 0) {
2245 				if (!nowait) {
2246 					if (err == 0)
2247 						return 0;
2248 					goto nla_put_failure;
2249 				} else {
2250 					if (err == -EMSGSIZE)
2251 						goto nla_put_failure;
2252 				}
2253 			}
2254 		} else
2255 #endif
2256 			NLA_PUT_U32(skb, RTA_IIF, iif);
2257 	} else if (dst) {
2258 		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2259 		struct in6_addr saddr_buf;
2260 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2261 				       dst, 0, &saddr_buf) == 0)
2262 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2263 	}
2264 
2265 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2266 		goto nla_put_failure;
2267 
2268 	if (rt->u.dst.neighbour)
2269 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2270 
2271 	if (rt->u.dst.dev)
2272 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2273 
2274 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2275 
2276 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2277 		expires = 0;
2278 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2279 		expires = rt->rt6i_expires - jiffies;
2280 	else
2281 		expires = INT_MAX;
2282 
2283 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2284 			       expires, rt->u.dst.error) < 0)
2285 		goto nla_put_failure;
2286 
2287 	return nlmsg_end(skb, nlh);
2288 
2289 nla_put_failure:
2290 	nlmsg_cancel(skb, nlh);
2291 	return -EMSGSIZE;
2292 }
2293 
2294 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2295 {
2296 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2297 	int prefix;
2298 
2299 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2300 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2301 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2302 	} else
2303 		prefix = 0;
2304 
2305 	return rt6_fill_node(arg->net,
2306 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2307 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2308 		     prefix, 0, NLM_F_MULTI);
2309 }
2310 
2311 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2312 {
2313 	struct net *net = sock_net(in_skb->sk);
2314 	struct nlattr *tb[RTA_MAX+1];
2315 	struct rt6_info *rt;
2316 	struct sk_buff *skb;
2317 	struct rtmsg *rtm;
2318 	struct flowi fl;
2319 	int err, iif = 0;
2320 
2321 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2322 	if (err < 0)
2323 		goto errout;
2324 
2325 	err = -EINVAL;
2326 	memset(&fl, 0, sizeof(fl));
2327 
2328 	if (tb[RTA_SRC]) {
2329 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2330 			goto errout;
2331 
2332 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2333 	}
2334 
2335 	if (tb[RTA_DST]) {
2336 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2337 			goto errout;
2338 
2339 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2340 	}
2341 
2342 	if (tb[RTA_IIF])
2343 		iif = nla_get_u32(tb[RTA_IIF]);
2344 
2345 	if (tb[RTA_OIF])
2346 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2347 
2348 	if (iif) {
2349 		struct net_device *dev;
2350 		dev = __dev_get_by_index(net, iif);
2351 		if (!dev) {
2352 			err = -ENODEV;
2353 			goto errout;
2354 		}
2355 	}
2356 
2357 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2358 	if (skb == NULL) {
2359 		err = -ENOBUFS;
2360 		goto errout;
2361 	}
2362 
2363 	/* Reserve room for dummy headers, this skb can pass
2364 	   through good chunk of routing engine.
2365 	 */
2366 	skb_reset_mac_header(skb);
2367 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2368 
2369 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2370 	skb_dst_set(skb, &rt->u.dst);
2371 
2372 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2373 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2374 			    nlh->nlmsg_seq, 0, 0, 0);
2375 	if (err < 0) {
2376 		kfree_skb(skb);
2377 		goto errout;
2378 	}
2379 
2380 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2381 errout:
2382 	return err;
2383 }
2384 
2385 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2386 {
2387 	struct sk_buff *skb;
2388 	struct net *net = info->nl_net;
2389 	u32 seq;
2390 	int err;
2391 
2392 	err = -ENOBUFS;
2393 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2394 
2395 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2396 	if (skb == NULL)
2397 		goto errout;
2398 
2399 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2400 				event, info->pid, seq, 0, 0, 0);
2401 	if (err < 0) {
2402 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2403 		WARN_ON(err == -EMSGSIZE);
2404 		kfree_skb(skb);
2405 		goto errout;
2406 	}
2407 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2408 		    info->nlh, gfp_any());
2409 	return;
2410 errout:
2411 	if (err < 0)
2412 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2413 }
2414 
2415 static int ip6_route_dev_notify(struct notifier_block *this,
2416 				unsigned long event, void *data)
2417 {
2418 	struct net_device *dev = (struct net_device *)data;
2419 	struct net *net = dev_net(dev);
2420 
2421 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2422 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2423 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2424 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2425 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2426 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2427 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2428 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2429 #endif
2430 	}
2431 
2432 	return NOTIFY_OK;
2433 }
2434 
2435 /*
2436  *	/proc
2437  */
2438 
2439 #ifdef CONFIG_PROC_FS
2440 
2441 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2442 
2443 struct rt6_proc_arg
2444 {
2445 	char *buffer;
2446 	int offset;
2447 	int length;
2448 	int skip;
2449 	int len;
2450 };
2451 
2452 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2453 {
2454 	struct seq_file *m = p_arg;
2455 
2456 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2457 
2458 #ifdef CONFIG_IPV6_SUBTREES
2459 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2460 #else
2461 	seq_puts(m, "00000000000000000000000000000000 00 ");
2462 #endif
2463 
2464 	if (rt->rt6i_nexthop) {
2465 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2466 	} else {
2467 		seq_puts(m, "00000000000000000000000000000000");
2468 	}
2469 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2470 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2471 		   rt->u.dst.__use, rt->rt6i_flags,
2472 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2473 	return 0;
2474 }
2475 
2476 static int ipv6_route_show(struct seq_file *m, void *v)
2477 {
2478 	struct net *net = (struct net *)m->private;
2479 	fib6_clean_all(net, rt6_info_route, 0, m);
2480 	return 0;
2481 }
2482 
2483 static int ipv6_route_open(struct inode *inode, struct file *file)
2484 {
2485 	return single_open_net(inode, file, ipv6_route_show);
2486 }
2487 
2488 static const struct file_operations ipv6_route_proc_fops = {
2489 	.owner		= THIS_MODULE,
2490 	.open		= ipv6_route_open,
2491 	.read		= seq_read,
2492 	.llseek		= seq_lseek,
2493 	.release	= single_release_net,
2494 };
2495 
2496 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2497 {
2498 	struct net *net = (struct net *)seq->private;
2499 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2500 		   net->ipv6.rt6_stats->fib_nodes,
2501 		   net->ipv6.rt6_stats->fib_route_nodes,
2502 		   net->ipv6.rt6_stats->fib_rt_alloc,
2503 		   net->ipv6.rt6_stats->fib_rt_entries,
2504 		   net->ipv6.rt6_stats->fib_rt_cache,
2505 		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
2506 		   net->ipv6.rt6_stats->fib_discarded_routes);
2507 
2508 	return 0;
2509 }
2510 
2511 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2512 {
2513 	return single_open_net(inode, file, rt6_stats_seq_show);
2514 }
2515 
2516 static const struct file_operations rt6_stats_seq_fops = {
2517 	.owner	 = THIS_MODULE,
2518 	.open	 = rt6_stats_seq_open,
2519 	.read	 = seq_read,
2520 	.llseek	 = seq_lseek,
2521 	.release = single_release_net,
2522 };
2523 #endif	/* CONFIG_PROC_FS */
2524 
2525 #ifdef CONFIG_SYSCTL
2526 
2527 static
2528 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2529 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2530 {
2531 	struct net *net = current->nsproxy->net_ns;
2532 	int delay = net->ipv6.sysctl.flush_delay;
2533 	if (write) {
2534 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2535 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2536 		return 0;
2537 	} else
2538 		return -EINVAL;
2539 }
2540 
2541 ctl_table ipv6_route_table_template[] = {
2542 	{
2543 		.procname	=	"flush",
2544 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2545 		.maxlen		=	sizeof(int),
2546 		.mode		=	0200,
2547 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2548 	},
2549 	{
2550 		.procname	=	"gc_thresh",
2551 		.data		=	&ip6_dst_ops_template.gc_thresh,
2552 		.maxlen		=	sizeof(int),
2553 		.mode		=	0644,
2554 		.proc_handler	=	proc_dointvec,
2555 	},
2556 	{
2557 		.procname	=	"max_size",
2558 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2559 		.maxlen		=	sizeof(int),
2560 		.mode		=	0644,
2561 		.proc_handler	=	proc_dointvec,
2562 	},
2563 	{
2564 		.procname	=	"gc_min_interval",
2565 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2566 		.maxlen		=	sizeof(int),
2567 		.mode		=	0644,
2568 		.proc_handler	=	proc_dointvec_jiffies,
2569 	},
2570 	{
2571 		.procname	=	"gc_timeout",
2572 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2573 		.maxlen		=	sizeof(int),
2574 		.mode		=	0644,
2575 		.proc_handler	=	proc_dointvec_jiffies,
2576 	},
2577 	{
2578 		.procname	=	"gc_interval",
2579 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2580 		.maxlen		=	sizeof(int),
2581 		.mode		=	0644,
2582 		.proc_handler	=	proc_dointvec_jiffies,
2583 	},
2584 	{
2585 		.procname	=	"gc_elasticity",
2586 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2587 		.maxlen		=	sizeof(int),
2588 		.mode		=	0644,
2589 		.proc_handler	=	proc_dointvec_jiffies,
2590 	},
2591 	{
2592 		.procname	=	"mtu_expires",
2593 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2594 		.maxlen		=	sizeof(int),
2595 		.mode		=	0644,
2596 		.proc_handler	=	proc_dointvec_jiffies,
2597 	},
2598 	{
2599 		.procname	=	"min_adv_mss",
2600 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2601 		.maxlen		=	sizeof(int),
2602 		.mode		=	0644,
2603 		.proc_handler	=	proc_dointvec_jiffies,
2604 	},
2605 	{
2606 		.procname	=	"gc_min_interval_ms",
2607 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2608 		.maxlen		=	sizeof(int),
2609 		.mode		=	0644,
2610 		.proc_handler	=	proc_dointvec_ms_jiffies,
2611 	},
2612 	{ }
2613 };
2614 
2615 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2616 {
2617 	struct ctl_table *table;
2618 
2619 	table = kmemdup(ipv6_route_table_template,
2620 			sizeof(ipv6_route_table_template),
2621 			GFP_KERNEL);
2622 
2623 	if (table) {
2624 		table[0].data = &net->ipv6.sysctl.flush_delay;
2625 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2626 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2627 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2628 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2629 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2630 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2631 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2632 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2633 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2634 	}
2635 
2636 	return table;
2637 }
2638 #endif
2639 
2640 static int ip6_route_net_init(struct net *net)
2641 {
2642 	int ret = -ENOMEM;
2643 
2644 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2645 	       sizeof(net->ipv6.ip6_dst_ops));
2646 
2647 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2648 					   sizeof(*net->ipv6.ip6_null_entry),
2649 					   GFP_KERNEL);
2650 	if (!net->ipv6.ip6_null_entry)
2651 		goto out_ip6_dst_ops;
2652 	net->ipv6.ip6_null_entry->u.dst.path =
2653 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2654 	net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2655 
2656 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2657 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2658 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2659 					       GFP_KERNEL);
2660 	if (!net->ipv6.ip6_prohibit_entry)
2661 		goto out_ip6_null_entry;
2662 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2663 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2664 	net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2665 
2666 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2667 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2668 					       GFP_KERNEL);
2669 	if (!net->ipv6.ip6_blk_hole_entry)
2670 		goto out_ip6_prohibit_entry;
2671 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2672 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2673 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2674 #endif
2675 
2676 	net->ipv6.sysctl.flush_delay = 0;
2677 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2678 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2679 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2680 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2681 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2682 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2683 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2684 
2685 #ifdef CONFIG_PROC_FS
2686 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2687 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2688 #endif
2689 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2690 
2691 	ret = 0;
2692 out:
2693 	return ret;
2694 
2695 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2696 out_ip6_prohibit_entry:
2697 	kfree(net->ipv6.ip6_prohibit_entry);
2698 out_ip6_null_entry:
2699 	kfree(net->ipv6.ip6_null_entry);
2700 #endif
2701 out_ip6_dst_ops:
2702 	goto out;
2703 }
2704 
2705 static void ip6_route_net_exit(struct net *net)
2706 {
2707 #ifdef CONFIG_PROC_FS
2708 	proc_net_remove(net, "ipv6_route");
2709 	proc_net_remove(net, "rt6_stats");
2710 #endif
2711 	kfree(net->ipv6.ip6_null_entry);
2712 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2713 	kfree(net->ipv6.ip6_prohibit_entry);
2714 	kfree(net->ipv6.ip6_blk_hole_entry);
2715 #endif
2716 }
2717 
2718 static struct pernet_operations ip6_route_net_ops = {
2719 	.init = ip6_route_net_init,
2720 	.exit = ip6_route_net_exit,
2721 };
2722 
2723 static struct notifier_block ip6_route_dev_notifier = {
2724 	.notifier_call = ip6_route_dev_notify,
2725 	.priority = 0,
2726 };
2727 
2728 int __init ip6_route_init(void)
2729 {
2730 	int ret;
2731 
2732 	ret = -ENOMEM;
2733 	ip6_dst_ops_template.kmem_cachep =
2734 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2735 				  SLAB_HWCACHE_ALIGN, NULL);
2736 	if (!ip6_dst_ops_template.kmem_cachep)
2737 		goto out;
2738 
2739 	ret = register_pernet_subsys(&ip6_route_net_ops);
2740 	if (ret)
2741 		goto out_kmem_cache;
2742 
2743 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2744 
2745 	/* Registering of the loopback is done before this portion of code,
2746 	 * the loopback reference in rt6_info will not be taken, do it
2747 	 * manually for init_net */
2748 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2749 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2750   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2751 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2752 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2753 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2754 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2755   #endif
2756 	ret = fib6_init();
2757 	if (ret)
2758 		goto out_register_subsys;
2759 
2760 	ret = xfrm6_init();
2761 	if (ret)
2762 		goto out_fib6_init;
2763 
2764 	ret = fib6_rules_init();
2765 	if (ret)
2766 		goto xfrm6_init;
2767 
2768 	ret = -ENOBUFS;
2769 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2770 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2771 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2772 		goto fib6_rules_init;
2773 
2774 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2775 	if (ret)
2776 		goto fib6_rules_init;
2777 
2778 out:
2779 	return ret;
2780 
2781 fib6_rules_init:
2782 	fib6_rules_cleanup();
2783 xfrm6_init:
2784 	xfrm6_fini();
2785 out_fib6_init:
2786 	fib6_gc_cleanup();
2787 out_register_subsys:
2788 	unregister_pernet_subsys(&ip6_route_net_ops);
2789 out_kmem_cache:
2790 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2791 	goto out;
2792 }
2793 
2794 void ip6_route_cleanup(void)
2795 {
2796 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2797 	fib6_rules_cleanup();
2798 	xfrm6_fini();
2799 	fib6_gc_cleanup();
2800 	unregister_pernet_subsys(&ip6_route_net_ops);
2801 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2802 }
2803