xref: /openbmc/linux/net/ipv6/route.c (revision 732a675a)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  *	Ville Nuorvala
26  *		Fixed routing subtrees.
27  */
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/mroute6.h>
40 #include <linux/init.h>
41 #include <linux/if_arp.h>
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #include <linux/nsproxy.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 
59 #include <asm/uaccess.h>
60 
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64 
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67 
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75 
76 #define CLONE_OFFLINK_ROUTE 0
77 
78 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
79 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void		ip6_dst_destroy(struct dst_entry *);
82 static void		ip6_dst_ifdown(struct dst_entry *,
83 				       struct net_device *dev, int how);
84 static int		 ip6_dst_gc(struct dst_ops *ops);
85 
86 static int		ip6_pkt_discard(struct sk_buff *skb);
87 static int		ip6_pkt_discard_out(struct sk_buff *skb);
88 static void		ip6_link_failure(struct sk_buff *skb);
89 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 					   struct in6_addr *prefix, int prefixlen,
94 					   struct in6_addr *gwaddr, int ifindex,
95 					   unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 					   struct in6_addr *prefix, int prefixlen,
98 					   struct in6_addr *gwaddr, int ifindex);
99 #endif
100 
101 static struct dst_ops ip6_dst_ops_template = {
102 	.family			=	AF_INET6,
103 	.protocol		=	__constant_htons(ETH_P_IPV6),
104 	.gc			=	ip6_dst_gc,
105 	.gc_thresh		=	1024,
106 	.check			=	ip6_dst_check,
107 	.destroy		=	ip6_dst_destroy,
108 	.ifdown			=	ip6_dst_ifdown,
109 	.negative_advice	=	ip6_negative_advice,
110 	.link_failure		=	ip6_link_failure,
111 	.update_pmtu		=	ip6_rt_update_pmtu,
112 	.local_out		=	__ip6_local_out,
113 	.entry_size		=	sizeof(struct rt6_info),
114 	.entries		=	ATOMIC_INIT(0),
115 };
116 
117 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
118 {
119 }
120 
121 static struct dst_ops ip6_dst_blackhole_ops = {
122 	.family			=	AF_INET6,
123 	.protocol		=	__constant_htons(ETH_P_IPV6),
124 	.destroy		=	ip6_dst_destroy,
125 	.check			=	ip6_dst_check,
126 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
127 	.entry_size		=	sizeof(struct rt6_info),
128 	.entries		=	ATOMIC_INIT(0),
129 };
130 
131 static struct rt6_info ip6_null_entry_template = {
132 	.u = {
133 		.dst = {
134 			.__refcnt	= ATOMIC_INIT(1),
135 			.__use		= 1,
136 			.obsolete	= -1,
137 			.error		= -ENETUNREACH,
138 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
139 			.input		= ip6_pkt_discard,
140 			.output		= ip6_pkt_discard_out,
141 		}
142 	},
143 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
144 	.rt6i_metric	= ~(u32) 0,
145 	.rt6i_ref	= ATOMIC_INIT(1),
146 };
147 
148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
149 
150 static int ip6_pkt_prohibit(struct sk_buff *skb);
151 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
152 
153 static struct rt6_info ip6_prohibit_entry_template = {
154 	.u = {
155 		.dst = {
156 			.__refcnt	= ATOMIC_INIT(1),
157 			.__use		= 1,
158 			.obsolete	= -1,
159 			.error		= -EACCES,
160 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
161 			.input		= ip6_pkt_prohibit,
162 			.output		= ip6_pkt_prohibit_out,
163 		}
164 	},
165 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
166 	.rt6i_metric	= ~(u32) 0,
167 	.rt6i_ref	= ATOMIC_INIT(1),
168 };
169 
170 static struct rt6_info ip6_blk_hole_entry_template = {
171 	.u = {
172 		.dst = {
173 			.__refcnt	= ATOMIC_INIT(1),
174 			.__use		= 1,
175 			.obsolete	= -1,
176 			.error		= -EINVAL,
177 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
178 			.input		= dst_discard,
179 			.output		= dst_discard,
180 		}
181 	},
182 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
183 	.rt6i_metric	= ~(u32) 0,
184 	.rt6i_ref	= ATOMIC_INIT(1),
185 };
186 
187 #endif
188 
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192 	return (struct rt6_info *)dst_alloc(ops);
193 }
194 
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197 	struct rt6_info *rt = (struct rt6_info *)dst;
198 	struct inet6_dev *idev = rt->rt6i_idev;
199 
200 	if (idev != NULL) {
201 		rt->rt6i_idev = NULL;
202 		in6_dev_put(idev);
203 	}
204 }
205 
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 			   int how)
208 {
209 	struct rt6_info *rt = (struct rt6_info *)dst;
210 	struct inet6_dev *idev = rt->rt6i_idev;
211 	struct net_device *loopback_dev =
212 		dev_net(dev)->loopback_dev;
213 
214 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 		struct inet6_dev *loopback_idev =
216 			in6_dev_get(loopback_dev);
217 		if (loopback_idev != NULL) {
218 			rt->rt6i_idev = loopback_idev;
219 			in6_dev_put(idev);
220 		}
221 	}
222 }
223 
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226 	return (rt->rt6i_flags & RTF_EXPIRES &&
227 		time_after(jiffies, rt->rt6i_expires));
228 }
229 
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232 	return (ipv6_addr_type(daddr) &
233 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
234 }
235 
236 /*
237  *	Route lookup. Any table->tb6_lock is implied.
238  */
239 
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 						    struct rt6_info *rt,
242 						    int oif,
243 						    int strict)
244 {
245 	struct rt6_info *local = NULL;
246 	struct rt6_info *sprt;
247 
248 	if (oif) {
249 		for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
250 			struct net_device *dev = sprt->rt6i_dev;
251 			if (dev->ifindex == oif)
252 				return sprt;
253 			if (dev->flags & IFF_LOOPBACK) {
254 				if (sprt->rt6i_idev == NULL ||
255 				    sprt->rt6i_idev->dev->ifindex != oif) {
256 					if (strict && oif)
257 						continue;
258 					if (local && (!oif ||
259 						      local->rt6i_idev->dev->ifindex == oif))
260 						continue;
261 				}
262 				local = sprt;
263 			}
264 		}
265 
266 		if (local)
267 			return local;
268 
269 		if (strict)
270 			return net->ipv6.ip6_null_entry;
271 	}
272 	return rt;
273 }
274 
275 #ifdef CONFIG_IPV6_ROUTER_PREF
276 static void rt6_probe(struct rt6_info *rt)
277 {
278 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
279 	/*
280 	 * Okay, this does not seem to be appropriate
281 	 * for now, however, we need to check if it
282 	 * is really so; aka Router Reachability Probing.
283 	 *
284 	 * Router Reachability Probe MUST be rate-limited
285 	 * to no more than one per minute.
286 	 */
287 	if (!neigh || (neigh->nud_state & NUD_VALID))
288 		return;
289 	read_lock_bh(&neigh->lock);
290 	if (!(neigh->nud_state & NUD_VALID) &&
291 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
292 		struct in6_addr mcaddr;
293 		struct in6_addr *target;
294 
295 		neigh->updated = jiffies;
296 		read_unlock_bh(&neigh->lock);
297 
298 		target = (struct in6_addr *)&neigh->primary_key;
299 		addrconf_addr_solict_mult(target, &mcaddr);
300 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
301 	} else
302 		read_unlock_bh(&neigh->lock);
303 }
304 #else
305 static inline void rt6_probe(struct rt6_info *rt)
306 {
307 	return;
308 }
309 #endif
310 
311 /*
312  * Default Router Selection (RFC 2461 6.3.6)
313  */
314 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
315 {
316 	struct net_device *dev = rt->rt6i_dev;
317 	if (!oif || dev->ifindex == oif)
318 		return 2;
319 	if ((dev->flags & IFF_LOOPBACK) &&
320 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
321 		return 1;
322 	return 0;
323 }
324 
325 static inline int rt6_check_neigh(struct rt6_info *rt)
326 {
327 	struct neighbour *neigh = rt->rt6i_nexthop;
328 	int m;
329 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
330 	    !(rt->rt6i_flags & RTF_GATEWAY))
331 		m = 1;
332 	else if (neigh) {
333 		read_lock_bh(&neigh->lock);
334 		if (neigh->nud_state & NUD_VALID)
335 			m = 2;
336 #ifdef CONFIG_IPV6_ROUTER_PREF
337 		else if (neigh->nud_state & NUD_FAILED)
338 			m = 0;
339 #endif
340 		else
341 			m = 1;
342 		read_unlock_bh(&neigh->lock);
343 	} else
344 		m = 0;
345 	return m;
346 }
347 
348 static int rt6_score_route(struct rt6_info *rt, int oif,
349 			   int strict)
350 {
351 	int m, n;
352 
353 	m = rt6_check_dev(rt, oif);
354 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
355 		return -1;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
358 #endif
359 	n = rt6_check_neigh(rt);
360 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
361 		return -1;
362 	return m;
363 }
364 
365 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
366 				   int *mpri, struct rt6_info *match)
367 {
368 	int m;
369 
370 	if (rt6_check_expired(rt))
371 		goto out;
372 
373 	m = rt6_score_route(rt, oif, strict);
374 	if (m < 0)
375 		goto out;
376 
377 	if (m > *mpri) {
378 		if (strict & RT6_LOOKUP_F_REACHABLE)
379 			rt6_probe(match);
380 		*mpri = m;
381 		match = rt;
382 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
383 		rt6_probe(rt);
384 	}
385 
386 out:
387 	return match;
388 }
389 
390 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
391 				     struct rt6_info *rr_head,
392 				     u32 metric, int oif, int strict)
393 {
394 	struct rt6_info *rt, *match;
395 	int mpri = -1;
396 
397 	match = NULL;
398 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
399 	     rt = rt->u.dst.rt6_next)
400 		match = find_match(rt, oif, strict, &mpri, match);
401 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
402 	     rt = rt->u.dst.rt6_next)
403 		match = find_match(rt, oif, strict, &mpri, match);
404 
405 	return match;
406 }
407 
408 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
409 {
410 	struct rt6_info *match, *rt0;
411 	struct net *net;
412 
413 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
414 		  __func__, fn->leaf, oif);
415 
416 	rt0 = fn->rr_ptr;
417 	if (!rt0)
418 		fn->rr_ptr = rt0 = fn->leaf;
419 
420 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
421 
422 	if (!match &&
423 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
424 		struct rt6_info *next = rt0->u.dst.rt6_next;
425 
426 		/* no entries matched; do round-robin */
427 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
428 			next = fn->leaf;
429 
430 		if (next != rt0)
431 			fn->rr_ptr = next;
432 	}
433 
434 	RT6_TRACE("%s() => %p\n",
435 		  __func__, match);
436 
437 	net = dev_net(rt0->rt6i_dev);
438 	return (match ? match : net->ipv6.ip6_null_entry);
439 }
440 
441 #ifdef CONFIG_IPV6_ROUTE_INFO
442 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
443 		  struct in6_addr *gwaddr)
444 {
445 	struct net *net = dev_net(dev);
446 	struct route_info *rinfo = (struct route_info *) opt;
447 	struct in6_addr prefix_buf, *prefix;
448 	unsigned int pref;
449 	unsigned long lifetime;
450 	struct rt6_info *rt;
451 
452 	if (len < sizeof(struct route_info)) {
453 		return -EINVAL;
454 	}
455 
456 	/* Sanity check for prefix_len and length */
457 	if (rinfo->length > 3) {
458 		return -EINVAL;
459 	} else if (rinfo->prefix_len > 128) {
460 		return -EINVAL;
461 	} else if (rinfo->prefix_len > 64) {
462 		if (rinfo->length < 2) {
463 			return -EINVAL;
464 		}
465 	} else if (rinfo->prefix_len > 0) {
466 		if (rinfo->length < 1) {
467 			return -EINVAL;
468 		}
469 	}
470 
471 	pref = rinfo->route_pref;
472 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
473 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
474 
475 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
476 
477 	if (rinfo->length == 3)
478 		prefix = (struct in6_addr *)rinfo->prefix;
479 	else {
480 		/* this function is safe */
481 		ipv6_addr_prefix(&prefix_buf,
482 				 (struct in6_addr *)rinfo->prefix,
483 				 rinfo->prefix_len);
484 		prefix = &prefix_buf;
485 	}
486 
487 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
488 				dev->ifindex);
489 
490 	if (rt && !lifetime) {
491 		ip6_del_rt(rt);
492 		rt = NULL;
493 	}
494 
495 	if (!rt && lifetime)
496 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
497 					pref);
498 	else if (rt)
499 		rt->rt6i_flags = RTF_ROUTEINFO |
500 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
501 
502 	if (rt) {
503 		if (!addrconf_finite_timeout(lifetime)) {
504 			rt->rt6i_flags &= ~RTF_EXPIRES;
505 		} else {
506 			rt->rt6i_expires = jiffies + HZ * lifetime;
507 			rt->rt6i_flags |= RTF_EXPIRES;
508 		}
509 		dst_release(&rt->u.dst);
510 	}
511 	return 0;
512 }
513 #endif
514 
515 #define BACKTRACK(__net, saddr)			\
516 do { \
517 	if (rt == __net->ipv6.ip6_null_entry) {	\
518 		struct fib6_node *pn; \
519 		while (1) { \
520 			if (fn->fn_flags & RTN_TL_ROOT) \
521 				goto out; \
522 			pn = fn->parent; \
523 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
524 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
525 			else \
526 				fn = pn; \
527 			if (fn->fn_flags & RTN_RTINFO) \
528 				goto restart; \
529 		} \
530 	} \
531 } while(0)
532 
533 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
534 					     struct fib6_table *table,
535 					     struct flowi *fl, int flags)
536 {
537 	struct fib6_node *fn;
538 	struct rt6_info *rt;
539 
540 	read_lock_bh(&table->tb6_lock);
541 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
542 restart:
543 	rt = fn->leaf;
544 	rt = rt6_device_match(net, rt, fl->oif, flags);
545 	BACKTRACK(net, &fl->fl6_src);
546 out:
547 	dst_use(&rt->u.dst, jiffies);
548 	read_unlock_bh(&table->tb6_lock);
549 	return rt;
550 
551 }
552 
553 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
554 			    const struct in6_addr *saddr, int oif, int strict)
555 {
556 	struct flowi fl = {
557 		.oif = oif,
558 		.nl_u = {
559 			.ip6_u = {
560 				.daddr = *daddr,
561 			},
562 		},
563 	};
564 	struct dst_entry *dst;
565 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
566 
567 	if (saddr) {
568 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
569 		flags |= RT6_LOOKUP_F_HAS_SADDR;
570 	}
571 
572 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
573 	if (dst->error == 0)
574 		return (struct rt6_info *) dst;
575 
576 	dst_release(dst);
577 
578 	return NULL;
579 }
580 
581 EXPORT_SYMBOL(rt6_lookup);
582 
583 /* ip6_ins_rt is called with FREE table->tb6_lock.
584    It takes new route entry, the addition fails by any reason the
585    route is freed. In any case, if caller does not hold it, it may
586    be destroyed.
587  */
588 
589 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
590 {
591 	int err;
592 	struct fib6_table *table;
593 
594 	table = rt->rt6i_table;
595 	write_lock_bh(&table->tb6_lock);
596 	err = fib6_add(&table->tb6_root, rt, info);
597 	write_unlock_bh(&table->tb6_lock);
598 
599 	return err;
600 }
601 
602 int ip6_ins_rt(struct rt6_info *rt)
603 {
604 	struct nl_info info = {
605 		.nl_net = dev_net(rt->rt6i_dev),
606 	};
607 	return __ip6_ins_rt(rt, &info);
608 }
609 
610 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
611 				      struct in6_addr *saddr)
612 {
613 	struct rt6_info *rt;
614 
615 	/*
616 	 *	Clone the route.
617 	 */
618 
619 	rt = ip6_rt_copy(ort);
620 
621 	if (rt) {
622 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
623 			if (rt->rt6i_dst.plen != 128 &&
624 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
625 				rt->rt6i_flags |= RTF_ANYCAST;
626 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
627 		}
628 
629 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
630 		rt->rt6i_dst.plen = 128;
631 		rt->rt6i_flags |= RTF_CACHE;
632 		rt->u.dst.flags |= DST_HOST;
633 
634 #ifdef CONFIG_IPV6_SUBTREES
635 		if (rt->rt6i_src.plen && saddr) {
636 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
637 			rt->rt6i_src.plen = 128;
638 		}
639 #endif
640 
641 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
642 
643 	}
644 
645 	return rt;
646 }
647 
648 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
649 {
650 	struct rt6_info *rt = ip6_rt_copy(ort);
651 	if (rt) {
652 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
653 		rt->rt6i_dst.plen = 128;
654 		rt->rt6i_flags |= RTF_CACHE;
655 		rt->u.dst.flags |= DST_HOST;
656 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
657 	}
658 	return rt;
659 }
660 
661 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
662 				      struct flowi *fl, int flags)
663 {
664 	struct fib6_node *fn;
665 	struct rt6_info *rt, *nrt;
666 	int strict = 0;
667 	int attempts = 3;
668 	int err;
669 	int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
670 
671 	strict |= flags & RT6_LOOKUP_F_IFACE;
672 
673 relookup:
674 	read_lock_bh(&table->tb6_lock);
675 
676 restart_2:
677 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
678 
679 restart:
680 	rt = rt6_select(fn, oif, strict | reachable);
681 
682 	BACKTRACK(net, &fl->fl6_src);
683 	if (rt == net->ipv6.ip6_null_entry ||
684 	    rt->rt6i_flags & RTF_CACHE)
685 		goto out;
686 
687 	dst_hold(&rt->u.dst);
688 	read_unlock_bh(&table->tb6_lock);
689 
690 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
691 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
692 	else {
693 #if CLONE_OFFLINK_ROUTE
694 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
695 #else
696 		goto out2;
697 #endif
698 	}
699 
700 	dst_release(&rt->u.dst);
701 	rt = nrt ? : net->ipv6.ip6_null_entry;
702 
703 	dst_hold(&rt->u.dst);
704 	if (nrt) {
705 		err = ip6_ins_rt(nrt);
706 		if (!err)
707 			goto out2;
708 	}
709 
710 	if (--attempts <= 0)
711 		goto out2;
712 
713 	/*
714 	 * Race condition! In the gap, when table->tb6_lock was
715 	 * released someone could insert this route.  Relookup.
716 	 */
717 	dst_release(&rt->u.dst);
718 	goto relookup;
719 
720 out:
721 	if (reachable) {
722 		reachable = 0;
723 		goto restart_2;
724 	}
725 	dst_hold(&rt->u.dst);
726 	read_unlock_bh(&table->tb6_lock);
727 out2:
728 	rt->u.dst.lastuse = jiffies;
729 	rt->u.dst.__use++;
730 
731 	return rt;
732 }
733 
734 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
735 					    struct flowi *fl, int flags)
736 {
737 	return ip6_pol_route(net, table, fl->iif, fl, flags);
738 }
739 
740 void ip6_route_input(struct sk_buff *skb)
741 {
742 	struct ipv6hdr *iph = ipv6_hdr(skb);
743 	struct net *net = dev_net(skb->dev);
744 	int flags = RT6_LOOKUP_F_HAS_SADDR;
745 	struct flowi fl = {
746 		.iif = skb->dev->ifindex,
747 		.nl_u = {
748 			.ip6_u = {
749 				.daddr = iph->daddr,
750 				.saddr = iph->saddr,
751 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
752 			},
753 		},
754 		.mark = skb->mark,
755 		.proto = iph->nexthdr,
756 	};
757 
758 	if (rt6_need_strict(&iph->daddr))
759 		flags |= RT6_LOOKUP_F_IFACE;
760 
761 	skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
762 }
763 
764 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
765 					     struct flowi *fl, int flags)
766 {
767 	return ip6_pol_route(net, table, fl->oif, fl, flags);
768 }
769 
770 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
771 				    struct flowi *fl)
772 {
773 	int flags = 0;
774 
775 	if (rt6_need_strict(&fl->fl6_dst))
776 		flags |= RT6_LOOKUP_F_IFACE;
777 
778 	if (!ipv6_addr_any(&fl->fl6_src))
779 		flags |= RT6_LOOKUP_F_HAS_SADDR;
780 	else if (sk) {
781 		unsigned int prefs = inet6_sk(sk)->srcprefs;
782 		if (prefs & IPV6_PREFER_SRC_TMP)
783 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
784 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
785 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
786 		if (prefs & IPV6_PREFER_SRC_COA)
787 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
788 	}
789 
790 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
791 }
792 
793 EXPORT_SYMBOL(ip6_route_output);
794 
795 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
796 {
797 	struct rt6_info *ort = (struct rt6_info *) *dstp;
798 	struct rt6_info *rt = (struct rt6_info *)
799 		dst_alloc(&ip6_dst_blackhole_ops);
800 	struct dst_entry *new = NULL;
801 
802 	if (rt) {
803 		new = &rt->u.dst;
804 
805 		atomic_set(&new->__refcnt, 1);
806 		new->__use = 1;
807 		new->input = dst_discard;
808 		new->output = dst_discard;
809 
810 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
811 		new->dev = ort->u.dst.dev;
812 		if (new->dev)
813 			dev_hold(new->dev);
814 		rt->rt6i_idev = ort->rt6i_idev;
815 		if (rt->rt6i_idev)
816 			in6_dev_hold(rt->rt6i_idev);
817 		rt->rt6i_expires = 0;
818 
819 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
820 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
821 		rt->rt6i_metric = 0;
822 
823 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
824 #ifdef CONFIG_IPV6_SUBTREES
825 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
826 #endif
827 
828 		dst_free(new);
829 	}
830 
831 	dst_release(*dstp);
832 	*dstp = new;
833 	return (new ? 0 : -ENOMEM);
834 }
835 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
836 
837 /*
838  *	Destination cache support functions
839  */
840 
841 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
842 {
843 	struct rt6_info *rt;
844 
845 	rt = (struct rt6_info *) dst;
846 
847 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
848 		return dst;
849 
850 	return NULL;
851 }
852 
853 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
854 {
855 	struct rt6_info *rt = (struct rt6_info *) dst;
856 
857 	if (rt) {
858 		if (rt->rt6i_flags & RTF_CACHE)
859 			ip6_del_rt(rt);
860 		else
861 			dst_release(dst);
862 	}
863 	return NULL;
864 }
865 
866 static void ip6_link_failure(struct sk_buff *skb)
867 {
868 	struct rt6_info *rt;
869 
870 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
871 
872 	rt = (struct rt6_info *) skb->dst;
873 	if (rt) {
874 		if (rt->rt6i_flags&RTF_CACHE) {
875 			dst_set_expires(&rt->u.dst, 0);
876 			rt->rt6i_flags |= RTF_EXPIRES;
877 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
878 			rt->rt6i_node->fn_sernum = -1;
879 	}
880 }
881 
882 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
883 {
884 	struct rt6_info *rt6 = (struct rt6_info*)dst;
885 
886 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
887 		rt6->rt6i_flags |= RTF_MODIFIED;
888 		if (mtu < IPV6_MIN_MTU) {
889 			mtu = IPV6_MIN_MTU;
890 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
891 		}
892 		dst->metrics[RTAX_MTU-1] = mtu;
893 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
894 	}
895 }
896 
897 static int ipv6_get_mtu(struct net_device *dev);
898 
899 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
900 {
901 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
902 
903 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
904 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
905 
906 	/*
907 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
908 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
909 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
910 	 * rely only on pmtu discovery"
911 	 */
912 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
913 		mtu = IPV6_MAXPLEN;
914 	return mtu;
915 }
916 
917 static struct dst_entry *icmp6_dst_gc_list;
918 static DEFINE_SPINLOCK(icmp6_dst_lock);
919 
920 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
921 				  struct neighbour *neigh,
922 				  const struct in6_addr *addr)
923 {
924 	struct rt6_info *rt;
925 	struct inet6_dev *idev = in6_dev_get(dev);
926 	struct net *net = dev_net(dev);
927 
928 	if (unlikely(idev == NULL))
929 		return NULL;
930 
931 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
932 	if (unlikely(rt == NULL)) {
933 		in6_dev_put(idev);
934 		goto out;
935 	}
936 
937 	dev_hold(dev);
938 	if (neigh)
939 		neigh_hold(neigh);
940 	else
941 		neigh = ndisc_get_neigh(dev, addr);
942 
943 	rt->rt6i_dev	  = dev;
944 	rt->rt6i_idev     = idev;
945 	rt->rt6i_nexthop  = neigh;
946 	atomic_set(&rt->u.dst.__refcnt, 1);
947 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
948 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
949 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
950 	rt->u.dst.output  = ip6_output;
951 
952 #if 0	/* there's no chance to use these for ndisc */
953 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
954 				? DST_HOST
955 				: 0;
956 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
957 	rt->rt6i_dst.plen = 128;
958 #endif
959 
960 	spin_lock_bh(&icmp6_dst_lock);
961 	rt->u.dst.next = icmp6_dst_gc_list;
962 	icmp6_dst_gc_list = &rt->u.dst;
963 	spin_unlock_bh(&icmp6_dst_lock);
964 
965 	fib6_force_start_gc(net);
966 
967 out:
968 	return &rt->u.dst;
969 }
970 
971 int icmp6_dst_gc(int *more)
972 {
973 	struct dst_entry *dst, *next, **pprev;
974 	int freed;
975 
976 	next = NULL;
977 	freed = 0;
978 
979 	spin_lock_bh(&icmp6_dst_lock);
980 	pprev = &icmp6_dst_gc_list;
981 
982 	while ((dst = *pprev) != NULL) {
983 		if (!atomic_read(&dst->__refcnt)) {
984 			*pprev = dst->next;
985 			dst_free(dst);
986 			freed++;
987 		} else {
988 			pprev = &dst->next;
989 			(*more)++;
990 		}
991 	}
992 
993 	spin_unlock_bh(&icmp6_dst_lock);
994 
995 	return freed;
996 }
997 
998 static int ip6_dst_gc(struct dst_ops *ops)
999 {
1000 	unsigned long now = jiffies;
1001 	struct net *net = ops->dst_net;
1002 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1003 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1004 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1005 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1006 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1007 
1008 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1009 	    atomic_read(&ops->entries) <= rt_max_size)
1010 		goto out;
1011 
1012 	net->ipv6.ip6_rt_gc_expire++;
1013 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1014 	net->ipv6.ip6_rt_last_gc = now;
1015 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1016 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1017 out:
1018 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1019 	return (atomic_read(&ops->entries) > rt_max_size);
1020 }
1021 
1022 /* Clean host part of a prefix. Not necessary in radix tree,
1023    but results in cleaner routing tables.
1024 
1025    Remove it only when all the things will work!
1026  */
1027 
1028 static int ipv6_get_mtu(struct net_device *dev)
1029 {
1030 	int mtu = IPV6_MIN_MTU;
1031 	struct inet6_dev *idev;
1032 
1033 	idev = in6_dev_get(dev);
1034 	if (idev) {
1035 		mtu = idev->cnf.mtu6;
1036 		in6_dev_put(idev);
1037 	}
1038 	return mtu;
1039 }
1040 
1041 int ip6_dst_hoplimit(struct dst_entry *dst)
1042 {
1043 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1044 	if (hoplimit < 0) {
1045 		struct net_device *dev = dst->dev;
1046 		struct inet6_dev *idev = in6_dev_get(dev);
1047 		if (idev) {
1048 			hoplimit = idev->cnf.hop_limit;
1049 			in6_dev_put(idev);
1050 		} else
1051 			hoplimit = ipv6_devconf.hop_limit;
1052 	}
1053 	return hoplimit;
1054 }
1055 
1056 /*
1057  *
1058  */
1059 
1060 int ip6_route_add(struct fib6_config *cfg)
1061 {
1062 	int err;
1063 	struct net *net = cfg->fc_nlinfo.nl_net;
1064 	struct rt6_info *rt = NULL;
1065 	struct net_device *dev = NULL;
1066 	struct inet6_dev *idev = NULL;
1067 	struct fib6_table *table;
1068 	int addr_type;
1069 
1070 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1071 		return -EINVAL;
1072 #ifndef CONFIG_IPV6_SUBTREES
1073 	if (cfg->fc_src_len)
1074 		return -EINVAL;
1075 #endif
1076 	if (cfg->fc_ifindex) {
1077 		err = -ENODEV;
1078 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1079 		if (!dev)
1080 			goto out;
1081 		idev = in6_dev_get(dev);
1082 		if (!idev)
1083 			goto out;
1084 	}
1085 
1086 	if (cfg->fc_metric == 0)
1087 		cfg->fc_metric = IP6_RT_PRIO_USER;
1088 
1089 	table = fib6_new_table(net, cfg->fc_table);
1090 	if (table == NULL) {
1091 		err = -ENOBUFS;
1092 		goto out;
1093 	}
1094 
1095 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1096 
1097 	if (rt == NULL) {
1098 		err = -ENOMEM;
1099 		goto out;
1100 	}
1101 
1102 	rt->u.dst.obsolete = -1;
1103 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1104 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1105 				0;
1106 
1107 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1108 		cfg->fc_protocol = RTPROT_BOOT;
1109 	rt->rt6i_protocol = cfg->fc_protocol;
1110 
1111 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1112 
1113 	if (addr_type & IPV6_ADDR_MULTICAST)
1114 		rt->u.dst.input = ip6_mc_input;
1115 	else
1116 		rt->u.dst.input = ip6_forward;
1117 
1118 	rt->u.dst.output = ip6_output;
1119 
1120 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1121 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1122 	if (rt->rt6i_dst.plen == 128)
1123 	       rt->u.dst.flags = DST_HOST;
1124 
1125 #ifdef CONFIG_IPV6_SUBTREES
1126 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1127 	rt->rt6i_src.plen = cfg->fc_src_len;
1128 #endif
1129 
1130 	rt->rt6i_metric = cfg->fc_metric;
1131 
1132 	/* We cannot add true routes via loopback here,
1133 	   they would result in kernel looping; promote them to reject routes
1134 	 */
1135 	if ((cfg->fc_flags & RTF_REJECT) ||
1136 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1137 		/* hold loopback dev/idev if we haven't done so. */
1138 		if (dev != net->loopback_dev) {
1139 			if (dev) {
1140 				dev_put(dev);
1141 				in6_dev_put(idev);
1142 			}
1143 			dev = net->loopback_dev;
1144 			dev_hold(dev);
1145 			idev = in6_dev_get(dev);
1146 			if (!idev) {
1147 				err = -ENODEV;
1148 				goto out;
1149 			}
1150 		}
1151 		rt->u.dst.output = ip6_pkt_discard_out;
1152 		rt->u.dst.input = ip6_pkt_discard;
1153 		rt->u.dst.error = -ENETUNREACH;
1154 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1155 		goto install_route;
1156 	}
1157 
1158 	if (cfg->fc_flags & RTF_GATEWAY) {
1159 		struct in6_addr *gw_addr;
1160 		int gwa_type;
1161 
1162 		gw_addr = &cfg->fc_gateway;
1163 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1164 		gwa_type = ipv6_addr_type(gw_addr);
1165 
1166 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1167 			struct rt6_info *grt;
1168 
1169 			/* IPv6 strictly inhibits using not link-local
1170 			   addresses as nexthop address.
1171 			   Otherwise, router will not able to send redirects.
1172 			   It is very good, but in some (rare!) circumstances
1173 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1174 			   some exceptions. --ANK
1175 			 */
1176 			err = -EINVAL;
1177 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1178 				goto out;
1179 
1180 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1181 
1182 			err = -EHOSTUNREACH;
1183 			if (grt == NULL)
1184 				goto out;
1185 			if (dev) {
1186 				if (dev != grt->rt6i_dev) {
1187 					dst_release(&grt->u.dst);
1188 					goto out;
1189 				}
1190 			} else {
1191 				dev = grt->rt6i_dev;
1192 				idev = grt->rt6i_idev;
1193 				dev_hold(dev);
1194 				in6_dev_hold(grt->rt6i_idev);
1195 			}
1196 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1197 				err = 0;
1198 			dst_release(&grt->u.dst);
1199 
1200 			if (err)
1201 				goto out;
1202 		}
1203 		err = -EINVAL;
1204 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1205 			goto out;
1206 	}
1207 
1208 	err = -ENODEV;
1209 	if (dev == NULL)
1210 		goto out;
1211 
1212 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1213 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1214 		if (IS_ERR(rt->rt6i_nexthop)) {
1215 			err = PTR_ERR(rt->rt6i_nexthop);
1216 			rt->rt6i_nexthop = NULL;
1217 			goto out;
1218 		}
1219 	}
1220 
1221 	rt->rt6i_flags = cfg->fc_flags;
1222 
1223 install_route:
1224 	if (cfg->fc_mx) {
1225 		struct nlattr *nla;
1226 		int remaining;
1227 
1228 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1229 			int type = nla_type(nla);
1230 
1231 			if (type) {
1232 				if (type > RTAX_MAX) {
1233 					err = -EINVAL;
1234 					goto out;
1235 				}
1236 
1237 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1238 			}
1239 		}
1240 	}
1241 
1242 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1243 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1244 	if (!dst_metric(&rt->u.dst, RTAX_MTU))
1245 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1246 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1247 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1248 	rt->u.dst.dev = dev;
1249 	rt->rt6i_idev = idev;
1250 	rt->rt6i_table = table;
1251 
1252 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1253 
1254 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1255 
1256 out:
1257 	if (dev)
1258 		dev_put(dev);
1259 	if (idev)
1260 		in6_dev_put(idev);
1261 	if (rt)
1262 		dst_free(&rt->u.dst);
1263 	return err;
1264 }
1265 
1266 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1267 {
1268 	int err;
1269 	struct fib6_table *table;
1270 	struct net *net = dev_net(rt->rt6i_dev);
1271 
1272 	if (rt == net->ipv6.ip6_null_entry)
1273 		return -ENOENT;
1274 
1275 	table = rt->rt6i_table;
1276 	write_lock_bh(&table->tb6_lock);
1277 
1278 	err = fib6_del(rt, info);
1279 	dst_release(&rt->u.dst);
1280 
1281 	write_unlock_bh(&table->tb6_lock);
1282 
1283 	return err;
1284 }
1285 
1286 int ip6_del_rt(struct rt6_info *rt)
1287 {
1288 	struct nl_info info = {
1289 		.nl_net = dev_net(rt->rt6i_dev),
1290 	};
1291 	return __ip6_del_rt(rt, &info);
1292 }
1293 
1294 static int ip6_route_del(struct fib6_config *cfg)
1295 {
1296 	struct fib6_table *table;
1297 	struct fib6_node *fn;
1298 	struct rt6_info *rt;
1299 	int err = -ESRCH;
1300 
1301 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1302 	if (table == NULL)
1303 		return err;
1304 
1305 	read_lock_bh(&table->tb6_lock);
1306 
1307 	fn = fib6_locate(&table->tb6_root,
1308 			 &cfg->fc_dst, cfg->fc_dst_len,
1309 			 &cfg->fc_src, cfg->fc_src_len);
1310 
1311 	if (fn) {
1312 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1313 			if (cfg->fc_ifindex &&
1314 			    (rt->rt6i_dev == NULL ||
1315 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1316 				continue;
1317 			if (cfg->fc_flags & RTF_GATEWAY &&
1318 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1319 				continue;
1320 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1321 				continue;
1322 			dst_hold(&rt->u.dst);
1323 			read_unlock_bh(&table->tb6_lock);
1324 
1325 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1326 		}
1327 	}
1328 	read_unlock_bh(&table->tb6_lock);
1329 
1330 	return err;
1331 }
1332 
1333 /*
1334  *	Handle redirects
1335  */
1336 struct ip6rd_flowi {
1337 	struct flowi fl;
1338 	struct in6_addr gateway;
1339 };
1340 
1341 static struct rt6_info *__ip6_route_redirect(struct net *net,
1342 					     struct fib6_table *table,
1343 					     struct flowi *fl,
1344 					     int flags)
1345 {
1346 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1347 	struct rt6_info *rt;
1348 	struct fib6_node *fn;
1349 
1350 	/*
1351 	 * Get the "current" route for this destination and
1352 	 * check if the redirect has come from approriate router.
1353 	 *
1354 	 * RFC 2461 specifies that redirects should only be
1355 	 * accepted if they come from the nexthop to the target.
1356 	 * Due to the way the routes are chosen, this notion
1357 	 * is a bit fuzzy and one might need to check all possible
1358 	 * routes.
1359 	 */
1360 
1361 	read_lock_bh(&table->tb6_lock);
1362 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1363 restart:
1364 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1365 		/*
1366 		 * Current route is on-link; redirect is always invalid.
1367 		 *
1368 		 * Seems, previous statement is not true. It could
1369 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1370 		 * But then router serving it might decide, that we should
1371 		 * know truth 8)8) --ANK (980726).
1372 		 */
1373 		if (rt6_check_expired(rt))
1374 			continue;
1375 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1376 			continue;
1377 		if (fl->oif != rt->rt6i_dev->ifindex)
1378 			continue;
1379 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1380 			continue;
1381 		break;
1382 	}
1383 
1384 	if (!rt)
1385 		rt = net->ipv6.ip6_null_entry;
1386 	BACKTRACK(net, &fl->fl6_src);
1387 out:
1388 	dst_hold(&rt->u.dst);
1389 
1390 	read_unlock_bh(&table->tb6_lock);
1391 
1392 	return rt;
1393 };
1394 
1395 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1396 					   struct in6_addr *src,
1397 					   struct in6_addr *gateway,
1398 					   struct net_device *dev)
1399 {
1400 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1401 	struct net *net = dev_net(dev);
1402 	struct ip6rd_flowi rdfl = {
1403 		.fl = {
1404 			.oif = dev->ifindex,
1405 			.nl_u = {
1406 				.ip6_u = {
1407 					.daddr = *dest,
1408 					.saddr = *src,
1409 				},
1410 			},
1411 		},
1412 		.gateway = *gateway,
1413 	};
1414 
1415 	if (rt6_need_strict(dest))
1416 		flags |= RT6_LOOKUP_F_IFACE;
1417 
1418 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1419 						   flags, __ip6_route_redirect);
1420 }
1421 
1422 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1423 		  struct in6_addr *saddr,
1424 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1425 {
1426 	struct rt6_info *rt, *nrt = NULL;
1427 	struct netevent_redirect netevent;
1428 	struct net *net = dev_net(neigh->dev);
1429 
1430 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1431 
1432 	if (rt == net->ipv6.ip6_null_entry) {
1433 		if (net_ratelimit())
1434 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1435 			       "for redirect target\n");
1436 		goto out;
1437 	}
1438 
1439 	/*
1440 	 *	We have finally decided to accept it.
1441 	 */
1442 
1443 	neigh_update(neigh, lladdr, NUD_STALE,
1444 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1445 		     NEIGH_UPDATE_F_OVERRIDE|
1446 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1447 				     NEIGH_UPDATE_F_ISROUTER))
1448 		     );
1449 
1450 	/*
1451 	 * Redirect received -> path was valid.
1452 	 * Look, redirects are sent only in response to data packets,
1453 	 * so that this nexthop apparently is reachable. --ANK
1454 	 */
1455 	dst_confirm(&rt->u.dst);
1456 
1457 	/* Duplicate redirect: silently ignore. */
1458 	if (neigh == rt->u.dst.neighbour)
1459 		goto out;
1460 
1461 	nrt = ip6_rt_copy(rt);
1462 	if (nrt == NULL)
1463 		goto out;
1464 
1465 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1466 	if (on_link)
1467 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1468 
1469 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1470 	nrt->rt6i_dst.plen = 128;
1471 	nrt->u.dst.flags |= DST_HOST;
1472 
1473 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1474 	nrt->rt6i_nexthop = neigh_clone(neigh);
1475 	/* Reset pmtu, it may be better */
1476 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1477 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1478 							dst_mtu(&nrt->u.dst));
1479 
1480 	if (ip6_ins_rt(nrt))
1481 		goto out;
1482 
1483 	netevent.old = &rt->u.dst;
1484 	netevent.new = &nrt->u.dst;
1485 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1486 
1487 	if (rt->rt6i_flags&RTF_CACHE) {
1488 		ip6_del_rt(rt);
1489 		return;
1490 	}
1491 
1492 out:
1493 	dst_release(&rt->u.dst);
1494 	return;
1495 }
1496 
1497 /*
1498  *	Handle ICMP "packet too big" messages
1499  *	i.e. Path MTU discovery
1500  */
1501 
1502 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1503 			struct net_device *dev, u32 pmtu)
1504 {
1505 	struct rt6_info *rt, *nrt;
1506 	struct net *net = dev_net(dev);
1507 	int allfrag = 0;
1508 
1509 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1510 	if (rt == NULL)
1511 		return;
1512 
1513 	if (pmtu >= dst_mtu(&rt->u.dst))
1514 		goto out;
1515 
1516 	if (pmtu < IPV6_MIN_MTU) {
1517 		/*
1518 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1519 		 * MTU (1280) and a fragment header should always be included
1520 		 * after a node receiving Too Big message reporting PMTU is
1521 		 * less than the IPv6 Minimum Link MTU.
1522 		 */
1523 		pmtu = IPV6_MIN_MTU;
1524 		allfrag = 1;
1525 	}
1526 
1527 	/* New mtu received -> path was valid.
1528 	   They are sent only in response to data packets,
1529 	   so that this nexthop apparently is reachable. --ANK
1530 	 */
1531 	dst_confirm(&rt->u.dst);
1532 
1533 	/* Host route. If it is static, it would be better
1534 	   not to override it, but add new one, so that
1535 	   when cache entry will expire old pmtu
1536 	   would return automatically.
1537 	 */
1538 	if (rt->rt6i_flags & RTF_CACHE) {
1539 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1540 		if (allfrag)
1541 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1542 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1543 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1544 		goto out;
1545 	}
1546 
1547 	/* Network route.
1548 	   Two cases are possible:
1549 	   1. It is connected route. Action: COW
1550 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1551 	 */
1552 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1553 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1554 	else
1555 		nrt = rt6_alloc_clone(rt, daddr);
1556 
1557 	if (nrt) {
1558 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1559 		if (allfrag)
1560 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1561 
1562 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1563 		 * happened within 5 mins, the recommended timer is 10 mins.
1564 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1565 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1566 		 * and detecting PMTU increase will be automatically happened.
1567 		 */
1568 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1569 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1570 
1571 		ip6_ins_rt(nrt);
1572 	}
1573 out:
1574 	dst_release(&rt->u.dst);
1575 }
1576 
1577 /*
1578  *	Misc support functions
1579  */
1580 
1581 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1582 {
1583 	struct net *net = dev_net(ort->rt6i_dev);
1584 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1585 
1586 	if (rt) {
1587 		rt->u.dst.input = ort->u.dst.input;
1588 		rt->u.dst.output = ort->u.dst.output;
1589 
1590 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1591 		rt->u.dst.error = ort->u.dst.error;
1592 		rt->u.dst.dev = ort->u.dst.dev;
1593 		if (rt->u.dst.dev)
1594 			dev_hold(rt->u.dst.dev);
1595 		rt->rt6i_idev = ort->rt6i_idev;
1596 		if (rt->rt6i_idev)
1597 			in6_dev_hold(rt->rt6i_idev);
1598 		rt->u.dst.lastuse = jiffies;
1599 		rt->rt6i_expires = 0;
1600 
1601 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1602 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1603 		rt->rt6i_metric = 0;
1604 
1605 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1606 #ifdef CONFIG_IPV6_SUBTREES
1607 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1608 #endif
1609 		rt->rt6i_table = ort->rt6i_table;
1610 	}
1611 	return rt;
1612 }
1613 
1614 #ifdef CONFIG_IPV6_ROUTE_INFO
1615 static struct rt6_info *rt6_get_route_info(struct net *net,
1616 					   struct in6_addr *prefix, int prefixlen,
1617 					   struct in6_addr *gwaddr, int ifindex)
1618 {
1619 	struct fib6_node *fn;
1620 	struct rt6_info *rt = NULL;
1621 	struct fib6_table *table;
1622 
1623 	table = fib6_get_table(net, RT6_TABLE_INFO);
1624 	if (table == NULL)
1625 		return NULL;
1626 
1627 	write_lock_bh(&table->tb6_lock);
1628 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1629 	if (!fn)
1630 		goto out;
1631 
1632 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1633 		if (rt->rt6i_dev->ifindex != ifindex)
1634 			continue;
1635 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1636 			continue;
1637 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1638 			continue;
1639 		dst_hold(&rt->u.dst);
1640 		break;
1641 	}
1642 out:
1643 	write_unlock_bh(&table->tb6_lock);
1644 	return rt;
1645 }
1646 
1647 static struct rt6_info *rt6_add_route_info(struct net *net,
1648 					   struct in6_addr *prefix, int prefixlen,
1649 					   struct in6_addr *gwaddr, int ifindex,
1650 					   unsigned pref)
1651 {
1652 	struct fib6_config cfg = {
1653 		.fc_table	= RT6_TABLE_INFO,
1654 		.fc_metric	= IP6_RT_PRIO_USER,
1655 		.fc_ifindex	= ifindex,
1656 		.fc_dst_len	= prefixlen,
1657 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1658 				  RTF_UP | RTF_PREF(pref),
1659 		.fc_nlinfo.pid = 0,
1660 		.fc_nlinfo.nlh = NULL,
1661 		.fc_nlinfo.nl_net = net,
1662 	};
1663 
1664 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1665 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1666 
1667 	/* We should treat it as a default route if prefix length is 0. */
1668 	if (!prefixlen)
1669 		cfg.fc_flags |= RTF_DEFAULT;
1670 
1671 	ip6_route_add(&cfg);
1672 
1673 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1674 }
1675 #endif
1676 
1677 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1678 {
1679 	struct rt6_info *rt;
1680 	struct fib6_table *table;
1681 
1682 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1683 	if (table == NULL)
1684 		return NULL;
1685 
1686 	write_lock_bh(&table->tb6_lock);
1687 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1688 		if (dev == rt->rt6i_dev &&
1689 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1690 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1691 			break;
1692 	}
1693 	if (rt)
1694 		dst_hold(&rt->u.dst);
1695 	write_unlock_bh(&table->tb6_lock);
1696 	return rt;
1697 }
1698 
1699 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1700 				     struct net_device *dev,
1701 				     unsigned int pref)
1702 {
1703 	struct fib6_config cfg = {
1704 		.fc_table	= RT6_TABLE_DFLT,
1705 		.fc_metric	= IP6_RT_PRIO_USER,
1706 		.fc_ifindex	= dev->ifindex,
1707 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1708 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1709 		.fc_nlinfo.pid = 0,
1710 		.fc_nlinfo.nlh = NULL,
1711 		.fc_nlinfo.nl_net = dev_net(dev),
1712 	};
1713 
1714 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1715 
1716 	ip6_route_add(&cfg);
1717 
1718 	return rt6_get_dflt_router(gwaddr, dev);
1719 }
1720 
1721 void rt6_purge_dflt_routers(struct net *net)
1722 {
1723 	struct rt6_info *rt;
1724 	struct fib6_table *table;
1725 
1726 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1727 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1728 	if (table == NULL)
1729 		return;
1730 
1731 restart:
1732 	read_lock_bh(&table->tb6_lock);
1733 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1734 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1735 			dst_hold(&rt->u.dst);
1736 			read_unlock_bh(&table->tb6_lock);
1737 			ip6_del_rt(rt);
1738 			goto restart;
1739 		}
1740 	}
1741 	read_unlock_bh(&table->tb6_lock);
1742 }
1743 
1744 static void rtmsg_to_fib6_config(struct net *net,
1745 				 struct in6_rtmsg *rtmsg,
1746 				 struct fib6_config *cfg)
1747 {
1748 	memset(cfg, 0, sizeof(*cfg));
1749 
1750 	cfg->fc_table = RT6_TABLE_MAIN;
1751 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1752 	cfg->fc_metric = rtmsg->rtmsg_metric;
1753 	cfg->fc_expires = rtmsg->rtmsg_info;
1754 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1755 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1756 	cfg->fc_flags = rtmsg->rtmsg_flags;
1757 
1758 	cfg->fc_nlinfo.nl_net = net;
1759 
1760 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1761 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1762 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1763 }
1764 
1765 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1766 {
1767 	struct fib6_config cfg;
1768 	struct in6_rtmsg rtmsg;
1769 	int err;
1770 
1771 	switch(cmd) {
1772 	case SIOCADDRT:		/* Add a route */
1773 	case SIOCDELRT:		/* Delete a route */
1774 		if (!capable(CAP_NET_ADMIN))
1775 			return -EPERM;
1776 		err = copy_from_user(&rtmsg, arg,
1777 				     sizeof(struct in6_rtmsg));
1778 		if (err)
1779 			return -EFAULT;
1780 
1781 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1782 
1783 		rtnl_lock();
1784 		switch (cmd) {
1785 		case SIOCADDRT:
1786 			err = ip6_route_add(&cfg);
1787 			break;
1788 		case SIOCDELRT:
1789 			err = ip6_route_del(&cfg);
1790 			break;
1791 		default:
1792 			err = -EINVAL;
1793 		}
1794 		rtnl_unlock();
1795 
1796 		return err;
1797 	}
1798 
1799 	return -EINVAL;
1800 }
1801 
1802 /*
1803  *	Drop the packet on the floor
1804  */
1805 
1806 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1807 {
1808 	int type;
1809 	switch (ipstats_mib_noroutes) {
1810 	case IPSTATS_MIB_INNOROUTES:
1811 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1812 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1813 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1814 			break;
1815 		}
1816 		/* FALLTHROUGH */
1817 	case IPSTATS_MIB_OUTNOROUTES:
1818 		IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1819 		break;
1820 	}
1821 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1822 	kfree_skb(skb);
1823 	return 0;
1824 }
1825 
1826 static int ip6_pkt_discard(struct sk_buff *skb)
1827 {
1828 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1829 }
1830 
1831 static int ip6_pkt_discard_out(struct sk_buff *skb)
1832 {
1833 	skb->dev = skb->dst->dev;
1834 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1835 }
1836 
1837 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1838 
1839 static int ip6_pkt_prohibit(struct sk_buff *skb)
1840 {
1841 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1842 }
1843 
1844 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1845 {
1846 	skb->dev = skb->dst->dev;
1847 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1848 }
1849 
1850 #endif
1851 
1852 /*
1853  *	Allocate a dst for local (unicast / anycast) address.
1854  */
1855 
1856 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1857 				    const struct in6_addr *addr,
1858 				    int anycast)
1859 {
1860 	struct net *net = dev_net(idev->dev);
1861 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1862 
1863 	if (rt == NULL)
1864 		return ERR_PTR(-ENOMEM);
1865 
1866 	dev_hold(net->loopback_dev);
1867 	in6_dev_hold(idev);
1868 
1869 	rt->u.dst.flags = DST_HOST;
1870 	rt->u.dst.input = ip6_input;
1871 	rt->u.dst.output = ip6_output;
1872 	rt->rt6i_dev = net->loopback_dev;
1873 	rt->rt6i_idev = idev;
1874 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1875 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1876 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1877 	rt->u.dst.obsolete = -1;
1878 
1879 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1880 	if (anycast)
1881 		rt->rt6i_flags |= RTF_ANYCAST;
1882 	else
1883 		rt->rt6i_flags |= RTF_LOCAL;
1884 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1885 	if (rt->rt6i_nexthop == NULL) {
1886 		dst_free(&rt->u.dst);
1887 		return ERR_PTR(-ENOMEM);
1888 	}
1889 
1890 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1891 	rt->rt6i_dst.plen = 128;
1892 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1893 
1894 	atomic_set(&rt->u.dst.__refcnt, 1);
1895 
1896 	return rt;
1897 }
1898 
1899 struct arg_dev_net {
1900 	struct net_device *dev;
1901 	struct net *net;
1902 };
1903 
1904 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1905 {
1906 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1907 	struct net *net = ((struct arg_dev_net *)arg)->net;
1908 
1909 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1910 	    rt != net->ipv6.ip6_null_entry) {
1911 		RT6_TRACE("deleted by ifdown %p\n", rt);
1912 		return -1;
1913 	}
1914 	return 0;
1915 }
1916 
1917 void rt6_ifdown(struct net *net, struct net_device *dev)
1918 {
1919 	struct arg_dev_net adn = {
1920 		.dev = dev,
1921 		.net = net,
1922 	};
1923 
1924 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1925 }
1926 
1927 struct rt6_mtu_change_arg
1928 {
1929 	struct net_device *dev;
1930 	unsigned mtu;
1931 };
1932 
1933 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1934 {
1935 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1936 	struct inet6_dev *idev;
1937 	struct net *net = dev_net(arg->dev);
1938 
1939 	/* In IPv6 pmtu discovery is not optional,
1940 	   so that RTAX_MTU lock cannot disable it.
1941 	   We still use this lock to block changes
1942 	   caused by addrconf/ndisc.
1943 	*/
1944 
1945 	idev = __in6_dev_get(arg->dev);
1946 	if (idev == NULL)
1947 		return 0;
1948 
1949 	/* For administrative MTU increase, there is no way to discover
1950 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1951 	   Since RFC 1981 doesn't include administrative MTU increase
1952 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1953 	 */
1954 	/*
1955 	   If new MTU is less than route PMTU, this new MTU will be the
1956 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1957 	   decreases; if new MTU is greater than route PMTU, and the
1958 	   old MTU is the lowest MTU in the path, update the route PMTU
1959 	   to reflect the increase. In this case if the other nodes' MTU
1960 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1961 	   PMTU discouvery.
1962 	 */
1963 	if (rt->rt6i_dev == arg->dev &&
1964 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1965 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
1966 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
1967 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1968 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1969 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1970 	}
1971 	return 0;
1972 }
1973 
1974 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1975 {
1976 	struct rt6_mtu_change_arg arg = {
1977 		.dev = dev,
1978 		.mtu = mtu,
1979 	};
1980 
1981 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
1982 }
1983 
1984 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1985 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1986 	[RTA_OIF]               = { .type = NLA_U32 },
1987 	[RTA_IIF]		= { .type = NLA_U32 },
1988 	[RTA_PRIORITY]          = { .type = NLA_U32 },
1989 	[RTA_METRICS]           = { .type = NLA_NESTED },
1990 };
1991 
1992 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1993 			      struct fib6_config *cfg)
1994 {
1995 	struct rtmsg *rtm;
1996 	struct nlattr *tb[RTA_MAX+1];
1997 	int err;
1998 
1999 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2000 	if (err < 0)
2001 		goto errout;
2002 
2003 	err = -EINVAL;
2004 	rtm = nlmsg_data(nlh);
2005 	memset(cfg, 0, sizeof(*cfg));
2006 
2007 	cfg->fc_table = rtm->rtm_table;
2008 	cfg->fc_dst_len = rtm->rtm_dst_len;
2009 	cfg->fc_src_len = rtm->rtm_src_len;
2010 	cfg->fc_flags = RTF_UP;
2011 	cfg->fc_protocol = rtm->rtm_protocol;
2012 
2013 	if (rtm->rtm_type == RTN_UNREACHABLE)
2014 		cfg->fc_flags |= RTF_REJECT;
2015 
2016 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2017 	cfg->fc_nlinfo.nlh = nlh;
2018 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2019 
2020 	if (tb[RTA_GATEWAY]) {
2021 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2022 		cfg->fc_flags |= RTF_GATEWAY;
2023 	}
2024 
2025 	if (tb[RTA_DST]) {
2026 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2027 
2028 		if (nla_len(tb[RTA_DST]) < plen)
2029 			goto errout;
2030 
2031 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2032 	}
2033 
2034 	if (tb[RTA_SRC]) {
2035 		int plen = (rtm->rtm_src_len + 7) >> 3;
2036 
2037 		if (nla_len(tb[RTA_SRC]) < plen)
2038 			goto errout;
2039 
2040 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2041 	}
2042 
2043 	if (tb[RTA_OIF])
2044 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2045 
2046 	if (tb[RTA_PRIORITY])
2047 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2048 
2049 	if (tb[RTA_METRICS]) {
2050 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2051 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2052 	}
2053 
2054 	if (tb[RTA_TABLE])
2055 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2056 
2057 	err = 0;
2058 errout:
2059 	return err;
2060 }
2061 
2062 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2063 {
2064 	struct fib6_config cfg;
2065 	int err;
2066 
2067 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2068 	if (err < 0)
2069 		return err;
2070 
2071 	return ip6_route_del(&cfg);
2072 }
2073 
2074 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2075 {
2076 	struct fib6_config cfg;
2077 	int err;
2078 
2079 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2080 	if (err < 0)
2081 		return err;
2082 
2083 	return ip6_route_add(&cfg);
2084 }
2085 
2086 static inline size_t rt6_nlmsg_size(void)
2087 {
2088 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2089 	       + nla_total_size(16) /* RTA_SRC */
2090 	       + nla_total_size(16) /* RTA_DST */
2091 	       + nla_total_size(16) /* RTA_GATEWAY */
2092 	       + nla_total_size(16) /* RTA_PREFSRC */
2093 	       + nla_total_size(4) /* RTA_TABLE */
2094 	       + nla_total_size(4) /* RTA_IIF */
2095 	       + nla_total_size(4) /* RTA_OIF */
2096 	       + nla_total_size(4) /* RTA_PRIORITY */
2097 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2098 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2099 }
2100 
2101 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2102 			 struct in6_addr *dst, struct in6_addr *src,
2103 			 int iif, int type, u32 pid, u32 seq,
2104 			 int prefix, int nowait, unsigned int flags)
2105 {
2106 	struct rtmsg *rtm;
2107 	struct nlmsghdr *nlh;
2108 	long expires;
2109 	u32 table;
2110 
2111 	if (prefix) {	/* user wants prefix routes only */
2112 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2113 			/* success since this is not a prefix route */
2114 			return 1;
2115 		}
2116 	}
2117 
2118 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2119 	if (nlh == NULL)
2120 		return -EMSGSIZE;
2121 
2122 	rtm = nlmsg_data(nlh);
2123 	rtm->rtm_family = AF_INET6;
2124 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2125 	rtm->rtm_src_len = rt->rt6i_src.plen;
2126 	rtm->rtm_tos = 0;
2127 	if (rt->rt6i_table)
2128 		table = rt->rt6i_table->tb6_id;
2129 	else
2130 		table = RT6_TABLE_UNSPEC;
2131 	rtm->rtm_table = table;
2132 	NLA_PUT_U32(skb, RTA_TABLE, table);
2133 	if (rt->rt6i_flags&RTF_REJECT)
2134 		rtm->rtm_type = RTN_UNREACHABLE;
2135 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2136 		rtm->rtm_type = RTN_LOCAL;
2137 	else
2138 		rtm->rtm_type = RTN_UNICAST;
2139 	rtm->rtm_flags = 0;
2140 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2141 	rtm->rtm_protocol = rt->rt6i_protocol;
2142 	if (rt->rt6i_flags&RTF_DYNAMIC)
2143 		rtm->rtm_protocol = RTPROT_REDIRECT;
2144 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2145 		rtm->rtm_protocol = RTPROT_KERNEL;
2146 	else if (rt->rt6i_flags&RTF_DEFAULT)
2147 		rtm->rtm_protocol = RTPROT_RA;
2148 
2149 	if (rt->rt6i_flags&RTF_CACHE)
2150 		rtm->rtm_flags |= RTM_F_CLONED;
2151 
2152 	if (dst) {
2153 		NLA_PUT(skb, RTA_DST, 16, dst);
2154 		rtm->rtm_dst_len = 128;
2155 	} else if (rtm->rtm_dst_len)
2156 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2157 #ifdef CONFIG_IPV6_SUBTREES
2158 	if (src) {
2159 		NLA_PUT(skb, RTA_SRC, 16, src);
2160 		rtm->rtm_src_len = 128;
2161 	} else if (rtm->rtm_src_len)
2162 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2163 #endif
2164 	if (iif) {
2165 #ifdef CONFIG_IPV6_MROUTE
2166 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2167 			int err = ip6mr_get_route(skb, rtm, nowait);
2168 			if (err <= 0) {
2169 				if (!nowait) {
2170 					if (err == 0)
2171 						return 0;
2172 					goto nla_put_failure;
2173 				} else {
2174 					if (err == -EMSGSIZE)
2175 						goto nla_put_failure;
2176 				}
2177 			}
2178 		} else
2179 #endif
2180 			NLA_PUT_U32(skb, RTA_IIF, iif);
2181 	} else if (dst) {
2182 		struct in6_addr saddr_buf;
2183 		if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2184 				       dst, 0, &saddr_buf) == 0)
2185 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2186 	}
2187 
2188 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2189 		goto nla_put_failure;
2190 
2191 	if (rt->u.dst.neighbour)
2192 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2193 
2194 	if (rt->u.dst.dev)
2195 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2196 
2197 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2198 
2199 	expires = (rt->rt6i_flags & RTF_EXPIRES) ?
2200 			rt->rt6i_expires - jiffies : 0;
2201 
2202 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2203 			       expires, rt->u.dst.error) < 0)
2204 		goto nla_put_failure;
2205 
2206 	return nlmsg_end(skb, nlh);
2207 
2208 nla_put_failure:
2209 	nlmsg_cancel(skb, nlh);
2210 	return -EMSGSIZE;
2211 }
2212 
2213 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2214 {
2215 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2216 	int prefix;
2217 
2218 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2219 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2220 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2221 	} else
2222 		prefix = 0;
2223 
2224 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2225 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2226 		     prefix, 0, NLM_F_MULTI);
2227 }
2228 
2229 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2230 {
2231 	struct net *net = sock_net(in_skb->sk);
2232 	struct nlattr *tb[RTA_MAX+1];
2233 	struct rt6_info *rt;
2234 	struct sk_buff *skb;
2235 	struct rtmsg *rtm;
2236 	struct flowi fl;
2237 	int err, iif = 0;
2238 
2239 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2240 	if (err < 0)
2241 		goto errout;
2242 
2243 	err = -EINVAL;
2244 	memset(&fl, 0, sizeof(fl));
2245 
2246 	if (tb[RTA_SRC]) {
2247 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2248 			goto errout;
2249 
2250 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2251 	}
2252 
2253 	if (tb[RTA_DST]) {
2254 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2255 			goto errout;
2256 
2257 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2258 	}
2259 
2260 	if (tb[RTA_IIF])
2261 		iif = nla_get_u32(tb[RTA_IIF]);
2262 
2263 	if (tb[RTA_OIF])
2264 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2265 
2266 	if (iif) {
2267 		struct net_device *dev;
2268 		dev = __dev_get_by_index(net, iif);
2269 		if (!dev) {
2270 			err = -ENODEV;
2271 			goto errout;
2272 		}
2273 	}
2274 
2275 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2276 	if (skb == NULL) {
2277 		err = -ENOBUFS;
2278 		goto errout;
2279 	}
2280 
2281 	/* Reserve room for dummy headers, this skb can pass
2282 	   through good chunk of routing engine.
2283 	 */
2284 	skb_reset_mac_header(skb);
2285 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2286 
2287 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2288 	skb->dst = &rt->u.dst;
2289 
2290 	err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2291 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2292 			    nlh->nlmsg_seq, 0, 0, 0);
2293 	if (err < 0) {
2294 		kfree_skb(skb);
2295 		goto errout;
2296 	}
2297 
2298 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2299 errout:
2300 	return err;
2301 }
2302 
2303 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2304 {
2305 	struct sk_buff *skb;
2306 	struct net *net = info->nl_net;
2307 	u32 seq;
2308 	int err;
2309 
2310 	err = -ENOBUFS;
2311 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2312 
2313 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2314 	if (skb == NULL)
2315 		goto errout;
2316 
2317 	err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2318 				event, info->pid, seq, 0, 0, 0);
2319 	if (err < 0) {
2320 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2321 		WARN_ON(err == -EMSGSIZE);
2322 		kfree_skb(skb);
2323 		goto errout;
2324 	}
2325 	err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2326 			  info->nlh, gfp_any());
2327 errout:
2328 	if (err < 0)
2329 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2330 }
2331 
2332 static int ip6_route_dev_notify(struct notifier_block *this,
2333 				unsigned long event, void *data)
2334 {
2335 	struct net_device *dev = (struct net_device *)data;
2336 	struct net *net = dev_net(dev);
2337 
2338 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2339 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2340 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2341 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2342 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2343 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2344 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2345 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2346 #endif
2347 	}
2348 
2349 	return NOTIFY_OK;
2350 }
2351 
2352 /*
2353  *	/proc
2354  */
2355 
2356 #ifdef CONFIG_PROC_FS
2357 
2358 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2359 
2360 struct rt6_proc_arg
2361 {
2362 	char *buffer;
2363 	int offset;
2364 	int length;
2365 	int skip;
2366 	int len;
2367 };
2368 
2369 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2370 {
2371 	struct seq_file *m = p_arg;
2372 
2373 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2374 		   rt->rt6i_dst.plen);
2375 
2376 #ifdef CONFIG_IPV6_SUBTREES
2377 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2378 		   rt->rt6i_src.plen);
2379 #else
2380 	seq_puts(m, "00000000000000000000000000000000 00 ");
2381 #endif
2382 
2383 	if (rt->rt6i_nexthop) {
2384 		seq_printf(m, NIP6_SEQFMT,
2385 			   NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2386 	} else {
2387 		seq_puts(m, "00000000000000000000000000000000");
2388 	}
2389 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2390 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2391 		   rt->u.dst.__use, rt->rt6i_flags,
2392 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2393 	return 0;
2394 }
2395 
2396 static int ipv6_route_show(struct seq_file *m, void *v)
2397 {
2398 	struct net *net = (struct net *)m->private;
2399 	fib6_clean_all(net, rt6_info_route, 0, m);
2400 	return 0;
2401 }
2402 
2403 static int ipv6_route_open(struct inode *inode, struct file *file)
2404 {
2405 	int err;
2406 	struct net *net = get_proc_net(inode);
2407 	if (!net)
2408 		return -ENXIO;
2409 
2410 	err = single_open(file, ipv6_route_show, net);
2411 	if (err < 0) {
2412 		put_net(net);
2413 		return err;
2414 	}
2415 
2416 	return 0;
2417 }
2418 
2419 static int ipv6_route_release(struct inode *inode, struct file *file)
2420 {
2421 	struct seq_file *seq = file->private_data;
2422 	struct net *net = seq->private;
2423 	put_net(net);
2424 	return single_release(inode, file);
2425 }
2426 
2427 static const struct file_operations ipv6_route_proc_fops = {
2428 	.owner		= THIS_MODULE,
2429 	.open		= ipv6_route_open,
2430 	.read		= seq_read,
2431 	.llseek		= seq_lseek,
2432 	.release	= ipv6_route_release,
2433 };
2434 
2435 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2436 {
2437 	struct net *net = (struct net *)seq->private;
2438 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2439 		   net->ipv6.rt6_stats->fib_nodes,
2440 		   net->ipv6.rt6_stats->fib_route_nodes,
2441 		   net->ipv6.rt6_stats->fib_rt_alloc,
2442 		   net->ipv6.rt6_stats->fib_rt_entries,
2443 		   net->ipv6.rt6_stats->fib_rt_cache,
2444 		   atomic_read(&net->ipv6.ip6_dst_ops->entries),
2445 		   net->ipv6.rt6_stats->fib_discarded_routes);
2446 
2447 	return 0;
2448 }
2449 
2450 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2451 {
2452 	int err;
2453 	struct net *net = get_proc_net(inode);
2454 	if (!net)
2455 		return -ENXIO;
2456 
2457 	err = single_open(file, rt6_stats_seq_show, net);
2458 	if (err < 0) {
2459 		put_net(net);
2460 		return err;
2461 	}
2462 
2463 	return 0;
2464 }
2465 
2466 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2467 {
2468 	struct seq_file *seq = file->private_data;
2469 	struct net *net = (struct net *)seq->private;
2470 	put_net(net);
2471 	return single_release(inode, file);
2472 }
2473 
2474 static const struct file_operations rt6_stats_seq_fops = {
2475 	.owner	 = THIS_MODULE,
2476 	.open	 = rt6_stats_seq_open,
2477 	.read	 = seq_read,
2478 	.llseek	 = seq_lseek,
2479 	.release = rt6_stats_seq_release,
2480 };
2481 #endif	/* CONFIG_PROC_FS */
2482 
2483 #ifdef CONFIG_SYSCTL
2484 
2485 static
2486 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2487 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2488 {
2489 	struct net *net = current->nsproxy->net_ns;
2490 	int delay = net->ipv6.sysctl.flush_delay;
2491 	if (write) {
2492 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2493 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2494 		return 0;
2495 	} else
2496 		return -EINVAL;
2497 }
2498 
2499 ctl_table ipv6_route_table_template[] = {
2500 	{
2501 		.procname	=	"flush",
2502 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2503 		.maxlen		=	sizeof(int),
2504 		.mode		=	0200,
2505 		.proc_handler	=	&ipv6_sysctl_rtcache_flush
2506 	},
2507 	{
2508 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2509 		.procname	=	"gc_thresh",
2510 		.data		=	&ip6_dst_ops_template.gc_thresh,
2511 		.maxlen		=	sizeof(int),
2512 		.mode		=	0644,
2513 		.proc_handler	=	&proc_dointvec,
2514 	},
2515 	{
2516 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2517 		.procname	=	"max_size",
2518 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2519 		.maxlen		=	sizeof(int),
2520 		.mode		=	0644,
2521 		.proc_handler	=	&proc_dointvec,
2522 	},
2523 	{
2524 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2525 		.procname	=	"gc_min_interval",
2526 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2527 		.maxlen		=	sizeof(int),
2528 		.mode		=	0644,
2529 		.proc_handler	=	&proc_dointvec_jiffies,
2530 		.strategy	=	&sysctl_jiffies,
2531 	},
2532 	{
2533 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2534 		.procname	=	"gc_timeout",
2535 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2536 		.maxlen		=	sizeof(int),
2537 		.mode		=	0644,
2538 		.proc_handler	=	&proc_dointvec_jiffies,
2539 		.strategy	=	&sysctl_jiffies,
2540 	},
2541 	{
2542 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2543 		.procname	=	"gc_interval",
2544 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2545 		.maxlen		=	sizeof(int),
2546 		.mode		=	0644,
2547 		.proc_handler	=	&proc_dointvec_jiffies,
2548 		.strategy	=	&sysctl_jiffies,
2549 	},
2550 	{
2551 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2552 		.procname	=	"gc_elasticity",
2553 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2554 		.maxlen		=	sizeof(int),
2555 		.mode		=	0644,
2556 		.proc_handler	=	&proc_dointvec_jiffies,
2557 		.strategy	=	&sysctl_jiffies,
2558 	},
2559 	{
2560 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2561 		.procname	=	"mtu_expires",
2562 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2563 		.maxlen		=	sizeof(int),
2564 		.mode		=	0644,
2565 		.proc_handler	=	&proc_dointvec_jiffies,
2566 		.strategy	=	&sysctl_jiffies,
2567 	},
2568 	{
2569 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2570 		.procname	=	"min_adv_mss",
2571 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2572 		.maxlen		=	sizeof(int),
2573 		.mode		=	0644,
2574 		.proc_handler	=	&proc_dointvec_jiffies,
2575 		.strategy	=	&sysctl_jiffies,
2576 	},
2577 	{
2578 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2579 		.procname	=	"gc_min_interval_ms",
2580 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2581 		.maxlen		=	sizeof(int),
2582 		.mode		=	0644,
2583 		.proc_handler	=	&proc_dointvec_ms_jiffies,
2584 		.strategy	=	&sysctl_ms_jiffies,
2585 	},
2586 	{ .ctl_name = 0 }
2587 };
2588 
2589 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2590 {
2591 	struct ctl_table *table;
2592 
2593 	table = kmemdup(ipv6_route_table_template,
2594 			sizeof(ipv6_route_table_template),
2595 			GFP_KERNEL);
2596 
2597 	if (table) {
2598 		table[0].data = &net->ipv6.sysctl.flush_delay;
2599 		table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2600 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2601 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2602 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2603 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2604 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2605 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2606 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2607 	}
2608 
2609 	return table;
2610 }
2611 #endif
2612 
2613 static int ip6_route_net_init(struct net *net)
2614 {
2615 	int ret = -ENOMEM;
2616 
2617 	net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2618 					sizeof(*net->ipv6.ip6_dst_ops),
2619 					GFP_KERNEL);
2620 	if (!net->ipv6.ip6_dst_ops)
2621 		goto out;
2622 	net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2623 
2624 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2625 					   sizeof(*net->ipv6.ip6_null_entry),
2626 					   GFP_KERNEL);
2627 	if (!net->ipv6.ip6_null_entry)
2628 		goto out_ip6_dst_ops;
2629 	net->ipv6.ip6_null_entry->u.dst.path =
2630 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2631 	net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2632 
2633 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2634 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2635 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2636 					       GFP_KERNEL);
2637 	if (!net->ipv6.ip6_prohibit_entry) {
2638 		kfree(net->ipv6.ip6_null_entry);
2639 		goto out;
2640 	}
2641 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2642 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2643 	net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2644 
2645 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2646 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2647 					       GFP_KERNEL);
2648 	if (!net->ipv6.ip6_blk_hole_entry) {
2649 		kfree(net->ipv6.ip6_null_entry);
2650 		kfree(net->ipv6.ip6_prohibit_entry);
2651 		goto out;
2652 	}
2653 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2654 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2655 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2656 #endif
2657 
2658 #ifdef CONFIG_PROC_FS
2659 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2660 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2661 #endif
2662 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2663 
2664 	ret = 0;
2665 out:
2666 	return ret;
2667 
2668 out_ip6_dst_ops:
2669 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2670 	kfree(net->ipv6.ip6_dst_ops);
2671 	goto out;
2672 }
2673 
2674 static void ip6_route_net_exit(struct net *net)
2675 {
2676 #ifdef CONFIG_PROC_FS
2677 	proc_net_remove(net, "ipv6_route");
2678 	proc_net_remove(net, "rt6_stats");
2679 #endif
2680 	kfree(net->ipv6.ip6_null_entry);
2681 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2682 	kfree(net->ipv6.ip6_prohibit_entry);
2683 	kfree(net->ipv6.ip6_blk_hole_entry);
2684 #endif
2685 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2686 	kfree(net->ipv6.ip6_dst_ops);
2687 }
2688 
2689 static struct pernet_operations ip6_route_net_ops = {
2690 	.init = ip6_route_net_init,
2691 	.exit = ip6_route_net_exit,
2692 };
2693 
2694 static struct notifier_block ip6_route_dev_notifier = {
2695 	.notifier_call = ip6_route_dev_notify,
2696 	.priority = 0,
2697 };
2698 
2699 int __init ip6_route_init(void)
2700 {
2701 	int ret;
2702 
2703 	ret = -ENOMEM;
2704 	ip6_dst_ops_template.kmem_cachep =
2705 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2706 				  SLAB_HWCACHE_ALIGN, NULL);
2707 	if (!ip6_dst_ops_template.kmem_cachep)
2708 		goto out;;
2709 
2710 	ret = register_pernet_subsys(&ip6_route_net_ops);
2711 	if (ret)
2712 		goto out_kmem_cache;
2713 
2714 	/* Registering of the loopback is done before this portion of code,
2715 	 * the loopback reference in rt6_info will not be taken, do it
2716 	 * manually for init_net */
2717 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2718 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2719   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2720 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2721 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2722 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2723 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2724   #endif
2725 	ret = fib6_init();
2726 	if (ret)
2727 		goto out_register_subsys;
2728 
2729 	ret = xfrm6_init();
2730 	if (ret)
2731 		goto out_fib6_init;
2732 
2733 	ret = fib6_rules_init();
2734 	if (ret)
2735 		goto xfrm6_init;
2736 
2737 	ret = -ENOBUFS;
2738 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2739 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2740 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2741 		goto fib6_rules_init;
2742 
2743 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2744 	if (ret)
2745 		goto fib6_rules_init;
2746 
2747 out:
2748 	return ret;
2749 
2750 fib6_rules_init:
2751 	fib6_rules_cleanup();
2752 xfrm6_init:
2753 	xfrm6_fini();
2754 out_fib6_init:
2755 	fib6_gc_cleanup();
2756 out_register_subsys:
2757 	unregister_pernet_subsys(&ip6_route_net_ops);
2758 out_kmem_cache:
2759 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2760 	goto out;
2761 }
2762 
2763 void ip6_route_cleanup(void)
2764 {
2765 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2766 	fib6_rules_cleanup();
2767 	xfrm6_fini();
2768 	fib6_gc_cleanup();
2769 	unregister_pernet_subsys(&ip6_route_net_ops);
2770 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2771 }
2772