xref: /openbmc/linux/net/ipv6/route.c (revision f42b3800)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  *	Ville Nuorvala
26  *		Fixed routing subtrees.
27  */
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/mroute6.h>
40 #include <linux/init.h>
41 #include <linux/if_arp.h>
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #include <linux/nsproxy.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 
59 #include <asm/uaccess.h>
60 
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64 
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67 
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75 
76 #define CLONE_OFFLINK_ROUTE 0
77 
78 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
79 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void		ip6_dst_destroy(struct dst_entry *);
82 static void		ip6_dst_ifdown(struct dst_entry *,
83 				       struct net_device *dev, int how);
84 static int		 ip6_dst_gc(struct dst_ops *ops);
85 
86 static int		ip6_pkt_discard(struct sk_buff *skb);
87 static int		ip6_pkt_discard_out(struct sk_buff *skb);
88 static void		ip6_link_failure(struct sk_buff *skb);
89 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 					   struct in6_addr *prefix, int prefixlen,
94 					   struct in6_addr *gwaddr, int ifindex,
95 					   unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 					   struct in6_addr *prefix, int prefixlen,
98 					   struct in6_addr *gwaddr, int ifindex);
99 #endif
100 
101 static struct dst_ops ip6_dst_ops_template = {
102 	.family			=	AF_INET6,
103 	.protocol		=	__constant_htons(ETH_P_IPV6),
104 	.gc			=	ip6_dst_gc,
105 	.gc_thresh		=	1024,
106 	.check			=	ip6_dst_check,
107 	.destroy		=	ip6_dst_destroy,
108 	.ifdown			=	ip6_dst_ifdown,
109 	.negative_advice	=	ip6_negative_advice,
110 	.link_failure		=	ip6_link_failure,
111 	.update_pmtu		=	ip6_rt_update_pmtu,
112 	.local_out		=	ip6_local_out,
113 	.entry_size		=	sizeof(struct rt6_info),
114 	.entries		=	ATOMIC_INIT(0),
115 };
116 
117 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
118 {
119 }
120 
121 static struct dst_ops ip6_dst_blackhole_ops = {
122 	.family			=	AF_INET6,
123 	.protocol		=	__constant_htons(ETH_P_IPV6),
124 	.destroy		=	ip6_dst_destroy,
125 	.check			=	ip6_dst_check,
126 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
127 	.entry_size		=	sizeof(struct rt6_info),
128 	.entries		=	ATOMIC_INIT(0),
129 };
130 
131 static struct rt6_info ip6_null_entry_template = {
132 	.u = {
133 		.dst = {
134 			.__refcnt	= ATOMIC_INIT(1),
135 			.__use		= 1,
136 			.obsolete	= -1,
137 			.error		= -ENETUNREACH,
138 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
139 			.input		= ip6_pkt_discard,
140 			.output		= ip6_pkt_discard_out,
141 		}
142 	},
143 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
144 	.rt6i_metric	= ~(u32) 0,
145 	.rt6i_ref	= ATOMIC_INIT(1),
146 };
147 
148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
149 
150 static int ip6_pkt_prohibit(struct sk_buff *skb);
151 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
152 
153 struct rt6_info ip6_prohibit_entry_template = {
154 	.u = {
155 		.dst = {
156 			.__refcnt	= ATOMIC_INIT(1),
157 			.__use		= 1,
158 			.obsolete	= -1,
159 			.error		= -EACCES,
160 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
161 			.input		= ip6_pkt_prohibit,
162 			.output		= ip6_pkt_prohibit_out,
163 		}
164 	},
165 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
166 	.rt6i_metric	= ~(u32) 0,
167 	.rt6i_ref	= ATOMIC_INIT(1),
168 };
169 
170 static struct rt6_info ip6_blk_hole_entry_template = {
171 	.u = {
172 		.dst = {
173 			.__refcnt	= ATOMIC_INIT(1),
174 			.__use		= 1,
175 			.obsolete	= -1,
176 			.error		= -EINVAL,
177 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
178 			.input		= dst_discard,
179 			.output		= dst_discard,
180 		}
181 	},
182 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
183 	.rt6i_metric	= ~(u32) 0,
184 	.rt6i_ref	= ATOMIC_INIT(1),
185 };
186 
187 #endif
188 
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192 	return (struct rt6_info *)dst_alloc(ops);
193 }
194 
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197 	struct rt6_info *rt = (struct rt6_info *)dst;
198 	struct inet6_dev *idev = rt->rt6i_idev;
199 
200 	if (idev != NULL) {
201 		rt->rt6i_idev = NULL;
202 		in6_dev_put(idev);
203 	}
204 }
205 
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 			   int how)
208 {
209 	struct rt6_info *rt = (struct rt6_info *)dst;
210 	struct inet6_dev *idev = rt->rt6i_idev;
211 	struct net_device *loopback_dev =
212 		dev_net(dev)->loopback_dev;
213 
214 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 		struct inet6_dev *loopback_idev =
216 			in6_dev_get(loopback_dev);
217 		if (loopback_idev != NULL) {
218 			rt->rt6i_idev = loopback_idev;
219 			in6_dev_put(idev);
220 		}
221 	}
222 }
223 
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226 	return (rt->rt6i_flags & RTF_EXPIRES &&
227 		time_after(jiffies, rt->rt6i_expires));
228 }
229 
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232 	return (ipv6_addr_type(daddr) &
233 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
234 }
235 
236 /*
237  *	Route lookup. Any table->tb6_lock is implied.
238  */
239 
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 						    struct rt6_info *rt,
242 						    int oif,
243 						    int strict)
244 {
245 	struct rt6_info *local = NULL;
246 	struct rt6_info *sprt;
247 
248 	if (oif) {
249 		for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
250 			struct net_device *dev = sprt->rt6i_dev;
251 			if (dev->ifindex == oif)
252 				return sprt;
253 			if (dev->flags & IFF_LOOPBACK) {
254 				if (sprt->rt6i_idev == NULL ||
255 				    sprt->rt6i_idev->dev->ifindex != oif) {
256 					if (strict && oif)
257 						continue;
258 					if (local && (!oif ||
259 						      local->rt6i_idev->dev->ifindex == oif))
260 						continue;
261 				}
262 				local = sprt;
263 			}
264 		}
265 
266 		if (local)
267 			return local;
268 
269 		if (strict)
270 			return net->ipv6.ip6_null_entry;
271 	}
272 	return rt;
273 }
274 
275 #ifdef CONFIG_IPV6_ROUTER_PREF
276 static void rt6_probe(struct rt6_info *rt)
277 {
278 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
279 	/*
280 	 * Okay, this does not seem to be appropriate
281 	 * for now, however, we need to check if it
282 	 * is really so; aka Router Reachability Probing.
283 	 *
284 	 * Router Reachability Probe MUST be rate-limited
285 	 * to no more than one per minute.
286 	 */
287 	if (!neigh || (neigh->nud_state & NUD_VALID))
288 		return;
289 	read_lock_bh(&neigh->lock);
290 	if (!(neigh->nud_state & NUD_VALID) &&
291 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
292 		struct in6_addr mcaddr;
293 		struct in6_addr *target;
294 
295 		neigh->updated = jiffies;
296 		read_unlock_bh(&neigh->lock);
297 
298 		target = (struct in6_addr *)&neigh->primary_key;
299 		addrconf_addr_solict_mult(target, &mcaddr);
300 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
301 	} else
302 		read_unlock_bh(&neigh->lock);
303 }
304 #else
305 static inline void rt6_probe(struct rt6_info *rt)
306 {
307 	return;
308 }
309 #endif
310 
311 /*
312  * Default Router Selection (RFC 2461 6.3.6)
313  */
314 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
315 {
316 	struct net_device *dev = rt->rt6i_dev;
317 	if (!oif || dev->ifindex == oif)
318 		return 2;
319 	if ((dev->flags & IFF_LOOPBACK) &&
320 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
321 		return 1;
322 	return 0;
323 }
324 
325 static inline int rt6_check_neigh(struct rt6_info *rt)
326 {
327 	struct neighbour *neigh = rt->rt6i_nexthop;
328 	int m;
329 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
330 	    !(rt->rt6i_flags & RTF_GATEWAY))
331 		m = 1;
332 	else if (neigh) {
333 		read_lock_bh(&neigh->lock);
334 		if (neigh->nud_state & NUD_VALID)
335 			m = 2;
336 #ifdef CONFIG_IPV6_ROUTER_PREF
337 		else if (neigh->nud_state & NUD_FAILED)
338 			m = 0;
339 #endif
340 		else
341 			m = 1;
342 		read_unlock_bh(&neigh->lock);
343 	} else
344 		m = 0;
345 	return m;
346 }
347 
348 static int rt6_score_route(struct rt6_info *rt, int oif,
349 			   int strict)
350 {
351 	int m, n;
352 
353 	m = rt6_check_dev(rt, oif);
354 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
355 		return -1;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
358 #endif
359 	n = rt6_check_neigh(rt);
360 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
361 		return -1;
362 	return m;
363 }
364 
365 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
366 				   int *mpri, struct rt6_info *match)
367 {
368 	int m;
369 
370 	if (rt6_check_expired(rt))
371 		goto out;
372 
373 	m = rt6_score_route(rt, oif, strict);
374 	if (m < 0)
375 		goto out;
376 
377 	if (m > *mpri) {
378 		if (strict & RT6_LOOKUP_F_REACHABLE)
379 			rt6_probe(match);
380 		*mpri = m;
381 		match = rt;
382 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
383 		rt6_probe(rt);
384 	}
385 
386 out:
387 	return match;
388 }
389 
390 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
391 				     struct rt6_info *rr_head,
392 				     u32 metric, int oif, int strict)
393 {
394 	struct rt6_info *rt, *match;
395 	int mpri = -1;
396 
397 	match = NULL;
398 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
399 	     rt = rt->u.dst.rt6_next)
400 		match = find_match(rt, oif, strict, &mpri, match);
401 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
402 	     rt = rt->u.dst.rt6_next)
403 		match = find_match(rt, oif, strict, &mpri, match);
404 
405 	return match;
406 }
407 
408 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
409 {
410 	struct rt6_info *match, *rt0;
411 	struct net *net;
412 
413 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
414 		  __func__, fn->leaf, oif);
415 
416 	rt0 = fn->rr_ptr;
417 	if (!rt0)
418 		fn->rr_ptr = rt0 = fn->leaf;
419 
420 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
421 
422 	if (!match &&
423 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
424 		struct rt6_info *next = rt0->u.dst.rt6_next;
425 
426 		/* no entries matched; do round-robin */
427 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
428 			next = fn->leaf;
429 
430 		if (next != rt0)
431 			fn->rr_ptr = next;
432 	}
433 
434 	RT6_TRACE("%s() => %p\n",
435 		  __func__, match);
436 
437 	net = dev_net(rt0->rt6i_dev);
438 	return (match ? match : net->ipv6.ip6_null_entry);
439 }
440 
441 #ifdef CONFIG_IPV6_ROUTE_INFO
442 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
443 		  struct in6_addr *gwaddr)
444 {
445 	struct net *net = dev_net(dev);
446 	struct route_info *rinfo = (struct route_info *) opt;
447 	struct in6_addr prefix_buf, *prefix;
448 	unsigned int pref;
449 	u32 lifetime;
450 	struct rt6_info *rt;
451 
452 	if (len < sizeof(struct route_info)) {
453 		return -EINVAL;
454 	}
455 
456 	/* Sanity check for prefix_len and length */
457 	if (rinfo->length > 3) {
458 		return -EINVAL;
459 	} else if (rinfo->prefix_len > 128) {
460 		return -EINVAL;
461 	} else if (rinfo->prefix_len > 64) {
462 		if (rinfo->length < 2) {
463 			return -EINVAL;
464 		}
465 	} else if (rinfo->prefix_len > 0) {
466 		if (rinfo->length < 1) {
467 			return -EINVAL;
468 		}
469 	}
470 
471 	pref = rinfo->route_pref;
472 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
473 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
474 
475 	lifetime = ntohl(rinfo->lifetime);
476 	if (lifetime == 0xffffffff) {
477 		/* infinity */
478 	} else if (lifetime > 0x7fffffff/HZ) {
479 		/* Avoid arithmetic overflow */
480 		lifetime = 0x7fffffff/HZ - 1;
481 	}
482 
483 	if (rinfo->length == 3)
484 		prefix = (struct in6_addr *)rinfo->prefix;
485 	else {
486 		/* this function is safe */
487 		ipv6_addr_prefix(&prefix_buf,
488 				 (struct in6_addr *)rinfo->prefix,
489 				 rinfo->prefix_len);
490 		prefix = &prefix_buf;
491 	}
492 
493 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
494 				dev->ifindex);
495 
496 	if (rt && !lifetime) {
497 		ip6_del_rt(rt);
498 		rt = NULL;
499 	}
500 
501 	if (!rt && lifetime)
502 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
503 					pref);
504 	else if (rt)
505 		rt->rt6i_flags = RTF_ROUTEINFO |
506 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
507 
508 	if (rt) {
509 		if (lifetime == 0xffffffff) {
510 			rt->rt6i_flags &= ~RTF_EXPIRES;
511 		} else {
512 			rt->rt6i_expires = jiffies + HZ * lifetime;
513 			rt->rt6i_flags |= RTF_EXPIRES;
514 		}
515 		dst_release(&rt->u.dst);
516 	}
517 	return 0;
518 }
519 #endif
520 
521 #define BACKTRACK(__net, saddr)			\
522 do { \
523 	if (rt == __net->ipv6.ip6_null_entry) {	\
524 		struct fib6_node *pn; \
525 		while (1) { \
526 			if (fn->fn_flags & RTN_TL_ROOT) \
527 				goto out; \
528 			pn = fn->parent; \
529 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
530 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
531 			else \
532 				fn = pn; \
533 			if (fn->fn_flags & RTN_RTINFO) \
534 				goto restart; \
535 		} \
536 	} \
537 } while(0)
538 
539 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
540 					     struct fib6_table *table,
541 					     struct flowi *fl, int flags)
542 {
543 	struct fib6_node *fn;
544 	struct rt6_info *rt;
545 
546 	read_lock_bh(&table->tb6_lock);
547 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
548 restart:
549 	rt = fn->leaf;
550 	rt = rt6_device_match(net, rt, fl->oif, flags);
551 	BACKTRACK(net, &fl->fl6_src);
552 out:
553 	dst_use(&rt->u.dst, jiffies);
554 	read_unlock_bh(&table->tb6_lock);
555 	return rt;
556 
557 }
558 
559 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
560 			    const struct in6_addr *saddr, int oif, int strict)
561 {
562 	struct flowi fl = {
563 		.oif = oif,
564 		.nl_u = {
565 			.ip6_u = {
566 				.daddr = *daddr,
567 			},
568 		},
569 	};
570 	struct dst_entry *dst;
571 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
572 
573 	if (saddr) {
574 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
575 		flags |= RT6_LOOKUP_F_HAS_SADDR;
576 	}
577 
578 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
579 	if (dst->error == 0)
580 		return (struct rt6_info *) dst;
581 
582 	dst_release(dst);
583 
584 	return NULL;
585 }
586 
587 EXPORT_SYMBOL(rt6_lookup);
588 
589 /* ip6_ins_rt is called with FREE table->tb6_lock.
590    It takes new route entry, the addition fails by any reason the
591    route is freed. In any case, if caller does not hold it, it may
592    be destroyed.
593  */
594 
595 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
596 {
597 	int err;
598 	struct fib6_table *table;
599 
600 	table = rt->rt6i_table;
601 	write_lock_bh(&table->tb6_lock);
602 	err = fib6_add(&table->tb6_root, rt, info);
603 	write_unlock_bh(&table->tb6_lock);
604 
605 	return err;
606 }
607 
608 int ip6_ins_rt(struct rt6_info *rt)
609 {
610 	struct nl_info info = {
611 		.nl_net = dev_net(rt->rt6i_dev),
612 	};
613 	return __ip6_ins_rt(rt, &info);
614 }
615 
616 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
617 				      struct in6_addr *saddr)
618 {
619 	struct rt6_info *rt;
620 
621 	/*
622 	 *	Clone the route.
623 	 */
624 
625 	rt = ip6_rt_copy(ort);
626 
627 	if (rt) {
628 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
629 			if (rt->rt6i_dst.plen != 128 &&
630 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
631 				rt->rt6i_flags |= RTF_ANYCAST;
632 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
633 		}
634 
635 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
636 		rt->rt6i_dst.plen = 128;
637 		rt->rt6i_flags |= RTF_CACHE;
638 		rt->u.dst.flags |= DST_HOST;
639 
640 #ifdef CONFIG_IPV6_SUBTREES
641 		if (rt->rt6i_src.plen && saddr) {
642 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
643 			rt->rt6i_src.plen = 128;
644 		}
645 #endif
646 
647 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
648 
649 	}
650 
651 	return rt;
652 }
653 
654 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
655 {
656 	struct rt6_info *rt = ip6_rt_copy(ort);
657 	if (rt) {
658 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
659 		rt->rt6i_dst.plen = 128;
660 		rt->rt6i_flags |= RTF_CACHE;
661 		rt->u.dst.flags |= DST_HOST;
662 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
663 	}
664 	return rt;
665 }
666 
667 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
668 				      struct flowi *fl, int flags)
669 {
670 	struct fib6_node *fn;
671 	struct rt6_info *rt, *nrt;
672 	int strict = 0;
673 	int attempts = 3;
674 	int err;
675 	int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
676 
677 	strict |= flags & RT6_LOOKUP_F_IFACE;
678 
679 relookup:
680 	read_lock_bh(&table->tb6_lock);
681 
682 restart_2:
683 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
684 
685 restart:
686 	rt = rt6_select(fn, oif, strict | reachable);
687 
688 	BACKTRACK(net, &fl->fl6_src);
689 	if (rt == net->ipv6.ip6_null_entry ||
690 	    rt->rt6i_flags & RTF_CACHE)
691 		goto out;
692 
693 	dst_hold(&rt->u.dst);
694 	read_unlock_bh(&table->tb6_lock);
695 
696 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
697 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
698 	else {
699 #if CLONE_OFFLINK_ROUTE
700 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
701 #else
702 		goto out2;
703 #endif
704 	}
705 
706 	dst_release(&rt->u.dst);
707 	rt = nrt ? : net->ipv6.ip6_null_entry;
708 
709 	dst_hold(&rt->u.dst);
710 	if (nrt) {
711 		err = ip6_ins_rt(nrt);
712 		if (!err)
713 			goto out2;
714 	}
715 
716 	if (--attempts <= 0)
717 		goto out2;
718 
719 	/*
720 	 * Race condition! In the gap, when table->tb6_lock was
721 	 * released someone could insert this route.  Relookup.
722 	 */
723 	dst_release(&rt->u.dst);
724 	goto relookup;
725 
726 out:
727 	if (reachable) {
728 		reachable = 0;
729 		goto restart_2;
730 	}
731 	dst_hold(&rt->u.dst);
732 	read_unlock_bh(&table->tb6_lock);
733 out2:
734 	rt->u.dst.lastuse = jiffies;
735 	rt->u.dst.__use++;
736 
737 	return rt;
738 }
739 
740 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
741 					    struct flowi *fl, int flags)
742 {
743 	return ip6_pol_route(net, table, fl->iif, fl, flags);
744 }
745 
746 void ip6_route_input(struct sk_buff *skb)
747 {
748 	struct ipv6hdr *iph = ipv6_hdr(skb);
749 	struct net *net = dev_net(skb->dev);
750 	int flags = RT6_LOOKUP_F_HAS_SADDR;
751 	struct flowi fl = {
752 		.iif = skb->dev->ifindex,
753 		.nl_u = {
754 			.ip6_u = {
755 				.daddr = iph->daddr,
756 				.saddr = iph->saddr,
757 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
758 			},
759 		},
760 		.mark = skb->mark,
761 		.proto = iph->nexthdr,
762 	};
763 
764 	if (rt6_need_strict(&iph->daddr))
765 		flags |= RT6_LOOKUP_F_IFACE;
766 
767 	skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
768 }
769 
770 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
771 					     struct flowi *fl, int flags)
772 {
773 	return ip6_pol_route(net, table, fl->oif, fl, flags);
774 }
775 
776 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
777 				    struct flowi *fl)
778 {
779 	int flags = 0;
780 
781 	if (rt6_need_strict(&fl->fl6_dst))
782 		flags |= RT6_LOOKUP_F_IFACE;
783 
784 	if (!ipv6_addr_any(&fl->fl6_src))
785 		flags |= RT6_LOOKUP_F_HAS_SADDR;
786 	else if (sk) {
787 		unsigned int prefs = inet6_sk(sk)->srcprefs;
788 		if (prefs & IPV6_PREFER_SRC_TMP)
789 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
790 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
791 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
792 		if (prefs & IPV6_PREFER_SRC_COA)
793 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
794 	}
795 
796 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
797 }
798 
799 EXPORT_SYMBOL(ip6_route_output);
800 
801 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
802 {
803 	struct rt6_info *ort = (struct rt6_info *) *dstp;
804 	struct rt6_info *rt = (struct rt6_info *)
805 		dst_alloc(&ip6_dst_blackhole_ops);
806 	struct dst_entry *new = NULL;
807 
808 	if (rt) {
809 		new = &rt->u.dst;
810 
811 		atomic_set(&new->__refcnt, 1);
812 		new->__use = 1;
813 		new->input = dst_discard;
814 		new->output = dst_discard;
815 
816 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
817 		new->dev = ort->u.dst.dev;
818 		if (new->dev)
819 			dev_hold(new->dev);
820 		rt->rt6i_idev = ort->rt6i_idev;
821 		if (rt->rt6i_idev)
822 			in6_dev_hold(rt->rt6i_idev);
823 		rt->rt6i_expires = 0;
824 
825 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
826 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
827 		rt->rt6i_metric = 0;
828 
829 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
830 #ifdef CONFIG_IPV6_SUBTREES
831 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
832 #endif
833 
834 		dst_free(new);
835 	}
836 
837 	dst_release(*dstp);
838 	*dstp = new;
839 	return (new ? 0 : -ENOMEM);
840 }
841 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
842 
843 /*
844  *	Destination cache support functions
845  */
846 
847 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
848 {
849 	struct rt6_info *rt;
850 
851 	rt = (struct rt6_info *) dst;
852 
853 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
854 		return dst;
855 
856 	return NULL;
857 }
858 
859 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
860 {
861 	struct rt6_info *rt = (struct rt6_info *) dst;
862 
863 	if (rt) {
864 		if (rt->rt6i_flags & RTF_CACHE)
865 			ip6_del_rt(rt);
866 		else
867 			dst_release(dst);
868 	}
869 	return NULL;
870 }
871 
872 static void ip6_link_failure(struct sk_buff *skb)
873 {
874 	struct rt6_info *rt;
875 
876 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
877 
878 	rt = (struct rt6_info *) skb->dst;
879 	if (rt) {
880 		if (rt->rt6i_flags&RTF_CACHE) {
881 			dst_set_expires(&rt->u.dst, 0);
882 			rt->rt6i_flags |= RTF_EXPIRES;
883 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
884 			rt->rt6i_node->fn_sernum = -1;
885 	}
886 }
887 
888 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
889 {
890 	struct rt6_info *rt6 = (struct rt6_info*)dst;
891 
892 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
893 		rt6->rt6i_flags |= RTF_MODIFIED;
894 		if (mtu < IPV6_MIN_MTU) {
895 			mtu = IPV6_MIN_MTU;
896 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
897 		}
898 		dst->metrics[RTAX_MTU-1] = mtu;
899 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
900 	}
901 }
902 
903 static int ipv6_get_mtu(struct net_device *dev);
904 
905 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
906 {
907 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
908 
909 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
910 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
911 
912 	/*
913 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
914 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
915 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
916 	 * rely only on pmtu discovery"
917 	 */
918 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
919 		mtu = IPV6_MAXPLEN;
920 	return mtu;
921 }
922 
923 static struct dst_entry *icmp6_dst_gc_list;
924 static DEFINE_SPINLOCK(icmp6_dst_lock);
925 
926 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
927 				  struct neighbour *neigh,
928 				  const struct in6_addr *addr)
929 {
930 	struct rt6_info *rt;
931 	struct inet6_dev *idev = in6_dev_get(dev);
932 	struct net *net = dev_net(dev);
933 
934 	if (unlikely(idev == NULL))
935 		return NULL;
936 
937 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
938 	if (unlikely(rt == NULL)) {
939 		in6_dev_put(idev);
940 		goto out;
941 	}
942 
943 	dev_hold(dev);
944 	if (neigh)
945 		neigh_hold(neigh);
946 	else
947 		neigh = ndisc_get_neigh(dev, addr);
948 
949 	rt->rt6i_dev	  = dev;
950 	rt->rt6i_idev     = idev;
951 	rt->rt6i_nexthop  = neigh;
952 	atomic_set(&rt->u.dst.__refcnt, 1);
953 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
954 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
955 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
956 	rt->u.dst.output  = ip6_output;
957 
958 #if 0	/* there's no chance to use these for ndisc */
959 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
960 				? DST_HOST
961 				: 0;
962 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
963 	rt->rt6i_dst.plen = 128;
964 #endif
965 
966 	spin_lock_bh(&icmp6_dst_lock);
967 	rt->u.dst.next = icmp6_dst_gc_list;
968 	icmp6_dst_gc_list = &rt->u.dst;
969 	spin_unlock_bh(&icmp6_dst_lock);
970 
971 	fib6_force_start_gc(net);
972 
973 out:
974 	return &rt->u.dst;
975 }
976 
977 int icmp6_dst_gc(int *more)
978 {
979 	struct dst_entry *dst, *next, **pprev;
980 	int freed;
981 
982 	next = NULL;
983 	freed = 0;
984 
985 	spin_lock_bh(&icmp6_dst_lock);
986 	pprev = &icmp6_dst_gc_list;
987 
988 	while ((dst = *pprev) != NULL) {
989 		if (!atomic_read(&dst->__refcnt)) {
990 			*pprev = dst->next;
991 			dst_free(dst);
992 			freed++;
993 		} else {
994 			pprev = &dst->next;
995 			(*more)++;
996 		}
997 	}
998 
999 	spin_unlock_bh(&icmp6_dst_lock);
1000 
1001 	return freed;
1002 }
1003 
1004 static int ip6_dst_gc(struct dst_ops *ops)
1005 {
1006 	unsigned long now = jiffies;
1007 	struct net *net = ops->dst_net;
1008 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1009 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1010 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1011 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1012 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1013 
1014 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1015 	    atomic_read(&ops->entries) <= rt_max_size)
1016 		goto out;
1017 
1018 	net->ipv6.ip6_rt_gc_expire++;
1019 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1020 	net->ipv6.ip6_rt_last_gc = now;
1021 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1022 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1023 out:
1024 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1025 	return (atomic_read(&ops->entries) > rt_max_size);
1026 }
1027 
1028 /* Clean host part of a prefix. Not necessary in radix tree,
1029    but results in cleaner routing tables.
1030 
1031    Remove it only when all the things will work!
1032  */
1033 
1034 static int ipv6_get_mtu(struct net_device *dev)
1035 {
1036 	int mtu = IPV6_MIN_MTU;
1037 	struct inet6_dev *idev;
1038 
1039 	idev = in6_dev_get(dev);
1040 	if (idev) {
1041 		mtu = idev->cnf.mtu6;
1042 		in6_dev_put(idev);
1043 	}
1044 	return mtu;
1045 }
1046 
1047 int ip6_dst_hoplimit(struct dst_entry *dst)
1048 {
1049 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1050 	if (hoplimit < 0) {
1051 		struct net_device *dev = dst->dev;
1052 		struct inet6_dev *idev = in6_dev_get(dev);
1053 		if (idev) {
1054 			hoplimit = idev->cnf.hop_limit;
1055 			in6_dev_put(idev);
1056 		} else
1057 			hoplimit = ipv6_devconf.hop_limit;
1058 	}
1059 	return hoplimit;
1060 }
1061 
1062 /*
1063  *
1064  */
1065 
1066 int ip6_route_add(struct fib6_config *cfg)
1067 {
1068 	int err;
1069 	struct net *net = cfg->fc_nlinfo.nl_net;
1070 	struct rt6_info *rt = NULL;
1071 	struct net_device *dev = NULL;
1072 	struct inet6_dev *idev = NULL;
1073 	struct fib6_table *table;
1074 	int addr_type;
1075 
1076 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1077 		return -EINVAL;
1078 #ifndef CONFIG_IPV6_SUBTREES
1079 	if (cfg->fc_src_len)
1080 		return -EINVAL;
1081 #endif
1082 	if (cfg->fc_ifindex) {
1083 		err = -ENODEV;
1084 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1085 		if (!dev)
1086 			goto out;
1087 		idev = in6_dev_get(dev);
1088 		if (!idev)
1089 			goto out;
1090 	}
1091 
1092 	if (cfg->fc_metric == 0)
1093 		cfg->fc_metric = IP6_RT_PRIO_USER;
1094 
1095 	table = fib6_new_table(net, cfg->fc_table);
1096 	if (table == NULL) {
1097 		err = -ENOBUFS;
1098 		goto out;
1099 	}
1100 
1101 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1102 
1103 	if (rt == NULL) {
1104 		err = -ENOMEM;
1105 		goto out;
1106 	}
1107 
1108 	rt->u.dst.obsolete = -1;
1109 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1110 
1111 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1112 		cfg->fc_protocol = RTPROT_BOOT;
1113 	rt->rt6i_protocol = cfg->fc_protocol;
1114 
1115 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1116 
1117 	if (addr_type & IPV6_ADDR_MULTICAST)
1118 		rt->u.dst.input = ip6_mc_input;
1119 	else
1120 		rt->u.dst.input = ip6_forward;
1121 
1122 	rt->u.dst.output = ip6_output;
1123 
1124 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1125 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1126 	if (rt->rt6i_dst.plen == 128)
1127 	       rt->u.dst.flags = DST_HOST;
1128 
1129 #ifdef CONFIG_IPV6_SUBTREES
1130 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1131 	rt->rt6i_src.plen = cfg->fc_src_len;
1132 #endif
1133 
1134 	rt->rt6i_metric = cfg->fc_metric;
1135 
1136 	/* We cannot add true routes via loopback here,
1137 	   they would result in kernel looping; promote them to reject routes
1138 	 */
1139 	if ((cfg->fc_flags & RTF_REJECT) ||
1140 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1141 		/* hold loopback dev/idev if we haven't done so. */
1142 		if (dev != net->loopback_dev) {
1143 			if (dev) {
1144 				dev_put(dev);
1145 				in6_dev_put(idev);
1146 			}
1147 			dev = net->loopback_dev;
1148 			dev_hold(dev);
1149 			idev = in6_dev_get(dev);
1150 			if (!idev) {
1151 				err = -ENODEV;
1152 				goto out;
1153 			}
1154 		}
1155 		rt->u.dst.output = ip6_pkt_discard_out;
1156 		rt->u.dst.input = ip6_pkt_discard;
1157 		rt->u.dst.error = -ENETUNREACH;
1158 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1159 		goto install_route;
1160 	}
1161 
1162 	if (cfg->fc_flags & RTF_GATEWAY) {
1163 		struct in6_addr *gw_addr;
1164 		int gwa_type;
1165 
1166 		gw_addr = &cfg->fc_gateway;
1167 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1168 		gwa_type = ipv6_addr_type(gw_addr);
1169 
1170 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1171 			struct rt6_info *grt;
1172 
1173 			/* IPv6 strictly inhibits using not link-local
1174 			   addresses as nexthop address.
1175 			   Otherwise, router will not able to send redirects.
1176 			   It is very good, but in some (rare!) circumstances
1177 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1178 			   some exceptions. --ANK
1179 			 */
1180 			err = -EINVAL;
1181 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1182 				goto out;
1183 
1184 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1185 
1186 			err = -EHOSTUNREACH;
1187 			if (grt == NULL)
1188 				goto out;
1189 			if (dev) {
1190 				if (dev != grt->rt6i_dev) {
1191 					dst_release(&grt->u.dst);
1192 					goto out;
1193 				}
1194 			} else {
1195 				dev = grt->rt6i_dev;
1196 				idev = grt->rt6i_idev;
1197 				dev_hold(dev);
1198 				in6_dev_hold(grt->rt6i_idev);
1199 			}
1200 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1201 				err = 0;
1202 			dst_release(&grt->u.dst);
1203 
1204 			if (err)
1205 				goto out;
1206 		}
1207 		err = -EINVAL;
1208 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1209 			goto out;
1210 	}
1211 
1212 	err = -ENODEV;
1213 	if (dev == NULL)
1214 		goto out;
1215 
1216 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1217 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1218 		if (IS_ERR(rt->rt6i_nexthop)) {
1219 			err = PTR_ERR(rt->rt6i_nexthop);
1220 			rt->rt6i_nexthop = NULL;
1221 			goto out;
1222 		}
1223 	}
1224 
1225 	rt->rt6i_flags = cfg->fc_flags;
1226 
1227 install_route:
1228 	if (cfg->fc_mx) {
1229 		struct nlattr *nla;
1230 		int remaining;
1231 
1232 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1233 			int type = nla_type(nla);
1234 
1235 			if (type) {
1236 				if (type > RTAX_MAX) {
1237 					err = -EINVAL;
1238 					goto out;
1239 				}
1240 
1241 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1242 			}
1243 		}
1244 	}
1245 
1246 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1247 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1248 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1249 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1250 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1251 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1252 	rt->u.dst.dev = dev;
1253 	rt->rt6i_idev = idev;
1254 	rt->rt6i_table = table;
1255 
1256 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1257 
1258 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1259 
1260 out:
1261 	if (dev)
1262 		dev_put(dev);
1263 	if (idev)
1264 		in6_dev_put(idev);
1265 	if (rt)
1266 		dst_free(&rt->u.dst);
1267 	return err;
1268 }
1269 
1270 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1271 {
1272 	int err;
1273 	struct fib6_table *table;
1274 	struct net *net = dev_net(rt->rt6i_dev);
1275 
1276 	if (rt == net->ipv6.ip6_null_entry)
1277 		return -ENOENT;
1278 
1279 	table = rt->rt6i_table;
1280 	write_lock_bh(&table->tb6_lock);
1281 
1282 	err = fib6_del(rt, info);
1283 	dst_release(&rt->u.dst);
1284 
1285 	write_unlock_bh(&table->tb6_lock);
1286 
1287 	return err;
1288 }
1289 
1290 int ip6_del_rt(struct rt6_info *rt)
1291 {
1292 	struct nl_info info = {
1293 		.nl_net = dev_net(rt->rt6i_dev),
1294 	};
1295 	return __ip6_del_rt(rt, &info);
1296 }
1297 
1298 static int ip6_route_del(struct fib6_config *cfg)
1299 {
1300 	struct fib6_table *table;
1301 	struct fib6_node *fn;
1302 	struct rt6_info *rt;
1303 	int err = -ESRCH;
1304 
1305 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1306 	if (table == NULL)
1307 		return err;
1308 
1309 	read_lock_bh(&table->tb6_lock);
1310 
1311 	fn = fib6_locate(&table->tb6_root,
1312 			 &cfg->fc_dst, cfg->fc_dst_len,
1313 			 &cfg->fc_src, cfg->fc_src_len);
1314 
1315 	if (fn) {
1316 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1317 			if (cfg->fc_ifindex &&
1318 			    (rt->rt6i_dev == NULL ||
1319 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1320 				continue;
1321 			if (cfg->fc_flags & RTF_GATEWAY &&
1322 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1323 				continue;
1324 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1325 				continue;
1326 			dst_hold(&rt->u.dst);
1327 			read_unlock_bh(&table->tb6_lock);
1328 
1329 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1330 		}
1331 	}
1332 	read_unlock_bh(&table->tb6_lock);
1333 
1334 	return err;
1335 }
1336 
1337 /*
1338  *	Handle redirects
1339  */
1340 struct ip6rd_flowi {
1341 	struct flowi fl;
1342 	struct in6_addr gateway;
1343 };
1344 
1345 static struct rt6_info *__ip6_route_redirect(struct net *net,
1346 					     struct fib6_table *table,
1347 					     struct flowi *fl,
1348 					     int flags)
1349 {
1350 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1351 	struct rt6_info *rt;
1352 	struct fib6_node *fn;
1353 
1354 	/*
1355 	 * Get the "current" route for this destination and
1356 	 * check if the redirect has come from approriate router.
1357 	 *
1358 	 * RFC 2461 specifies that redirects should only be
1359 	 * accepted if they come from the nexthop to the target.
1360 	 * Due to the way the routes are chosen, this notion
1361 	 * is a bit fuzzy and one might need to check all possible
1362 	 * routes.
1363 	 */
1364 
1365 	read_lock_bh(&table->tb6_lock);
1366 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1367 restart:
1368 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1369 		/*
1370 		 * Current route is on-link; redirect is always invalid.
1371 		 *
1372 		 * Seems, previous statement is not true. It could
1373 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1374 		 * But then router serving it might decide, that we should
1375 		 * know truth 8)8) --ANK (980726).
1376 		 */
1377 		if (rt6_check_expired(rt))
1378 			continue;
1379 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1380 			continue;
1381 		if (fl->oif != rt->rt6i_dev->ifindex)
1382 			continue;
1383 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1384 			continue;
1385 		break;
1386 	}
1387 
1388 	if (!rt)
1389 		rt = net->ipv6.ip6_null_entry;
1390 	BACKTRACK(net, &fl->fl6_src);
1391 out:
1392 	dst_hold(&rt->u.dst);
1393 
1394 	read_unlock_bh(&table->tb6_lock);
1395 
1396 	return rt;
1397 };
1398 
1399 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1400 					   struct in6_addr *src,
1401 					   struct in6_addr *gateway,
1402 					   struct net_device *dev)
1403 {
1404 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1405 	struct net *net = dev_net(dev);
1406 	struct ip6rd_flowi rdfl = {
1407 		.fl = {
1408 			.oif = dev->ifindex,
1409 			.nl_u = {
1410 				.ip6_u = {
1411 					.daddr = *dest,
1412 					.saddr = *src,
1413 				},
1414 			},
1415 		},
1416 		.gateway = *gateway,
1417 	};
1418 
1419 	if (rt6_need_strict(dest))
1420 		flags |= RT6_LOOKUP_F_IFACE;
1421 
1422 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1423 						   flags, __ip6_route_redirect);
1424 }
1425 
1426 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1427 		  struct in6_addr *saddr,
1428 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1429 {
1430 	struct rt6_info *rt, *nrt = NULL;
1431 	struct netevent_redirect netevent;
1432 	struct net *net = dev_net(neigh->dev);
1433 
1434 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1435 
1436 	if (rt == net->ipv6.ip6_null_entry) {
1437 		if (net_ratelimit())
1438 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1439 			       "for redirect target\n");
1440 		goto out;
1441 	}
1442 
1443 	/*
1444 	 *	We have finally decided to accept it.
1445 	 */
1446 
1447 	neigh_update(neigh, lladdr, NUD_STALE,
1448 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1449 		     NEIGH_UPDATE_F_OVERRIDE|
1450 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1451 				     NEIGH_UPDATE_F_ISROUTER))
1452 		     );
1453 
1454 	/*
1455 	 * Redirect received -> path was valid.
1456 	 * Look, redirects are sent only in response to data packets,
1457 	 * so that this nexthop apparently is reachable. --ANK
1458 	 */
1459 	dst_confirm(&rt->u.dst);
1460 
1461 	/* Duplicate redirect: silently ignore. */
1462 	if (neigh == rt->u.dst.neighbour)
1463 		goto out;
1464 
1465 	nrt = ip6_rt_copy(rt);
1466 	if (nrt == NULL)
1467 		goto out;
1468 
1469 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1470 	if (on_link)
1471 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1472 
1473 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1474 	nrt->rt6i_dst.plen = 128;
1475 	nrt->u.dst.flags |= DST_HOST;
1476 
1477 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1478 	nrt->rt6i_nexthop = neigh_clone(neigh);
1479 	/* Reset pmtu, it may be better */
1480 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1481 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1482 							dst_mtu(&nrt->u.dst));
1483 
1484 	if (ip6_ins_rt(nrt))
1485 		goto out;
1486 
1487 	netevent.old = &rt->u.dst;
1488 	netevent.new = &nrt->u.dst;
1489 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1490 
1491 	if (rt->rt6i_flags&RTF_CACHE) {
1492 		ip6_del_rt(rt);
1493 		return;
1494 	}
1495 
1496 out:
1497 	dst_release(&rt->u.dst);
1498 	return;
1499 }
1500 
1501 /*
1502  *	Handle ICMP "packet too big" messages
1503  *	i.e. Path MTU discovery
1504  */
1505 
1506 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1507 			struct net_device *dev, u32 pmtu)
1508 {
1509 	struct rt6_info *rt, *nrt;
1510 	struct net *net = dev_net(dev);
1511 	int allfrag = 0;
1512 
1513 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1514 	if (rt == NULL)
1515 		return;
1516 
1517 	if (pmtu >= dst_mtu(&rt->u.dst))
1518 		goto out;
1519 
1520 	if (pmtu < IPV6_MIN_MTU) {
1521 		/*
1522 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1523 		 * MTU (1280) and a fragment header should always be included
1524 		 * after a node receiving Too Big message reporting PMTU is
1525 		 * less than the IPv6 Minimum Link MTU.
1526 		 */
1527 		pmtu = IPV6_MIN_MTU;
1528 		allfrag = 1;
1529 	}
1530 
1531 	/* New mtu received -> path was valid.
1532 	   They are sent only in response to data packets,
1533 	   so that this nexthop apparently is reachable. --ANK
1534 	 */
1535 	dst_confirm(&rt->u.dst);
1536 
1537 	/* Host route. If it is static, it would be better
1538 	   not to override it, but add new one, so that
1539 	   when cache entry will expire old pmtu
1540 	   would return automatically.
1541 	 */
1542 	if (rt->rt6i_flags & RTF_CACHE) {
1543 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1544 		if (allfrag)
1545 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1546 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1547 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1548 		goto out;
1549 	}
1550 
1551 	/* Network route.
1552 	   Two cases are possible:
1553 	   1. It is connected route. Action: COW
1554 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1555 	 */
1556 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1557 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1558 	else
1559 		nrt = rt6_alloc_clone(rt, daddr);
1560 
1561 	if (nrt) {
1562 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1563 		if (allfrag)
1564 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1565 
1566 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1567 		 * happened within 5 mins, the recommended timer is 10 mins.
1568 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1569 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1570 		 * and detecting PMTU increase will be automatically happened.
1571 		 */
1572 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1573 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1574 
1575 		ip6_ins_rt(nrt);
1576 	}
1577 out:
1578 	dst_release(&rt->u.dst);
1579 }
1580 
1581 /*
1582  *	Misc support functions
1583  */
1584 
1585 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1586 {
1587 	struct net *net = dev_net(ort->rt6i_dev);
1588 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1589 
1590 	if (rt) {
1591 		rt->u.dst.input = ort->u.dst.input;
1592 		rt->u.dst.output = ort->u.dst.output;
1593 
1594 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1595 		rt->u.dst.error = ort->u.dst.error;
1596 		rt->u.dst.dev = ort->u.dst.dev;
1597 		if (rt->u.dst.dev)
1598 			dev_hold(rt->u.dst.dev);
1599 		rt->rt6i_idev = ort->rt6i_idev;
1600 		if (rt->rt6i_idev)
1601 			in6_dev_hold(rt->rt6i_idev);
1602 		rt->u.dst.lastuse = jiffies;
1603 		rt->rt6i_expires = 0;
1604 
1605 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1606 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1607 		rt->rt6i_metric = 0;
1608 
1609 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1610 #ifdef CONFIG_IPV6_SUBTREES
1611 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1612 #endif
1613 		rt->rt6i_table = ort->rt6i_table;
1614 	}
1615 	return rt;
1616 }
1617 
1618 #ifdef CONFIG_IPV6_ROUTE_INFO
1619 static struct rt6_info *rt6_get_route_info(struct net *net,
1620 					   struct in6_addr *prefix, int prefixlen,
1621 					   struct in6_addr *gwaddr, int ifindex)
1622 {
1623 	struct fib6_node *fn;
1624 	struct rt6_info *rt = NULL;
1625 	struct fib6_table *table;
1626 
1627 	table = fib6_get_table(net, RT6_TABLE_INFO);
1628 	if (table == NULL)
1629 		return NULL;
1630 
1631 	write_lock_bh(&table->tb6_lock);
1632 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1633 	if (!fn)
1634 		goto out;
1635 
1636 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1637 		if (rt->rt6i_dev->ifindex != ifindex)
1638 			continue;
1639 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1640 			continue;
1641 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1642 			continue;
1643 		dst_hold(&rt->u.dst);
1644 		break;
1645 	}
1646 out:
1647 	write_unlock_bh(&table->tb6_lock);
1648 	return rt;
1649 }
1650 
1651 static struct rt6_info *rt6_add_route_info(struct net *net,
1652 					   struct in6_addr *prefix, int prefixlen,
1653 					   struct in6_addr *gwaddr, int ifindex,
1654 					   unsigned pref)
1655 {
1656 	struct fib6_config cfg = {
1657 		.fc_table	= RT6_TABLE_INFO,
1658 		.fc_metric	= IP6_RT_PRIO_USER,
1659 		.fc_ifindex	= ifindex,
1660 		.fc_dst_len	= prefixlen,
1661 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1662 				  RTF_UP | RTF_PREF(pref),
1663 		.fc_nlinfo.pid = 0,
1664 		.fc_nlinfo.nlh = NULL,
1665 		.fc_nlinfo.nl_net = net,
1666 	};
1667 
1668 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1669 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1670 
1671 	/* We should treat it as a default route if prefix length is 0. */
1672 	if (!prefixlen)
1673 		cfg.fc_flags |= RTF_DEFAULT;
1674 
1675 	ip6_route_add(&cfg);
1676 
1677 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1678 }
1679 #endif
1680 
1681 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1682 {
1683 	struct rt6_info *rt;
1684 	struct fib6_table *table;
1685 
1686 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1687 	if (table == NULL)
1688 		return NULL;
1689 
1690 	write_lock_bh(&table->tb6_lock);
1691 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1692 		if (dev == rt->rt6i_dev &&
1693 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1694 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1695 			break;
1696 	}
1697 	if (rt)
1698 		dst_hold(&rt->u.dst);
1699 	write_unlock_bh(&table->tb6_lock);
1700 	return rt;
1701 }
1702 
1703 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1704 				     struct net_device *dev,
1705 				     unsigned int pref)
1706 {
1707 	struct fib6_config cfg = {
1708 		.fc_table	= RT6_TABLE_DFLT,
1709 		.fc_metric	= IP6_RT_PRIO_USER,
1710 		.fc_ifindex	= dev->ifindex,
1711 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1712 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1713 		.fc_nlinfo.pid = 0,
1714 		.fc_nlinfo.nlh = NULL,
1715 		.fc_nlinfo.nl_net = dev_net(dev),
1716 	};
1717 
1718 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1719 
1720 	ip6_route_add(&cfg);
1721 
1722 	return rt6_get_dflt_router(gwaddr, dev);
1723 }
1724 
1725 void rt6_purge_dflt_routers(struct net *net)
1726 {
1727 	struct rt6_info *rt;
1728 	struct fib6_table *table;
1729 
1730 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1731 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1732 	if (table == NULL)
1733 		return;
1734 
1735 restart:
1736 	read_lock_bh(&table->tb6_lock);
1737 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1738 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1739 			dst_hold(&rt->u.dst);
1740 			read_unlock_bh(&table->tb6_lock);
1741 			ip6_del_rt(rt);
1742 			goto restart;
1743 		}
1744 	}
1745 	read_unlock_bh(&table->tb6_lock);
1746 }
1747 
1748 static void rtmsg_to_fib6_config(struct net *net,
1749 				 struct in6_rtmsg *rtmsg,
1750 				 struct fib6_config *cfg)
1751 {
1752 	memset(cfg, 0, sizeof(*cfg));
1753 
1754 	cfg->fc_table = RT6_TABLE_MAIN;
1755 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1756 	cfg->fc_metric = rtmsg->rtmsg_metric;
1757 	cfg->fc_expires = rtmsg->rtmsg_info;
1758 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1759 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1760 	cfg->fc_flags = rtmsg->rtmsg_flags;
1761 
1762 	cfg->fc_nlinfo.nl_net = net;
1763 
1764 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1765 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1766 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1767 }
1768 
1769 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1770 {
1771 	struct fib6_config cfg;
1772 	struct in6_rtmsg rtmsg;
1773 	int err;
1774 
1775 	switch(cmd) {
1776 	case SIOCADDRT:		/* Add a route */
1777 	case SIOCDELRT:		/* Delete a route */
1778 		if (!capable(CAP_NET_ADMIN))
1779 			return -EPERM;
1780 		err = copy_from_user(&rtmsg, arg,
1781 				     sizeof(struct in6_rtmsg));
1782 		if (err)
1783 			return -EFAULT;
1784 
1785 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1786 
1787 		rtnl_lock();
1788 		switch (cmd) {
1789 		case SIOCADDRT:
1790 			err = ip6_route_add(&cfg);
1791 			break;
1792 		case SIOCDELRT:
1793 			err = ip6_route_del(&cfg);
1794 			break;
1795 		default:
1796 			err = -EINVAL;
1797 		}
1798 		rtnl_unlock();
1799 
1800 		return err;
1801 	}
1802 
1803 	return -EINVAL;
1804 }
1805 
1806 /*
1807  *	Drop the packet on the floor
1808  */
1809 
1810 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1811 {
1812 	int type;
1813 	switch (ipstats_mib_noroutes) {
1814 	case IPSTATS_MIB_INNOROUTES:
1815 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1816 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1817 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1818 			break;
1819 		}
1820 		/* FALLTHROUGH */
1821 	case IPSTATS_MIB_OUTNOROUTES:
1822 		IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1823 		break;
1824 	}
1825 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1826 	kfree_skb(skb);
1827 	return 0;
1828 }
1829 
1830 static int ip6_pkt_discard(struct sk_buff *skb)
1831 {
1832 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1833 }
1834 
1835 static int ip6_pkt_discard_out(struct sk_buff *skb)
1836 {
1837 	skb->dev = skb->dst->dev;
1838 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1839 }
1840 
1841 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1842 
1843 static int ip6_pkt_prohibit(struct sk_buff *skb)
1844 {
1845 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1846 }
1847 
1848 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1849 {
1850 	skb->dev = skb->dst->dev;
1851 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1852 }
1853 
1854 #endif
1855 
1856 /*
1857  *	Allocate a dst for local (unicast / anycast) address.
1858  */
1859 
1860 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1861 				    const struct in6_addr *addr,
1862 				    int anycast)
1863 {
1864 	struct net *net = dev_net(idev->dev);
1865 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1866 
1867 	if (rt == NULL)
1868 		return ERR_PTR(-ENOMEM);
1869 
1870 	dev_hold(net->loopback_dev);
1871 	in6_dev_hold(idev);
1872 
1873 	rt->u.dst.flags = DST_HOST;
1874 	rt->u.dst.input = ip6_input;
1875 	rt->u.dst.output = ip6_output;
1876 	rt->rt6i_dev = net->loopback_dev;
1877 	rt->rt6i_idev = idev;
1878 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1879 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1880 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1881 	rt->u.dst.obsolete = -1;
1882 
1883 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1884 	if (anycast)
1885 		rt->rt6i_flags |= RTF_ANYCAST;
1886 	else
1887 		rt->rt6i_flags |= RTF_LOCAL;
1888 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1889 	if (rt->rt6i_nexthop == NULL) {
1890 		dst_free(&rt->u.dst);
1891 		return ERR_PTR(-ENOMEM);
1892 	}
1893 
1894 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1895 	rt->rt6i_dst.plen = 128;
1896 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1897 
1898 	atomic_set(&rt->u.dst.__refcnt, 1);
1899 
1900 	return rt;
1901 }
1902 
1903 struct arg_dev_net {
1904 	struct net_device *dev;
1905 	struct net *net;
1906 };
1907 
1908 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1909 {
1910 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1911 	struct net *net = ((struct arg_dev_net *)arg)->net;
1912 
1913 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1914 	    rt != net->ipv6.ip6_null_entry) {
1915 		RT6_TRACE("deleted by ifdown %p\n", rt);
1916 		return -1;
1917 	}
1918 	return 0;
1919 }
1920 
1921 void rt6_ifdown(struct net *net, struct net_device *dev)
1922 {
1923 	struct arg_dev_net adn = {
1924 		.dev = dev,
1925 		.net = net,
1926 	};
1927 
1928 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1929 }
1930 
1931 struct rt6_mtu_change_arg
1932 {
1933 	struct net_device *dev;
1934 	unsigned mtu;
1935 };
1936 
1937 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1938 {
1939 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1940 	struct inet6_dev *idev;
1941 	struct net *net = dev_net(arg->dev);
1942 
1943 	/* In IPv6 pmtu discovery is not optional,
1944 	   so that RTAX_MTU lock cannot disable it.
1945 	   We still use this lock to block changes
1946 	   caused by addrconf/ndisc.
1947 	*/
1948 
1949 	idev = __in6_dev_get(arg->dev);
1950 	if (idev == NULL)
1951 		return 0;
1952 
1953 	/* For administrative MTU increase, there is no way to discover
1954 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1955 	   Since RFC 1981 doesn't include administrative MTU increase
1956 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1957 	 */
1958 	/*
1959 	   If new MTU is less than route PMTU, this new MTU will be the
1960 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1961 	   decreases; if new MTU is greater than route PMTU, and the
1962 	   old MTU is the lowest MTU in the path, update the route PMTU
1963 	   to reflect the increase. In this case if the other nodes' MTU
1964 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1965 	   PMTU discouvery.
1966 	 */
1967 	if (rt->rt6i_dev == arg->dev &&
1968 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1969 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
1970 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
1971 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1972 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1973 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1974 	}
1975 	return 0;
1976 }
1977 
1978 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1979 {
1980 	struct rt6_mtu_change_arg arg = {
1981 		.dev = dev,
1982 		.mtu = mtu,
1983 	};
1984 
1985 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
1986 }
1987 
1988 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1989 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1990 	[RTA_OIF]               = { .type = NLA_U32 },
1991 	[RTA_IIF]		= { .type = NLA_U32 },
1992 	[RTA_PRIORITY]          = { .type = NLA_U32 },
1993 	[RTA_METRICS]           = { .type = NLA_NESTED },
1994 };
1995 
1996 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1997 			      struct fib6_config *cfg)
1998 {
1999 	struct rtmsg *rtm;
2000 	struct nlattr *tb[RTA_MAX+1];
2001 	int err;
2002 
2003 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2004 	if (err < 0)
2005 		goto errout;
2006 
2007 	err = -EINVAL;
2008 	rtm = nlmsg_data(nlh);
2009 	memset(cfg, 0, sizeof(*cfg));
2010 
2011 	cfg->fc_table = rtm->rtm_table;
2012 	cfg->fc_dst_len = rtm->rtm_dst_len;
2013 	cfg->fc_src_len = rtm->rtm_src_len;
2014 	cfg->fc_flags = RTF_UP;
2015 	cfg->fc_protocol = rtm->rtm_protocol;
2016 
2017 	if (rtm->rtm_type == RTN_UNREACHABLE)
2018 		cfg->fc_flags |= RTF_REJECT;
2019 
2020 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2021 	cfg->fc_nlinfo.nlh = nlh;
2022 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2023 
2024 	if (tb[RTA_GATEWAY]) {
2025 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2026 		cfg->fc_flags |= RTF_GATEWAY;
2027 	}
2028 
2029 	if (tb[RTA_DST]) {
2030 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2031 
2032 		if (nla_len(tb[RTA_DST]) < plen)
2033 			goto errout;
2034 
2035 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2036 	}
2037 
2038 	if (tb[RTA_SRC]) {
2039 		int plen = (rtm->rtm_src_len + 7) >> 3;
2040 
2041 		if (nla_len(tb[RTA_SRC]) < plen)
2042 			goto errout;
2043 
2044 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2045 	}
2046 
2047 	if (tb[RTA_OIF])
2048 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2049 
2050 	if (tb[RTA_PRIORITY])
2051 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2052 
2053 	if (tb[RTA_METRICS]) {
2054 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2055 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2056 	}
2057 
2058 	if (tb[RTA_TABLE])
2059 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2060 
2061 	err = 0;
2062 errout:
2063 	return err;
2064 }
2065 
2066 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2067 {
2068 	struct fib6_config cfg;
2069 	int err;
2070 
2071 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2072 	if (err < 0)
2073 		return err;
2074 
2075 	return ip6_route_del(&cfg);
2076 }
2077 
2078 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2079 {
2080 	struct fib6_config cfg;
2081 	int err;
2082 
2083 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2084 	if (err < 0)
2085 		return err;
2086 
2087 	return ip6_route_add(&cfg);
2088 }
2089 
2090 static inline size_t rt6_nlmsg_size(void)
2091 {
2092 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2093 	       + nla_total_size(16) /* RTA_SRC */
2094 	       + nla_total_size(16) /* RTA_DST */
2095 	       + nla_total_size(16) /* RTA_GATEWAY */
2096 	       + nla_total_size(16) /* RTA_PREFSRC */
2097 	       + nla_total_size(4) /* RTA_TABLE */
2098 	       + nla_total_size(4) /* RTA_IIF */
2099 	       + nla_total_size(4) /* RTA_OIF */
2100 	       + nla_total_size(4) /* RTA_PRIORITY */
2101 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2102 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2103 }
2104 
2105 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2106 			 struct in6_addr *dst, struct in6_addr *src,
2107 			 int iif, int type, u32 pid, u32 seq,
2108 			 int prefix, int nowait, unsigned int flags)
2109 {
2110 	struct rtmsg *rtm;
2111 	struct nlmsghdr *nlh;
2112 	long expires;
2113 	u32 table;
2114 
2115 	if (prefix) {	/* user wants prefix routes only */
2116 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2117 			/* success since this is not a prefix route */
2118 			return 1;
2119 		}
2120 	}
2121 
2122 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2123 	if (nlh == NULL)
2124 		return -EMSGSIZE;
2125 
2126 	rtm = nlmsg_data(nlh);
2127 	rtm->rtm_family = AF_INET6;
2128 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2129 	rtm->rtm_src_len = rt->rt6i_src.plen;
2130 	rtm->rtm_tos = 0;
2131 	if (rt->rt6i_table)
2132 		table = rt->rt6i_table->tb6_id;
2133 	else
2134 		table = RT6_TABLE_UNSPEC;
2135 	rtm->rtm_table = table;
2136 	NLA_PUT_U32(skb, RTA_TABLE, table);
2137 	if (rt->rt6i_flags&RTF_REJECT)
2138 		rtm->rtm_type = RTN_UNREACHABLE;
2139 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2140 		rtm->rtm_type = RTN_LOCAL;
2141 	else
2142 		rtm->rtm_type = RTN_UNICAST;
2143 	rtm->rtm_flags = 0;
2144 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2145 	rtm->rtm_protocol = rt->rt6i_protocol;
2146 	if (rt->rt6i_flags&RTF_DYNAMIC)
2147 		rtm->rtm_protocol = RTPROT_REDIRECT;
2148 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2149 		rtm->rtm_protocol = RTPROT_KERNEL;
2150 	else if (rt->rt6i_flags&RTF_DEFAULT)
2151 		rtm->rtm_protocol = RTPROT_RA;
2152 
2153 	if (rt->rt6i_flags&RTF_CACHE)
2154 		rtm->rtm_flags |= RTM_F_CLONED;
2155 
2156 	if (dst) {
2157 		NLA_PUT(skb, RTA_DST, 16, dst);
2158 		rtm->rtm_dst_len = 128;
2159 	} else if (rtm->rtm_dst_len)
2160 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2161 #ifdef CONFIG_IPV6_SUBTREES
2162 	if (src) {
2163 		NLA_PUT(skb, RTA_SRC, 16, src);
2164 		rtm->rtm_src_len = 128;
2165 	} else if (rtm->rtm_src_len)
2166 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2167 #endif
2168 	if (iif) {
2169 #ifdef CONFIG_IPV6_MROUTE
2170 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2171 			int err = ip6mr_get_route(skb, rtm, nowait);
2172 			if (err <= 0) {
2173 				if (!nowait) {
2174 					if (err == 0)
2175 						return 0;
2176 					goto nla_put_failure;
2177 				} else {
2178 					if (err == -EMSGSIZE)
2179 						goto nla_put_failure;
2180 				}
2181 			}
2182 		} else
2183 #endif
2184 			NLA_PUT_U32(skb, RTA_IIF, iif);
2185 	} else if (dst) {
2186 		struct in6_addr saddr_buf;
2187 		if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2188 				       dst, 0, &saddr_buf) == 0)
2189 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2190 	}
2191 
2192 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2193 		goto nla_put_failure;
2194 
2195 	if (rt->u.dst.neighbour)
2196 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2197 
2198 	if (rt->u.dst.dev)
2199 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2200 
2201 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2202 
2203 	expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2204 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2205 			       expires, rt->u.dst.error) < 0)
2206 		goto nla_put_failure;
2207 
2208 	return nlmsg_end(skb, nlh);
2209 
2210 nla_put_failure:
2211 	nlmsg_cancel(skb, nlh);
2212 	return -EMSGSIZE;
2213 }
2214 
2215 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2216 {
2217 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2218 	int prefix;
2219 
2220 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2221 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2222 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2223 	} else
2224 		prefix = 0;
2225 
2226 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2227 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2228 		     prefix, 0, NLM_F_MULTI);
2229 }
2230 
2231 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2232 {
2233 	struct net *net = sock_net(in_skb->sk);
2234 	struct nlattr *tb[RTA_MAX+1];
2235 	struct rt6_info *rt;
2236 	struct sk_buff *skb;
2237 	struct rtmsg *rtm;
2238 	struct flowi fl;
2239 	int err, iif = 0;
2240 
2241 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2242 	if (err < 0)
2243 		goto errout;
2244 
2245 	err = -EINVAL;
2246 	memset(&fl, 0, sizeof(fl));
2247 
2248 	if (tb[RTA_SRC]) {
2249 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2250 			goto errout;
2251 
2252 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2253 	}
2254 
2255 	if (tb[RTA_DST]) {
2256 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2257 			goto errout;
2258 
2259 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2260 	}
2261 
2262 	if (tb[RTA_IIF])
2263 		iif = nla_get_u32(tb[RTA_IIF]);
2264 
2265 	if (tb[RTA_OIF])
2266 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2267 
2268 	if (iif) {
2269 		struct net_device *dev;
2270 		dev = __dev_get_by_index(net, iif);
2271 		if (!dev) {
2272 			err = -ENODEV;
2273 			goto errout;
2274 		}
2275 	}
2276 
2277 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2278 	if (skb == NULL) {
2279 		err = -ENOBUFS;
2280 		goto errout;
2281 	}
2282 
2283 	/* Reserve room for dummy headers, this skb can pass
2284 	   through good chunk of routing engine.
2285 	 */
2286 	skb_reset_mac_header(skb);
2287 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2288 
2289 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2290 	skb->dst = &rt->u.dst;
2291 
2292 	err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2293 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2294 			    nlh->nlmsg_seq, 0, 0, 0);
2295 	if (err < 0) {
2296 		kfree_skb(skb);
2297 		goto errout;
2298 	}
2299 
2300 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2301 errout:
2302 	return err;
2303 }
2304 
2305 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2306 {
2307 	struct sk_buff *skb;
2308 	struct net *net = info->nl_net;
2309 	u32 seq;
2310 	int err;
2311 
2312 	err = -ENOBUFS;
2313 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2314 
2315 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2316 	if (skb == NULL)
2317 		goto errout;
2318 
2319 	err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2320 				event, info->pid, seq, 0, 0, 0);
2321 	if (err < 0) {
2322 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2323 		WARN_ON(err == -EMSGSIZE);
2324 		kfree_skb(skb);
2325 		goto errout;
2326 	}
2327 	err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2328 			  info->nlh, gfp_any());
2329 errout:
2330 	if (err < 0)
2331 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2332 }
2333 
2334 static int ip6_route_dev_notify(struct notifier_block *this,
2335 				unsigned long event, void *data)
2336 {
2337 	struct net_device *dev = (struct net_device *)data;
2338 	struct net *net = dev_net(dev);
2339 
2340 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2341 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2342 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2344 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2345 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2346 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2347 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2348 #endif
2349 	}
2350 
2351 	return NOTIFY_OK;
2352 }
2353 
2354 /*
2355  *	/proc
2356  */
2357 
2358 #ifdef CONFIG_PROC_FS
2359 
2360 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2361 
2362 struct rt6_proc_arg
2363 {
2364 	char *buffer;
2365 	int offset;
2366 	int length;
2367 	int skip;
2368 	int len;
2369 };
2370 
2371 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2372 {
2373 	struct seq_file *m = p_arg;
2374 
2375 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2376 		   rt->rt6i_dst.plen);
2377 
2378 #ifdef CONFIG_IPV6_SUBTREES
2379 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2380 		   rt->rt6i_src.plen);
2381 #else
2382 	seq_puts(m, "00000000000000000000000000000000 00 ");
2383 #endif
2384 
2385 	if (rt->rt6i_nexthop) {
2386 		seq_printf(m, NIP6_SEQFMT,
2387 			   NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2388 	} else {
2389 		seq_puts(m, "00000000000000000000000000000000");
2390 	}
2391 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2392 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2393 		   rt->u.dst.__use, rt->rt6i_flags,
2394 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2395 	return 0;
2396 }
2397 
2398 static int ipv6_route_show(struct seq_file *m, void *v)
2399 {
2400 	struct net *net = (struct net *)m->private;
2401 	fib6_clean_all(net, rt6_info_route, 0, m);
2402 	return 0;
2403 }
2404 
2405 static int ipv6_route_open(struct inode *inode, struct file *file)
2406 {
2407 	int err;
2408 	struct net *net = get_proc_net(inode);
2409 	if (!net)
2410 		return -ENXIO;
2411 
2412 	err = single_open(file, ipv6_route_show, net);
2413 	if (err < 0) {
2414 		put_net(net);
2415 		return err;
2416 	}
2417 
2418 	return 0;
2419 }
2420 
2421 static int ipv6_route_release(struct inode *inode, struct file *file)
2422 {
2423 	struct seq_file *seq = file->private_data;
2424 	struct net *net = seq->private;
2425 	put_net(net);
2426 	return single_release(inode, file);
2427 }
2428 
2429 static const struct file_operations ipv6_route_proc_fops = {
2430 	.owner		= THIS_MODULE,
2431 	.open		= ipv6_route_open,
2432 	.read		= seq_read,
2433 	.llseek		= seq_lseek,
2434 	.release	= ipv6_route_release,
2435 };
2436 
2437 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2438 {
2439 	struct net *net = (struct net *)seq->private;
2440 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2441 		   net->ipv6.rt6_stats->fib_nodes,
2442 		   net->ipv6.rt6_stats->fib_route_nodes,
2443 		   net->ipv6.rt6_stats->fib_rt_alloc,
2444 		   net->ipv6.rt6_stats->fib_rt_entries,
2445 		   net->ipv6.rt6_stats->fib_rt_cache,
2446 		   atomic_read(&net->ipv6.ip6_dst_ops->entries),
2447 		   net->ipv6.rt6_stats->fib_discarded_routes);
2448 
2449 	return 0;
2450 }
2451 
2452 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2453 {
2454 	int err;
2455 	struct net *net = get_proc_net(inode);
2456 	if (!net)
2457 		return -ENXIO;
2458 
2459 	err = single_open(file, rt6_stats_seq_show, net);
2460 	if (err < 0) {
2461 		put_net(net);
2462 		return err;
2463 	}
2464 
2465 	return 0;
2466 }
2467 
2468 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2469 {
2470 	struct seq_file *seq = file->private_data;
2471 	struct net *net = (struct net *)seq->private;
2472 	put_net(net);
2473 	return single_release(inode, file);
2474 }
2475 
2476 static const struct file_operations rt6_stats_seq_fops = {
2477 	.owner	 = THIS_MODULE,
2478 	.open	 = rt6_stats_seq_open,
2479 	.read	 = seq_read,
2480 	.llseek	 = seq_lseek,
2481 	.release = rt6_stats_seq_release,
2482 };
2483 #endif	/* CONFIG_PROC_FS */
2484 
2485 #ifdef CONFIG_SYSCTL
2486 
2487 static
2488 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2489 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2490 {
2491 	struct net *net = current->nsproxy->net_ns;
2492 	int delay = net->ipv6.sysctl.flush_delay;
2493 	if (write) {
2494 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2495 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2496 		return 0;
2497 	} else
2498 		return -EINVAL;
2499 }
2500 
2501 ctl_table ipv6_route_table_template[] = {
2502 	{
2503 		.procname	=	"flush",
2504 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2505 		.maxlen		=	sizeof(int),
2506 		.mode		=	0200,
2507 		.proc_handler	=	&ipv6_sysctl_rtcache_flush
2508 	},
2509 	{
2510 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2511 		.procname	=	"gc_thresh",
2512 		.data		=	&ip6_dst_ops_template.gc_thresh,
2513 		.maxlen		=	sizeof(int),
2514 		.mode		=	0644,
2515 		.proc_handler	=	&proc_dointvec,
2516 	},
2517 	{
2518 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2519 		.procname	=	"max_size",
2520 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2521 		.maxlen		=	sizeof(int),
2522 		.mode		=	0644,
2523 		.proc_handler	=	&proc_dointvec,
2524 	},
2525 	{
2526 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2527 		.procname	=	"gc_min_interval",
2528 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2529 		.maxlen		=	sizeof(int),
2530 		.mode		=	0644,
2531 		.proc_handler	=	&proc_dointvec_jiffies,
2532 		.strategy	=	&sysctl_jiffies,
2533 	},
2534 	{
2535 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2536 		.procname	=	"gc_timeout",
2537 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2538 		.maxlen		=	sizeof(int),
2539 		.mode		=	0644,
2540 		.proc_handler	=	&proc_dointvec_jiffies,
2541 		.strategy	=	&sysctl_jiffies,
2542 	},
2543 	{
2544 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2545 		.procname	=	"gc_interval",
2546 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2547 		.maxlen		=	sizeof(int),
2548 		.mode		=	0644,
2549 		.proc_handler	=	&proc_dointvec_jiffies,
2550 		.strategy	=	&sysctl_jiffies,
2551 	},
2552 	{
2553 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2554 		.procname	=	"gc_elasticity",
2555 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2556 		.maxlen		=	sizeof(int),
2557 		.mode		=	0644,
2558 		.proc_handler	=	&proc_dointvec_jiffies,
2559 		.strategy	=	&sysctl_jiffies,
2560 	},
2561 	{
2562 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2563 		.procname	=	"mtu_expires",
2564 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2565 		.maxlen		=	sizeof(int),
2566 		.mode		=	0644,
2567 		.proc_handler	=	&proc_dointvec_jiffies,
2568 		.strategy	=	&sysctl_jiffies,
2569 	},
2570 	{
2571 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2572 		.procname	=	"min_adv_mss",
2573 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2574 		.maxlen		=	sizeof(int),
2575 		.mode		=	0644,
2576 		.proc_handler	=	&proc_dointvec_jiffies,
2577 		.strategy	=	&sysctl_jiffies,
2578 	},
2579 	{
2580 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2581 		.procname	=	"gc_min_interval_ms",
2582 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2583 		.maxlen		=	sizeof(int),
2584 		.mode		=	0644,
2585 		.proc_handler	=	&proc_dointvec_ms_jiffies,
2586 		.strategy	=	&sysctl_ms_jiffies,
2587 	},
2588 	{ .ctl_name = 0 }
2589 };
2590 
2591 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2592 {
2593 	struct ctl_table *table;
2594 
2595 	table = kmemdup(ipv6_route_table_template,
2596 			sizeof(ipv6_route_table_template),
2597 			GFP_KERNEL);
2598 
2599 	if (table) {
2600 		table[0].data = &net->ipv6.sysctl.flush_delay;
2601 		table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2602 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2603 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2604 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2605 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2606 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2607 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2608 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2609 	}
2610 
2611 	return table;
2612 }
2613 #endif
2614 
2615 static int ip6_route_net_init(struct net *net)
2616 {
2617 	int ret = 0;
2618 
2619 	ret = -ENOMEM;
2620 	net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2621 					sizeof(*net->ipv6.ip6_dst_ops),
2622 					GFP_KERNEL);
2623 	if (!net->ipv6.ip6_dst_ops)
2624 		goto out;
2625 	net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2626 
2627 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2628 					   sizeof(*net->ipv6.ip6_null_entry),
2629 					   GFP_KERNEL);
2630 	if (!net->ipv6.ip6_null_entry)
2631 		goto out_ip6_dst_ops;
2632 	net->ipv6.ip6_null_entry->u.dst.path =
2633 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2634 	net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2635 
2636 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2637 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2638 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2639 					       GFP_KERNEL);
2640 	if (!net->ipv6.ip6_prohibit_entry) {
2641 		kfree(net->ipv6.ip6_null_entry);
2642 		goto out;
2643 	}
2644 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2645 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2646 	net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2647 
2648 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2649 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2650 					       GFP_KERNEL);
2651 	if (!net->ipv6.ip6_blk_hole_entry) {
2652 		kfree(net->ipv6.ip6_null_entry);
2653 		kfree(net->ipv6.ip6_prohibit_entry);
2654 		goto out;
2655 	}
2656 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2657 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2658 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2659 #endif
2660 
2661 #ifdef CONFIG_PROC_FS
2662 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2663 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2664 #endif
2665 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2666 
2667 	ret = 0;
2668 out:
2669 	return ret;
2670 
2671 out_ip6_dst_ops:
2672 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2673 	kfree(net->ipv6.ip6_dst_ops);
2674 	goto out;
2675 }
2676 
2677 static void ip6_route_net_exit(struct net *net)
2678 {
2679 #ifdef CONFIG_PROC_FS
2680 	proc_net_remove(net, "ipv6_route");
2681 	proc_net_remove(net, "rt6_stats");
2682 #endif
2683 	kfree(net->ipv6.ip6_null_entry);
2684 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2685 	kfree(net->ipv6.ip6_prohibit_entry);
2686 	kfree(net->ipv6.ip6_blk_hole_entry);
2687 #endif
2688 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2689 	kfree(net->ipv6.ip6_dst_ops);
2690 }
2691 
2692 static struct pernet_operations ip6_route_net_ops = {
2693 	.init = ip6_route_net_init,
2694 	.exit = ip6_route_net_exit,
2695 };
2696 
2697 static struct notifier_block ip6_route_dev_notifier = {
2698 	.notifier_call = ip6_route_dev_notify,
2699 	.priority = 0,
2700 };
2701 
2702 int __init ip6_route_init(void)
2703 {
2704 	int ret;
2705 
2706 	ret = -ENOMEM;
2707 	ip6_dst_ops_template.kmem_cachep =
2708 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2709 				  SLAB_HWCACHE_ALIGN, NULL);
2710 	if (!ip6_dst_ops_template.kmem_cachep)
2711 		goto out;;
2712 
2713 	ret = register_pernet_subsys(&ip6_route_net_ops);
2714 	if (ret)
2715 		goto out_kmem_cache;
2716 
2717 	/* Registering of the loopback is done before this portion of code,
2718 	 * the loopback reference in rt6_info will not be taken, do it
2719 	 * manually for init_net */
2720 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2721 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2722   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2723 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2724 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2725 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2726 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2727   #endif
2728 	ret = fib6_init();
2729 	if (ret)
2730 		goto out_register_subsys;
2731 
2732 	ret = xfrm6_init();
2733 	if (ret)
2734 		goto out_fib6_init;
2735 
2736 	ret = fib6_rules_init();
2737 	if (ret)
2738 		goto xfrm6_init;
2739 
2740 	ret = -ENOBUFS;
2741 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2742 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2743 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2744 		goto fib6_rules_init;
2745 
2746 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2747 	if (ret)
2748 		goto fib6_rules_init;
2749 
2750 out:
2751 	return ret;
2752 
2753 fib6_rules_init:
2754 	fib6_rules_cleanup();
2755 xfrm6_init:
2756 	xfrm6_fini();
2757 out_fib6_init:
2758 	fib6_gc_cleanup();
2759 out_register_subsys:
2760 	unregister_pernet_subsys(&ip6_route_net_ops);
2761 out_kmem_cache:
2762 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2763 	goto out;
2764 }
2765 
2766 void ip6_route_cleanup(void)
2767 {
2768 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2769 	fib6_rules_cleanup();
2770 	xfrm6_fini();
2771 	fib6_gc_cleanup();
2772 	unregister_pernet_subsys(&ip6_route_net_ops);
2773 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2774 }
2775