xref: /openbmc/linux/net/ipv6/route.c (revision f30828a6)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  *	Ville Nuorvala
26  *		Fixed routing subtrees.
27  */
28 
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/mroute6.h>
40 #include <linux/init.h>
41 #include <linux/if_arp.h>
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #include <linux/nsproxy.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58 
59 #include <asm/uaccess.h>
60 
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64 
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67 
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75 
76 #define CLONE_OFFLINK_ROUTE 0
77 
78 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
79 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
80 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81 static void		ip6_dst_destroy(struct dst_entry *);
82 static void		ip6_dst_ifdown(struct dst_entry *,
83 				       struct net_device *dev, int how);
84 static int		 ip6_dst_gc(struct dst_ops *ops);
85 
86 static int		ip6_pkt_discard(struct sk_buff *skb);
87 static int		ip6_pkt_discard_out(struct sk_buff *skb);
88 static void		ip6_link_failure(struct sk_buff *skb);
89 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 
91 #ifdef CONFIG_IPV6_ROUTE_INFO
92 static struct rt6_info *rt6_add_route_info(struct net *net,
93 					   struct in6_addr *prefix, int prefixlen,
94 					   struct in6_addr *gwaddr, int ifindex,
95 					   unsigned pref);
96 static struct rt6_info *rt6_get_route_info(struct net *net,
97 					   struct in6_addr *prefix, int prefixlen,
98 					   struct in6_addr *gwaddr, int ifindex);
99 #endif
100 
101 static struct dst_ops ip6_dst_ops_template = {
102 	.family			=	AF_INET6,
103 	.protocol		=	__constant_htons(ETH_P_IPV6),
104 	.gc			=	ip6_dst_gc,
105 	.gc_thresh		=	1024,
106 	.check			=	ip6_dst_check,
107 	.destroy		=	ip6_dst_destroy,
108 	.ifdown			=	ip6_dst_ifdown,
109 	.negative_advice	=	ip6_negative_advice,
110 	.link_failure		=	ip6_link_failure,
111 	.update_pmtu		=	ip6_rt_update_pmtu,
112 	.local_out		=	__ip6_local_out,
113 	.entry_size		=	sizeof(struct rt6_info),
114 	.entries		=	ATOMIC_INIT(0),
115 };
116 
117 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
118 {
119 }
120 
121 static struct dst_ops ip6_dst_blackhole_ops = {
122 	.family			=	AF_INET6,
123 	.protocol		=	__constant_htons(ETH_P_IPV6),
124 	.destroy		=	ip6_dst_destroy,
125 	.check			=	ip6_dst_check,
126 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
127 	.entry_size		=	sizeof(struct rt6_info),
128 	.entries		=	ATOMIC_INIT(0),
129 };
130 
131 static struct rt6_info ip6_null_entry_template = {
132 	.u = {
133 		.dst = {
134 			.__refcnt	= ATOMIC_INIT(1),
135 			.__use		= 1,
136 			.obsolete	= -1,
137 			.error		= -ENETUNREACH,
138 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
139 			.input		= ip6_pkt_discard,
140 			.output		= ip6_pkt_discard_out,
141 		}
142 	},
143 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
144 	.rt6i_metric	= ~(u32) 0,
145 	.rt6i_ref	= ATOMIC_INIT(1),
146 };
147 
148 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
149 
150 static int ip6_pkt_prohibit(struct sk_buff *skb);
151 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
152 
153 static struct rt6_info ip6_prohibit_entry_template = {
154 	.u = {
155 		.dst = {
156 			.__refcnt	= ATOMIC_INIT(1),
157 			.__use		= 1,
158 			.obsolete	= -1,
159 			.error		= -EACCES,
160 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
161 			.input		= ip6_pkt_prohibit,
162 			.output		= ip6_pkt_prohibit_out,
163 		}
164 	},
165 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
166 	.rt6i_metric	= ~(u32) 0,
167 	.rt6i_ref	= ATOMIC_INIT(1),
168 };
169 
170 static struct rt6_info ip6_blk_hole_entry_template = {
171 	.u = {
172 		.dst = {
173 			.__refcnt	= ATOMIC_INIT(1),
174 			.__use		= 1,
175 			.obsolete	= -1,
176 			.error		= -EINVAL,
177 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
178 			.input		= dst_discard,
179 			.output		= dst_discard,
180 		}
181 	},
182 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
183 	.rt6i_metric	= ~(u32) 0,
184 	.rt6i_ref	= ATOMIC_INIT(1),
185 };
186 
187 #endif
188 
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192 	return (struct rt6_info *)dst_alloc(ops);
193 }
194 
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197 	struct rt6_info *rt = (struct rt6_info *)dst;
198 	struct inet6_dev *idev = rt->rt6i_idev;
199 
200 	if (idev != NULL) {
201 		rt->rt6i_idev = NULL;
202 		in6_dev_put(idev);
203 	}
204 }
205 
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 			   int how)
208 {
209 	struct rt6_info *rt = (struct rt6_info *)dst;
210 	struct inet6_dev *idev = rt->rt6i_idev;
211 	struct net_device *loopback_dev =
212 		dev_net(dev)->loopback_dev;
213 
214 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 		struct inet6_dev *loopback_idev =
216 			in6_dev_get(loopback_dev);
217 		if (loopback_idev != NULL) {
218 			rt->rt6i_idev = loopback_idev;
219 			in6_dev_put(idev);
220 		}
221 	}
222 }
223 
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226 	return (rt->rt6i_flags & RTF_EXPIRES &&
227 		time_after(jiffies, rt->rt6i_expires));
228 }
229 
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232 	return (ipv6_addr_type(daddr) &
233 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
234 }
235 
236 /*
237  *	Route lookup. Any table->tb6_lock is implied.
238  */
239 
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 						    struct rt6_info *rt,
242 						    int oif,
243 						    int flags)
244 {
245 	struct rt6_info *local = NULL;
246 	struct rt6_info *sprt;
247 
248 	if (oif) {
249 		for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
250 			struct net_device *dev = sprt->rt6i_dev;
251 			if (dev->ifindex == oif)
252 				return sprt;
253 			if (dev->flags & IFF_LOOPBACK) {
254 				if (sprt->rt6i_idev == NULL ||
255 				    sprt->rt6i_idev->dev->ifindex != oif) {
256 					if (flags & RT6_LOOKUP_F_IFACE && oif)
257 						continue;
258 					if (local && (!oif ||
259 						      local->rt6i_idev->dev->ifindex == oif))
260 						continue;
261 				}
262 				local = sprt;
263 			}
264 		}
265 
266 		if (local)
267 			return local;
268 
269 		if (flags & RT6_LOOKUP_F_IFACE)
270 			return net->ipv6.ip6_null_entry;
271 	}
272 	return rt;
273 }
274 
275 #ifdef CONFIG_IPV6_ROUTER_PREF
276 static void rt6_probe(struct rt6_info *rt)
277 {
278 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
279 	/*
280 	 * Okay, this does not seem to be appropriate
281 	 * for now, however, we need to check if it
282 	 * is really so; aka Router Reachability Probing.
283 	 *
284 	 * Router Reachability Probe MUST be rate-limited
285 	 * to no more than one per minute.
286 	 */
287 	if (!neigh || (neigh->nud_state & NUD_VALID))
288 		return;
289 	read_lock_bh(&neigh->lock);
290 	if (!(neigh->nud_state & NUD_VALID) &&
291 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
292 		struct in6_addr mcaddr;
293 		struct in6_addr *target;
294 
295 		neigh->updated = jiffies;
296 		read_unlock_bh(&neigh->lock);
297 
298 		target = (struct in6_addr *)&neigh->primary_key;
299 		addrconf_addr_solict_mult(target, &mcaddr);
300 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
301 	} else
302 		read_unlock_bh(&neigh->lock);
303 }
304 #else
305 static inline void rt6_probe(struct rt6_info *rt)
306 {
307 	return;
308 }
309 #endif
310 
311 /*
312  * Default Router Selection (RFC 2461 6.3.6)
313  */
314 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
315 {
316 	struct net_device *dev = rt->rt6i_dev;
317 	if (!oif || dev->ifindex == oif)
318 		return 2;
319 	if ((dev->flags & IFF_LOOPBACK) &&
320 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
321 		return 1;
322 	return 0;
323 }
324 
325 static inline int rt6_check_neigh(struct rt6_info *rt)
326 {
327 	struct neighbour *neigh = rt->rt6i_nexthop;
328 	int m;
329 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
330 	    !(rt->rt6i_flags & RTF_GATEWAY))
331 		m = 1;
332 	else if (neigh) {
333 		read_lock_bh(&neigh->lock);
334 		if (neigh->nud_state & NUD_VALID)
335 			m = 2;
336 #ifdef CONFIG_IPV6_ROUTER_PREF
337 		else if (neigh->nud_state & NUD_FAILED)
338 			m = 0;
339 #endif
340 		else
341 			m = 1;
342 		read_unlock_bh(&neigh->lock);
343 	} else
344 		m = 0;
345 	return m;
346 }
347 
348 static int rt6_score_route(struct rt6_info *rt, int oif,
349 			   int strict)
350 {
351 	int m, n;
352 
353 	m = rt6_check_dev(rt, oif);
354 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
355 		return -1;
356 #ifdef CONFIG_IPV6_ROUTER_PREF
357 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
358 #endif
359 	n = rt6_check_neigh(rt);
360 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
361 		return -1;
362 	return m;
363 }
364 
365 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
366 				   int *mpri, struct rt6_info *match)
367 {
368 	int m;
369 
370 	if (rt6_check_expired(rt))
371 		goto out;
372 
373 	m = rt6_score_route(rt, oif, strict);
374 	if (m < 0)
375 		goto out;
376 
377 	if (m > *mpri) {
378 		if (strict & RT6_LOOKUP_F_REACHABLE)
379 			rt6_probe(match);
380 		*mpri = m;
381 		match = rt;
382 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
383 		rt6_probe(rt);
384 	}
385 
386 out:
387 	return match;
388 }
389 
390 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
391 				     struct rt6_info *rr_head,
392 				     u32 metric, int oif, int strict)
393 {
394 	struct rt6_info *rt, *match;
395 	int mpri = -1;
396 
397 	match = NULL;
398 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
399 	     rt = rt->u.dst.rt6_next)
400 		match = find_match(rt, oif, strict, &mpri, match);
401 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
402 	     rt = rt->u.dst.rt6_next)
403 		match = find_match(rt, oif, strict, &mpri, match);
404 
405 	return match;
406 }
407 
408 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
409 {
410 	struct rt6_info *match, *rt0;
411 	struct net *net;
412 
413 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
414 		  __func__, fn->leaf, oif);
415 
416 	rt0 = fn->rr_ptr;
417 	if (!rt0)
418 		fn->rr_ptr = rt0 = fn->leaf;
419 
420 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
421 
422 	if (!match &&
423 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
424 		struct rt6_info *next = rt0->u.dst.rt6_next;
425 
426 		/* no entries matched; do round-robin */
427 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
428 			next = fn->leaf;
429 
430 		if (next != rt0)
431 			fn->rr_ptr = next;
432 	}
433 
434 	RT6_TRACE("%s() => %p\n",
435 		  __func__, match);
436 
437 	net = dev_net(rt0->rt6i_dev);
438 	return (match ? match : net->ipv6.ip6_null_entry);
439 }
440 
441 #ifdef CONFIG_IPV6_ROUTE_INFO
442 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
443 		  struct in6_addr *gwaddr)
444 {
445 	struct net *net = dev_net(dev);
446 	struct route_info *rinfo = (struct route_info *) opt;
447 	struct in6_addr prefix_buf, *prefix;
448 	unsigned int pref;
449 	unsigned long lifetime;
450 	struct rt6_info *rt;
451 
452 	if (len < sizeof(struct route_info)) {
453 		return -EINVAL;
454 	}
455 
456 	/* Sanity check for prefix_len and length */
457 	if (rinfo->length > 3) {
458 		return -EINVAL;
459 	} else if (rinfo->prefix_len > 128) {
460 		return -EINVAL;
461 	} else if (rinfo->prefix_len > 64) {
462 		if (rinfo->length < 2) {
463 			return -EINVAL;
464 		}
465 	} else if (rinfo->prefix_len > 0) {
466 		if (rinfo->length < 1) {
467 			return -EINVAL;
468 		}
469 	}
470 
471 	pref = rinfo->route_pref;
472 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
473 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
474 
475 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
476 
477 	if (rinfo->length == 3)
478 		prefix = (struct in6_addr *)rinfo->prefix;
479 	else {
480 		/* this function is safe */
481 		ipv6_addr_prefix(&prefix_buf,
482 				 (struct in6_addr *)rinfo->prefix,
483 				 rinfo->prefix_len);
484 		prefix = &prefix_buf;
485 	}
486 
487 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
488 				dev->ifindex);
489 
490 	if (rt && !lifetime) {
491 		ip6_del_rt(rt);
492 		rt = NULL;
493 	}
494 
495 	if (!rt && lifetime)
496 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
497 					pref);
498 	else if (rt)
499 		rt->rt6i_flags = RTF_ROUTEINFO |
500 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
501 
502 	if (rt) {
503 		if (!addrconf_finite_timeout(lifetime)) {
504 			rt->rt6i_flags &= ~RTF_EXPIRES;
505 		} else {
506 			rt->rt6i_expires = jiffies + HZ * lifetime;
507 			rt->rt6i_flags |= RTF_EXPIRES;
508 		}
509 		dst_release(&rt->u.dst);
510 	}
511 	return 0;
512 }
513 #endif
514 
515 #define BACKTRACK(__net, saddr)			\
516 do { \
517 	if (rt == __net->ipv6.ip6_null_entry) {	\
518 		struct fib6_node *pn; \
519 		while (1) { \
520 			if (fn->fn_flags & RTN_TL_ROOT) \
521 				goto out; \
522 			pn = fn->parent; \
523 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
524 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
525 			else \
526 				fn = pn; \
527 			if (fn->fn_flags & RTN_RTINFO) \
528 				goto restart; \
529 		} \
530 	} \
531 } while(0)
532 
533 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
534 					     struct fib6_table *table,
535 					     struct flowi *fl, int flags)
536 {
537 	struct fib6_node *fn;
538 	struct rt6_info *rt;
539 
540 	read_lock_bh(&table->tb6_lock);
541 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
542 restart:
543 	rt = fn->leaf;
544 	rt = rt6_device_match(net, rt, fl->oif, flags);
545 	BACKTRACK(net, &fl->fl6_src);
546 out:
547 	dst_use(&rt->u.dst, jiffies);
548 	read_unlock_bh(&table->tb6_lock);
549 	return rt;
550 
551 }
552 
553 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
554 			    const struct in6_addr *saddr, int oif, int strict)
555 {
556 	struct flowi fl = {
557 		.oif = oif,
558 		.nl_u = {
559 			.ip6_u = {
560 				.daddr = *daddr,
561 			},
562 		},
563 	};
564 	struct dst_entry *dst;
565 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
566 
567 	if (saddr) {
568 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
569 		flags |= RT6_LOOKUP_F_HAS_SADDR;
570 	}
571 
572 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
573 	if (dst->error == 0)
574 		return (struct rt6_info *) dst;
575 
576 	dst_release(dst);
577 
578 	return NULL;
579 }
580 
581 EXPORT_SYMBOL(rt6_lookup);
582 
583 /* ip6_ins_rt is called with FREE table->tb6_lock.
584    It takes new route entry, the addition fails by any reason the
585    route is freed. In any case, if caller does not hold it, it may
586    be destroyed.
587  */
588 
589 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
590 {
591 	int err;
592 	struct fib6_table *table;
593 
594 	table = rt->rt6i_table;
595 	write_lock_bh(&table->tb6_lock);
596 	err = fib6_add(&table->tb6_root, rt, info);
597 	write_unlock_bh(&table->tb6_lock);
598 
599 	return err;
600 }
601 
602 int ip6_ins_rt(struct rt6_info *rt)
603 {
604 	struct nl_info info = {
605 		.nl_net = dev_net(rt->rt6i_dev),
606 	};
607 	return __ip6_ins_rt(rt, &info);
608 }
609 
610 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
611 				      struct in6_addr *saddr)
612 {
613 	struct rt6_info *rt;
614 
615 	/*
616 	 *	Clone the route.
617 	 */
618 
619 	rt = ip6_rt_copy(ort);
620 
621 	if (rt) {
622 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
623 			if (rt->rt6i_dst.plen != 128 &&
624 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
625 				rt->rt6i_flags |= RTF_ANYCAST;
626 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
627 		}
628 
629 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
630 		rt->rt6i_dst.plen = 128;
631 		rt->rt6i_flags |= RTF_CACHE;
632 		rt->u.dst.flags |= DST_HOST;
633 
634 #ifdef CONFIG_IPV6_SUBTREES
635 		if (rt->rt6i_src.plen && saddr) {
636 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
637 			rt->rt6i_src.plen = 128;
638 		}
639 #endif
640 
641 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
642 
643 	}
644 
645 	return rt;
646 }
647 
648 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
649 {
650 	struct rt6_info *rt = ip6_rt_copy(ort);
651 	if (rt) {
652 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
653 		rt->rt6i_dst.plen = 128;
654 		rt->rt6i_flags |= RTF_CACHE;
655 		rt->u.dst.flags |= DST_HOST;
656 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
657 	}
658 	return rt;
659 }
660 
661 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
662 				      struct flowi *fl, int flags)
663 {
664 	struct fib6_node *fn;
665 	struct rt6_info *rt, *nrt;
666 	int strict = 0;
667 	int attempts = 3;
668 	int err;
669 	int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
670 
671 	strict |= flags & RT6_LOOKUP_F_IFACE;
672 
673 relookup:
674 	read_lock_bh(&table->tb6_lock);
675 
676 restart_2:
677 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
678 
679 restart:
680 	rt = rt6_select(fn, oif, strict | reachable);
681 
682 	BACKTRACK(net, &fl->fl6_src);
683 	if (rt == net->ipv6.ip6_null_entry ||
684 	    rt->rt6i_flags & RTF_CACHE)
685 		goto out;
686 
687 	dst_hold(&rt->u.dst);
688 	read_unlock_bh(&table->tb6_lock);
689 
690 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
691 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
692 	else {
693 #if CLONE_OFFLINK_ROUTE
694 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
695 #else
696 		goto out2;
697 #endif
698 	}
699 
700 	dst_release(&rt->u.dst);
701 	rt = nrt ? : net->ipv6.ip6_null_entry;
702 
703 	dst_hold(&rt->u.dst);
704 	if (nrt) {
705 		err = ip6_ins_rt(nrt);
706 		if (!err)
707 			goto out2;
708 	}
709 
710 	if (--attempts <= 0)
711 		goto out2;
712 
713 	/*
714 	 * Race condition! In the gap, when table->tb6_lock was
715 	 * released someone could insert this route.  Relookup.
716 	 */
717 	dst_release(&rt->u.dst);
718 	goto relookup;
719 
720 out:
721 	if (reachable) {
722 		reachable = 0;
723 		goto restart_2;
724 	}
725 	dst_hold(&rt->u.dst);
726 	read_unlock_bh(&table->tb6_lock);
727 out2:
728 	rt->u.dst.lastuse = jiffies;
729 	rt->u.dst.__use++;
730 
731 	return rt;
732 }
733 
734 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
735 					    struct flowi *fl, int flags)
736 {
737 	return ip6_pol_route(net, table, fl->iif, fl, flags);
738 }
739 
740 void ip6_route_input(struct sk_buff *skb)
741 {
742 	struct ipv6hdr *iph = ipv6_hdr(skb);
743 	struct net *net = dev_net(skb->dev);
744 	int flags = RT6_LOOKUP_F_HAS_SADDR;
745 	struct flowi fl = {
746 		.iif = skb->dev->ifindex,
747 		.nl_u = {
748 			.ip6_u = {
749 				.daddr = iph->daddr,
750 				.saddr = iph->saddr,
751 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
752 			},
753 		},
754 		.mark = skb->mark,
755 		.proto = iph->nexthdr,
756 	};
757 
758 	if (rt6_need_strict(&iph->daddr))
759 		flags |= RT6_LOOKUP_F_IFACE;
760 
761 	skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
762 }
763 
764 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
765 					     struct flowi *fl, int flags)
766 {
767 	return ip6_pol_route(net, table, fl->oif, fl, flags);
768 }
769 
770 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
771 				    struct flowi *fl)
772 {
773 	int flags = 0;
774 
775 	if (rt6_need_strict(&fl->fl6_dst))
776 		flags |= RT6_LOOKUP_F_IFACE;
777 
778 	if (!ipv6_addr_any(&fl->fl6_src))
779 		flags |= RT6_LOOKUP_F_HAS_SADDR;
780 	else if (sk) {
781 		unsigned int prefs = inet6_sk(sk)->srcprefs;
782 		if (prefs & IPV6_PREFER_SRC_TMP)
783 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
784 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
785 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
786 		if (prefs & IPV6_PREFER_SRC_COA)
787 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
788 	}
789 
790 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
791 }
792 
793 EXPORT_SYMBOL(ip6_route_output);
794 
795 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
796 {
797 	struct rt6_info *ort = (struct rt6_info *) *dstp;
798 	struct rt6_info *rt = (struct rt6_info *)
799 		dst_alloc(&ip6_dst_blackhole_ops);
800 	struct dst_entry *new = NULL;
801 
802 	if (rt) {
803 		new = &rt->u.dst;
804 
805 		atomic_set(&new->__refcnt, 1);
806 		new->__use = 1;
807 		new->input = dst_discard;
808 		new->output = dst_discard;
809 
810 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
811 		new->dev = ort->u.dst.dev;
812 		if (new->dev)
813 			dev_hold(new->dev);
814 		rt->rt6i_idev = ort->rt6i_idev;
815 		if (rt->rt6i_idev)
816 			in6_dev_hold(rt->rt6i_idev);
817 		rt->rt6i_expires = 0;
818 
819 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
820 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
821 		rt->rt6i_metric = 0;
822 
823 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
824 #ifdef CONFIG_IPV6_SUBTREES
825 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
826 #endif
827 
828 		dst_free(new);
829 	}
830 
831 	dst_release(*dstp);
832 	*dstp = new;
833 	return (new ? 0 : -ENOMEM);
834 }
835 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
836 
837 /*
838  *	Destination cache support functions
839  */
840 
841 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
842 {
843 	struct rt6_info *rt;
844 
845 	rt = (struct rt6_info *) dst;
846 
847 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
848 		return dst;
849 
850 	return NULL;
851 }
852 
853 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
854 {
855 	struct rt6_info *rt = (struct rt6_info *) dst;
856 
857 	if (rt) {
858 		if (rt->rt6i_flags & RTF_CACHE)
859 			ip6_del_rt(rt);
860 		else
861 			dst_release(dst);
862 	}
863 	return NULL;
864 }
865 
866 static void ip6_link_failure(struct sk_buff *skb)
867 {
868 	struct rt6_info *rt;
869 
870 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
871 
872 	rt = (struct rt6_info *) skb->dst;
873 	if (rt) {
874 		if (rt->rt6i_flags&RTF_CACHE) {
875 			dst_set_expires(&rt->u.dst, 0);
876 			rt->rt6i_flags |= RTF_EXPIRES;
877 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
878 			rt->rt6i_node->fn_sernum = -1;
879 	}
880 }
881 
882 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
883 {
884 	struct rt6_info *rt6 = (struct rt6_info*)dst;
885 
886 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
887 		rt6->rt6i_flags |= RTF_MODIFIED;
888 		if (mtu < IPV6_MIN_MTU) {
889 			mtu = IPV6_MIN_MTU;
890 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
891 		}
892 		dst->metrics[RTAX_MTU-1] = mtu;
893 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
894 	}
895 }
896 
897 static int ipv6_get_mtu(struct net_device *dev);
898 
899 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
900 {
901 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
902 
903 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
904 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
905 
906 	/*
907 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
908 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
909 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
910 	 * rely only on pmtu discovery"
911 	 */
912 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
913 		mtu = IPV6_MAXPLEN;
914 	return mtu;
915 }
916 
917 static struct dst_entry *icmp6_dst_gc_list;
918 static DEFINE_SPINLOCK(icmp6_dst_lock);
919 
920 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
921 				  struct neighbour *neigh,
922 				  const struct in6_addr *addr)
923 {
924 	struct rt6_info *rt;
925 	struct inet6_dev *idev = in6_dev_get(dev);
926 	struct net *net = dev_net(dev);
927 
928 	if (unlikely(idev == NULL))
929 		return NULL;
930 
931 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
932 	if (unlikely(rt == NULL)) {
933 		in6_dev_put(idev);
934 		goto out;
935 	}
936 
937 	dev_hold(dev);
938 	if (neigh)
939 		neigh_hold(neigh);
940 	else
941 		neigh = ndisc_get_neigh(dev, addr);
942 
943 	rt->rt6i_dev	  = dev;
944 	rt->rt6i_idev     = idev;
945 	rt->rt6i_nexthop  = neigh;
946 	atomic_set(&rt->u.dst.__refcnt, 1);
947 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
948 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
949 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
950 	rt->u.dst.output  = ip6_output;
951 
952 #if 0	/* there's no chance to use these for ndisc */
953 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
954 				? DST_HOST
955 				: 0;
956 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
957 	rt->rt6i_dst.plen = 128;
958 #endif
959 
960 	spin_lock_bh(&icmp6_dst_lock);
961 	rt->u.dst.next = icmp6_dst_gc_list;
962 	icmp6_dst_gc_list = &rt->u.dst;
963 	spin_unlock_bh(&icmp6_dst_lock);
964 
965 	fib6_force_start_gc(net);
966 
967 out:
968 	return &rt->u.dst;
969 }
970 
971 int icmp6_dst_gc(int *more)
972 {
973 	struct dst_entry *dst, *next, **pprev;
974 	int freed;
975 
976 	next = NULL;
977 	freed = 0;
978 
979 	spin_lock_bh(&icmp6_dst_lock);
980 	pprev = &icmp6_dst_gc_list;
981 
982 	while ((dst = *pprev) != NULL) {
983 		if (!atomic_read(&dst->__refcnt)) {
984 			*pprev = dst->next;
985 			dst_free(dst);
986 			freed++;
987 		} else {
988 			pprev = &dst->next;
989 			(*more)++;
990 		}
991 	}
992 
993 	spin_unlock_bh(&icmp6_dst_lock);
994 
995 	return freed;
996 }
997 
998 static int ip6_dst_gc(struct dst_ops *ops)
999 {
1000 	unsigned long now = jiffies;
1001 	struct net *net = ops->dst_net;
1002 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1003 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1004 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1005 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1006 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1007 
1008 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1009 	    atomic_read(&ops->entries) <= rt_max_size)
1010 		goto out;
1011 
1012 	net->ipv6.ip6_rt_gc_expire++;
1013 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1014 	net->ipv6.ip6_rt_last_gc = now;
1015 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1016 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1017 out:
1018 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1019 	return (atomic_read(&ops->entries) > rt_max_size);
1020 }
1021 
1022 /* Clean host part of a prefix. Not necessary in radix tree,
1023    but results in cleaner routing tables.
1024 
1025    Remove it only when all the things will work!
1026  */
1027 
1028 static int ipv6_get_mtu(struct net_device *dev)
1029 {
1030 	int mtu = IPV6_MIN_MTU;
1031 	struct inet6_dev *idev;
1032 
1033 	idev = in6_dev_get(dev);
1034 	if (idev) {
1035 		mtu = idev->cnf.mtu6;
1036 		in6_dev_put(idev);
1037 	}
1038 	return mtu;
1039 }
1040 
1041 int ip6_dst_hoplimit(struct dst_entry *dst)
1042 {
1043 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1044 	if (hoplimit < 0) {
1045 		struct net_device *dev = dst->dev;
1046 		struct inet6_dev *idev = in6_dev_get(dev);
1047 		if (idev) {
1048 			hoplimit = idev->cnf.hop_limit;
1049 			in6_dev_put(idev);
1050 		} else
1051 			hoplimit = ipv6_devconf.hop_limit;
1052 	}
1053 	return hoplimit;
1054 }
1055 
1056 /*
1057  *
1058  */
1059 
1060 int ip6_route_add(struct fib6_config *cfg)
1061 {
1062 	int err;
1063 	struct net *net = cfg->fc_nlinfo.nl_net;
1064 	struct rt6_info *rt = NULL;
1065 	struct net_device *dev = NULL;
1066 	struct inet6_dev *idev = NULL;
1067 	struct fib6_table *table;
1068 	int addr_type;
1069 
1070 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1071 		return -EINVAL;
1072 #ifndef CONFIG_IPV6_SUBTREES
1073 	if (cfg->fc_src_len)
1074 		return -EINVAL;
1075 #endif
1076 	if (cfg->fc_ifindex) {
1077 		err = -ENODEV;
1078 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1079 		if (!dev)
1080 			goto out;
1081 		idev = in6_dev_get(dev);
1082 		if (!idev)
1083 			goto out;
1084 	}
1085 
1086 	if (cfg->fc_metric == 0)
1087 		cfg->fc_metric = IP6_RT_PRIO_USER;
1088 
1089 	table = fib6_new_table(net, cfg->fc_table);
1090 	if (table == NULL) {
1091 		err = -ENOBUFS;
1092 		goto out;
1093 	}
1094 
1095 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1096 
1097 	if (rt == NULL) {
1098 		err = -ENOMEM;
1099 		goto out;
1100 	}
1101 
1102 	rt->u.dst.obsolete = -1;
1103 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1104 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1105 				0;
1106 
1107 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1108 		cfg->fc_protocol = RTPROT_BOOT;
1109 	rt->rt6i_protocol = cfg->fc_protocol;
1110 
1111 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1112 
1113 	if (addr_type & IPV6_ADDR_MULTICAST)
1114 		rt->u.dst.input = ip6_mc_input;
1115 	else
1116 		rt->u.dst.input = ip6_forward;
1117 
1118 	rt->u.dst.output = ip6_output;
1119 
1120 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1121 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1122 	if (rt->rt6i_dst.plen == 128)
1123 	       rt->u.dst.flags = DST_HOST;
1124 
1125 #ifdef CONFIG_IPV6_SUBTREES
1126 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1127 	rt->rt6i_src.plen = cfg->fc_src_len;
1128 #endif
1129 
1130 	rt->rt6i_metric = cfg->fc_metric;
1131 
1132 	/* We cannot add true routes via loopback here,
1133 	   they would result in kernel looping; promote them to reject routes
1134 	 */
1135 	if ((cfg->fc_flags & RTF_REJECT) ||
1136 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1137 		/* hold loopback dev/idev if we haven't done so. */
1138 		if (dev != net->loopback_dev) {
1139 			if (dev) {
1140 				dev_put(dev);
1141 				in6_dev_put(idev);
1142 			}
1143 			dev = net->loopback_dev;
1144 			dev_hold(dev);
1145 			idev = in6_dev_get(dev);
1146 			if (!idev) {
1147 				err = -ENODEV;
1148 				goto out;
1149 			}
1150 		}
1151 		rt->u.dst.output = ip6_pkt_discard_out;
1152 		rt->u.dst.input = ip6_pkt_discard;
1153 		rt->u.dst.error = -ENETUNREACH;
1154 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1155 		goto install_route;
1156 	}
1157 
1158 	if (cfg->fc_flags & RTF_GATEWAY) {
1159 		struct in6_addr *gw_addr;
1160 		int gwa_type;
1161 
1162 		gw_addr = &cfg->fc_gateway;
1163 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1164 		gwa_type = ipv6_addr_type(gw_addr);
1165 
1166 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1167 			struct rt6_info *grt;
1168 
1169 			/* IPv6 strictly inhibits using not link-local
1170 			   addresses as nexthop address.
1171 			   Otherwise, router will not able to send redirects.
1172 			   It is very good, but in some (rare!) circumstances
1173 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1174 			   some exceptions. --ANK
1175 			 */
1176 			err = -EINVAL;
1177 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1178 				goto out;
1179 
1180 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1181 
1182 			err = -EHOSTUNREACH;
1183 			if (grt == NULL)
1184 				goto out;
1185 			if (dev) {
1186 				if (dev != grt->rt6i_dev) {
1187 					dst_release(&grt->u.dst);
1188 					goto out;
1189 				}
1190 			} else {
1191 				dev = grt->rt6i_dev;
1192 				idev = grt->rt6i_idev;
1193 				dev_hold(dev);
1194 				in6_dev_hold(grt->rt6i_idev);
1195 			}
1196 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1197 				err = 0;
1198 			dst_release(&grt->u.dst);
1199 
1200 			if (err)
1201 				goto out;
1202 		}
1203 		err = -EINVAL;
1204 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1205 			goto out;
1206 	}
1207 
1208 	err = -ENODEV;
1209 	if (dev == NULL)
1210 		goto out;
1211 
1212 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1213 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1214 		if (IS_ERR(rt->rt6i_nexthop)) {
1215 			err = PTR_ERR(rt->rt6i_nexthop);
1216 			rt->rt6i_nexthop = NULL;
1217 			goto out;
1218 		}
1219 	}
1220 
1221 	rt->rt6i_flags = cfg->fc_flags;
1222 
1223 install_route:
1224 	if (cfg->fc_mx) {
1225 		struct nlattr *nla;
1226 		int remaining;
1227 
1228 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1229 			int type = nla_type(nla);
1230 
1231 			if (type) {
1232 				if (type > RTAX_MAX) {
1233 					err = -EINVAL;
1234 					goto out;
1235 				}
1236 
1237 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1238 			}
1239 		}
1240 	}
1241 
1242 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1243 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1244 	if (!dst_metric(&rt->u.dst, RTAX_MTU))
1245 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1246 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1247 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1248 	rt->u.dst.dev = dev;
1249 	rt->rt6i_idev = idev;
1250 	rt->rt6i_table = table;
1251 
1252 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1253 
1254 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1255 
1256 out:
1257 	if (dev)
1258 		dev_put(dev);
1259 	if (idev)
1260 		in6_dev_put(idev);
1261 	if (rt)
1262 		dst_free(&rt->u.dst);
1263 	return err;
1264 }
1265 
1266 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1267 {
1268 	int err;
1269 	struct fib6_table *table;
1270 	struct net *net = dev_net(rt->rt6i_dev);
1271 
1272 	if (rt == net->ipv6.ip6_null_entry)
1273 		return -ENOENT;
1274 
1275 	table = rt->rt6i_table;
1276 	write_lock_bh(&table->tb6_lock);
1277 
1278 	err = fib6_del(rt, info);
1279 	dst_release(&rt->u.dst);
1280 
1281 	write_unlock_bh(&table->tb6_lock);
1282 
1283 	return err;
1284 }
1285 
1286 int ip6_del_rt(struct rt6_info *rt)
1287 {
1288 	struct nl_info info = {
1289 		.nl_net = dev_net(rt->rt6i_dev),
1290 	};
1291 	return __ip6_del_rt(rt, &info);
1292 }
1293 
1294 static int ip6_route_del(struct fib6_config *cfg)
1295 {
1296 	struct fib6_table *table;
1297 	struct fib6_node *fn;
1298 	struct rt6_info *rt;
1299 	int err = -ESRCH;
1300 
1301 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1302 	if (table == NULL)
1303 		return err;
1304 
1305 	read_lock_bh(&table->tb6_lock);
1306 
1307 	fn = fib6_locate(&table->tb6_root,
1308 			 &cfg->fc_dst, cfg->fc_dst_len,
1309 			 &cfg->fc_src, cfg->fc_src_len);
1310 
1311 	if (fn) {
1312 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1313 			if (cfg->fc_ifindex &&
1314 			    (rt->rt6i_dev == NULL ||
1315 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1316 				continue;
1317 			if (cfg->fc_flags & RTF_GATEWAY &&
1318 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1319 				continue;
1320 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1321 				continue;
1322 			dst_hold(&rt->u.dst);
1323 			read_unlock_bh(&table->tb6_lock);
1324 
1325 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1326 		}
1327 	}
1328 	read_unlock_bh(&table->tb6_lock);
1329 
1330 	return err;
1331 }
1332 
1333 /*
1334  *	Handle redirects
1335  */
1336 struct ip6rd_flowi {
1337 	struct flowi fl;
1338 	struct in6_addr gateway;
1339 };
1340 
1341 static struct rt6_info *__ip6_route_redirect(struct net *net,
1342 					     struct fib6_table *table,
1343 					     struct flowi *fl,
1344 					     int flags)
1345 {
1346 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1347 	struct rt6_info *rt;
1348 	struct fib6_node *fn;
1349 
1350 	/*
1351 	 * Get the "current" route for this destination and
1352 	 * check if the redirect has come from approriate router.
1353 	 *
1354 	 * RFC 2461 specifies that redirects should only be
1355 	 * accepted if they come from the nexthop to the target.
1356 	 * Due to the way the routes are chosen, this notion
1357 	 * is a bit fuzzy and one might need to check all possible
1358 	 * routes.
1359 	 */
1360 
1361 	read_lock_bh(&table->tb6_lock);
1362 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1363 restart:
1364 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1365 		/*
1366 		 * Current route is on-link; redirect is always invalid.
1367 		 *
1368 		 * Seems, previous statement is not true. It could
1369 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1370 		 * But then router serving it might decide, that we should
1371 		 * know truth 8)8) --ANK (980726).
1372 		 */
1373 		if (rt6_check_expired(rt))
1374 			continue;
1375 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1376 			continue;
1377 		if (fl->oif != rt->rt6i_dev->ifindex)
1378 			continue;
1379 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1380 			continue;
1381 		break;
1382 	}
1383 
1384 	if (!rt)
1385 		rt = net->ipv6.ip6_null_entry;
1386 	BACKTRACK(net, &fl->fl6_src);
1387 out:
1388 	dst_hold(&rt->u.dst);
1389 
1390 	read_unlock_bh(&table->tb6_lock);
1391 
1392 	return rt;
1393 };
1394 
1395 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1396 					   struct in6_addr *src,
1397 					   struct in6_addr *gateway,
1398 					   struct net_device *dev)
1399 {
1400 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1401 	struct net *net = dev_net(dev);
1402 	struct ip6rd_flowi rdfl = {
1403 		.fl = {
1404 			.oif = dev->ifindex,
1405 			.nl_u = {
1406 				.ip6_u = {
1407 					.daddr = *dest,
1408 					.saddr = *src,
1409 				},
1410 			},
1411 		},
1412 		.gateway = *gateway,
1413 	};
1414 
1415 	if (rt6_need_strict(dest))
1416 		flags |= RT6_LOOKUP_F_IFACE;
1417 
1418 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1419 						   flags, __ip6_route_redirect);
1420 }
1421 
1422 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1423 		  struct in6_addr *saddr,
1424 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1425 {
1426 	struct rt6_info *rt, *nrt = NULL;
1427 	struct netevent_redirect netevent;
1428 	struct net *net = dev_net(neigh->dev);
1429 
1430 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1431 
1432 	if (rt == net->ipv6.ip6_null_entry) {
1433 		if (net_ratelimit())
1434 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1435 			       "for redirect target\n");
1436 		goto out;
1437 	}
1438 
1439 	/*
1440 	 *	We have finally decided to accept it.
1441 	 */
1442 
1443 	neigh_update(neigh, lladdr, NUD_STALE,
1444 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1445 		     NEIGH_UPDATE_F_OVERRIDE|
1446 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1447 				     NEIGH_UPDATE_F_ISROUTER))
1448 		     );
1449 
1450 	/*
1451 	 * Redirect received -> path was valid.
1452 	 * Look, redirects are sent only in response to data packets,
1453 	 * so that this nexthop apparently is reachable. --ANK
1454 	 */
1455 	dst_confirm(&rt->u.dst);
1456 
1457 	/* Duplicate redirect: silently ignore. */
1458 	if (neigh == rt->u.dst.neighbour)
1459 		goto out;
1460 
1461 	nrt = ip6_rt_copy(rt);
1462 	if (nrt == NULL)
1463 		goto out;
1464 
1465 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1466 	if (on_link)
1467 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1468 
1469 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1470 	nrt->rt6i_dst.plen = 128;
1471 	nrt->u.dst.flags |= DST_HOST;
1472 
1473 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1474 	nrt->rt6i_nexthop = neigh_clone(neigh);
1475 	/* Reset pmtu, it may be better */
1476 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1477 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1478 							dst_mtu(&nrt->u.dst));
1479 
1480 	if (ip6_ins_rt(nrt))
1481 		goto out;
1482 
1483 	netevent.old = &rt->u.dst;
1484 	netevent.new = &nrt->u.dst;
1485 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1486 
1487 	if (rt->rt6i_flags&RTF_CACHE) {
1488 		ip6_del_rt(rt);
1489 		return;
1490 	}
1491 
1492 out:
1493 	dst_release(&rt->u.dst);
1494 	return;
1495 }
1496 
1497 /*
1498  *	Handle ICMP "packet too big" messages
1499  *	i.e. Path MTU discovery
1500  */
1501 
1502 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1503 			struct net_device *dev, u32 pmtu)
1504 {
1505 	struct rt6_info *rt, *nrt;
1506 	struct net *net = dev_net(dev);
1507 	int allfrag = 0;
1508 
1509 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1510 	if (rt == NULL)
1511 		return;
1512 
1513 	if (pmtu >= dst_mtu(&rt->u.dst))
1514 		goto out;
1515 
1516 	if (pmtu < IPV6_MIN_MTU) {
1517 		/*
1518 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1519 		 * MTU (1280) and a fragment header should always be included
1520 		 * after a node receiving Too Big message reporting PMTU is
1521 		 * less than the IPv6 Minimum Link MTU.
1522 		 */
1523 		pmtu = IPV6_MIN_MTU;
1524 		allfrag = 1;
1525 	}
1526 
1527 	/* New mtu received -> path was valid.
1528 	   They are sent only in response to data packets,
1529 	   so that this nexthop apparently is reachable. --ANK
1530 	 */
1531 	dst_confirm(&rt->u.dst);
1532 
1533 	/* Host route. If it is static, it would be better
1534 	   not to override it, but add new one, so that
1535 	   when cache entry will expire old pmtu
1536 	   would return automatically.
1537 	 */
1538 	if (rt->rt6i_flags & RTF_CACHE) {
1539 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1540 		if (allfrag)
1541 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1542 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1543 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1544 		goto out;
1545 	}
1546 
1547 	/* Network route.
1548 	   Two cases are possible:
1549 	   1. It is connected route. Action: COW
1550 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1551 	 */
1552 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1553 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1554 	else
1555 		nrt = rt6_alloc_clone(rt, daddr);
1556 
1557 	if (nrt) {
1558 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1559 		if (allfrag)
1560 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1561 
1562 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1563 		 * happened within 5 mins, the recommended timer is 10 mins.
1564 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1565 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1566 		 * and detecting PMTU increase will be automatically happened.
1567 		 */
1568 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1569 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1570 
1571 		ip6_ins_rt(nrt);
1572 	}
1573 out:
1574 	dst_release(&rt->u.dst);
1575 }
1576 
1577 /*
1578  *	Misc support functions
1579  */
1580 
1581 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1582 {
1583 	struct net *net = dev_net(ort->rt6i_dev);
1584 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1585 
1586 	if (rt) {
1587 		rt->u.dst.input = ort->u.dst.input;
1588 		rt->u.dst.output = ort->u.dst.output;
1589 
1590 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1591 		rt->u.dst.error = ort->u.dst.error;
1592 		rt->u.dst.dev = ort->u.dst.dev;
1593 		if (rt->u.dst.dev)
1594 			dev_hold(rt->u.dst.dev);
1595 		rt->rt6i_idev = ort->rt6i_idev;
1596 		if (rt->rt6i_idev)
1597 			in6_dev_hold(rt->rt6i_idev);
1598 		rt->u.dst.lastuse = jiffies;
1599 		rt->rt6i_expires = 0;
1600 
1601 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1602 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1603 		rt->rt6i_metric = 0;
1604 
1605 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1606 #ifdef CONFIG_IPV6_SUBTREES
1607 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1608 #endif
1609 		rt->rt6i_table = ort->rt6i_table;
1610 	}
1611 	return rt;
1612 }
1613 
1614 #ifdef CONFIG_IPV6_ROUTE_INFO
1615 static struct rt6_info *rt6_get_route_info(struct net *net,
1616 					   struct in6_addr *prefix, int prefixlen,
1617 					   struct in6_addr *gwaddr, int ifindex)
1618 {
1619 	struct fib6_node *fn;
1620 	struct rt6_info *rt = NULL;
1621 	struct fib6_table *table;
1622 
1623 	table = fib6_get_table(net, RT6_TABLE_INFO);
1624 	if (table == NULL)
1625 		return NULL;
1626 
1627 	write_lock_bh(&table->tb6_lock);
1628 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1629 	if (!fn)
1630 		goto out;
1631 
1632 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1633 		if (rt->rt6i_dev->ifindex != ifindex)
1634 			continue;
1635 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1636 			continue;
1637 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1638 			continue;
1639 		dst_hold(&rt->u.dst);
1640 		break;
1641 	}
1642 out:
1643 	write_unlock_bh(&table->tb6_lock);
1644 	return rt;
1645 }
1646 
1647 static struct rt6_info *rt6_add_route_info(struct net *net,
1648 					   struct in6_addr *prefix, int prefixlen,
1649 					   struct in6_addr *gwaddr, int ifindex,
1650 					   unsigned pref)
1651 {
1652 	struct fib6_config cfg = {
1653 		.fc_table	= RT6_TABLE_INFO,
1654 		.fc_metric	= IP6_RT_PRIO_USER,
1655 		.fc_ifindex	= ifindex,
1656 		.fc_dst_len	= prefixlen,
1657 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1658 				  RTF_UP | RTF_PREF(pref),
1659 		.fc_nlinfo.pid = 0,
1660 		.fc_nlinfo.nlh = NULL,
1661 		.fc_nlinfo.nl_net = net,
1662 	};
1663 
1664 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1665 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1666 
1667 	/* We should treat it as a default route if prefix length is 0. */
1668 	if (!prefixlen)
1669 		cfg.fc_flags |= RTF_DEFAULT;
1670 
1671 	ip6_route_add(&cfg);
1672 
1673 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1674 }
1675 #endif
1676 
1677 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1678 {
1679 	struct rt6_info *rt;
1680 	struct fib6_table *table;
1681 
1682 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1683 	if (table == NULL)
1684 		return NULL;
1685 
1686 	write_lock_bh(&table->tb6_lock);
1687 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1688 		if (dev == rt->rt6i_dev &&
1689 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1690 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1691 			break;
1692 	}
1693 	if (rt)
1694 		dst_hold(&rt->u.dst);
1695 	write_unlock_bh(&table->tb6_lock);
1696 	return rt;
1697 }
1698 
1699 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1700 				     struct net_device *dev,
1701 				     unsigned int pref)
1702 {
1703 	struct fib6_config cfg = {
1704 		.fc_table	= RT6_TABLE_DFLT,
1705 		.fc_metric	= IP6_RT_PRIO_USER,
1706 		.fc_ifindex	= dev->ifindex,
1707 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1708 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1709 		.fc_nlinfo.pid = 0,
1710 		.fc_nlinfo.nlh = NULL,
1711 		.fc_nlinfo.nl_net = dev_net(dev),
1712 	};
1713 
1714 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1715 
1716 	ip6_route_add(&cfg);
1717 
1718 	return rt6_get_dflt_router(gwaddr, dev);
1719 }
1720 
1721 void rt6_purge_dflt_routers(struct net *net)
1722 {
1723 	struct rt6_info *rt;
1724 	struct fib6_table *table;
1725 
1726 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1727 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1728 	if (table == NULL)
1729 		return;
1730 
1731 restart:
1732 	read_lock_bh(&table->tb6_lock);
1733 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1734 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1735 			dst_hold(&rt->u.dst);
1736 			read_unlock_bh(&table->tb6_lock);
1737 			ip6_del_rt(rt);
1738 			goto restart;
1739 		}
1740 	}
1741 	read_unlock_bh(&table->tb6_lock);
1742 }
1743 
1744 static void rtmsg_to_fib6_config(struct net *net,
1745 				 struct in6_rtmsg *rtmsg,
1746 				 struct fib6_config *cfg)
1747 {
1748 	memset(cfg, 0, sizeof(*cfg));
1749 
1750 	cfg->fc_table = RT6_TABLE_MAIN;
1751 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1752 	cfg->fc_metric = rtmsg->rtmsg_metric;
1753 	cfg->fc_expires = rtmsg->rtmsg_info;
1754 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1755 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1756 	cfg->fc_flags = rtmsg->rtmsg_flags;
1757 
1758 	cfg->fc_nlinfo.nl_net = net;
1759 
1760 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1761 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1762 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1763 }
1764 
1765 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1766 {
1767 	struct fib6_config cfg;
1768 	struct in6_rtmsg rtmsg;
1769 	int err;
1770 
1771 	switch(cmd) {
1772 	case SIOCADDRT:		/* Add a route */
1773 	case SIOCDELRT:		/* Delete a route */
1774 		if (!capable(CAP_NET_ADMIN))
1775 			return -EPERM;
1776 		err = copy_from_user(&rtmsg, arg,
1777 				     sizeof(struct in6_rtmsg));
1778 		if (err)
1779 			return -EFAULT;
1780 
1781 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1782 
1783 		rtnl_lock();
1784 		switch (cmd) {
1785 		case SIOCADDRT:
1786 			err = ip6_route_add(&cfg);
1787 			break;
1788 		case SIOCDELRT:
1789 			err = ip6_route_del(&cfg);
1790 			break;
1791 		default:
1792 			err = -EINVAL;
1793 		}
1794 		rtnl_unlock();
1795 
1796 		return err;
1797 	}
1798 
1799 	return -EINVAL;
1800 }
1801 
1802 /*
1803  *	Drop the packet on the floor
1804  */
1805 
1806 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1807 {
1808 	int type;
1809 	switch (ipstats_mib_noroutes) {
1810 	case IPSTATS_MIB_INNOROUTES:
1811 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1812 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1813 			IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1814 			break;
1815 		}
1816 		/* FALLTHROUGH */
1817 	case IPSTATS_MIB_OUTNOROUTES:
1818 		IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1819 		break;
1820 	}
1821 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1822 	kfree_skb(skb);
1823 	return 0;
1824 }
1825 
1826 static int ip6_pkt_discard(struct sk_buff *skb)
1827 {
1828 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1829 }
1830 
1831 static int ip6_pkt_discard_out(struct sk_buff *skb)
1832 {
1833 	skb->dev = skb->dst->dev;
1834 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1835 }
1836 
1837 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1838 
1839 static int ip6_pkt_prohibit(struct sk_buff *skb)
1840 {
1841 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1842 }
1843 
1844 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1845 {
1846 	skb->dev = skb->dst->dev;
1847 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1848 }
1849 
1850 #endif
1851 
1852 /*
1853  *	Allocate a dst for local (unicast / anycast) address.
1854  */
1855 
1856 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1857 				    const struct in6_addr *addr,
1858 				    int anycast)
1859 {
1860 	struct net *net = dev_net(idev->dev);
1861 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1862 
1863 	if (rt == NULL)
1864 		return ERR_PTR(-ENOMEM);
1865 
1866 	dev_hold(net->loopback_dev);
1867 	in6_dev_hold(idev);
1868 
1869 	rt->u.dst.flags = DST_HOST;
1870 	rt->u.dst.input = ip6_input;
1871 	rt->u.dst.output = ip6_output;
1872 	rt->rt6i_dev = net->loopback_dev;
1873 	rt->rt6i_idev = idev;
1874 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1875 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1876 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1877 	rt->u.dst.obsolete = -1;
1878 
1879 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1880 	if (anycast)
1881 		rt->rt6i_flags |= RTF_ANYCAST;
1882 	else
1883 		rt->rt6i_flags |= RTF_LOCAL;
1884 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1885 	if (rt->rt6i_nexthop == NULL) {
1886 		dst_free(&rt->u.dst);
1887 		return ERR_PTR(-ENOMEM);
1888 	}
1889 
1890 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1891 	rt->rt6i_dst.plen = 128;
1892 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1893 
1894 	atomic_set(&rt->u.dst.__refcnt, 1);
1895 
1896 	return rt;
1897 }
1898 
1899 struct arg_dev_net {
1900 	struct net_device *dev;
1901 	struct net *net;
1902 };
1903 
1904 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1905 {
1906 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1907 	struct net *net = ((struct arg_dev_net *)arg)->net;
1908 
1909 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1910 	    rt != net->ipv6.ip6_null_entry) {
1911 		RT6_TRACE("deleted by ifdown %p\n", rt);
1912 		return -1;
1913 	}
1914 	return 0;
1915 }
1916 
1917 void rt6_ifdown(struct net *net, struct net_device *dev)
1918 {
1919 	struct arg_dev_net adn = {
1920 		.dev = dev,
1921 		.net = net,
1922 	};
1923 
1924 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1925 }
1926 
1927 struct rt6_mtu_change_arg
1928 {
1929 	struct net_device *dev;
1930 	unsigned mtu;
1931 };
1932 
1933 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1934 {
1935 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1936 	struct inet6_dev *idev;
1937 	struct net *net = dev_net(arg->dev);
1938 
1939 	/* In IPv6 pmtu discovery is not optional,
1940 	   so that RTAX_MTU lock cannot disable it.
1941 	   We still use this lock to block changes
1942 	   caused by addrconf/ndisc.
1943 	*/
1944 
1945 	idev = __in6_dev_get(arg->dev);
1946 	if (idev == NULL)
1947 		return 0;
1948 
1949 	/* For administrative MTU increase, there is no way to discover
1950 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1951 	   Since RFC 1981 doesn't include administrative MTU increase
1952 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1953 	 */
1954 	/*
1955 	   If new MTU is less than route PMTU, this new MTU will be the
1956 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1957 	   decreases; if new MTU is greater than route PMTU, and the
1958 	   old MTU is the lowest MTU in the path, update the route PMTU
1959 	   to reflect the increase. In this case if the other nodes' MTU
1960 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1961 	   PMTU discouvery.
1962 	 */
1963 	if (rt->rt6i_dev == arg->dev &&
1964 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1965 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
1966 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
1967 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1968 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1969 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1970 	}
1971 	return 0;
1972 }
1973 
1974 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1975 {
1976 	struct rt6_mtu_change_arg arg = {
1977 		.dev = dev,
1978 		.mtu = mtu,
1979 	};
1980 
1981 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
1982 }
1983 
1984 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1985 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1986 	[RTA_OIF]               = { .type = NLA_U32 },
1987 	[RTA_IIF]		= { .type = NLA_U32 },
1988 	[RTA_PRIORITY]          = { .type = NLA_U32 },
1989 	[RTA_METRICS]           = { .type = NLA_NESTED },
1990 };
1991 
1992 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1993 			      struct fib6_config *cfg)
1994 {
1995 	struct rtmsg *rtm;
1996 	struct nlattr *tb[RTA_MAX+1];
1997 	int err;
1998 
1999 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2000 	if (err < 0)
2001 		goto errout;
2002 
2003 	err = -EINVAL;
2004 	rtm = nlmsg_data(nlh);
2005 	memset(cfg, 0, sizeof(*cfg));
2006 
2007 	cfg->fc_table = rtm->rtm_table;
2008 	cfg->fc_dst_len = rtm->rtm_dst_len;
2009 	cfg->fc_src_len = rtm->rtm_src_len;
2010 	cfg->fc_flags = RTF_UP;
2011 	cfg->fc_protocol = rtm->rtm_protocol;
2012 
2013 	if (rtm->rtm_type == RTN_UNREACHABLE)
2014 		cfg->fc_flags |= RTF_REJECT;
2015 
2016 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2017 	cfg->fc_nlinfo.nlh = nlh;
2018 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2019 
2020 	if (tb[RTA_GATEWAY]) {
2021 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2022 		cfg->fc_flags |= RTF_GATEWAY;
2023 	}
2024 
2025 	if (tb[RTA_DST]) {
2026 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2027 
2028 		if (nla_len(tb[RTA_DST]) < plen)
2029 			goto errout;
2030 
2031 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2032 	}
2033 
2034 	if (tb[RTA_SRC]) {
2035 		int plen = (rtm->rtm_src_len + 7) >> 3;
2036 
2037 		if (nla_len(tb[RTA_SRC]) < plen)
2038 			goto errout;
2039 
2040 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2041 	}
2042 
2043 	if (tb[RTA_OIF])
2044 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2045 
2046 	if (tb[RTA_PRIORITY])
2047 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2048 
2049 	if (tb[RTA_METRICS]) {
2050 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2051 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2052 	}
2053 
2054 	if (tb[RTA_TABLE])
2055 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2056 
2057 	err = 0;
2058 errout:
2059 	return err;
2060 }
2061 
2062 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2063 {
2064 	struct fib6_config cfg;
2065 	int err;
2066 
2067 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2068 	if (err < 0)
2069 		return err;
2070 
2071 	return ip6_route_del(&cfg);
2072 }
2073 
2074 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2075 {
2076 	struct fib6_config cfg;
2077 	int err;
2078 
2079 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2080 	if (err < 0)
2081 		return err;
2082 
2083 	return ip6_route_add(&cfg);
2084 }
2085 
2086 static inline size_t rt6_nlmsg_size(void)
2087 {
2088 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2089 	       + nla_total_size(16) /* RTA_SRC */
2090 	       + nla_total_size(16) /* RTA_DST */
2091 	       + nla_total_size(16) /* RTA_GATEWAY */
2092 	       + nla_total_size(16) /* RTA_PREFSRC */
2093 	       + nla_total_size(4) /* RTA_TABLE */
2094 	       + nla_total_size(4) /* RTA_IIF */
2095 	       + nla_total_size(4) /* RTA_OIF */
2096 	       + nla_total_size(4) /* RTA_PRIORITY */
2097 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2098 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2099 }
2100 
2101 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2102 			 struct in6_addr *dst, struct in6_addr *src,
2103 			 int iif, int type, u32 pid, u32 seq,
2104 			 int prefix, int nowait, unsigned int flags)
2105 {
2106 	struct rtmsg *rtm;
2107 	struct nlmsghdr *nlh;
2108 	long expires;
2109 	u32 table;
2110 
2111 	if (prefix) {	/* user wants prefix routes only */
2112 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2113 			/* success since this is not a prefix route */
2114 			return 1;
2115 		}
2116 	}
2117 
2118 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2119 	if (nlh == NULL)
2120 		return -EMSGSIZE;
2121 
2122 	rtm = nlmsg_data(nlh);
2123 	rtm->rtm_family = AF_INET6;
2124 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2125 	rtm->rtm_src_len = rt->rt6i_src.plen;
2126 	rtm->rtm_tos = 0;
2127 	if (rt->rt6i_table)
2128 		table = rt->rt6i_table->tb6_id;
2129 	else
2130 		table = RT6_TABLE_UNSPEC;
2131 	rtm->rtm_table = table;
2132 	NLA_PUT_U32(skb, RTA_TABLE, table);
2133 	if (rt->rt6i_flags&RTF_REJECT)
2134 		rtm->rtm_type = RTN_UNREACHABLE;
2135 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2136 		rtm->rtm_type = RTN_LOCAL;
2137 	else
2138 		rtm->rtm_type = RTN_UNICAST;
2139 	rtm->rtm_flags = 0;
2140 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2141 	rtm->rtm_protocol = rt->rt6i_protocol;
2142 	if (rt->rt6i_flags&RTF_DYNAMIC)
2143 		rtm->rtm_protocol = RTPROT_REDIRECT;
2144 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2145 		rtm->rtm_protocol = RTPROT_KERNEL;
2146 	else if (rt->rt6i_flags&RTF_DEFAULT)
2147 		rtm->rtm_protocol = RTPROT_RA;
2148 
2149 	if (rt->rt6i_flags&RTF_CACHE)
2150 		rtm->rtm_flags |= RTM_F_CLONED;
2151 
2152 	if (dst) {
2153 		NLA_PUT(skb, RTA_DST, 16, dst);
2154 		rtm->rtm_dst_len = 128;
2155 	} else if (rtm->rtm_dst_len)
2156 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2157 #ifdef CONFIG_IPV6_SUBTREES
2158 	if (src) {
2159 		NLA_PUT(skb, RTA_SRC, 16, src);
2160 		rtm->rtm_src_len = 128;
2161 	} else if (rtm->rtm_src_len)
2162 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2163 #endif
2164 	if (iif) {
2165 #ifdef CONFIG_IPV6_MROUTE
2166 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2167 			int err = ip6mr_get_route(skb, rtm, nowait);
2168 			if (err <= 0) {
2169 				if (!nowait) {
2170 					if (err == 0)
2171 						return 0;
2172 					goto nla_put_failure;
2173 				} else {
2174 					if (err == -EMSGSIZE)
2175 						goto nla_put_failure;
2176 				}
2177 			}
2178 		} else
2179 #endif
2180 			NLA_PUT_U32(skb, RTA_IIF, iif);
2181 	} else if (dst) {
2182 		struct in6_addr saddr_buf;
2183 		if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2184 				       dst, 0, &saddr_buf) == 0)
2185 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2186 	}
2187 
2188 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2189 		goto nla_put_failure;
2190 
2191 	if (rt->u.dst.neighbour)
2192 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2193 
2194 	if (rt->u.dst.dev)
2195 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2196 
2197 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2198 
2199 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2200 		expires = 0;
2201 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2202 		expires = rt->rt6i_expires - jiffies;
2203 	else
2204 		expires = INT_MAX;
2205 
2206 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2207 			       expires, rt->u.dst.error) < 0)
2208 		goto nla_put_failure;
2209 
2210 	return nlmsg_end(skb, nlh);
2211 
2212 nla_put_failure:
2213 	nlmsg_cancel(skb, nlh);
2214 	return -EMSGSIZE;
2215 }
2216 
2217 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2218 {
2219 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2220 	int prefix;
2221 
2222 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2223 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2224 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2225 	} else
2226 		prefix = 0;
2227 
2228 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2229 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2230 		     prefix, 0, NLM_F_MULTI);
2231 }
2232 
2233 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2234 {
2235 	struct net *net = sock_net(in_skb->sk);
2236 	struct nlattr *tb[RTA_MAX+1];
2237 	struct rt6_info *rt;
2238 	struct sk_buff *skb;
2239 	struct rtmsg *rtm;
2240 	struct flowi fl;
2241 	int err, iif = 0;
2242 
2243 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2244 	if (err < 0)
2245 		goto errout;
2246 
2247 	err = -EINVAL;
2248 	memset(&fl, 0, sizeof(fl));
2249 
2250 	if (tb[RTA_SRC]) {
2251 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2252 			goto errout;
2253 
2254 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2255 	}
2256 
2257 	if (tb[RTA_DST]) {
2258 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2259 			goto errout;
2260 
2261 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2262 	}
2263 
2264 	if (tb[RTA_IIF])
2265 		iif = nla_get_u32(tb[RTA_IIF]);
2266 
2267 	if (tb[RTA_OIF])
2268 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2269 
2270 	if (iif) {
2271 		struct net_device *dev;
2272 		dev = __dev_get_by_index(net, iif);
2273 		if (!dev) {
2274 			err = -ENODEV;
2275 			goto errout;
2276 		}
2277 	}
2278 
2279 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2280 	if (skb == NULL) {
2281 		err = -ENOBUFS;
2282 		goto errout;
2283 	}
2284 
2285 	/* Reserve room for dummy headers, this skb can pass
2286 	   through good chunk of routing engine.
2287 	 */
2288 	skb_reset_mac_header(skb);
2289 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2290 
2291 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2292 	skb->dst = &rt->u.dst;
2293 
2294 	err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2295 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2296 			    nlh->nlmsg_seq, 0, 0, 0);
2297 	if (err < 0) {
2298 		kfree_skb(skb);
2299 		goto errout;
2300 	}
2301 
2302 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2303 errout:
2304 	return err;
2305 }
2306 
2307 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2308 {
2309 	struct sk_buff *skb;
2310 	struct net *net = info->nl_net;
2311 	u32 seq;
2312 	int err;
2313 
2314 	err = -ENOBUFS;
2315 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2316 
2317 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2318 	if (skb == NULL)
2319 		goto errout;
2320 
2321 	err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2322 				event, info->pid, seq, 0, 0, 0);
2323 	if (err < 0) {
2324 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2325 		WARN_ON(err == -EMSGSIZE);
2326 		kfree_skb(skb);
2327 		goto errout;
2328 	}
2329 	err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2330 			  info->nlh, gfp_any());
2331 errout:
2332 	if (err < 0)
2333 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2334 }
2335 
2336 static int ip6_route_dev_notify(struct notifier_block *this,
2337 				unsigned long event, void *data)
2338 {
2339 	struct net_device *dev = (struct net_device *)data;
2340 	struct net *net = dev_net(dev);
2341 
2342 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2343 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2344 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2345 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2346 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2347 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2348 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2349 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2350 #endif
2351 	}
2352 
2353 	return NOTIFY_OK;
2354 }
2355 
2356 /*
2357  *	/proc
2358  */
2359 
2360 #ifdef CONFIG_PROC_FS
2361 
2362 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2363 
2364 struct rt6_proc_arg
2365 {
2366 	char *buffer;
2367 	int offset;
2368 	int length;
2369 	int skip;
2370 	int len;
2371 };
2372 
2373 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2374 {
2375 	struct seq_file *m = p_arg;
2376 
2377 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2378 		   rt->rt6i_dst.plen);
2379 
2380 #ifdef CONFIG_IPV6_SUBTREES
2381 	seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2382 		   rt->rt6i_src.plen);
2383 #else
2384 	seq_puts(m, "00000000000000000000000000000000 00 ");
2385 #endif
2386 
2387 	if (rt->rt6i_nexthop) {
2388 		seq_printf(m, NIP6_SEQFMT,
2389 			   NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2390 	} else {
2391 		seq_puts(m, "00000000000000000000000000000000");
2392 	}
2393 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2394 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2395 		   rt->u.dst.__use, rt->rt6i_flags,
2396 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2397 	return 0;
2398 }
2399 
2400 static int ipv6_route_show(struct seq_file *m, void *v)
2401 {
2402 	struct net *net = (struct net *)m->private;
2403 	fib6_clean_all(net, rt6_info_route, 0, m);
2404 	return 0;
2405 }
2406 
2407 static int ipv6_route_open(struct inode *inode, struct file *file)
2408 {
2409 	int err;
2410 	struct net *net = get_proc_net(inode);
2411 	if (!net)
2412 		return -ENXIO;
2413 
2414 	err = single_open(file, ipv6_route_show, net);
2415 	if (err < 0) {
2416 		put_net(net);
2417 		return err;
2418 	}
2419 
2420 	return 0;
2421 }
2422 
2423 static int ipv6_route_release(struct inode *inode, struct file *file)
2424 {
2425 	struct seq_file *seq = file->private_data;
2426 	struct net *net = seq->private;
2427 	put_net(net);
2428 	return single_release(inode, file);
2429 }
2430 
2431 static const struct file_operations ipv6_route_proc_fops = {
2432 	.owner		= THIS_MODULE,
2433 	.open		= ipv6_route_open,
2434 	.read		= seq_read,
2435 	.llseek		= seq_lseek,
2436 	.release	= ipv6_route_release,
2437 };
2438 
2439 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2440 {
2441 	struct net *net = (struct net *)seq->private;
2442 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2443 		   net->ipv6.rt6_stats->fib_nodes,
2444 		   net->ipv6.rt6_stats->fib_route_nodes,
2445 		   net->ipv6.rt6_stats->fib_rt_alloc,
2446 		   net->ipv6.rt6_stats->fib_rt_entries,
2447 		   net->ipv6.rt6_stats->fib_rt_cache,
2448 		   atomic_read(&net->ipv6.ip6_dst_ops->entries),
2449 		   net->ipv6.rt6_stats->fib_discarded_routes);
2450 
2451 	return 0;
2452 }
2453 
2454 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2455 {
2456 	int err;
2457 	struct net *net = get_proc_net(inode);
2458 	if (!net)
2459 		return -ENXIO;
2460 
2461 	err = single_open(file, rt6_stats_seq_show, net);
2462 	if (err < 0) {
2463 		put_net(net);
2464 		return err;
2465 	}
2466 
2467 	return 0;
2468 }
2469 
2470 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2471 {
2472 	struct seq_file *seq = file->private_data;
2473 	struct net *net = (struct net *)seq->private;
2474 	put_net(net);
2475 	return single_release(inode, file);
2476 }
2477 
2478 static const struct file_operations rt6_stats_seq_fops = {
2479 	.owner	 = THIS_MODULE,
2480 	.open	 = rt6_stats_seq_open,
2481 	.read	 = seq_read,
2482 	.llseek	 = seq_lseek,
2483 	.release = rt6_stats_seq_release,
2484 };
2485 #endif	/* CONFIG_PROC_FS */
2486 
2487 #ifdef CONFIG_SYSCTL
2488 
2489 static
2490 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2491 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2492 {
2493 	struct net *net = current->nsproxy->net_ns;
2494 	int delay = net->ipv6.sysctl.flush_delay;
2495 	if (write) {
2496 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2497 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2498 		return 0;
2499 	} else
2500 		return -EINVAL;
2501 }
2502 
2503 ctl_table ipv6_route_table_template[] = {
2504 	{
2505 		.procname	=	"flush",
2506 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2507 		.maxlen		=	sizeof(int),
2508 		.mode		=	0200,
2509 		.proc_handler	=	&ipv6_sysctl_rtcache_flush
2510 	},
2511 	{
2512 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2513 		.procname	=	"gc_thresh",
2514 		.data		=	&ip6_dst_ops_template.gc_thresh,
2515 		.maxlen		=	sizeof(int),
2516 		.mode		=	0644,
2517 		.proc_handler	=	&proc_dointvec,
2518 	},
2519 	{
2520 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2521 		.procname	=	"max_size",
2522 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2523 		.maxlen		=	sizeof(int),
2524 		.mode		=	0644,
2525 		.proc_handler	=	&proc_dointvec,
2526 	},
2527 	{
2528 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2529 		.procname	=	"gc_min_interval",
2530 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2531 		.maxlen		=	sizeof(int),
2532 		.mode		=	0644,
2533 		.proc_handler	=	&proc_dointvec_jiffies,
2534 		.strategy	=	&sysctl_jiffies,
2535 	},
2536 	{
2537 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2538 		.procname	=	"gc_timeout",
2539 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2540 		.maxlen		=	sizeof(int),
2541 		.mode		=	0644,
2542 		.proc_handler	=	&proc_dointvec_jiffies,
2543 		.strategy	=	&sysctl_jiffies,
2544 	},
2545 	{
2546 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2547 		.procname	=	"gc_interval",
2548 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2549 		.maxlen		=	sizeof(int),
2550 		.mode		=	0644,
2551 		.proc_handler	=	&proc_dointvec_jiffies,
2552 		.strategy	=	&sysctl_jiffies,
2553 	},
2554 	{
2555 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2556 		.procname	=	"gc_elasticity",
2557 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2558 		.maxlen		=	sizeof(int),
2559 		.mode		=	0644,
2560 		.proc_handler	=	&proc_dointvec_jiffies,
2561 		.strategy	=	&sysctl_jiffies,
2562 	},
2563 	{
2564 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2565 		.procname	=	"mtu_expires",
2566 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2567 		.maxlen		=	sizeof(int),
2568 		.mode		=	0644,
2569 		.proc_handler	=	&proc_dointvec_jiffies,
2570 		.strategy	=	&sysctl_jiffies,
2571 	},
2572 	{
2573 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2574 		.procname	=	"min_adv_mss",
2575 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2576 		.maxlen		=	sizeof(int),
2577 		.mode		=	0644,
2578 		.proc_handler	=	&proc_dointvec_jiffies,
2579 		.strategy	=	&sysctl_jiffies,
2580 	},
2581 	{
2582 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2583 		.procname	=	"gc_min_interval_ms",
2584 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2585 		.maxlen		=	sizeof(int),
2586 		.mode		=	0644,
2587 		.proc_handler	=	&proc_dointvec_ms_jiffies,
2588 		.strategy	=	&sysctl_ms_jiffies,
2589 	},
2590 	{ .ctl_name = 0 }
2591 };
2592 
2593 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2594 {
2595 	struct ctl_table *table;
2596 
2597 	table = kmemdup(ipv6_route_table_template,
2598 			sizeof(ipv6_route_table_template),
2599 			GFP_KERNEL);
2600 
2601 	if (table) {
2602 		table[0].data = &net->ipv6.sysctl.flush_delay;
2603 		table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2604 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2605 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2606 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2607 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2608 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2609 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2610 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2611 	}
2612 
2613 	return table;
2614 }
2615 #endif
2616 
2617 static int ip6_route_net_init(struct net *net)
2618 {
2619 	int ret = -ENOMEM;
2620 
2621 	net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2622 					sizeof(*net->ipv6.ip6_dst_ops),
2623 					GFP_KERNEL);
2624 	if (!net->ipv6.ip6_dst_ops)
2625 		goto out;
2626 	net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2627 
2628 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2629 					   sizeof(*net->ipv6.ip6_null_entry),
2630 					   GFP_KERNEL);
2631 	if (!net->ipv6.ip6_null_entry)
2632 		goto out_ip6_dst_ops;
2633 	net->ipv6.ip6_null_entry->u.dst.path =
2634 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2635 	net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2636 
2637 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2638 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2639 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2640 					       GFP_KERNEL);
2641 	if (!net->ipv6.ip6_prohibit_entry) {
2642 		kfree(net->ipv6.ip6_null_entry);
2643 		goto out;
2644 	}
2645 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2646 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2647 	net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2648 
2649 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2650 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2651 					       GFP_KERNEL);
2652 	if (!net->ipv6.ip6_blk_hole_entry) {
2653 		kfree(net->ipv6.ip6_null_entry);
2654 		kfree(net->ipv6.ip6_prohibit_entry);
2655 		goto out;
2656 	}
2657 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2658 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2659 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2660 #endif
2661 
2662 #ifdef CONFIG_PROC_FS
2663 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2664 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2665 #endif
2666 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2667 
2668 	ret = 0;
2669 out:
2670 	return ret;
2671 
2672 out_ip6_dst_ops:
2673 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2674 	kfree(net->ipv6.ip6_dst_ops);
2675 	goto out;
2676 }
2677 
2678 static void ip6_route_net_exit(struct net *net)
2679 {
2680 #ifdef CONFIG_PROC_FS
2681 	proc_net_remove(net, "ipv6_route");
2682 	proc_net_remove(net, "rt6_stats");
2683 #endif
2684 	kfree(net->ipv6.ip6_null_entry);
2685 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2686 	kfree(net->ipv6.ip6_prohibit_entry);
2687 	kfree(net->ipv6.ip6_blk_hole_entry);
2688 #endif
2689 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2690 	kfree(net->ipv6.ip6_dst_ops);
2691 }
2692 
2693 static struct pernet_operations ip6_route_net_ops = {
2694 	.init = ip6_route_net_init,
2695 	.exit = ip6_route_net_exit,
2696 };
2697 
2698 static struct notifier_block ip6_route_dev_notifier = {
2699 	.notifier_call = ip6_route_dev_notify,
2700 	.priority = 0,
2701 };
2702 
2703 int __init ip6_route_init(void)
2704 {
2705 	int ret;
2706 
2707 	ret = -ENOMEM;
2708 	ip6_dst_ops_template.kmem_cachep =
2709 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2710 				  SLAB_HWCACHE_ALIGN, NULL);
2711 	if (!ip6_dst_ops_template.kmem_cachep)
2712 		goto out;;
2713 
2714 	ret = register_pernet_subsys(&ip6_route_net_ops);
2715 	if (ret)
2716 		goto out_kmem_cache;
2717 
2718 	/* Registering of the loopback is done before this portion of code,
2719 	 * the loopback reference in rt6_info will not be taken, do it
2720 	 * manually for init_net */
2721 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2722 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2723   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2724 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2725 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2726 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2727 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2728   #endif
2729 	ret = fib6_init();
2730 	if (ret)
2731 		goto out_register_subsys;
2732 
2733 	ret = xfrm6_init();
2734 	if (ret)
2735 		goto out_fib6_init;
2736 
2737 	ret = fib6_rules_init();
2738 	if (ret)
2739 		goto xfrm6_init;
2740 
2741 	ret = -ENOBUFS;
2742 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2743 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2744 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2745 		goto fib6_rules_init;
2746 
2747 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2748 	if (ret)
2749 		goto fib6_rules_init;
2750 
2751 out:
2752 	return ret;
2753 
2754 fib6_rules_init:
2755 	fib6_rules_cleanup();
2756 xfrm6_init:
2757 	xfrm6_fini();
2758 out_fib6_init:
2759 	fib6_gc_cleanup();
2760 out_register_subsys:
2761 	unregister_pernet_subsys(&ip6_route_net_ops);
2762 out_kmem_cache:
2763 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2764 	goto out;
2765 }
2766 
2767 void ip6_route_cleanup(void)
2768 {
2769 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2770 	fib6_rules_cleanup();
2771 	xfrm6_fini();
2772 	fib6_gc_cleanup();
2773 	unregister_pernet_subsys(&ip6_route_net_ops);
2774 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2775 }
2776