xref: /openbmc/linux/net/ipv6/route.c (revision 5d4a2e29)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.destroy		=	ip6_dst_destroy,
107 	.ifdown			=	ip6_dst_ifdown,
108 	.negative_advice	=	ip6_negative_advice,
109 	.link_failure		=	ip6_link_failure,
110 	.update_pmtu		=	ip6_rt_update_pmtu,
111 	.local_out		=	__ip6_local_out,
112 	.entries		=	ATOMIC_INIT(0),
113 };
114 
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118 
119 static struct dst_ops ip6_dst_blackhole_ops = {
120 	.family			=	AF_INET6,
121 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
122 	.destroy		=	ip6_dst_destroy,
123 	.check			=	ip6_dst_check,
124 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
125 	.entries		=	ATOMIC_INIT(0),
126 };
127 
128 static struct rt6_info ip6_null_entry_template = {
129 	.u = {
130 		.dst = {
131 			.__refcnt	= ATOMIC_INIT(1),
132 			.__use		= 1,
133 			.obsolete	= -1,
134 			.error		= -ENETUNREACH,
135 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
136 			.input		= ip6_pkt_discard,
137 			.output		= ip6_pkt_discard_out,
138 		}
139 	},
140 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
141 	.rt6i_protocol  = RTPROT_KERNEL,
142 	.rt6i_metric	= ~(u32) 0,
143 	.rt6i_ref	= ATOMIC_INIT(1),
144 };
145 
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147 
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
150 
151 static struct rt6_info ip6_prohibit_entry_template = {
152 	.u = {
153 		.dst = {
154 			.__refcnt	= ATOMIC_INIT(1),
155 			.__use		= 1,
156 			.obsolete	= -1,
157 			.error		= -EACCES,
158 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
159 			.input		= ip6_pkt_prohibit,
160 			.output		= ip6_pkt_prohibit_out,
161 		}
162 	},
163 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
164 	.rt6i_protocol  = RTPROT_KERNEL,
165 	.rt6i_metric	= ~(u32) 0,
166 	.rt6i_ref	= ATOMIC_INIT(1),
167 };
168 
169 static struct rt6_info ip6_blk_hole_entry_template = {
170 	.u = {
171 		.dst = {
172 			.__refcnt	= ATOMIC_INIT(1),
173 			.__use		= 1,
174 			.obsolete	= -1,
175 			.error		= -EINVAL,
176 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
177 			.input		= dst_discard,
178 			.output		= dst_discard,
179 		}
180 	},
181 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
182 	.rt6i_protocol  = RTPROT_KERNEL,
183 	.rt6i_metric	= ~(u32) 0,
184 	.rt6i_ref	= ATOMIC_INIT(1),
185 };
186 
187 #endif
188 
189 /* allocate dst with ip6_dst_ops */
190 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
191 {
192 	return (struct rt6_info *)dst_alloc(ops);
193 }
194 
195 static void ip6_dst_destroy(struct dst_entry *dst)
196 {
197 	struct rt6_info *rt = (struct rt6_info *)dst;
198 	struct inet6_dev *idev = rt->rt6i_idev;
199 
200 	if (idev != NULL) {
201 		rt->rt6i_idev = NULL;
202 		in6_dev_put(idev);
203 	}
204 }
205 
206 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
207 			   int how)
208 {
209 	struct rt6_info *rt = (struct rt6_info *)dst;
210 	struct inet6_dev *idev = rt->rt6i_idev;
211 	struct net_device *loopback_dev =
212 		dev_net(dev)->loopback_dev;
213 
214 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
215 		struct inet6_dev *loopback_idev =
216 			in6_dev_get(loopback_dev);
217 		if (loopback_idev != NULL) {
218 			rt->rt6i_idev = loopback_idev;
219 			in6_dev_put(idev);
220 		}
221 	}
222 }
223 
224 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
225 {
226 	return (rt->rt6i_flags & RTF_EXPIRES &&
227 		time_after(jiffies, rt->rt6i_expires));
228 }
229 
230 static inline int rt6_need_strict(struct in6_addr *daddr)
231 {
232 	return (ipv6_addr_type(daddr) &
233 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
234 }
235 
236 /*
237  *	Route lookup. Any table->tb6_lock is implied.
238  */
239 
240 static inline struct rt6_info *rt6_device_match(struct net *net,
241 						    struct rt6_info *rt,
242 						    struct in6_addr *saddr,
243 						    int oif,
244 						    int flags)
245 {
246 	struct rt6_info *local = NULL;
247 	struct rt6_info *sprt;
248 
249 	if (!oif && ipv6_addr_any(saddr))
250 		goto out;
251 
252 	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
253 		struct net_device *dev = sprt->rt6i_dev;
254 
255 		if (oif) {
256 			if (dev->ifindex == oif)
257 				return sprt;
258 			if (dev->flags & IFF_LOOPBACK) {
259 				if (sprt->rt6i_idev == NULL ||
260 				    sprt->rt6i_idev->dev->ifindex != oif) {
261 					if (flags & RT6_LOOKUP_F_IFACE && oif)
262 						continue;
263 					if (local && (!oif ||
264 						      local->rt6i_idev->dev->ifindex == oif))
265 						continue;
266 				}
267 				local = sprt;
268 			}
269 		} else {
270 			if (ipv6_chk_addr(net, saddr, dev,
271 					  flags & RT6_LOOKUP_F_IFACE))
272 				return sprt;
273 		}
274 	}
275 
276 	if (oif) {
277 		if (local)
278 			return local;
279 
280 		if (flags & RT6_LOOKUP_F_IFACE)
281 			return net->ipv6.ip6_null_entry;
282 	}
283 out:
284 	return rt;
285 }
286 
287 #ifdef CONFIG_IPV6_ROUTER_PREF
288 static void rt6_probe(struct rt6_info *rt)
289 {
290 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
291 	/*
292 	 * Okay, this does not seem to be appropriate
293 	 * for now, however, we need to check if it
294 	 * is really so; aka Router Reachability Probing.
295 	 *
296 	 * Router Reachability Probe MUST be rate-limited
297 	 * to no more than one per minute.
298 	 */
299 	if (!neigh || (neigh->nud_state & NUD_VALID))
300 		return;
301 	read_lock_bh(&neigh->lock);
302 	if (!(neigh->nud_state & NUD_VALID) &&
303 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
304 		struct in6_addr mcaddr;
305 		struct in6_addr *target;
306 
307 		neigh->updated = jiffies;
308 		read_unlock_bh(&neigh->lock);
309 
310 		target = (struct in6_addr *)&neigh->primary_key;
311 		addrconf_addr_solict_mult(target, &mcaddr);
312 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
313 	} else
314 		read_unlock_bh(&neigh->lock);
315 }
316 #else
317 static inline void rt6_probe(struct rt6_info *rt)
318 {
319 }
320 #endif
321 
322 /*
323  * Default Router Selection (RFC 2461 6.3.6)
324  */
325 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
326 {
327 	struct net_device *dev = rt->rt6i_dev;
328 	if (!oif || dev->ifindex == oif)
329 		return 2;
330 	if ((dev->flags & IFF_LOOPBACK) &&
331 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
332 		return 1;
333 	return 0;
334 }
335 
336 static inline int rt6_check_neigh(struct rt6_info *rt)
337 {
338 	struct neighbour *neigh = rt->rt6i_nexthop;
339 	int m;
340 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
341 	    !(rt->rt6i_flags & RTF_GATEWAY))
342 		m = 1;
343 	else if (neigh) {
344 		read_lock_bh(&neigh->lock);
345 		if (neigh->nud_state & NUD_VALID)
346 			m = 2;
347 #ifdef CONFIG_IPV6_ROUTER_PREF
348 		else if (neigh->nud_state & NUD_FAILED)
349 			m = 0;
350 #endif
351 		else
352 			m = 1;
353 		read_unlock_bh(&neigh->lock);
354 	} else
355 		m = 0;
356 	return m;
357 }
358 
359 static int rt6_score_route(struct rt6_info *rt, int oif,
360 			   int strict)
361 {
362 	int m, n;
363 
364 	m = rt6_check_dev(rt, oif);
365 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
366 		return -1;
367 #ifdef CONFIG_IPV6_ROUTER_PREF
368 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
369 #endif
370 	n = rt6_check_neigh(rt);
371 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
372 		return -1;
373 	return m;
374 }
375 
376 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
377 				   int *mpri, struct rt6_info *match)
378 {
379 	int m;
380 
381 	if (rt6_check_expired(rt))
382 		goto out;
383 
384 	m = rt6_score_route(rt, oif, strict);
385 	if (m < 0)
386 		goto out;
387 
388 	if (m > *mpri) {
389 		if (strict & RT6_LOOKUP_F_REACHABLE)
390 			rt6_probe(match);
391 		*mpri = m;
392 		match = rt;
393 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
394 		rt6_probe(rt);
395 	}
396 
397 out:
398 	return match;
399 }
400 
401 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
402 				     struct rt6_info *rr_head,
403 				     u32 metric, int oif, int strict)
404 {
405 	struct rt6_info *rt, *match;
406 	int mpri = -1;
407 
408 	match = NULL;
409 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
410 	     rt = rt->u.dst.rt6_next)
411 		match = find_match(rt, oif, strict, &mpri, match);
412 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
413 	     rt = rt->u.dst.rt6_next)
414 		match = find_match(rt, oif, strict, &mpri, match);
415 
416 	return match;
417 }
418 
419 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
420 {
421 	struct rt6_info *match, *rt0;
422 	struct net *net;
423 
424 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
425 		  __func__, fn->leaf, oif);
426 
427 	rt0 = fn->rr_ptr;
428 	if (!rt0)
429 		fn->rr_ptr = rt0 = fn->leaf;
430 
431 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
432 
433 	if (!match &&
434 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
435 		struct rt6_info *next = rt0->u.dst.rt6_next;
436 
437 		/* no entries matched; do round-robin */
438 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
439 			next = fn->leaf;
440 
441 		if (next != rt0)
442 			fn->rr_ptr = next;
443 	}
444 
445 	RT6_TRACE("%s() => %p\n",
446 		  __func__, match);
447 
448 	net = dev_net(rt0->rt6i_dev);
449 	return (match ? match : net->ipv6.ip6_null_entry);
450 }
451 
452 #ifdef CONFIG_IPV6_ROUTE_INFO
453 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
454 		  struct in6_addr *gwaddr)
455 {
456 	struct net *net = dev_net(dev);
457 	struct route_info *rinfo = (struct route_info *) opt;
458 	struct in6_addr prefix_buf, *prefix;
459 	unsigned int pref;
460 	unsigned long lifetime;
461 	struct rt6_info *rt;
462 
463 	if (len < sizeof(struct route_info)) {
464 		return -EINVAL;
465 	}
466 
467 	/* Sanity check for prefix_len and length */
468 	if (rinfo->length > 3) {
469 		return -EINVAL;
470 	} else if (rinfo->prefix_len > 128) {
471 		return -EINVAL;
472 	} else if (rinfo->prefix_len > 64) {
473 		if (rinfo->length < 2) {
474 			return -EINVAL;
475 		}
476 	} else if (rinfo->prefix_len > 0) {
477 		if (rinfo->length < 1) {
478 			return -EINVAL;
479 		}
480 	}
481 
482 	pref = rinfo->route_pref;
483 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
484 		return -EINVAL;
485 
486 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
487 
488 	if (rinfo->length == 3)
489 		prefix = (struct in6_addr *)rinfo->prefix;
490 	else {
491 		/* this function is safe */
492 		ipv6_addr_prefix(&prefix_buf,
493 				 (struct in6_addr *)rinfo->prefix,
494 				 rinfo->prefix_len);
495 		prefix = &prefix_buf;
496 	}
497 
498 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
499 				dev->ifindex);
500 
501 	if (rt && !lifetime) {
502 		ip6_del_rt(rt);
503 		rt = NULL;
504 	}
505 
506 	if (!rt && lifetime)
507 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
508 					pref);
509 	else if (rt)
510 		rt->rt6i_flags = RTF_ROUTEINFO |
511 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
512 
513 	if (rt) {
514 		if (!addrconf_finite_timeout(lifetime)) {
515 			rt->rt6i_flags &= ~RTF_EXPIRES;
516 		} else {
517 			rt->rt6i_expires = jiffies + HZ * lifetime;
518 			rt->rt6i_flags |= RTF_EXPIRES;
519 		}
520 		dst_release(&rt->u.dst);
521 	}
522 	return 0;
523 }
524 #endif
525 
526 #define BACKTRACK(__net, saddr)			\
527 do { \
528 	if (rt == __net->ipv6.ip6_null_entry) {	\
529 		struct fib6_node *pn; \
530 		while (1) { \
531 			if (fn->fn_flags & RTN_TL_ROOT) \
532 				goto out; \
533 			pn = fn->parent; \
534 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
535 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
536 			else \
537 				fn = pn; \
538 			if (fn->fn_flags & RTN_RTINFO) \
539 				goto restart; \
540 		} \
541 	} \
542 } while(0)
543 
544 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
545 					     struct fib6_table *table,
546 					     struct flowi *fl, int flags)
547 {
548 	struct fib6_node *fn;
549 	struct rt6_info *rt;
550 
551 	read_lock_bh(&table->tb6_lock);
552 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
553 restart:
554 	rt = fn->leaf;
555 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
556 	BACKTRACK(net, &fl->fl6_src);
557 out:
558 	dst_use(&rt->u.dst, jiffies);
559 	read_unlock_bh(&table->tb6_lock);
560 	return rt;
561 
562 }
563 
564 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
565 			    const struct in6_addr *saddr, int oif, int strict)
566 {
567 	struct flowi fl = {
568 		.oif = oif,
569 		.nl_u = {
570 			.ip6_u = {
571 				.daddr = *daddr,
572 			},
573 		},
574 	};
575 	struct dst_entry *dst;
576 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
577 
578 	if (saddr) {
579 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
580 		flags |= RT6_LOOKUP_F_HAS_SADDR;
581 	}
582 
583 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
584 	if (dst->error == 0)
585 		return (struct rt6_info *) dst;
586 
587 	dst_release(dst);
588 
589 	return NULL;
590 }
591 
592 EXPORT_SYMBOL(rt6_lookup);
593 
594 /* ip6_ins_rt is called with FREE table->tb6_lock.
595    It takes new route entry, the addition fails by any reason the
596    route is freed. In any case, if caller does not hold it, it may
597    be destroyed.
598  */
599 
600 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
601 {
602 	int err;
603 	struct fib6_table *table;
604 
605 	table = rt->rt6i_table;
606 	write_lock_bh(&table->tb6_lock);
607 	err = fib6_add(&table->tb6_root, rt, info);
608 	write_unlock_bh(&table->tb6_lock);
609 
610 	return err;
611 }
612 
613 int ip6_ins_rt(struct rt6_info *rt)
614 {
615 	struct nl_info info = {
616 		.nl_net = dev_net(rt->rt6i_dev),
617 	};
618 	return __ip6_ins_rt(rt, &info);
619 }
620 
621 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
622 				      struct in6_addr *saddr)
623 {
624 	struct rt6_info *rt;
625 
626 	/*
627 	 *	Clone the route.
628 	 */
629 
630 	rt = ip6_rt_copy(ort);
631 
632 	if (rt) {
633 		struct neighbour *neigh;
634 		int attempts = !in_softirq();
635 
636 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
637 			if (rt->rt6i_dst.plen != 128 &&
638 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
639 				rt->rt6i_flags |= RTF_ANYCAST;
640 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
641 		}
642 
643 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
644 		rt->rt6i_dst.plen = 128;
645 		rt->rt6i_flags |= RTF_CACHE;
646 		rt->u.dst.flags |= DST_HOST;
647 
648 #ifdef CONFIG_IPV6_SUBTREES
649 		if (rt->rt6i_src.plen && saddr) {
650 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
651 			rt->rt6i_src.plen = 128;
652 		}
653 #endif
654 
655 	retry:
656 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
657 		if (IS_ERR(neigh)) {
658 			struct net *net = dev_net(rt->rt6i_dev);
659 			int saved_rt_min_interval =
660 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
661 			int saved_rt_elasticity =
662 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
663 
664 			if (attempts-- > 0) {
665 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
666 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
667 
668 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
669 
670 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
671 					saved_rt_elasticity;
672 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
673 					saved_rt_min_interval;
674 				goto retry;
675 			}
676 
677 			if (net_ratelimit())
678 				printk(KERN_WARNING
679 				       "Neighbour table overflow.\n");
680 			dst_free(&rt->u.dst);
681 			return NULL;
682 		}
683 		rt->rt6i_nexthop = neigh;
684 
685 	}
686 
687 	return rt;
688 }
689 
690 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
691 {
692 	struct rt6_info *rt = ip6_rt_copy(ort);
693 	if (rt) {
694 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 		rt->rt6i_dst.plen = 128;
696 		rt->rt6i_flags |= RTF_CACHE;
697 		rt->u.dst.flags |= DST_HOST;
698 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
699 	}
700 	return rt;
701 }
702 
703 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
704 				      struct flowi *fl, int flags)
705 {
706 	struct fib6_node *fn;
707 	struct rt6_info *rt, *nrt;
708 	int strict = 0;
709 	int attempts = 3;
710 	int err;
711 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
712 
713 	strict |= flags & RT6_LOOKUP_F_IFACE;
714 
715 relookup:
716 	read_lock_bh(&table->tb6_lock);
717 
718 restart_2:
719 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
720 
721 restart:
722 	rt = rt6_select(fn, oif, strict | reachable);
723 
724 	BACKTRACK(net, &fl->fl6_src);
725 	if (rt == net->ipv6.ip6_null_entry ||
726 	    rt->rt6i_flags & RTF_CACHE)
727 		goto out;
728 
729 	dst_hold(&rt->u.dst);
730 	read_unlock_bh(&table->tb6_lock);
731 
732 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
733 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
734 	else {
735 #if CLONE_OFFLINK_ROUTE
736 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
737 #else
738 		goto out2;
739 #endif
740 	}
741 
742 	dst_release(&rt->u.dst);
743 	rt = nrt ? : net->ipv6.ip6_null_entry;
744 
745 	dst_hold(&rt->u.dst);
746 	if (nrt) {
747 		err = ip6_ins_rt(nrt);
748 		if (!err)
749 			goto out2;
750 	}
751 
752 	if (--attempts <= 0)
753 		goto out2;
754 
755 	/*
756 	 * Race condition! In the gap, when table->tb6_lock was
757 	 * released someone could insert this route.  Relookup.
758 	 */
759 	dst_release(&rt->u.dst);
760 	goto relookup;
761 
762 out:
763 	if (reachable) {
764 		reachable = 0;
765 		goto restart_2;
766 	}
767 	dst_hold(&rt->u.dst);
768 	read_unlock_bh(&table->tb6_lock);
769 out2:
770 	rt->u.dst.lastuse = jiffies;
771 	rt->u.dst.__use++;
772 
773 	return rt;
774 }
775 
776 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
777 					    struct flowi *fl, int flags)
778 {
779 	return ip6_pol_route(net, table, fl->iif, fl, flags);
780 }
781 
782 void ip6_route_input(struct sk_buff *skb)
783 {
784 	struct ipv6hdr *iph = ipv6_hdr(skb);
785 	struct net *net = dev_net(skb->dev);
786 	int flags = RT6_LOOKUP_F_HAS_SADDR;
787 	struct flowi fl = {
788 		.iif = skb->dev->ifindex,
789 		.nl_u = {
790 			.ip6_u = {
791 				.daddr = iph->daddr,
792 				.saddr = iph->saddr,
793 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
794 			},
795 		},
796 		.mark = skb->mark,
797 		.proto = iph->nexthdr,
798 	};
799 
800 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
801 		flags |= RT6_LOOKUP_F_IFACE;
802 
803 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
804 }
805 
806 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
807 					     struct flowi *fl, int flags)
808 {
809 	return ip6_pol_route(net, table, fl->oif, fl, flags);
810 }
811 
812 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
813 				    struct flowi *fl)
814 {
815 	int flags = 0;
816 
817 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
818 		flags |= RT6_LOOKUP_F_IFACE;
819 
820 	if (!ipv6_addr_any(&fl->fl6_src))
821 		flags |= RT6_LOOKUP_F_HAS_SADDR;
822 	else if (sk)
823 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
824 
825 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
826 }
827 
828 EXPORT_SYMBOL(ip6_route_output);
829 
830 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
831 {
832 	struct rt6_info *ort = (struct rt6_info *) *dstp;
833 	struct rt6_info *rt = (struct rt6_info *)
834 		dst_alloc(&ip6_dst_blackhole_ops);
835 	struct dst_entry *new = NULL;
836 
837 	if (rt) {
838 		new = &rt->u.dst;
839 
840 		atomic_set(&new->__refcnt, 1);
841 		new->__use = 1;
842 		new->input = dst_discard;
843 		new->output = dst_discard;
844 
845 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
846 		new->dev = ort->u.dst.dev;
847 		if (new->dev)
848 			dev_hold(new->dev);
849 		rt->rt6i_idev = ort->rt6i_idev;
850 		if (rt->rt6i_idev)
851 			in6_dev_hold(rt->rt6i_idev);
852 		rt->rt6i_expires = 0;
853 
854 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
855 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
856 		rt->rt6i_metric = 0;
857 
858 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
859 #ifdef CONFIG_IPV6_SUBTREES
860 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
861 #endif
862 
863 		dst_free(new);
864 	}
865 
866 	dst_release(*dstp);
867 	*dstp = new;
868 	return (new ? 0 : -ENOMEM);
869 }
870 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
871 
872 /*
873  *	Destination cache support functions
874  */
875 
876 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
877 {
878 	struct rt6_info *rt;
879 
880 	rt = (struct rt6_info *) dst;
881 
882 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
883 		return dst;
884 
885 	return NULL;
886 }
887 
888 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
889 {
890 	struct rt6_info *rt = (struct rt6_info *) dst;
891 
892 	if (rt) {
893 		if (rt->rt6i_flags & RTF_CACHE) {
894 			if (rt6_check_expired(rt)) {
895 				ip6_del_rt(rt);
896 				dst = NULL;
897 			}
898 		} else {
899 			dst_release(dst);
900 			dst = NULL;
901 		}
902 	}
903 	return dst;
904 }
905 
906 static void ip6_link_failure(struct sk_buff *skb)
907 {
908 	struct rt6_info *rt;
909 
910 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
911 
912 	rt = (struct rt6_info *) skb_dst(skb);
913 	if (rt) {
914 		if (rt->rt6i_flags&RTF_CACHE) {
915 			dst_set_expires(&rt->u.dst, 0);
916 			rt->rt6i_flags |= RTF_EXPIRES;
917 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
918 			rt->rt6i_node->fn_sernum = -1;
919 	}
920 }
921 
922 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
923 {
924 	struct rt6_info *rt6 = (struct rt6_info*)dst;
925 
926 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
927 		rt6->rt6i_flags |= RTF_MODIFIED;
928 		if (mtu < IPV6_MIN_MTU) {
929 			mtu = IPV6_MIN_MTU;
930 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
931 		}
932 		dst->metrics[RTAX_MTU-1] = mtu;
933 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
934 	}
935 }
936 
937 static int ipv6_get_mtu(struct net_device *dev);
938 
939 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
940 {
941 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
942 
943 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
944 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
945 
946 	/*
947 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
948 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
949 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
950 	 * rely only on pmtu discovery"
951 	 */
952 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
953 		mtu = IPV6_MAXPLEN;
954 	return mtu;
955 }
956 
957 static struct dst_entry *icmp6_dst_gc_list;
958 static DEFINE_SPINLOCK(icmp6_dst_lock);
959 
960 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
961 				  struct neighbour *neigh,
962 				  const struct in6_addr *addr)
963 {
964 	struct rt6_info *rt;
965 	struct inet6_dev *idev = in6_dev_get(dev);
966 	struct net *net = dev_net(dev);
967 
968 	if (unlikely(idev == NULL))
969 		return NULL;
970 
971 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
972 	if (unlikely(rt == NULL)) {
973 		in6_dev_put(idev);
974 		goto out;
975 	}
976 
977 	dev_hold(dev);
978 	if (neigh)
979 		neigh_hold(neigh);
980 	else {
981 		neigh = ndisc_get_neigh(dev, addr);
982 		if (IS_ERR(neigh))
983 			neigh = NULL;
984 	}
985 
986 	rt->rt6i_dev	  = dev;
987 	rt->rt6i_idev     = idev;
988 	rt->rt6i_nexthop  = neigh;
989 	atomic_set(&rt->u.dst.__refcnt, 1);
990 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
991 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
992 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
993 	rt->u.dst.output  = ip6_output;
994 
995 #if 0	/* there's no chance to use these for ndisc */
996 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
997 				? DST_HOST
998 				: 0;
999 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1000 	rt->rt6i_dst.plen = 128;
1001 #endif
1002 
1003 	spin_lock_bh(&icmp6_dst_lock);
1004 	rt->u.dst.next = icmp6_dst_gc_list;
1005 	icmp6_dst_gc_list = &rt->u.dst;
1006 	spin_unlock_bh(&icmp6_dst_lock);
1007 
1008 	fib6_force_start_gc(net);
1009 
1010 out:
1011 	return &rt->u.dst;
1012 }
1013 
1014 int icmp6_dst_gc(void)
1015 {
1016 	struct dst_entry *dst, *next, **pprev;
1017 	int more = 0;
1018 
1019 	next = NULL;
1020 
1021 	spin_lock_bh(&icmp6_dst_lock);
1022 	pprev = &icmp6_dst_gc_list;
1023 
1024 	while ((dst = *pprev) != NULL) {
1025 		if (!atomic_read(&dst->__refcnt)) {
1026 			*pprev = dst->next;
1027 			dst_free(dst);
1028 		} else {
1029 			pprev = &dst->next;
1030 			++more;
1031 		}
1032 	}
1033 
1034 	spin_unlock_bh(&icmp6_dst_lock);
1035 
1036 	return more;
1037 }
1038 
1039 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1040 			    void *arg)
1041 {
1042 	struct dst_entry *dst, **pprev;
1043 
1044 	spin_lock_bh(&icmp6_dst_lock);
1045 	pprev = &icmp6_dst_gc_list;
1046 	while ((dst = *pprev) != NULL) {
1047 		struct rt6_info *rt = (struct rt6_info *) dst;
1048 		if (func(rt, arg)) {
1049 			*pprev = dst->next;
1050 			dst_free(dst);
1051 		} else {
1052 			pprev = &dst->next;
1053 		}
1054 	}
1055 	spin_unlock_bh(&icmp6_dst_lock);
1056 }
1057 
1058 static int ip6_dst_gc(struct dst_ops *ops)
1059 {
1060 	unsigned long now = jiffies;
1061 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1062 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1063 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1064 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1065 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1066 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1067 
1068 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1069 	    atomic_read(&ops->entries) <= rt_max_size)
1070 		goto out;
1071 
1072 	net->ipv6.ip6_rt_gc_expire++;
1073 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1074 	net->ipv6.ip6_rt_last_gc = now;
1075 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1076 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1077 out:
1078 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1079 	return (atomic_read(&ops->entries) > rt_max_size);
1080 }
1081 
1082 /* Clean host part of a prefix. Not necessary in radix tree,
1083    but results in cleaner routing tables.
1084 
1085    Remove it only when all the things will work!
1086  */
1087 
1088 static int ipv6_get_mtu(struct net_device *dev)
1089 {
1090 	int mtu = IPV6_MIN_MTU;
1091 	struct inet6_dev *idev;
1092 
1093 	idev = in6_dev_get(dev);
1094 	if (idev) {
1095 		mtu = idev->cnf.mtu6;
1096 		in6_dev_put(idev);
1097 	}
1098 	return mtu;
1099 }
1100 
1101 int ip6_dst_hoplimit(struct dst_entry *dst)
1102 {
1103 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1104 	if (hoplimit < 0) {
1105 		struct net_device *dev = dst->dev;
1106 		struct inet6_dev *idev = in6_dev_get(dev);
1107 		if (idev) {
1108 			hoplimit = idev->cnf.hop_limit;
1109 			in6_dev_put(idev);
1110 		} else
1111 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1112 	}
1113 	return hoplimit;
1114 }
1115 
1116 /*
1117  *
1118  */
1119 
1120 int ip6_route_add(struct fib6_config *cfg)
1121 {
1122 	int err;
1123 	struct net *net = cfg->fc_nlinfo.nl_net;
1124 	struct rt6_info *rt = NULL;
1125 	struct net_device *dev = NULL;
1126 	struct inet6_dev *idev = NULL;
1127 	struct fib6_table *table;
1128 	int addr_type;
1129 
1130 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1131 		return -EINVAL;
1132 #ifndef CONFIG_IPV6_SUBTREES
1133 	if (cfg->fc_src_len)
1134 		return -EINVAL;
1135 #endif
1136 	if (cfg->fc_ifindex) {
1137 		err = -ENODEV;
1138 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1139 		if (!dev)
1140 			goto out;
1141 		idev = in6_dev_get(dev);
1142 		if (!idev)
1143 			goto out;
1144 	}
1145 
1146 	if (cfg->fc_metric == 0)
1147 		cfg->fc_metric = IP6_RT_PRIO_USER;
1148 
1149 	table = fib6_new_table(net, cfg->fc_table);
1150 	if (table == NULL) {
1151 		err = -ENOBUFS;
1152 		goto out;
1153 	}
1154 
1155 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1156 
1157 	if (rt == NULL) {
1158 		err = -ENOMEM;
1159 		goto out;
1160 	}
1161 
1162 	rt->u.dst.obsolete = -1;
1163 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1164 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1165 				0;
1166 
1167 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1168 		cfg->fc_protocol = RTPROT_BOOT;
1169 	rt->rt6i_protocol = cfg->fc_protocol;
1170 
1171 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1172 
1173 	if (addr_type & IPV6_ADDR_MULTICAST)
1174 		rt->u.dst.input = ip6_mc_input;
1175 	else
1176 		rt->u.dst.input = ip6_forward;
1177 
1178 	rt->u.dst.output = ip6_output;
1179 
1180 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1181 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1182 	if (rt->rt6i_dst.plen == 128)
1183 	       rt->u.dst.flags = DST_HOST;
1184 
1185 #ifdef CONFIG_IPV6_SUBTREES
1186 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1187 	rt->rt6i_src.plen = cfg->fc_src_len;
1188 #endif
1189 
1190 	rt->rt6i_metric = cfg->fc_metric;
1191 
1192 	/* We cannot add true routes via loopback here,
1193 	   they would result in kernel looping; promote them to reject routes
1194 	 */
1195 	if ((cfg->fc_flags & RTF_REJECT) ||
1196 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1197 		/* hold loopback dev/idev if we haven't done so. */
1198 		if (dev != net->loopback_dev) {
1199 			if (dev) {
1200 				dev_put(dev);
1201 				in6_dev_put(idev);
1202 			}
1203 			dev = net->loopback_dev;
1204 			dev_hold(dev);
1205 			idev = in6_dev_get(dev);
1206 			if (!idev) {
1207 				err = -ENODEV;
1208 				goto out;
1209 			}
1210 		}
1211 		rt->u.dst.output = ip6_pkt_discard_out;
1212 		rt->u.dst.input = ip6_pkt_discard;
1213 		rt->u.dst.error = -ENETUNREACH;
1214 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1215 		goto install_route;
1216 	}
1217 
1218 	if (cfg->fc_flags & RTF_GATEWAY) {
1219 		struct in6_addr *gw_addr;
1220 		int gwa_type;
1221 
1222 		gw_addr = &cfg->fc_gateway;
1223 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1224 		gwa_type = ipv6_addr_type(gw_addr);
1225 
1226 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1227 			struct rt6_info *grt;
1228 
1229 			/* IPv6 strictly inhibits using not link-local
1230 			   addresses as nexthop address.
1231 			   Otherwise, router will not able to send redirects.
1232 			   It is very good, but in some (rare!) circumstances
1233 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1234 			   some exceptions. --ANK
1235 			 */
1236 			err = -EINVAL;
1237 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1238 				goto out;
1239 
1240 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1241 
1242 			err = -EHOSTUNREACH;
1243 			if (grt == NULL)
1244 				goto out;
1245 			if (dev) {
1246 				if (dev != grt->rt6i_dev) {
1247 					dst_release(&grt->u.dst);
1248 					goto out;
1249 				}
1250 			} else {
1251 				dev = grt->rt6i_dev;
1252 				idev = grt->rt6i_idev;
1253 				dev_hold(dev);
1254 				in6_dev_hold(grt->rt6i_idev);
1255 			}
1256 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1257 				err = 0;
1258 			dst_release(&grt->u.dst);
1259 
1260 			if (err)
1261 				goto out;
1262 		}
1263 		err = -EINVAL;
1264 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1265 			goto out;
1266 	}
1267 
1268 	err = -ENODEV;
1269 	if (dev == NULL)
1270 		goto out;
1271 
1272 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1273 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1274 		if (IS_ERR(rt->rt6i_nexthop)) {
1275 			err = PTR_ERR(rt->rt6i_nexthop);
1276 			rt->rt6i_nexthop = NULL;
1277 			goto out;
1278 		}
1279 	}
1280 
1281 	rt->rt6i_flags = cfg->fc_flags;
1282 
1283 install_route:
1284 	if (cfg->fc_mx) {
1285 		struct nlattr *nla;
1286 		int remaining;
1287 
1288 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1289 			int type = nla_type(nla);
1290 
1291 			if (type) {
1292 				if (type > RTAX_MAX) {
1293 					err = -EINVAL;
1294 					goto out;
1295 				}
1296 
1297 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1298 			}
1299 		}
1300 	}
1301 
1302 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1303 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1304 	if (!dst_mtu(&rt->u.dst))
1305 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1306 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1307 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1308 	rt->u.dst.dev = dev;
1309 	rt->rt6i_idev = idev;
1310 	rt->rt6i_table = table;
1311 
1312 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1313 
1314 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1315 
1316 out:
1317 	if (dev)
1318 		dev_put(dev);
1319 	if (idev)
1320 		in6_dev_put(idev);
1321 	if (rt)
1322 		dst_free(&rt->u.dst);
1323 	return err;
1324 }
1325 
1326 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1327 {
1328 	int err;
1329 	struct fib6_table *table;
1330 	struct net *net = dev_net(rt->rt6i_dev);
1331 
1332 	if (rt == net->ipv6.ip6_null_entry)
1333 		return -ENOENT;
1334 
1335 	table = rt->rt6i_table;
1336 	write_lock_bh(&table->tb6_lock);
1337 
1338 	err = fib6_del(rt, info);
1339 	dst_release(&rt->u.dst);
1340 
1341 	write_unlock_bh(&table->tb6_lock);
1342 
1343 	return err;
1344 }
1345 
1346 int ip6_del_rt(struct rt6_info *rt)
1347 {
1348 	struct nl_info info = {
1349 		.nl_net = dev_net(rt->rt6i_dev),
1350 	};
1351 	return __ip6_del_rt(rt, &info);
1352 }
1353 
1354 static int ip6_route_del(struct fib6_config *cfg)
1355 {
1356 	struct fib6_table *table;
1357 	struct fib6_node *fn;
1358 	struct rt6_info *rt;
1359 	int err = -ESRCH;
1360 
1361 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1362 	if (table == NULL)
1363 		return err;
1364 
1365 	read_lock_bh(&table->tb6_lock);
1366 
1367 	fn = fib6_locate(&table->tb6_root,
1368 			 &cfg->fc_dst, cfg->fc_dst_len,
1369 			 &cfg->fc_src, cfg->fc_src_len);
1370 
1371 	if (fn) {
1372 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1373 			if (cfg->fc_ifindex &&
1374 			    (rt->rt6i_dev == NULL ||
1375 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1376 				continue;
1377 			if (cfg->fc_flags & RTF_GATEWAY &&
1378 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1379 				continue;
1380 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1381 				continue;
1382 			dst_hold(&rt->u.dst);
1383 			read_unlock_bh(&table->tb6_lock);
1384 
1385 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1386 		}
1387 	}
1388 	read_unlock_bh(&table->tb6_lock);
1389 
1390 	return err;
1391 }
1392 
1393 /*
1394  *	Handle redirects
1395  */
1396 struct ip6rd_flowi {
1397 	struct flowi fl;
1398 	struct in6_addr gateway;
1399 };
1400 
1401 static struct rt6_info *__ip6_route_redirect(struct net *net,
1402 					     struct fib6_table *table,
1403 					     struct flowi *fl,
1404 					     int flags)
1405 {
1406 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1407 	struct rt6_info *rt;
1408 	struct fib6_node *fn;
1409 
1410 	/*
1411 	 * Get the "current" route for this destination and
1412 	 * check if the redirect has come from approriate router.
1413 	 *
1414 	 * RFC 2461 specifies that redirects should only be
1415 	 * accepted if they come from the nexthop to the target.
1416 	 * Due to the way the routes are chosen, this notion
1417 	 * is a bit fuzzy and one might need to check all possible
1418 	 * routes.
1419 	 */
1420 
1421 	read_lock_bh(&table->tb6_lock);
1422 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1423 restart:
1424 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1425 		/*
1426 		 * Current route is on-link; redirect is always invalid.
1427 		 *
1428 		 * Seems, previous statement is not true. It could
1429 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1430 		 * But then router serving it might decide, that we should
1431 		 * know truth 8)8) --ANK (980726).
1432 		 */
1433 		if (rt6_check_expired(rt))
1434 			continue;
1435 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1436 			continue;
1437 		if (fl->oif != rt->rt6i_dev->ifindex)
1438 			continue;
1439 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1440 			continue;
1441 		break;
1442 	}
1443 
1444 	if (!rt)
1445 		rt = net->ipv6.ip6_null_entry;
1446 	BACKTRACK(net, &fl->fl6_src);
1447 out:
1448 	dst_hold(&rt->u.dst);
1449 
1450 	read_unlock_bh(&table->tb6_lock);
1451 
1452 	return rt;
1453 };
1454 
1455 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1456 					   struct in6_addr *src,
1457 					   struct in6_addr *gateway,
1458 					   struct net_device *dev)
1459 {
1460 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1461 	struct net *net = dev_net(dev);
1462 	struct ip6rd_flowi rdfl = {
1463 		.fl = {
1464 			.oif = dev->ifindex,
1465 			.nl_u = {
1466 				.ip6_u = {
1467 					.daddr = *dest,
1468 					.saddr = *src,
1469 				},
1470 			},
1471 		},
1472 	};
1473 
1474 	ipv6_addr_copy(&rdfl.gateway, gateway);
1475 
1476 	if (rt6_need_strict(dest))
1477 		flags |= RT6_LOOKUP_F_IFACE;
1478 
1479 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1480 						   flags, __ip6_route_redirect);
1481 }
1482 
1483 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1484 		  struct in6_addr *saddr,
1485 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1486 {
1487 	struct rt6_info *rt, *nrt = NULL;
1488 	struct netevent_redirect netevent;
1489 	struct net *net = dev_net(neigh->dev);
1490 
1491 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1492 
1493 	if (rt == net->ipv6.ip6_null_entry) {
1494 		if (net_ratelimit())
1495 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1496 			       "for redirect target\n");
1497 		goto out;
1498 	}
1499 
1500 	/*
1501 	 *	We have finally decided to accept it.
1502 	 */
1503 
1504 	neigh_update(neigh, lladdr, NUD_STALE,
1505 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1506 		     NEIGH_UPDATE_F_OVERRIDE|
1507 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1508 				     NEIGH_UPDATE_F_ISROUTER))
1509 		     );
1510 
1511 	/*
1512 	 * Redirect received -> path was valid.
1513 	 * Look, redirects are sent only in response to data packets,
1514 	 * so that this nexthop apparently is reachable. --ANK
1515 	 */
1516 	dst_confirm(&rt->u.dst);
1517 
1518 	/* Duplicate redirect: silently ignore. */
1519 	if (neigh == rt->u.dst.neighbour)
1520 		goto out;
1521 
1522 	nrt = ip6_rt_copy(rt);
1523 	if (nrt == NULL)
1524 		goto out;
1525 
1526 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1527 	if (on_link)
1528 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1529 
1530 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1531 	nrt->rt6i_dst.plen = 128;
1532 	nrt->u.dst.flags |= DST_HOST;
1533 
1534 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1535 	nrt->rt6i_nexthop = neigh_clone(neigh);
1536 	/* Reset pmtu, it may be better */
1537 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1538 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1539 							dst_mtu(&nrt->u.dst));
1540 
1541 	if (ip6_ins_rt(nrt))
1542 		goto out;
1543 
1544 	netevent.old = &rt->u.dst;
1545 	netevent.new = &nrt->u.dst;
1546 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1547 
1548 	if (rt->rt6i_flags&RTF_CACHE) {
1549 		ip6_del_rt(rt);
1550 		return;
1551 	}
1552 
1553 out:
1554 	dst_release(&rt->u.dst);
1555 }
1556 
1557 /*
1558  *	Handle ICMP "packet too big" messages
1559  *	i.e. Path MTU discovery
1560  */
1561 
1562 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1563 			struct net_device *dev, u32 pmtu)
1564 {
1565 	struct rt6_info *rt, *nrt;
1566 	struct net *net = dev_net(dev);
1567 	int allfrag = 0;
1568 
1569 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1570 	if (rt == NULL)
1571 		return;
1572 
1573 	if (pmtu >= dst_mtu(&rt->u.dst))
1574 		goto out;
1575 
1576 	if (pmtu < IPV6_MIN_MTU) {
1577 		/*
1578 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1579 		 * MTU (1280) and a fragment header should always be included
1580 		 * after a node receiving Too Big message reporting PMTU is
1581 		 * less than the IPv6 Minimum Link MTU.
1582 		 */
1583 		pmtu = IPV6_MIN_MTU;
1584 		allfrag = 1;
1585 	}
1586 
1587 	/* New mtu received -> path was valid.
1588 	   They are sent only in response to data packets,
1589 	   so that this nexthop apparently is reachable. --ANK
1590 	 */
1591 	dst_confirm(&rt->u.dst);
1592 
1593 	/* Host route. If it is static, it would be better
1594 	   not to override it, but add new one, so that
1595 	   when cache entry will expire old pmtu
1596 	   would return automatically.
1597 	 */
1598 	if (rt->rt6i_flags & RTF_CACHE) {
1599 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1600 		if (allfrag)
1601 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1602 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1603 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1604 		goto out;
1605 	}
1606 
1607 	/* Network route.
1608 	   Two cases are possible:
1609 	   1. It is connected route. Action: COW
1610 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1611 	 */
1612 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1613 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1614 	else
1615 		nrt = rt6_alloc_clone(rt, daddr);
1616 
1617 	if (nrt) {
1618 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1619 		if (allfrag)
1620 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1621 
1622 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1623 		 * happened within 5 mins, the recommended timer is 10 mins.
1624 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1625 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1626 		 * and detecting PMTU increase will be automatically happened.
1627 		 */
1628 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1629 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1630 
1631 		ip6_ins_rt(nrt);
1632 	}
1633 out:
1634 	dst_release(&rt->u.dst);
1635 }
1636 
1637 /*
1638  *	Misc support functions
1639  */
1640 
1641 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1642 {
1643 	struct net *net = dev_net(ort->rt6i_dev);
1644 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1645 
1646 	if (rt) {
1647 		rt->u.dst.input = ort->u.dst.input;
1648 		rt->u.dst.output = ort->u.dst.output;
1649 
1650 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1651 		rt->u.dst.error = ort->u.dst.error;
1652 		rt->u.dst.dev = ort->u.dst.dev;
1653 		if (rt->u.dst.dev)
1654 			dev_hold(rt->u.dst.dev);
1655 		rt->rt6i_idev = ort->rt6i_idev;
1656 		if (rt->rt6i_idev)
1657 			in6_dev_hold(rt->rt6i_idev);
1658 		rt->u.dst.lastuse = jiffies;
1659 		rt->rt6i_expires = 0;
1660 
1661 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1662 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1663 		rt->rt6i_metric = 0;
1664 
1665 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1666 #ifdef CONFIG_IPV6_SUBTREES
1667 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1668 #endif
1669 		rt->rt6i_table = ort->rt6i_table;
1670 	}
1671 	return rt;
1672 }
1673 
1674 #ifdef CONFIG_IPV6_ROUTE_INFO
1675 static struct rt6_info *rt6_get_route_info(struct net *net,
1676 					   struct in6_addr *prefix, int prefixlen,
1677 					   struct in6_addr *gwaddr, int ifindex)
1678 {
1679 	struct fib6_node *fn;
1680 	struct rt6_info *rt = NULL;
1681 	struct fib6_table *table;
1682 
1683 	table = fib6_get_table(net, RT6_TABLE_INFO);
1684 	if (table == NULL)
1685 		return NULL;
1686 
1687 	write_lock_bh(&table->tb6_lock);
1688 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1689 	if (!fn)
1690 		goto out;
1691 
1692 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1693 		if (rt->rt6i_dev->ifindex != ifindex)
1694 			continue;
1695 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1696 			continue;
1697 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1698 			continue;
1699 		dst_hold(&rt->u.dst);
1700 		break;
1701 	}
1702 out:
1703 	write_unlock_bh(&table->tb6_lock);
1704 	return rt;
1705 }
1706 
1707 static struct rt6_info *rt6_add_route_info(struct net *net,
1708 					   struct in6_addr *prefix, int prefixlen,
1709 					   struct in6_addr *gwaddr, int ifindex,
1710 					   unsigned pref)
1711 {
1712 	struct fib6_config cfg = {
1713 		.fc_table	= RT6_TABLE_INFO,
1714 		.fc_metric	= IP6_RT_PRIO_USER,
1715 		.fc_ifindex	= ifindex,
1716 		.fc_dst_len	= prefixlen,
1717 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1718 				  RTF_UP | RTF_PREF(pref),
1719 		.fc_nlinfo.pid = 0,
1720 		.fc_nlinfo.nlh = NULL,
1721 		.fc_nlinfo.nl_net = net,
1722 	};
1723 
1724 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1725 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1726 
1727 	/* We should treat it as a default route if prefix length is 0. */
1728 	if (!prefixlen)
1729 		cfg.fc_flags |= RTF_DEFAULT;
1730 
1731 	ip6_route_add(&cfg);
1732 
1733 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1734 }
1735 #endif
1736 
1737 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1738 {
1739 	struct rt6_info *rt;
1740 	struct fib6_table *table;
1741 
1742 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1743 	if (table == NULL)
1744 		return NULL;
1745 
1746 	write_lock_bh(&table->tb6_lock);
1747 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1748 		if (dev == rt->rt6i_dev &&
1749 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1750 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1751 			break;
1752 	}
1753 	if (rt)
1754 		dst_hold(&rt->u.dst);
1755 	write_unlock_bh(&table->tb6_lock);
1756 	return rt;
1757 }
1758 
1759 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1760 				     struct net_device *dev,
1761 				     unsigned int pref)
1762 {
1763 	struct fib6_config cfg = {
1764 		.fc_table	= RT6_TABLE_DFLT,
1765 		.fc_metric	= IP6_RT_PRIO_USER,
1766 		.fc_ifindex	= dev->ifindex,
1767 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1768 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1769 		.fc_nlinfo.pid = 0,
1770 		.fc_nlinfo.nlh = NULL,
1771 		.fc_nlinfo.nl_net = dev_net(dev),
1772 	};
1773 
1774 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1775 
1776 	ip6_route_add(&cfg);
1777 
1778 	return rt6_get_dflt_router(gwaddr, dev);
1779 }
1780 
1781 void rt6_purge_dflt_routers(struct net *net)
1782 {
1783 	struct rt6_info *rt;
1784 	struct fib6_table *table;
1785 
1786 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1787 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1788 	if (table == NULL)
1789 		return;
1790 
1791 restart:
1792 	read_lock_bh(&table->tb6_lock);
1793 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1794 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1795 			dst_hold(&rt->u.dst);
1796 			read_unlock_bh(&table->tb6_lock);
1797 			ip6_del_rt(rt);
1798 			goto restart;
1799 		}
1800 	}
1801 	read_unlock_bh(&table->tb6_lock);
1802 }
1803 
1804 static void rtmsg_to_fib6_config(struct net *net,
1805 				 struct in6_rtmsg *rtmsg,
1806 				 struct fib6_config *cfg)
1807 {
1808 	memset(cfg, 0, sizeof(*cfg));
1809 
1810 	cfg->fc_table = RT6_TABLE_MAIN;
1811 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1812 	cfg->fc_metric = rtmsg->rtmsg_metric;
1813 	cfg->fc_expires = rtmsg->rtmsg_info;
1814 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1815 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1816 	cfg->fc_flags = rtmsg->rtmsg_flags;
1817 
1818 	cfg->fc_nlinfo.nl_net = net;
1819 
1820 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1821 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1822 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1823 }
1824 
1825 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1826 {
1827 	struct fib6_config cfg;
1828 	struct in6_rtmsg rtmsg;
1829 	int err;
1830 
1831 	switch(cmd) {
1832 	case SIOCADDRT:		/* Add a route */
1833 	case SIOCDELRT:		/* Delete a route */
1834 		if (!capable(CAP_NET_ADMIN))
1835 			return -EPERM;
1836 		err = copy_from_user(&rtmsg, arg,
1837 				     sizeof(struct in6_rtmsg));
1838 		if (err)
1839 			return -EFAULT;
1840 
1841 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1842 
1843 		rtnl_lock();
1844 		switch (cmd) {
1845 		case SIOCADDRT:
1846 			err = ip6_route_add(&cfg);
1847 			break;
1848 		case SIOCDELRT:
1849 			err = ip6_route_del(&cfg);
1850 			break;
1851 		default:
1852 			err = -EINVAL;
1853 		}
1854 		rtnl_unlock();
1855 
1856 		return err;
1857 	}
1858 
1859 	return -EINVAL;
1860 }
1861 
1862 /*
1863  *	Drop the packet on the floor
1864  */
1865 
1866 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1867 {
1868 	int type;
1869 	struct dst_entry *dst = skb_dst(skb);
1870 	switch (ipstats_mib_noroutes) {
1871 	case IPSTATS_MIB_INNOROUTES:
1872 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1873 		if (type == IPV6_ADDR_ANY) {
1874 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1875 				      IPSTATS_MIB_INADDRERRORS);
1876 			break;
1877 		}
1878 		/* FALLTHROUGH */
1879 	case IPSTATS_MIB_OUTNOROUTES:
1880 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1881 			      ipstats_mib_noroutes);
1882 		break;
1883 	}
1884 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1885 	kfree_skb(skb);
1886 	return 0;
1887 }
1888 
1889 static int ip6_pkt_discard(struct sk_buff *skb)
1890 {
1891 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1892 }
1893 
1894 static int ip6_pkt_discard_out(struct sk_buff *skb)
1895 {
1896 	skb->dev = skb_dst(skb)->dev;
1897 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1898 }
1899 
1900 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1901 
1902 static int ip6_pkt_prohibit(struct sk_buff *skb)
1903 {
1904 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1905 }
1906 
1907 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1908 {
1909 	skb->dev = skb_dst(skb)->dev;
1910 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1911 }
1912 
1913 #endif
1914 
1915 /*
1916  *	Allocate a dst for local (unicast / anycast) address.
1917  */
1918 
1919 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1920 				    const struct in6_addr *addr,
1921 				    int anycast)
1922 {
1923 	struct net *net = dev_net(idev->dev);
1924 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1925 	struct neighbour *neigh;
1926 
1927 	if (rt == NULL)
1928 		return ERR_PTR(-ENOMEM);
1929 
1930 	dev_hold(net->loopback_dev);
1931 	in6_dev_hold(idev);
1932 
1933 	rt->u.dst.flags = DST_HOST;
1934 	rt->u.dst.input = ip6_input;
1935 	rt->u.dst.output = ip6_output;
1936 	rt->rt6i_dev = net->loopback_dev;
1937 	rt->rt6i_idev = idev;
1938 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1939 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1940 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1941 	rt->u.dst.obsolete = -1;
1942 
1943 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1944 	if (anycast)
1945 		rt->rt6i_flags |= RTF_ANYCAST;
1946 	else
1947 		rt->rt6i_flags |= RTF_LOCAL;
1948 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1949 	if (IS_ERR(neigh)) {
1950 		dst_free(&rt->u.dst);
1951 
1952 		/* We are casting this because that is the return
1953 		 * value type.  But an errno encoded pointer is the
1954 		 * same regardless of the underlying pointer type,
1955 		 * and that's what we are returning.  So this is OK.
1956 		 */
1957 		return (struct rt6_info *) neigh;
1958 	}
1959 	rt->rt6i_nexthop = neigh;
1960 
1961 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1962 	rt->rt6i_dst.plen = 128;
1963 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1964 
1965 	atomic_set(&rt->u.dst.__refcnt, 1);
1966 
1967 	return rt;
1968 }
1969 
1970 struct arg_dev_net {
1971 	struct net_device *dev;
1972 	struct net *net;
1973 };
1974 
1975 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1976 {
1977 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1978 	struct net *net = ((struct arg_dev_net *)arg)->net;
1979 
1980 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1981 	    rt != net->ipv6.ip6_null_entry) {
1982 		RT6_TRACE("deleted by ifdown %p\n", rt);
1983 		return -1;
1984 	}
1985 	return 0;
1986 }
1987 
1988 void rt6_ifdown(struct net *net, struct net_device *dev)
1989 {
1990 	struct arg_dev_net adn = {
1991 		.dev = dev,
1992 		.net = net,
1993 	};
1994 
1995 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1996 	icmp6_clean_all(fib6_ifdown, &adn);
1997 }
1998 
1999 struct rt6_mtu_change_arg
2000 {
2001 	struct net_device *dev;
2002 	unsigned mtu;
2003 };
2004 
2005 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2006 {
2007 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2008 	struct inet6_dev *idev;
2009 	struct net *net = dev_net(arg->dev);
2010 
2011 	/* In IPv6 pmtu discovery is not optional,
2012 	   so that RTAX_MTU lock cannot disable it.
2013 	   We still use this lock to block changes
2014 	   caused by addrconf/ndisc.
2015 	*/
2016 
2017 	idev = __in6_dev_get(arg->dev);
2018 	if (idev == NULL)
2019 		return 0;
2020 
2021 	/* For administrative MTU increase, there is no way to discover
2022 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2023 	   Since RFC 1981 doesn't include administrative MTU increase
2024 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2025 	 */
2026 	/*
2027 	   If new MTU is less than route PMTU, this new MTU will be the
2028 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2029 	   decreases; if new MTU is greater than route PMTU, and the
2030 	   old MTU is the lowest MTU in the path, update the route PMTU
2031 	   to reflect the increase. In this case if the other nodes' MTU
2032 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2033 	   PMTU discouvery.
2034 	 */
2035 	if (rt->rt6i_dev == arg->dev &&
2036 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2037 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
2038 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
2039 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2040 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2041 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2042 	}
2043 	return 0;
2044 }
2045 
2046 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2047 {
2048 	struct rt6_mtu_change_arg arg = {
2049 		.dev = dev,
2050 		.mtu = mtu,
2051 	};
2052 
2053 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2054 }
2055 
2056 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2057 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2058 	[RTA_OIF]               = { .type = NLA_U32 },
2059 	[RTA_IIF]		= { .type = NLA_U32 },
2060 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2061 	[RTA_METRICS]           = { .type = NLA_NESTED },
2062 };
2063 
2064 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2065 			      struct fib6_config *cfg)
2066 {
2067 	struct rtmsg *rtm;
2068 	struct nlattr *tb[RTA_MAX+1];
2069 	int err;
2070 
2071 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2072 	if (err < 0)
2073 		goto errout;
2074 
2075 	err = -EINVAL;
2076 	rtm = nlmsg_data(nlh);
2077 	memset(cfg, 0, sizeof(*cfg));
2078 
2079 	cfg->fc_table = rtm->rtm_table;
2080 	cfg->fc_dst_len = rtm->rtm_dst_len;
2081 	cfg->fc_src_len = rtm->rtm_src_len;
2082 	cfg->fc_flags = RTF_UP;
2083 	cfg->fc_protocol = rtm->rtm_protocol;
2084 
2085 	if (rtm->rtm_type == RTN_UNREACHABLE)
2086 		cfg->fc_flags |= RTF_REJECT;
2087 
2088 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2089 	cfg->fc_nlinfo.nlh = nlh;
2090 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2091 
2092 	if (tb[RTA_GATEWAY]) {
2093 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2094 		cfg->fc_flags |= RTF_GATEWAY;
2095 	}
2096 
2097 	if (tb[RTA_DST]) {
2098 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2099 
2100 		if (nla_len(tb[RTA_DST]) < plen)
2101 			goto errout;
2102 
2103 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2104 	}
2105 
2106 	if (tb[RTA_SRC]) {
2107 		int plen = (rtm->rtm_src_len + 7) >> 3;
2108 
2109 		if (nla_len(tb[RTA_SRC]) < plen)
2110 			goto errout;
2111 
2112 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2113 	}
2114 
2115 	if (tb[RTA_OIF])
2116 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2117 
2118 	if (tb[RTA_PRIORITY])
2119 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2120 
2121 	if (tb[RTA_METRICS]) {
2122 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2123 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2124 	}
2125 
2126 	if (tb[RTA_TABLE])
2127 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2128 
2129 	err = 0;
2130 errout:
2131 	return err;
2132 }
2133 
2134 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2135 {
2136 	struct fib6_config cfg;
2137 	int err;
2138 
2139 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2140 	if (err < 0)
2141 		return err;
2142 
2143 	return ip6_route_del(&cfg);
2144 }
2145 
2146 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2147 {
2148 	struct fib6_config cfg;
2149 	int err;
2150 
2151 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2152 	if (err < 0)
2153 		return err;
2154 
2155 	return ip6_route_add(&cfg);
2156 }
2157 
2158 static inline size_t rt6_nlmsg_size(void)
2159 {
2160 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2161 	       + nla_total_size(16) /* RTA_SRC */
2162 	       + nla_total_size(16) /* RTA_DST */
2163 	       + nla_total_size(16) /* RTA_GATEWAY */
2164 	       + nla_total_size(16) /* RTA_PREFSRC */
2165 	       + nla_total_size(4) /* RTA_TABLE */
2166 	       + nla_total_size(4) /* RTA_IIF */
2167 	       + nla_total_size(4) /* RTA_OIF */
2168 	       + nla_total_size(4) /* RTA_PRIORITY */
2169 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2170 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2171 }
2172 
2173 static int rt6_fill_node(struct net *net,
2174 			 struct sk_buff *skb, struct rt6_info *rt,
2175 			 struct in6_addr *dst, struct in6_addr *src,
2176 			 int iif, int type, u32 pid, u32 seq,
2177 			 int prefix, int nowait, unsigned int flags)
2178 {
2179 	struct rtmsg *rtm;
2180 	struct nlmsghdr *nlh;
2181 	long expires;
2182 	u32 table;
2183 
2184 	if (prefix) {	/* user wants prefix routes only */
2185 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2186 			/* success since this is not a prefix route */
2187 			return 1;
2188 		}
2189 	}
2190 
2191 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2192 	if (nlh == NULL)
2193 		return -EMSGSIZE;
2194 
2195 	rtm = nlmsg_data(nlh);
2196 	rtm->rtm_family = AF_INET6;
2197 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2198 	rtm->rtm_src_len = rt->rt6i_src.plen;
2199 	rtm->rtm_tos = 0;
2200 	if (rt->rt6i_table)
2201 		table = rt->rt6i_table->tb6_id;
2202 	else
2203 		table = RT6_TABLE_UNSPEC;
2204 	rtm->rtm_table = table;
2205 	NLA_PUT_U32(skb, RTA_TABLE, table);
2206 	if (rt->rt6i_flags&RTF_REJECT)
2207 		rtm->rtm_type = RTN_UNREACHABLE;
2208 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2209 		rtm->rtm_type = RTN_LOCAL;
2210 	else
2211 		rtm->rtm_type = RTN_UNICAST;
2212 	rtm->rtm_flags = 0;
2213 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2214 	rtm->rtm_protocol = rt->rt6i_protocol;
2215 	if (rt->rt6i_flags&RTF_DYNAMIC)
2216 		rtm->rtm_protocol = RTPROT_REDIRECT;
2217 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2218 		rtm->rtm_protocol = RTPROT_KERNEL;
2219 	else if (rt->rt6i_flags&RTF_DEFAULT)
2220 		rtm->rtm_protocol = RTPROT_RA;
2221 
2222 	if (rt->rt6i_flags&RTF_CACHE)
2223 		rtm->rtm_flags |= RTM_F_CLONED;
2224 
2225 	if (dst) {
2226 		NLA_PUT(skb, RTA_DST, 16, dst);
2227 		rtm->rtm_dst_len = 128;
2228 	} else if (rtm->rtm_dst_len)
2229 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2230 #ifdef CONFIG_IPV6_SUBTREES
2231 	if (src) {
2232 		NLA_PUT(skb, RTA_SRC, 16, src);
2233 		rtm->rtm_src_len = 128;
2234 	} else if (rtm->rtm_src_len)
2235 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2236 #endif
2237 	if (iif) {
2238 #ifdef CONFIG_IPV6_MROUTE
2239 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2240 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2241 			if (err <= 0) {
2242 				if (!nowait) {
2243 					if (err == 0)
2244 						return 0;
2245 					goto nla_put_failure;
2246 				} else {
2247 					if (err == -EMSGSIZE)
2248 						goto nla_put_failure;
2249 				}
2250 			}
2251 		} else
2252 #endif
2253 			NLA_PUT_U32(skb, RTA_IIF, iif);
2254 	} else if (dst) {
2255 		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2256 		struct in6_addr saddr_buf;
2257 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2258 				       dst, 0, &saddr_buf) == 0)
2259 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2260 	}
2261 
2262 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2263 		goto nla_put_failure;
2264 
2265 	if (rt->u.dst.neighbour)
2266 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2267 
2268 	if (rt->u.dst.dev)
2269 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2270 
2271 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2272 
2273 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2274 		expires = 0;
2275 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2276 		expires = rt->rt6i_expires - jiffies;
2277 	else
2278 		expires = INT_MAX;
2279 
2280 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2281 			       expires, rt->u.dst.error) < 0)
2282 		goto nla_put_failure;
2283 
2284 	return nlmsg_end(skb, nlh);
2285 
2286 nla_put_failure:
2287 	nlmsg_cancel(skb, nlh);
2288 	return -EMSGSIZE;
2289 }
2290 
2291 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2292 {
2293 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2294 	int prefix;
2295 
2296 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2297 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2298 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2299 	} else
2300 		prefix = 0;
2301 
2302 	return rt6_fill_node(arg->net,
2303 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2304 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2305 		     prefix, 0, NLM_F_MULTI);
2306 }
2307 
2308 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2309 {
2310 	struct net *net = sock_net(in_skb->sk);
2311 	struct nlattr *tb[RTA_MAX+1];
2312 	struct rt6_info *rt;
2313 	struct sk_buff *skb;
2314 	struct rtmsg *rtm;
2315 	struct flowi fl;
2316 	int err, iif = 0;
2317 
2318 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2319 	if (err < 0)
2320 		goto errout;
2321 
2322 	err = -EINVAL;
2323 	memset(&fl, 0, sizeof(fl));
2324 
2325 	if (tb[RTA_SRC]) {
2326 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2327 			goto errout;
2328 
2329 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2330 	}
2331 
2332 	if (tb[RTA_DST]) {
2333 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2334 			goto errout;
2335 
2336 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2337 	}
2338 
2339 	if (tb[RTA_IIF])
2340 		iif = nla_get_u32(tb[RTA_IIF]);
2341 
2342 	if (tb[RTA_OIF])
2343 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2344 
2345 	if (iif) {
2346 		struct net_device *dev;
2347 		dev = __dev_get_by_index(net, iif);
2348 		if (!dev) {
2349 			err = -ENODEV;
2350 			goto errout;
2351 		}
2352 	}
2353 
2354 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2355 	if (skb == NULL) {
2356 		err = -ENOBUFS;
2357 		goto errout;
2358 	}
2359 
2360 	/* Reserve room for dummy headers, this skb can pass
2361 	   through good chunk of routing engine.
2362 	 */
2363 	skb_reset_mac_header(skb);
2364 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2365 
2366 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2367 	skb_dst_set(skb, &rt->u.dst);
2368 
2369 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2370 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2371 			    nlh->nlmsg_seq, 0, 0, 0);
2372 	if (err < 0) {
2373 		kfree_skb(skb);
2374 		goto errout;
2375 	}
2376 
2377 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2378 errout:
2379 	return err;
2380 }
2381 
2382 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2383 {
2384 	struct sk_buff *skb;
2385 	struct net *net = info->nl_net;
2386 	u32 seq;
2387 	int err;
2388 
2389 	err = -ENOBUFS;
2390 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2391 
2392 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2393 	if (skb == NULL)
2394 		goto errout;
2395 
2396 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2397 				event, info->pid, seq, 0, 0, 0);
2398 	if (err < 0) {
2399 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2400 		WARN_ON(err == -EMSGSIZE);
2401 		kfree_skb(skb);
2402 		goto errout;
2403 	}
2404 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2405 		    info->nlh, gfp_any());
2406 	return;
2407 errout:
2408 	if (err < 0)
2409 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2410 }
2411 
2412 static int ip6_route_dev_notify(struct notifier_block *this,
2413 				unsigned long event, void *data)
2414 {
2415 	struct net_device *dev = (struct net_device *)data;
2416 	struct net *net = dev_net(dev);
2417 
2418 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2419 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2420 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2421 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2422 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2423 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2424 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2425 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2426 #endif
2427 	}
2428 
2429 	return NOTIFY_OK;
2430 }
2431 
2432 /*
2433  *	/proc
2434  */
2435 
2436 #ifdef CONFIG_PROC_FS
2437 
2438 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2439 
2440 struct rt6_proc_arg
2441 {
2442 	char *buffer;
2443 	int offset;
2444 	int length;
2445 	int skip;
2446 	int len;
2447 };
2448 
2449 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2450 {
2451 	struct seq_file *m = p_arg;
2452 
2453 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2454 
2455 #ifdef CONFIG_IPV6_SUBTREES
2456 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2457 #else
2458 	seq_puts(m, "00000000000000000000000000000000 00 ");
2459 #endif
2460 
2461 	if (rt->rt6i_nexthop) {
2462 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2463 	} else {
2464 		seq_puts(m, "00000000000000000000000000000000");
2465 	}
2466 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2467 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2468 		   rt->u.dst.__use, rt->rt6i_flags,
2469 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2470 	return 0;
2471 }
2472 
2473 static int ipv6_route_show(struct seq_file *m, void *v)
2474 {
2475 	struct net *net = (struct net *)m->private;
2476 	fib6_clean_all(net, rt6_info_route, 0, m);
2477 	return 0;
2478 }
2479 
2480 static int ipv6_route_open(struct inode *inode, struct file *file)
2481 {
2482 	return single_open_net(inode, file, ipv6_route_show);
2483 }
2484 
2485 static const struct file_operations ipv6_route_proc_fops = {
2486 	.owner		= THIS_MODULE,
2487 	.open		= ipv6_route_open,
2488 	.read		= seq_read,
2489 	.llseek		= seq_lseek,
2490 	.release	= single_release_net,
2491 };
2492 
2493 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2494 {
2495 	struct net *net = (struct net *)seq->private;
2496 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2497 		   net->ipv6.rt6_stats->fib_nodes,
2498 		   net->ipv6.rt6_stats->fib_route_nodes,
2499 		   net->ipv6.rt6_stats->fib_rt_alloc,
2500 		   net->ipv6.rt6_stats->fib_rt_entries,
2501 		   net->ipv6.rt6_stats->fib_rt_cache,
2502 		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
2503 		   net->ipv6.rt6_stats->fib_discarded_routes);
2504 
2505 	return 0;
2506 }
2507 
2508 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2509 {
2510 	return single_open_net(inode, file, rt6_stats_seq_show);
2511 }
2512 
2513 static const struct file_operations rt6_stats_seq_fops = {
2514 	.owner	 = THIS_MODULE,
2515 	.open	 = rt6_stats_seq_open,
2516 	.read	 = seq_read,
2517 	.llseek	 = seq_lseek,
2518 	.release = single_release_net,
2519 };
2520 #endif	/* CONFIG_PROC_FS */
2521 
2522 #ifdef CONFIG_SYSCTL
2523 
2524 static
2525 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2526 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2527 {
2528 	struct net *net = current->nsproxy->net_ns;
2529 	int delay = net->ipv6.sysctl.flush_delay;
2530 	if (write) {
2531 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2532 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2533 		return 0;
2534 	} else
2535 		return -EINVAL;
2536 }
2537 
2538 ctl_table ipv6_route_table_template[] = {
2539 	{
2540 		.procname	=	"flush",
2541 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2542 		.maxlen		=	sizeof(int),
2543 		.mode		=	0200,
2544 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2545 	},
2546 	{
2547 		.procname	=	"gc_thresh",
2548 		.data		=	&ip6_dst_ops_template.gc_thresh,
2549 		.maxlen		=	sizeof(int),
2550 		.mode		=	0644,
2551 		.proc_handler	=	proc_dointvec,
2552 	},
2553 	{
2554 		.procname	=	"max_size",
2555 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2556 		.maxlen		=	sizeof(int),
2557 		.mode		=	0644,
2558 		.proc_handler	=	proc_dointvec,
2559 	},
2560 	{
2561 		.procname	=	"gc_min_interval",
2562 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2563 		.maxlen		=	sizeof(int),
2564 		.mode		=	0644,
2565 		.proc_handler	=	proc_dointvec_jiffies,
2566 	},
2567 	{
2568 		.procname	=	"gc_timeout",
2569 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2570 		.maxlen		=	sizeof(int),
2571 		.mode		=	0644,
2572 		.proc_handler	=	proc_dointvec_jiffies,
2573 	},
2574 	{
2575 		.procname	=	"gc_interval",
2576 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2577 		.maxlen		=	sizeof(int),
2578 		.mode		=	0644,
2579 		.proc_handler	=	proc_dointvec_jiffies,
2580 	},
2581 	{
2582 		.procname	=	"gc_elasticity",
2583 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2584 		.maxlen		=	sizeof(int),
2585 		.mode		=	0644,
2586 		.proc_handler	=	proc_dointvec_jiffies,
2587 	},
2588 	{
2589 		.procname	=	"mtu_expires",
2590 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2591 		.maxlen		=	sizeof(int),
2592 		.mode		=	0644,
2593 		.proc_handler	=	proc_dointvec_jiffies,
2594 	},
2595 	{
2596 		.procname	=	"min_adv_mss",
2597 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2598 		.maxlen		=	sizeof(int),
2599 		.mode		=	0644,
2600 		.proc_handler	=	proc_dointvec_jiffies,
2601 	},
2602 	{
2603 		.procname	=	"gc_min_interval_ms",
2604 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2605 		.maxlen		=	sizeof(int),
2606 		.mode		=	0644,
2607 		.proc_handler	=	proc_dointvec_ms_jiffies,
2608 	},
2609 	{ }
2610 };
2611 
2612 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2613 {
2614 	struct ctl_table *table;
2615 
2616 	table = kmemdup(ipv6_route_table_template,
2617 			sizeof(ipv6_route_table_template),
2618 			GFP_KERNEL);
2619 
2620 	if (table) {
2621 		table[0].data = &net->ipv6.sysctl.flush_delay;
2622 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2623 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2624 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2625 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2626 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2627 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2628 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2629 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2630 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2631 	}
2632 
2633 	return table;
2634 }
2635 #endif
2636 
2637 static int __net_init ip6_route_net_init(struct net *net)
2638 {
2639 	int ret = -ENOMEM;
2640 
2641 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2642 	       sizeof(net->ipv6.ip6_dst_ops));
2643 
2644 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2645 					   sizeof(*net->ipv6.ip6_null_entry),
2646 					   GFP_KERNEL);
2647 	if (!net->ipv6.ip6_null_entry)
2648 		goto out_ip6_dst_ops;
2649 	net->ipv6.ip6_null_entry->u.dst.path =
2650 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2651 	net->ipv6.ip6_null_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2652 
2653 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2654 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2655 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2656 					       GFP_KERNEL);
2657 	if (!net->ipv6.ip6_prohibit_entry)
2658 		goto out_ip6_null_entry;
2659 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2660 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2661 	net->ipv6.ip6_prohibit_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2662 
2663 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2664 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2665 					       GFP_KERNEL);
2666 	if (!net->ipv6.ip6_blk_hole_entry)
2667 		goto out_ip6_prohibit_entry;
2668 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2669 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2670 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = &net->ipv6.ip6_dst_ops;
2671 #endif
2672 
2673 	net->ipv6.sysctl.flush_delay = 0;
2674 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2675 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2676 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2677 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2678 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2679 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2680 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2681 
2682 #ifdef CONFIG_PROC_FS
2683 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2684 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2685 #endif
2686 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2687 
2688 	ret = 0;
2689 out:
2690 	return ret;
2691 
2692 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2693 out_ip6_prohibit_entry:
2694 	kfree(net->ipv6.ip6_prohibit_entry);
2695 out_ip6_null_entry:
2696 	kfree(net->ipv6.ip6_null_entry);
2697 #endif
2698 out_ip6_dst_ops:
2699 	goto out;
2700 }
2701 
2702 static void __net_exit ip6_route_net_exit(struct net *net)
2703 {
2704 #ifdef CONFIG_PROC_FS
2705 	proc_net_remove(net, "ipv6_route");
2706 	proc_net_remove(net, "rt6_stats");
2707 #endif
2708 	kfree(net->ipv6.ip6_null_entry);
2709 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2710 	kfree(net->ipv6.ip6_prohibit_entry);
2711 	kfree(net->ipv6.ip6_blk_hole_entry);
2712 #endif
2713 }
2714 
2715 static struct pernet_operations ip6_route_net_ops = {
2716 	.init = ip6_route_net_init,
2717 	.exit = ip6_route_net_exit,
2718 };
2719 
2720 static struct notifier_block ip6_route_dev_notifier = {
2721 	.notifier_call = ip6_route_dev_notify,
2722 	.priority = 0,
2723 };
2724 
2725 int __init ip6_route_init(void)
2726 {
2727 	int ret;
2728 
2729 	ret = -ENOMEM;
2730 	ip6_dst_ops_template.kmem_cachep =
2731 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2732 				  SLAB_HWCACHE_ALIGN, NULL);
2733 	if (!ip6_dst_ops_template.kmem_cachep)
2734 		goto out;
2735 
2736 	ret = register_pernet_subsys(&ip6_route_net_ops);
2737 	if (ret)
2738 		goto out_kmem_cache;
2739 
2740 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2741 
2742 	/* Registering of the loopback is done before this portion of code,
2743 	 * the loopback reference in rt6_info will not be taken, do it
2744 	 * manually for init_net */
2745 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2746 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2747   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2748 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2749 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2750 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2751 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2752   #endif
2753 	ret = fib6_init();
2754 	if (ret)
2755 		goto out_register_subsys;
2756 
2757 	ret = xfrm6_init();
2758 	if (ret)
2759 		goto out_fib6_init;
2760 
2761 	ret = fib6_rules_init();
2762 	if (ret)
2763 		goto xfrm6_init;
2764 
2765 	ret = -ENOBUFS;
2766 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2767 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2768 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2769 		goto fib6_rules_init;
2770 
2771 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2772 	if (ret)
2773 		goto fib6_rules_init;
2774 
2775 out:
2776 	return ret;
2777 
2778 fib6_rules_init:
2779 	fib6_rules_cleanup();
2780 xfrm6_init:
2781 	xfrm6_fini();
2782 out_fib6_init:
2783 	fib6_gc_cleanup();
2784 out_register_subsys:
2785 	unregister_pernet_subsys(&ip6_route_net_ops);
2786 out_kmem_cache:
2787 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2788 	goto out;
2789 }
2790 
2791 void ip6_route_cleanup(void)
2792 {
2793 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2794 	fib6_rules_cleanup();
2795 	xfrm6_fini();
2796 	fib6_gc_cleanup();
2797 	unregister_pernet_subsys(&ip6_route_net_ops);
2798 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2799 }
2800