xref: /openbmc/linux/net/ipv6/route.c (revision 8ac727c1)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.default_advmss		=	ip6_default_advmss,
107 	.default_mtu		=	ip6_default_mtu,
108 	.destroy		=	ip6_dst_destroy,
109 	.ifdown			=	ip6_dst_ifdown,
110 	.negative_advice	=	ip6_negative_advice,
111 	.link_failure		=	ip6_link_failure,
112 	.update_pmtu		=	ip6_rt_update_pmtu,
113 	.local_out		=	__ip6_local_out,
114 };
115 
116 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
117 {
118 	return 0;
119 }
120 
121 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
122 {
123 }
124 
125 static struct dst_ops ip6_dst_blackhole_ops = {
126 	.family			=	AF_INET6,
127 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
128 	.destroy		=	ip6_dst_destroy,
129 	.check			=	ip6_dst_check,
130 	.default_mtu		=	ip6_blackhole_default_mtu,
131 	.default_advmss		=	ip6_default_advmss,
132 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
133 };
134 
135 static struct rt6_info ip6_null_entry_template = {
136 	.dst = {
137 		.__refcnt	= ATOMIC_INIT(1),
138 		.__use		= 1,
139 		.obsolete	= -1,
140 		.error		= -ENETUNREACH,
141 		.input		= ip6_pkt_discard,
142 		.output		= ip6_pkt_discard_out,
143 	},
144 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
145 	.rt6i_protocol  = RTPROT_KERNEL,
146 	.rt6i_metric	= ~(u32) 0,
147 	.rt6i_ref	= ATOMIC_INIT(1),
148 };
149 
150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
151 
152 static int ip6_pkt_prohibit(struct sk_buff *skb);
153 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
154 
155 static struct rt6_info ip6_prohibit_entry_template = {
156 	.dst = {
157 		.__refcnt	= ATOMIC_INIT(1),
158 		.__use		= 1,
159 		.obsolete	= -1,
160 		.error		= -EACCES,
161 		.input		= ip6_pkt_prohibit,
162 		.output		= ip6_pkt_prohibit_out,
163 	},
164 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
165 	.rt6i_protocol  = RTPROT_KERNEL,
166 	.rt6i_metric	= ~(u32) 0,
167 	.rt6i_ref	= ATOMIC_INIT(1),
168 };
169 
170 static struct rt6_info ip6_blk_hole_entry_template = {
171 	.dst = {
172 		.__refcnt	= ATOMIC_INIT(1),
173 		.__use		= 1,
174 		.obsolete	= -1,
175 		.error		= -EINVAL,
176 		.input		= dst_discard,
177 		.output		= dst_discard,
178 	},
179 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
180 	.rt6i_protocol  = RTPROT_KERNEL,
181 	.rt6i_metric	= ~(u32) 0,
182 	.rt6i_ref	= ATOMIC_INIT(1),
183 };
184 
185 #endif
186 
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190 	return (struct rt6_info *)dst_alloc(ops);
191 }
192 
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195 	struct rt6_info *rt = (struct rt6_info *)dst;
196 	struct inet6_dev *idev = rt->rt6i_idev;
197 	struct inet_peer *peer = rt->rt6i_peer;
198 
199 	if (idev != NULL) {
200 		rt->rt6i_idev = NULL;
201 		in6_dev_put(idev);
202 	}
203 	if (peer) {
204 		rt->rt6i_peer = NULL;
205 		inet_putpeer(peer);
206 	}
207 }
208 
209 void rt6_bind_peer(struct rt6_info *rt, int create)
210 {
211 	struct inet_peer *peer;
212 
213 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
214 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
215 		inet_putpeer(peer);
216 }
217 
218 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
219 			   int how)
220 {
221 	struct rt6_info *rt = (struct rt6_info *)dst;
222 	struct inet6_dev *idev = rt->rt6i_idev;
223 	struct net_device *loopback_dev =
224 		dev_net(dev)->loopback_dev;
225 
226 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
227 		struct inet6_dev *loopback_idev =
228 			in6_dev_get(loopback_dev);
229 		if (loopback_idev != NULL) {
230 			rt->rt6i_idev = loopback_idev;
231 			in6_dev_put(idev);
232 		}
233 	}
234 }
235 
236 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
237 {
238 	return (rt->rt6i_flags & RTF_EXPIRES) &&
239 		time_after(jiffies, rt->rt6i_expires);
240 }
241 
242 static inline int rt6_need_strict(struct in6_addr *daddr)
243 {
244 	return ipv6_addr_type(daddr) &
245 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
246 }
247 
248 /*
249  *	Route lookup. Any table->tb6_lock is implied.
250  */
251 
252 static inline struct rt6_info *rt6_device_match(struct net *net,
253 						    struct rt6_info *rt,
254 						    struct in6_addr *saddr,
255 						    int oif,
256 						    int flags)
257 {
258 	struct rt6_info *local = NULL;
259 	struct rt6_info *sprt;
260 
261 	if (!oif && ipv6_addr_any(saddr))
262 		goto out;
263 
264 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
265 		struct net_device *dev = sprt->rt6i_dev;
266 
267 		if (oif) {
268 			if (dev->ifindex == oif)
269 				return sprt;
270 			if (dev->flags & IFF_LOOPBACK) {
271 				if (sprt->rt6i_idev == NULL ||
272 				    sprt->rt6i_idev->dev->ifindex != oif) {
273 					if (flags & RT6_LOOKUP_F_IFACE && oif)
274 						continue;
275 					if (local && (!oif ||
276 						      local->rt6i_idev->dev->ifindex == oif))
277 						continue;
278 				}
279 				local = sprt;
280 			}
281 		} else {
282 			if (ipv6_chk_addr(net, saddr, dev,
283 					  flags & RT6_LOOKUP_F_IFACE))
284 				return sprt;
285 		}
286 	}
287 
288 	if (oif) {
289 		if (local)
290 			return local;
291 
292 		if (flags & RT6_LOOKUP_F_IFACE)
293 			return net->ipv6.ip6_null_entry;
294 	}
295 out:
296 	return rt;
297 }
298 
299 #ifdef CONFIG_IPV6_ROUTER_PREF
300 static void rt6_probe(struct rt6_info *rt)
301 {
302 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
303 	/*
304 	 * Okay, this does not seem to be appropriate
305 	 * for now, however, we need to check if it
306 	 * is really so; aka Router Reachability Probing.
307 	 *
308 	 * Router Reachability Probe MUST be rate-limited
309 	 * to no more than one per minute.
310 	 */
311 	if (!neigh || (neigh->nud_state & NUD_VALID))
312 		return;
313 	read_lock_bh(&neigh->lock);
314 	if (!(neigh->nud_state & NUD_VALID) &&
315 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
316 		struct in6_addr mcaddr;
317 		struct in6_addr *target;
318 
319 		neigh->updated = jiffies;
320 		read_unlock_bh(&neigh->lock);
321 
322 		target = (struct in6_addr *)&neigh->primary_key;
323 		addrconf_addr_solict_mult(target, &mcaddr);
324 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
325 	} else
326 		read_unlock_bh(&neigh->lock);
327 }
328 #else
329 static inline void rt6_probe(struct rt6_info *rt)
330 {
331 }
332 #endif
333 
334 /*
335  * Default Router Selection (RFC 2461 6.3.6)
336  */
337 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
338 {
339 	struct net_device *dev = rt->rt6i_dev;
340 	if (!oif || dev->ifindex == oif)
341 		return 2;
342 	if ((dev->flags & IFF_LOOPBACK) &&
343 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
344 		return 1;
345 	return 0;
346 }
347 
348 static inline int rt6_check_neigh(struct rt6_info *rt)
349 {
350 	struct neighbour *neigh = rt->rt6i_nexthop;
351 	int m;
352 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
353 	    !(rt->rt6i_flags & RTF_GATEWAY))
354 		m = 1;
355 	else if (neigh) {
356 		read_lock_bh(&neigh->lock);
357 		if (neigh->nud_state & NUD_VALID)
358 			m = 2;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360 		else if (neigh->nud_state & NUD_FAILED)
361 			m = 0;
362 #endif
363 		else
364 			m = 1;
365 		read_unlock_bh(&neigh->lock);
366 	} else
367 		m = 0;
368 	return m;
369 }
370 
371 static int rt6_score_route(struct rt6_info *rt, int oif,
372 			   int strict)
373 {
374 	int m, n;
375 
376 	m = rt6_check_dev(rt, oif);
377 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
378 		return -1;
379 #ifdef CONFIG_IPV6_ROUTER_PREF
380 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
381 #endif
382 	n = rt6_check_neigh(rt);
383 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
384 		return -1;
385 	return m;
386 }
387 
388 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
389 				   int *mpri, struct rt6_info *match)
390 {
391 	int m;
392 
393 	if (rt6_check_expired(rt))
394 		goto out;
395 
396 	m = rt6_score_route(rt, oif, strict);
397 	if (m < 0)
398 		goto out;
399 
400 	if (m > *mpri) {
401 		if (strict & RT6_LOOKUP_F_REACHABLE)
402 			rt6_probe(match);
403 		*mpri = m;
404 		match = rt;
405 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
406 		rt6_probe(rt);
407 	}
408 
409 out:
410 	return match;
411 }
412 
413 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
414 				     struct rt6_info *rr_head,
415 				     u32 metric, int oif, int strict)
416 {
417 	struct rt6_info *rt, *match;
418 	int mpri = -1;
419 
420 	match = NULL;
421 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
422 	     rt = rt->dst.rt6_next)
423 		match = find_match(rt, oif, strict, &mpri, match);
424 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
425 	     rt = rt->dst.rt6_next)
426 		match = find_match(rt, oif, strict, &mpri, match);
427 
428 	return match;
429 }
430 
431 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
432 {
433 	struct rt6_info *match, *rt0;
434 	struct net *net;
435 
436 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
437 		  __func__, fn->leaf, oif);
438 
439 	rt0 = fn->rr_ptr;
440 	if (!rt0)
441 		fn->rr_ptr = rt0 = fn->leaf;
442 
443 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
444 
445 	if (!match &&
446 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
447 		struct rt6_info *next = rt0->dst.rt6_next;
448 
449 		/* no entries matched; do round-robin */
450 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
451 			next = fn->leaf;
452 
453 		if (next != rt0)
454 			fn->rr_ptr = next;
455 	}
456 
457 	RT6_TRACE("%s() => %p\n",
458 		  __func__, match);
459 
460 	net = dev_net(rt0->rt6i_dev);
461 	return match ? match : net->ipv6.ip6_null_entry;
462 }
463 
464 #ifdef CONFIG_IPV6_ROUTE_INFO
465 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
466 		  struct in6_addr *gwaddr)
467 {
468 	struct net *net = dev_net(dev);
469 	struct route_info *rinfo = (struct route_info *) opt;
470 	struct in6_addr prefix_buf, *prefix;
471 	unsigned int pref;
472 	unsigned long lifetime;
473 	struct rt6_info *rt;
474 
475 	if (len < sizeof(struct route_info)) {
476 		return -EINVAL;
477 	}
478 
479 	/* Sanity check for prefix_len and length */
480 	if (rinfo->length > 3) {
481 		return -EINVAL;
482 	} else if (rinfo->prefix_len > 128) {
483 		return -EINVAL;
484 	} else if (rinfo->prefix_len > 64) {
485 		if (rinfo->length < 2) {
486 			return -EINVAL;
487 		}
488 	} else if (rinfo->prefix_len > 0) {
489 		if (rinfo->length < 1) {
490 			return -EINVAL;
491 		}
492 	}
493 
494 	pref = rinfo->route_pref;
495 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
496 		return -EINVAL;
497 
498 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
499 
500 	if (rinfo->length == 3)
501 		prefix = (struct in6_addr *)rinfo->prefix;
502 	else {
503 		/* this function is safe */
504 		ipv6_addr_prefix(&prefix_buf,
505 				 (struct in6_addr *)rinfo->prefix,
506 				 rinfo->prefix_len);
507 		prefix = &prefix_buf;
508 	}
509 
510 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
511 				dev->ifindex);
512 
513 	if (rt && !lifetime) {
514 		ip6_del_rt(rt);
515 		rt = NULL;
516 	}
517 
518 	if (!rt && lifetime)
519 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
520 					pref);
521 	else if (rt)
522 		rt->rt6i_flags = RTF_ROUTEINFO |
523 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
524 
525 	if (rt) {
526 		if (!addrconf_finite_timeout(lifetime)) {
527 			rt->rt6i_flags &= ~RTF_EXPIRES;
528 		} else {
529 			rt->rt6i_expires = jiffies + HZ * lifetime;
530 			rt->rt6i_flags |= RTF_EXPIRES;
531 		}
532 		dst_release(&rt->dst);
533 	}
534 	return 0;
535 }
536 #endif
537 
538 #define BACKTRACK(__net, saddr)			\
539 do { \
540 	if (rt == __net->ipv6.ip6_null_entry) {	\
541 		struct fib6_node *pn; \
542 		while (1) { \
543 			if (fn->fn_flags & RTN_TL_ROOT) \
544 				goto out; \
545 			pn = fn->parent; \
546 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
547 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
548 			else \
549 				fn = pn; \
550 			if (fn->fn_flags & RTN_RTINFO) \
551 				goto restart; \
552 		} \
553 	} \
554 } while(0)
555 
556 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
557 					     struct fib6_table *table,
558 					     struct flowi *fl, int flags)
559 {
560 	struct fib6_node *fn;
561 	struct rt6_info *rt;
562 
563 	read_lock_bh(&table->tb6_lock);
564 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
565 restart:
566 	rt = fn->leaf;
567 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
568 	BACKTRACK(net, &fl->fl6_src);
569 out:
570 	dst_use(&rt->dst, jiffies);
571 	read_unlock_bh(&table->tb6_lock);
572 	return rt;
573 
574 }
575 
576 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
577 			    const struct in6_addr *saddr, int oif, int strict)
578 {
579 	struct flowi fl = {
580 		.oif = oif,
581 		.fl6_dst = *daddr,
582 	};
583 	struct dst_entry *dst;
584 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
585 
586 	if (saddr) {
587 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
588 		flags |= RT6_LOOKUP_F_HAS_SADDR;
589 	}
590 
591 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
592 	if (dst->error == 0)
593 		return (struct rt6_info *) dst;
594 
595 	dst_release(dst);
596 
597 	return NULL;
598 }
599 
600 EXPORT_SYMBOL(rt6_lookup);
601 
602 /* ip6_ins_rt is called with FREE table->tb6_lock.
603    It takes new route entry, the addition fails by any reason the
604    route is freed. In any case, if caller does not hold it, it may
605    be destroyed.
606  */
607 
608 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
609 {
610 	int err;
611 	struct fib6_table *table;
612 
613 	table = rt->rt6i_table;
614 	write_lock_bh(&table->tb6_lock);
615 	err = fib6_add(&table->tb6_root, rt, info);
616 	write_unlock_bh(&table->tb6_lock);
617 
618 	return err;
619 }
620 
621 int ip6_ins_rt(struct rt6_info *rt)
622 {
623 	struct nl_info info = {
624 		.nl_net = dev_net(rt->rt6i_dev),
625 	};
626 	return __ip6_ins_rt(rt, &info);
627 }
628 
629 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
630 				      struct in6_addr *saddr)
631 {
632 	struct rt6_info *rt;
633 
634 	/*
635 	 *	Clone the route.
636 	 */
637 
638 	rt = ip6_rt_copy(ort);
639 
640 	if (rt) {
641 		struct neighbour *neigh;
642 		int attempts = !in_softirq();
643 
644 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
645 			if (rt->rt6i_dst.plen != 128 &&
646 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
647 				rt->rt6i_flags |= RTF_ANYCAST;
648 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
649 		}
650 
651 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
652 		rt->rt6i_dst.plen = 128;
653 		rt->rt6i_flags |= RTF_CACHE;
654 		rt->dst.flags |= DST_HOST;
655 
656 #ifdef CONFIG_IPV6_SUBTREES
657 		if (rt->rt6i_src.plen && saddr) {
658 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
659 			rt->rt6i_src.plen = 128;
660 		}
661 #endif
662 
663 	retry:
664 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
665 		if (IS_ERR(neigh)) {
666 			struct net *net = dev_net(rt->rt6i_dev);
667 			int saved_rt_min_interval =
668 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
669 			int saved_rt_elasticity =
670 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
671 
672 			if (attempts-- > 0) {
673 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
674 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
675 
676 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
677 
678 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
679 					saved_rt_elasticity;
680 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
681 					saved_rt_min_interval;
682 				goto retry;
683 			}
684 
685 			if (net_ratelimit())
686 				printk(KERN_WARNING
687 				       "ipv6: Neighbour table overflow.\n");
688 			dst_free(&rt->dst);
689 			return NULL;
690 		}
691 		rt->rt6i_nexthop = neigh;
692 
693 	}
694 
695 	return rt;
696 }
697 
698 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
699 {
700 	struct rt6_info *rt = ip6_rt_copy(ort);
701 	if (rt) {
702 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
703 		rt->rt6i_dst.plen = 128;
704 		rt->rt6i_flags |= RTF_CACHE;
705 		rt->dst.flags |= DST_HOST;
706 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
707 	}
708 	return rt;
709 }
710 
711 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
712 				      struct flowi *fl, int flags)
713 {
714 	struct fib6_node *fn;
715 	struct rt6_info *rt, *nrt;
716 	int strict = 0;
717 	int attempts = 3;
718 	int err;
719 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
720 
721 	strict |= flags & RT6_LOOKUP_F_IFACE;
722 
723 relookup:
724 	read_lock_bh(&table->tb6_lock);
725 
726 restart_2:
727 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
728 
729 restart:
730 	rt = rt6_select(fn, oif, strict | reachable);
731 
732 	BACKTRACK(net, &fl->fl6_src);
733 	if (rt == net->ipv6.ip6_null_entry ||
734 	    rt->rt6i_flags & RTF_CACHE)
735 		goto out;
736 
737 	dst_hold(&rt->dst);
738 	read_unlock_bh(&table->tb6_lock);
739 
740 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
741 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
742 	else if (!(rt->dst.flags & DST_HOST))
743 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
744 	else
745 		goto out2;
746 
747 	dst_release(&rt->dst);
748 	rt = nrt ? : net->ipv6.ip6_null_entry;
749 
750 	dst_hold(&rt->dst);
751 	if (nrt) {
752 		err = ip6_ins_rt(nrt);
753 		if (!err)
754 			goto out2;
755 	}
756 
757 	if (--attempts <= 0)
758 		goto out2;
759 
760 	/*
761 	 * Race condition! In the gap, when table->tb6_lock was
762 	 * released someone could insert this route.  Relookup.
763 	 */
764 	dst_release(&rt->dst);
765 	goto relookup;
766 
767 out:
768 	if (reachable) {
769 		reachable = 0;
770 		goto restart_2;
771 	}
772 	dst_hold(&rt->dst);
773 	read_unlock_bh(&table->tb6_lock);
774 out2:
775 	rt->dst.lastuse = jiffies;
776 	rt->dst.__use++;
777 
778 	return rt;
779 }
780 
781 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
782 					    struct flowi *fl, int flags)
783 {
784 	return ip6_pol_route(net, table, fl->iif, fl, flags);
785 }
786 
787 void ip6_route_input(struct sk_buff *skb)
788 {
789 	struct ipv6hdr *iph = ipv6_hdr(skb);
790 	struct net *net = dev_net(skb->dev);
791 	int flags = RT6_LOOKUP_F_HAS_SADDR;
792 	struct flowi fl = {
793 		.iif = skb->dev->ifindex,
794 		.fl6_dst = iph->daddr,
795 		.fl6_src = iph->saddr,
796 		.fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
797 		.mark = skb->mark,
798 		.proto = iph->nexthdr,
799 	};
800 
801 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
802 		flags |= RT6_LOOKUP_F_IFACE;
803 
804 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
805 }
806 
807 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
808 					     struct flowi *fl, int flags)
809 {
810 	return ip6_pol_route(net, table, fl->oif, fl, flags);
811 }
812 
813 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
814 				    struct flowi *fl)
815 {
816 	int flags = 0;
817 
818 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
819 		flags |= RT6_LOOKUP_F_IFACE;
820 
821 	if (!ipv6_addr_any(&fl->fl6_src))
822 		flags |= RT6_LOOKUP_F_HAS_SADDR;
823 	else if (sk)
824 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
825 
826 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
827 }
828 
829 EXPORT_SYMBOL(ip6_route_output);
830 
831 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
832 {
833 	struct rt6_info *ort = (struct rt6_info *) *dstp;
834 	struct rt6_info *rt = (struct rt6_info *)
835 		dst_alloc(&ip6_dst_blackhole_ops);
836 	struct dst_entry *new = NULL;
837 
838 	if (rt) {
839 		new = &rt->dst;
840 
841 		atomic_set(&new->__refcnt, 1);
842 		new->__use = 1;
843 		new->input = dst_discard;
844 		new->output = dst_discard;
845 
846 		dst_copy_metrics(new, &ort->dst);
847 		new->dev = ort->dst.dev;
848 		if (new->dev)
849 			dev_hold(new->dev);
850 		rt->rt6i_idev = ort->rt6i_idev;
851 		if (rt->rt6i_idev)
852 			in6_dev_hold(rt->rt6i_idev);
853 		rt->rt6i_expires = 0;
854 
855 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
856 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
857 		rt->rt6i_metric = 0;
858 
859 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
860 #ifdef CONFIG_IPV6_SUBTREES
861 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
862 #endif
863 
864 		dst_free(new);
865 	}
866 
867 	dst_release(*dstp);
868 	*dstp = new;
869 	return new ? 0 : -ENOMEM;
870 }
871 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
872 
873 /*
874  *	Destination cache support functions
875  */
876 
877 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
878 {
879 	struct rt6_info *rt;
880 
881 	rt = (struct rt6_info *) dst;
882 
883 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
884 		return dst;
885 
886 	return NULL;
887 }
888 
889 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
890 {
891 	struct rt6_info *rt = (struct rt6_info *) dst;
892 
893 	if (rt) {
894 		if (rt->rt6i_flags & RTF_CACHE) {
895 			if (rt6_check_expired(rt)) {
896 				ip6_del_rt(rt);
897 				dst = NULL;
898 			}
899 		} else {
900 			dst_release(dst);
901 			dst = NULL;
902 		}
903 	}
904 	return dst;
905 }
906 
907 static void ip6_link_failure(struct sk_buff *skb)
908 {
909 	struct rt6_info *rt;
910 
911 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
912 
913 	rt = (struct rt6_info *) skb_dst(skb);
914 	if (rt) {
915 		if (rt->rt6i_flags&RTF_CACHE) {
916 			dst_set_expires(&rt->dst, 0);
917 			rt->rt6i_flags |= RTF_EXPIRES;
918 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
919 			rt->rt6i_node->fn_sernum = -1;
920 	}
921 }
922 
923 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
924 {
925 	struct rt6_info *rt6 = (struct rt6_info*)dst;
926 
927 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
928 		rt6->rt6i_flags |= RTF_MODIFIED;
929 		if (mtu < IPV6_MIN_MTU) {
930 			u32 features = dst_metric(dst, RTAX_FEATURES);
931 			mtu = IPV6_MIN_MTU;
932 			features |= RTAX_FEATURE_ALLFRAG;
933 			dst_metric_set(dst, RTAX_FEATURES, features);
934 		}
935 		dst_metric_set(dst, RTAX_MTU, mtu);
936 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
937 	}
938 }
939 
940 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
941 {
942 	struct net_device *dev = dst->dev;
943 	unsigned int mtu = dst_mtu(dst);
944 	struct net *net = dev_net(dev);
945 
946 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
947 
948 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
949 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
950 
951 	/*
952 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
953 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
954 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
955 	 * rely only on pmtu discovery"
956 	 */
957 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
958 		mtu = IPV6_MAXPLEN;
959 	return mtu;
960 }
961 
962 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
963 {
964 	unsigned int mtu = IPV6_MIN_MTU;
965 	struct inet6_dev *idev;
966 
967 	rcu_read_lock();
968 	idev = __in6_dev_get(dst->dev);
969 	if (idev)
970 		mtu = idev->cnf.mtu6;
971 	rcu_read_unlock();
972 
973 	return mtu;
974 }
975 
976 static struct dst_entry *icmp6_dst_gc_list;
977 static DEFINE_SPINLOCK(icmp6_dst_lock);
978 
979 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
980 				  struct neighbour *neigh,
981 				  const struct in6_addr *addr)
982 {
983 	struct rt6_info *rt;
984 	struct inet6_dev *idev = in6_dev_get(dev);
985 	struct net *net = dev_net(dev);
986 
987 	if (unlikely(idev == NULL))
988 		return NULL;
989 
990 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
991 	if (unlikely(rt == NULL)) {
992 		in6_dev_put(idev);
993 		goto out;
994 	}
995 
996 	dev_hold(dev);
997 	if (neigh)
998 		neigh_hold(neigh);
999 	else {
1000 		neigh = ndisc_get_neigh(dev, addr);
1001 		if (IS_ERR(neigh))
1002 			neigh = NULL;
1003 	}
1004 
1005 	rt->rt6i_dev	  = dev;
1006 	rt->rt6i_idev     = idev;
1007 	rt->rt6i_nexthop  = neigh;
1008 	atomic_set(&rt->dst.__refcnt, 1);
1009 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1010 	rt->dst.output  = ip6_output;
1011 
1012 #if 0	/* there's no chance to use these for ndisc */
1013 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1014 				? DST_HOST
1015 				: 0;
1016 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1017 	rt->rt6i_dst.plen = 128;
1018 #endif
1019 
1020 	spin_lock_bh(&icmp6_dst_lock);
1021 	rt->dst.next = icmp6_dst_gc_list;
1022 	icmp6_dst_gc_list = &rt->dst;
1023 	spin_unlock_bh(&icmp6_dst_lock);
1024 
1025 	fib6_force_start_gc(net);
1026 
1027 out:
1028 	return &rt->dst;
1029 }
1030 
1031 int icmp6_dst_gc(void)
1032 {
1033 	struct dst_entry *dst, *next, **pprev;
1034 	int more = 0;
1035 
1036 	next = NULL;
1037 
1038 	spin_lock_bh(&icmp6_dst_lock);
1039 	pprev = &icmp6_dst_gc_list;
1040 
1041 	while ((dst = *pprev) != NULL) {
1042 		if (!atomic_read(&dst->__refcnt)) {
1043 			*pprev = dst->next;
1044 			dst_free(dst);
1045 		} else {
1046 			pprev = &dst->next;
1047 			++more;
1048 		}
1049 	}
1050 
1051 	spin_unlock_bh(&icmp6_dst_lock);
1052 
1053 	return more;
1054 }
1055 
1056 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1057 			    void *arg)
1058 {
1059 	struct dst_entry *dst, **pprev;
1060 
1061 	spin_lock_bh(&icmp6_dst_lock);
1062 	pprev = &icmp6_dst_gc_list;
1063 	while ((dst = *pprev) != NULL) {
1064 		struct rt6_info *rt = (struct rt6_info *) dst;
1065 		if (func(rt, arg)) {
1066 			*pprev = dst->next;
1067 			dst_free(dst);
1068 		} else {
1069 			pprev = &dst->next;
1070 		}
1071 	}
1072 	spin_unlock_bh(&icmp6_dst_lock);
1073 }
1074 
1075 static int ip6_dst_gc(struct dst_ops *ops)
1076 {
1077 	unsigned long now = jiffies;
1078 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1079 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1080 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1081 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1082 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1083 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1084 	int entries;
1085 
1086 	entries = dst_entries_get_fast(ops);
1087 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1088 	    entries <= rt_max_size)
1089 		goto out;
1090 
1091 	net->ipv6.ip6_rt_gc_expire++;
1092 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1093 	net->ipv6.ip6_rt_last_gc = now;
1094 	entries = dst_entries_get_slow(ops);
1095 	if (entries < ops->gc_thresh)
1096 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1097 out:
1098 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1099 	return entries > rt_max_size;
1100 }
1101 
1102 /* Clean host part of a prefix. Not necessary in radix tree,
1103    but results in cleaner routing tables.
1104 
1105    Remove it only when all the things will work!
1106  */
1107 
1108 int ip6_dst_hoplimit(struct dst_entry *dst)
1109 {
1110 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1111 	if (hoplimit == 0) {
1112 		struct net_device *dev = dst->dev;
1113 		struct inet6_dev *idev;
1114 
1115 		rcu_read_lock();
1116 		idev = __in6_dev_get(dev);
1117 		if (idev)
1118 			hoplimit = idev->cnf.hop_limit;
1119 		else
1120 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1121 		rcu_read_unlock();
1122 	}
1123 	return hoplimit;
1124 }
1125 EXPORT_SYMBOL(ip6_dst_hoplimit);
1126 
1127 /*
1128  *
1129  */
1130 
1131 int ip6_route_add(struct fib6_config *cfg)
1132 {
1133 	int err;
1134 	struct net *net = cfg->fc_nlinfo.nl_net;
1135 	struct rt6_info *rt = NULL;
1136 	struct net_device *dev = NULL;
1137 	struct inet6_dev *idev = NULL;
1138 	struct fib6_table *table;
1139 	int addr_type;
1140 
1141 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1142 		return -EINVAL;
1143 #ifndef CONFIG_IPV6_SUBTREES
1144 	if (cfg->fc_src_len)
1145 		return -EINVAL;
1146 #endif
1147 	if (cfg->fc_ifindex) {
1148 		err = -ENODEV;
1149 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1150 		if (!dev)
1151 			goto out;
1152 		idev = in6_dev_get(dev);
1153 		if (!idev)
1154 			goto out;
1155 	}
1156 
1157 	if (cfg->fc_metric == 0)
1158 		cfg->fc_metric = IP6_RT_PRIO_USER;
1159 
1160 	table = fib6_new_table(net, cfg->fc_table);
1161 	if (table == NULL) {
1162 		err = -ENOBUFS;
1163 		goto out;
1164 	}
1165 
1166 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1167 
1168 	if (rt == NULL) {
1169 		err = -ENOMEM;
1170 		goto out;
1171 	}
1172 
1173 	rt->dst.obsolete = -1;
1174 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1175 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1176 				0;
1177 
1178 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1179 		cfg->fc_protocol = RTPROT_BOOT;
1180 	rt->rt6i_protocol = cfg->fc_protocol;
1181 
1182 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1183 
1184 	if (addr_type & IPV6_ADDR_MULTICAST)
1185 		rt->dst.input = ip6_mc_input;
1186 	else if (cfg->fc_flags & RTF_LOCAL)
1187 		rt->dst.input = ip6_input;
1188 	else
1189 		rt->dst.input = ip6_forward;
1190 
1191 	rt->dst.output = ip6_output;
1192 
1193 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1194 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1195 	if (rt->rt6i_dst.plen == 128)
1196 	       rt->dst.flags = DST_HOST;
1197 
1198 #ifdef CONFIG_IPV6_SUBTREES
1199 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1200 	rt->rt6i_src.plen = cfg->fc_src_len;
1201 #endif
1202 
1203 	rt->rt6i_metric = cfg->fc_metric;
1204 
1205 	/* We cannot add true routes via loopback here,
1206 	   they would result in kernel looping; promote them to reject routes
1207 	 */
1208 	if ((cfg->fc_flags & RTF_REJECT) ||
1209 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1210 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1211 		/* hold loopback dev/idev if we haven't done so. */
1212 		if (dev != net->loopback_dev) {
1213 			if (dev) {
1214 				dev_put(dev);
1215 				in6_dev_put(idev);
1216 			}
1217 			dev = net->loopback_dev;
1218 			dev_hold(dev);
1219 			idev = in6_dev_get(dev);
1220 			if (!idev) {
1221 				err = -ENODEV;
1222 				goto out;
1223 			}
1224 		}
1225 		rt->dst.output = ip6_pkt_discard_out;
1226 		rt->dst.input = ip6_pkt_discard;
1227 		rt->dst.error = -ENETUNREACH;
1228 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1229 		goto install_route;
1230 	}
1231 
1232 	if (cfg->fc_flags & RTF_GATEWAY) {
1233 		struct in6_addr *gw_addr;
1234 		int gwa_type;
1235 
1236 		gw_addr = &cfg->fc_gateway;
1237 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1238 		gwa_type = ipv6_addr_type(gw_addr);
1239 
1240 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1241 			struct rt6_info *grt;
1242 
1243 			/* IPv6 strictly inhibits using not link-local
1244 			   addresses as nexthop address.
1245 			   Otherwise, router will not able to send redirects.
1246 			   It is very good, but in some (rare!) circumstances
1247 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1248 			   some exceptions. --ANK
1249 			 */
1250 			err = -EINVAL;
1251 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1252 				goto out;
1253 
1254 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1255 
1256 			err = -EHOSTUNREACH;
1257 			if (grt == NULL)
1258 				goto out;
1259 			if (dev) {
1260 				if (dev != grt->rt6i_dev) {
1261 					dst_release(&grt->dst);
1262 					goto out;
1263 				}
1264 			} else {
1265 				dev = grt->rt6i_dev;
1266 				idev = grt->rt6i_idev;
1267 				dev_hold(dev);
1268 				in6_dev_hold(grt->rt6i_idev);
1269 			}
1270 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1271 				err = 0;
1272 			dst_release(&grt->dst);
1273 
1274 			if (err)
1275 				goto out;
1276 		}
1277 		err = -EINVAL;
1278 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1279 			goto out;
1280 	}
1281 
1282 	err = -ENODEV;
1283 	if (dev == NULL)
1284 		goto out;
1285 
1286 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1287 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1288 		if (IS_ERR(rt->rt6i_nexthop)) {
1289 			err = PTR_ERR(rt->rt6i_nexthop);
1290 			rt->rt6i_nexthop = NULL;
1291 			goto out;
1292 		}
1293 	}
1294 
1295 	rt->rt6i_flags = cfg->fc_flags;
1296 
1297 install_route:
1298 	if (cfg->fc_mx) {
1299 		struct nlattr *nla;
1300 		int remaining;
1301 
1302 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1303 			int type = nla_type(nla);
1304 
1305 			if (type) {
1306 				if (type > RTAX_MAX) {
1307 					err = -EINVAL;
1308 					goto out;
1309 				}
1310 
1311 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1312 			}
1313 		}
1314 	}
1315 
1316 	rt->dst.dev = dev;
1317 	rt->rt6i_idev = idev;
1318 	rt->rt6i_table = table;
1319 
1320 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1321 
1322 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1323 
1324 out:
1325 	if (dev)
1326 		dev_put(dev);
1327 	if (idev)
1328 		in6_dev_put(idev);
1329 	if (rt)
1330 		dst_free(&rt->dst);
1331 	return err;
1332 }
1333 
1334 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1335 {
1336 	int err;
1337 	struct fib6_table *table;
1338 	struct net *net = dev_net(rt->rt6i_dev);
1339 
1340 	if (rt == net->ipv6.ip6_null_entry)
1341 		return -ENOENT;
1342 
1343 	table = rt->rt6i_table;
1344 	write_lock_bh(&table->tb6_lock);
1345 
1346 	err = fib6_del(rt, info);
1347 	dst_release(&rt->dst);
1348 
1349 	write_unlock_bh(&table->tb6_lock);
1350 
1351 	return err;
1352 }
1353 
1354 int ip6_del_rt(struct rt6_info *rt)
1355 {
1356 	struct nl_info info = {
1357 		.nl_net = dev_net(rt->rt6i_dev),
1358 	};
1359 	return __ip6_del_rt(rt, &info);
1360 }
1361 
1362 static int ip6_route_del(struct fib6_config *cfg)
1363 {
1364 	struct fib6_table *table;
1365 	struct fib6_node *fn;
1366 	struct rt6_info *rt;
1367 	int err = -ESRCH;
1368 
1369 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1370 	if (table == NULL)
1371 		return err;
1372 
1373 	read_lock_bh(&table->tb6_lock);
1374 
1375 	fn = fib6_locate(&table->tb6_root,
1376 			 &cfg->fc_dst, cfg->fc_dst_len,
1377 			 &cfg->fc_src, cfg->fc_src_len);
1378 
1379 	if (fn) {
1380 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1381 			if (cfg->fc_ifindex &&
1382 			    (rt->rt6i_dev == NULL ||
1383 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1384 				continue;
1385 			if (cfg->fc_flags & RTF_GATEWAY &&
1386 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1387 				continue;
1388 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1389 				continue;
1390 			dst_hold(&rt->dst);
1391 			read_unlock_bh(&table->tb6_lock);
1392 
1393 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1394 		}
1395 	}
1396 	read_unlock_bh(&table->tb6_lock);
1397 
1398 	return err;
1399 }
1400 
1401 /*
1402  *	Handle redirects
1403  */
1404 struct ip6rd_flowi {
1405 	struct flowi fl;
1406 	struct in6_addr gateway;
1407 };
1408 
1409 static struct rt6_info *__ip6_route_redirect(struct net *net,
1410 					     struct fib6_table *table,
1411 					     struct flowi *fl,
1412 					     int flags)
1413 {
1414 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1415 	struct rt6_info *rt;
1416 	struct fib6_node *fn;
1417 
1418 	/*
1419 	 * Get the "current" route for this destination and
1420 	 * check if the redirect has come from approriate router.
1421 	 *
1422 	 * RFC 2461 specifies that redirects should only be
1423 	 * accepted if they come from the nexthop to the target.
1424 	 * Due to the way the routes are chosen, this notion
1425 	 * is a bit fuzzy and one might need to check all possible
1426 	 * routes.
1427 	 */
1428 
1429 	read_lock_bh(&table->tb6_lock);
1430 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1431 restart:
1432 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1433 		/*
1434 		 * Current route is on-link; redirect is always invalid.
1435 		 *
1436 		 * Seems, previous statement is not true. It could
1437 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1438 		 * But then router serving it might decide, that we should
1439 		 * know truth 8)8) --ANK (980726).
1440 		 */
1441 		if (rt6_check_expired(rt))
1442 			continue;
1443 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1444 			continue;
1445 		if (fl->oif != rt->rt6i_dev->ifindex)
1446 			continue;
1447 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1448 			continue;
1449 		break;
1450 	}
1451 
1452 	if (!rt)
1453 		rt = net->ipv6.ip6_null_entry;
1454 	BACKTRACK(net, &fl->fl6_src);
1455 out:
1456 	dst_hold(&rt->dst);
1457 
1458 	read_unlock_bh(&table->tb6_lock);
1459 
1460 	return rt;
1461 };
1462 
1463 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1464 					   struct in6_addr *src,
1465 					   struct in6_addr *gateway,
1466 					   struct net_device *dev)
1467 {
1468 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1469 	struct net *net = dev_net(dev);
1470 	struct ip6rd_flowi rdfl = {
1471 		.fl = {
1472 			.oif = dev->ifindex,
1473 			.fl6_dst = *dest,
1474 			.fl6_src = *src,
1475 		},
1476 	};
1477 
1478 	ipv6_addr_copy(&rdfl.gateway, gateway);
1479 
1480 	if (rt6_need_strict(dest))
1481 		flags |= RT6_LOOKUP_F_IFACE;
1482 
1483 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1484 						   flags, __ip6_route_redirect);
1485 }
1486 
1487 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1488 		  struct in6_addr *saddr,
1489 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1490 {
1491 	struct rt6_info *rt, *nrt = NULL;
1492 	struct netevent_redirect netevent;
1493 	struct net *net = dev_net(neigh->dev);
1494 
1495 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1496 
1497 	if (rt == net->ipv6.ip6_null_entry) {
1498 		if (net_ratelimit())
1499 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1500 			       "for redirect target\n");
1501 		goto out;
1502 	}
1503 
1504 	/*
1505 	 *	We have finally decided to accept it.
1506 	 */
1507 
1508 	neigh_update(neigh, lladdr, NUD_STALE,
1509 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1510 		     NEIGH_UPDATE_F_OVERRIDE|
1511 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1512 				     NEIGH_UPDATE_F_ISROUTER))
1513 		     );
1514 
1515 	/*
1516 	 * Redirect received -> path was valid.
1517 	 * Look, redirects are sent only in response to data packets,
1518 	 * so that this nexthop apparently is reachable. --ANK
1519 	 */
1520 	dst_confirm(&rt->dst);
1521 
1522 	/* Duplicate redirect: silently ignore. */
1523 	if (neigh == rt->dst.neighbour)
1524 		goto out;
1525 
1526 	nrt = ip6_rt_copy(rt);
1527 	if (nrt == NULL)
1528 		goto out;
1529 
1530 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1531 	if (on_link)
1532 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1533 
1534 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1535 	nrt->rt6i_dst.plen = 128;
1536 	nrt->dst.flags |= DST_HOST;
1537 
1538 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1539 	nrt->rt6i_nexthop = neigh_clone(neigh);
1540 
1541 	if (ip6_ins_rt(nrt))
1542 		goto out;
1543 
1544 	netevent.old = &rt->dst;
1545 	netevent.new = &nrt->dst;
1546 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1547 
1548 	if (rt->rt6i_flags&RTF_CACHE) {
1549 		ip6_del_rt(rt);
1550 		return;
1551 	}
1552 
1553 out:
1554 	dst_release(&rt->dst);
1555 }
1556 
1557 /*
1558  *	Handle ICMP "packet too big" messages
1559  *	i.e. Path MTU discovery
1560  */
1561 
1562 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1563 			     struct net *net, u32 pmtu, int ifindex)
1564 {
1565 	struct rt6_info *rt, *nrt;
1566 	int allfrag = 0;
1567 again:
1568 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1569 	if (rt == NULL)
1570 		return;
1571 
1572 	if (rt6_check_expired(rt)) {
1573 		ip6_del_rt(rt);
1574 		goto again;
1575 	}
1576 
1577 	if (pmtu >= dst_mtu(&rt->dst))
1578 		goto out;
1579 
1580 	if (pmtu < IPV6_MIN_MTU) {
1581 		/*
1582 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1583 		 * MTU (1280) and a fragment header should always be included
1584 		 * after a node receiving Too Big message reporting PMTU is
1585 		 * less than the IPv6 Minimum Link MTU.
1586 		 */
1587 		pmtu = IPV6_MIN_MTU;
1588 		allfrag = 1;
1589 	}
1590 
1591 	/* New mtu received -> path was valid.
1592 	   They are sent only in response to data packets,
1593 	   so that this nexthop apparently is reachable. --ANK
1594 	 */
1595 	dst_confirm(&rt->dst);
1596 
1597 	/* Host route. If it is static, it would be better
1598 	   not to override it, but add new one, so that
1599 	   when cache entry will expire old pmtu
1600 	   would return automatically.
1601 	 */
1602 	if (rt->rt6i_flags & RTF_CACHE) {
1603 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1604 		if (allfrag) {
1605 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1606 			features |= RTAX_FEATURE_ALLFRAG;
1607 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1608 		}
1609 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1610 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1611 		goto out;
1612 	}
1613 
1614 	/* Network route.
1615 	   Two cases are possible:
1616 	   1. It is connected route. Action: COW
1617 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1618 	 */
1619 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1620 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1621 	else
1622 		nrt = rt6_alloc_clone(rt, daddr);
1623 
1624 	if (nrt) {
1625 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1626 		if (allfrag) {
1627 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1628 			features |= RTAX_FEATURE_ALLFRAG;
1629 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1630 		}
1631 
1632 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1633 		 * happened within 5 mins, the recommended timer is 10 mins.
1634 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1635 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1636 		 * and detecting PMTU increase will be automatically happened.
1637 		 */
1638 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1639 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1640 
1641 		ip6_ins_rt(nrt);
1642 	}
1643 out:
1644 	dst_release(&rt->dst);
1645 }
1646 
1647 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1648 			struct net_device *dev, u32 pmtu)
1649 {
1650 	struct net *net = dev_net(dev);
1651 
1652 	/*
1653 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1654 	 * is sending along the path" that caused the Packet Too Big message.
1655 	 * Since it's not possible in the general case to determine which
1656 	 * interface was used to send the original packet, we update the MTU
1657 	 * on the interface that will be used to send future packets. We also
1658 	 * update the MTU on the interface that received the Packet Too Big in
1659 	 * case the original packet was forced out that interface with
1660 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1661 	 * correct behaviour, which would be to update the MTU on all
1662 	 * interfaces.
1663 	 */
1664 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1665 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1666 }
1667 
1668 /*
1669  *	Misc support functions
1670  */
1671 
1672 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1673 {
1674 	struct net *net = dev_net(ort->rt6i_dev);
1675 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1676 
1677 	if (rt) {
1678 		rt->dst.input = ort->dst.input;
1679 		rt->dst.output = ort->dst.output;
1680 
1681 		dst_copy_metrics(&rt->dst, &ort->dst);
1682 		rt->dst.error = ort->dst.error;
1683 		rt->dst.dev = ort->dst.dev;
1684 		if (rt->dst.dev)
1685 			dev_hold(rt->dst.dev);
1686 		rt->rt6i_idev = ort->rt6i_idev;
1687 		if (rt->rt6i_idev)
1688 			in6_dev_hold(rt->rt6i_idev);
1689 		rt->dst.lastuse = jiffies;
1690 		rt->rt6i_expires = 0;
1691 
1692 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1693 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1694 		rt->rt6i_metric = 0;
1695 
1696 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1697 #ifdef CONFIG_IPV6_SUBTREES
1698 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1699 #endif
1700 		rt->rt6i_table = ort->rt6i_table;
1701 	}
1702 	return rt;
1703 }
1704 
1705 #ifdef CONFIG_IPV6_ROUTE_INFO
1706 static struct rt6_info *rt6_get_route_info(struct net *net,
1707 					   struct in6_addr *prefix, int prefixlen,
1708 					   struct in6_addr *gwaddr, int ifindex)
1709 {
1710 	struct fib6_node *fn;
1711 	struct rt6_info *rt = NULL;
1712 	struct fib6_table *table;
1713 
1714 	table = fib6_get_table(net, RT6_TABLE_INFO);
1715 	if (table == NULL)
1716 		return NULL;
1717 
1718 	write_lock_bh(&table->tb6_lock);
1719 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1720 	if (!fn)
1721 		goto out;
1722 
1723 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1724 		if (rt->rt6i_dev->ifindex != ifindex)
1725 			continue;
1726 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1727 			continue;
1728 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1729 			continue;
1730 		dst_hold(&rt->dst);
1731 		break;
1732 	}
1733 out:
1734 	write_unlock_bh(&table->tb6_lock);
1735 	return rt;
1736 }
1737 
1738 static struct rt6_info *rt6_add_route_info(struct net *net,
1739 					   struct in6_addr *prefix, int prefixlen,
1740 					   struct in6_addr *gwaddr, int ifindex,
1741 					   unsigned pref)
1742 {
1743 	struct fib6_config cfg = {
1744 		.fc_table	= RT6_TABLE_INFO,
1745 		.fc_metric	= IP6_RT_PRIO_USER,
1746 		.fc_ifindex	= ifindex,
1747 		.fc_dst_len	= prefixlen,
1748 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1749 				  RTF_UP | RTF_PREF(pref),
1750 		.fc_nlinfo.pid = 0,
1751 		.fc_nlinfo.nlh = NULL,
1752 		.fc_nlinfo.nl_net = net,
1753 	};
1754 
1755 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1756 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1757 
1758 	/* We should treat it as a default route if prefix length is 0. */
1759 	if (!prefixlen)
1760 		cfg.fc_flags |= RTF_DEFAULT;
1761 
1762 	ip6_route_add(&cfg);
1763 
1764 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1765 }
1766 #endif
1767 
1768 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1769 {
1770 	struct rt6_info *rt;
1771 	struct fib6_table *table;
1772 
1773 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1774 	if (table == NULL)
1775 		return NULL;
1776 
1777 	write_lock_bh(&table->tb6_lock);
1778 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1779 		if (dev == rt->rt6i_dev &&
1780 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1781 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1782 			break;
1783 	}
1784 	if (rt)
1785 		dst_hold(&rt->dst);
1786 	write_unlock_bh(&table->tb6_lock);
1787 	return rt;
1788 }
1789 
1790 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1791 				     struct net_device *dev,
1792 				     unsigned int pref)
1793 {
1794 	struct fib6_config cfg = {
1795 		.fc_table	= RT6_TABLE_DFLT,
1796 		.fc_metric	= IP6_RT_PRIO_USER,
1797 		.fc_ifindex	= dev->ifindex,
1798 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1799 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1800 		.fc_nlinfo.pid = 0,
1801 		.fc_nlinfo.nlh = NULL,
1802 		.fc_nlinfo.nl_net = dev_net(dev),
1803 	};
1804 
1805 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1806 
1807 	ip6_route_add(&cfg);
1808 
1809 	return rt6_get_dflt_router(gwaddr, dev);
1810 }
1811 
1812 void rt6_purge_dflt_routers(struct net *net)
1813 {
1814 	struct rt6_info *rt;
1815 	struct fib6_table *table;
1816 
1817 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1818 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1819 	if (table == NULL)
1820 		return;
1821 
1822 restart:
1823 	read_lock_bh(&table->tb6_lock);
1824 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1825 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1826 			dst_hold(&rt->dst);
1827 			read_unlock_bh(&table->tb6_lock);
1828 			ip6_del_rt(rt);
1829 			goto restart;
1830 		}
1831 	}
1832 	read_unlock_bh(&table->tb6_lock);
1833 }
1834 
1835 static void rtmsg_to_fib6_config(struct net *net,
1836 				 struct in6_rtmsg *rtmsg,
1837 				 struct fib6_config *cfg)
1838 {
1839 	memset(cfg, 0, sizeof(*cfg));
1840 
1841 	cfg->fc_table = RT6_TABLE_MAIN;
1842 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1843 	cfg->fc_metric = rtmsg->rtmsg_metric;
1844 	cfg->fc_expires = rtmsg->rtmsg_info;
1845 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1846 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1847 	cfg->fc_flags = rtmsg->rtmsg_flags;
1848 
1849 	cfg->fc_nlinfo.nl_net = net;
1850 
1851 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1852 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1853 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1854 }
1855 
1856 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1857 {
1858 	struct fib6_config cfg;
1859 	struct in6_rtmsg rtmsg;
1860 	int err;
1861 
1862 	switch(cmd) {
1863 	case SIOCADDRT:		/* Add a route */
1864 	case SIOCDELRT:		/* Delete a route */
1865 		if (!capable(CAP_NET_ADMIN))
1866 			return -EPERM;
1867 		err = copy_from_user(&rtmsg, arg,
1868 				     sizeof(struct in6_rtmsg));
1869 		if (err)
1870 			return -EFAULT;
1871 
1872 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1873 
1874 		rtnl_lock();
1875 		switch (cmd) {
1876 		case SIOCADDRT:
1877 			err = ip6_route_add(&cfg);
1878 			break;
1879 		case SIOCDELRT:
1880 			err = ip6_route_del(&cfg);
1881 			break;
1882 		default:
1883 			err = -EINVAL;
1884 		}
1885 		rtnl_unlock();
1886 
1887 		return err;
1888 	}
1889 
1890 	return -EINVAL;
1891 }
1892 
1893 /*
1894  *	Drop the packet on the floor
1895  */
1896 
1897 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1898 {
1899 	int type;
1900 	struct dst_entry *dst = skb_dst(skb);
1901 	switch (ipstats_mib_noroutes) {
1902 	case IPSTATS_MIB_INNOROUTES:
1903 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1904 		if (type == IPV6_ADDR_ANY) {
1905 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1906 				      IPSTATS_MIB_INADDRERRORS);
1907 			break;
1908 		}
1909 		/* FALLTHROUGH */
1910 	case IPSTATS_MIB_OUTNOROUTES:
1911 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1912 			      ipstats_mib_noroutes);
1913 		break;
1914 	}
1915 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1916 	kfree_skb(skb);
1917 	return 0;
1918 }
1919 
1920 static int ip6_pkt_discard(struct sk_buff *skb)
1921 {
1922 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1923 }
1924 
1925 static int ip6_pkt_discard_out(struct sk_buff *skb)
1926 {
1927 	skb->dev = skb_dst(skb)->dev;
1928 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1929 }
1930 
1931 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1932 
1933 static int ip6_pkt_prohibit(struct sk_buff *skb)
1934 {
1935 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1936 }
1937 
1938 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1939 {
1940 	skb->dev = skb_dst(skb)->dev;
1941 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1942 }
1943 
1944 #endif
1945 
1946 /*
1947  *	Allocate a dst for local (unicast / anycast) address.
1948  */
1949 
1950 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1951 				    const struct in6_addr *addr,
1952 				    int anycast)
1953 {
1954 	struct net *net = dev_net(idev->dev);
1955 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1956 	struct neighbour *neigh;
1957 
1958 	if (rt == NULL) {
1959 		if (net_ratelimit())
1960 			pr_warning("IPv6:  Maximum number of routes reached,"
1961 				   " consider increasing route/max_size.\n");
1962 		return ERR_PTR(-ENOMEM);
1963 	}
1964 
1965 	dev_hold(net->loopback_dev);
1966 	in6_dev_hold(idev);
1967 
1968 	rt->dst.flags = DST_HOST;
1969 	rt->dst.input = ip6_input;
1970 	rt->dst.output = ip6_output;
1971 	rt->rt6i_dev = net->loopback_dev;
1972 	rt->rt6i_idev = idev;
1973 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1974 	rt->dst.obsolete = -1;
1975 
1976 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1977 	if (anycast)
1978 		rt->rt6i_flags |= RTF_ANYCAST;
1979 	else
1980 		rt->rt6i_flags |= RTF_LOCAL;
1981 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1982 	if (IS_ERR(neigh)) {
1983 		dst_free(&rt->dst);
1984 
1985 		/* We are casting this because that is the return
1986 		 * value type.  But an errno encoded pointer is the
1987 		 * same regardless of the underlying pointer type,
1988 		 * and that's what we are returning.  So this is OK.
1989 		 */
1990 		return (struct rt6_info *) neigh;
1991 	}
1992 	rt->rt6i_nexthop = neigh;
1993 
1994 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1995 	rt->rt6i_dst.plen = 128;
1996 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1997 
1998 	atomic_set(&rt->dst.__refcnt, 1);
1999 
2000 	return rt;
2001 }
2002 
2003 struct arg_dev_net {
2004 	struct net_device *dev;
2005 	struct net *net;
2006 };
2007 
2008 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2009 {
2010 	const struct arg_dev_net *adn = arg;
2011 	const struct net_device *dev = adn->dev;
2012 
2013 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2014 	    rt != adn->net->ipv6.ip6_null_entry) {
2015 		RT6_TRACE("deleted by ifdown %p\n", rt);
2016 		return -1;
2017 	}
2018 	return 0;
2019 }
2020 
2021 void rt6_ifdown(struct net *net, struct net_device *dev)
2022 {
2023 	struct arg_dev_net adn = {
2024 		.dev = dev,
2025 		.net = net,
2026 	};
2027 
2028 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2029 	icmp6_clean_all(fib6_ifdown, &adn);
2030 }
2031 
2032 struct rt6_mtu_change_arg
2033 {
2034 	struct net_device *dev;
2035 	unsigned mtu;
2036 };
2037 
2038 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2039 {
2040 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2041 	struct inet6_dev *idev;
2042 
2043 	/* In IPv6 pmtu discovery is not optional,
2044 	   so that RTAX_MTU lock cannot disable it.
2045 	   We still use this lock to block changes
2046 	   caused by addrconf/ndisc.
2047 	*/
2048 
2049 	idev = __in6_dev_get(arg->dev);
2050 	if (idev == NULL)
2051 		return 0;
2052 
2053 	/* For administrative MTU increase, there is no way to discover
2054 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2055 	   Since RFC 1981 doesn't include administrative MTU increase
2056 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2057 	 */
2058 	/*
2059 	   If new MTU is less than route PMTU, this new MTU will be the
2060 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2061 	   decreases; if new MTU is greater than route PMTU, and the
2062 	   old MTU is the lowest MTU in the path, update the route PMTU
2063 	   to reflect the increase. In this case if the other nodes' MTU
2064 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2065 	   PMTU discouvery.
2066 	 */
2067 	if (rt->rt6i_dev == arg->dev &&
2068 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2069 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2070 	     (dst_mtu(&rt->dst) < arg->mtu &&
2071 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2072 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2073 	}
2074 	return 0;
2075 }
2076 
2077 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2078 {
2079 	struct rt6_mtu_change_arg arg = {
2080 		.dev = dev,
2081 		.mtu = mtu,
2082 	};
2083 
2084 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2085 }
2086 
2087 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2088 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2089 	[RTA_OIF]               = { .type = NLA_U32 },
2090 	[RTA_IIF]		= { .type = NLA_U32 },
2091 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2092 	[RTA_METRICS]           = { .type = NLA_NESTED },
2093 };
2094 
2095 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2096 			      struct fib6_config *cfg)
2097 {
2098 	struct rtmsg *rtm;
2099 	struct nlattr *tb[RTA_MAX+1];
2100 	int err;
2101 
2102 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2103 	if (err < 0)
2104 		goto errout;
2105 
2106 	err = -EINVAL;
2107 	rtm = nlmsg_data(nlh);
2108 	memset(cfg, 0, sizeof(*cfg));
2109 
2110 	cfg->fc_table = rtm->rtm_table;
2111 	cfg->fc_dst_len = rtm->rtm_dst_len;
2112 	cfg->fc_src_len = rtm->rtm_src_len;
2113 	cfg->fc_flags = RTF_UP;
2114 	cfg->fc_protocol = rtm->rtm_protocol;
2115 
2116 	if (rtm->rtm_type == RTN_UNREACHABLE)
2117 		cfg->fc_flags |= RTF_REJECT;
2118 
2119 	if (rtm->rtm_type == RTN_LOCAL)
2120 		cfg->fc_flags |= RTF_LOCAL;
2121 
2122 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2123 	cfg->fc_nlinfo.nlh = nlh;
2124 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2125 
2126 	if (tb[RTA_GATEWAY]) {
2127 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2128 		cfg->fc_flags |= RTF_GATEWAY;
2129 	}
2130 
2131 	if (tb[RTA_DST]) {
2132 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2133 
2134 		if (nla_len(tb[RTA_DST]) < plen)
2135 			goto errout;
2136 
2137 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2138 	}
2139 
2140 	if (tb[RTA_SRC]) {
2141 		int plen = (rtm->rtm_src_len + 7) >> 3;
2142 
2143 		if (nla_len(tb[RTA_SRC]) < plen)
2144 			goto errout;
2145 
2146 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2147 	}
2148 
2149 	if (tb[RTA_OIF])
2150 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2151 
2152 	if (tb[RTA_PRIORITY])
2153 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2154 
2155 	if (tb[RTA_METRICS]) {
2156 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2157 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2158 	}
2159 
2160 	if (tb[RTA_TABLE])
2161 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2162 
2163 	err = 0;
2164 errout:
2165 	return err;
2166 }
2167 
2168 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2169 {
2170 	struct fib6_config cfg;
2171 	int err;
2172 
2173 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2174 	if (err < 0)
2175 		return err;
2176 
2177 	return ip6_route_del(&cfg);
2178 }
2179 
2180 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2181 {
2182 	struct fib6_config cfg;
2183 	int err;
2184 
2185 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2186 	if (err < 0)
2187 		return err;
2188 
2189 	return ip6_route_add(&cfg);
2190 }
2191 
2192 static inline size_t rt6_nlmsg_size(void)
2193 {
2194 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2195 	       + nla_total_size(16) /* RTA_SRC */
2196 	       + nla_total_size(16) /* RTA_DST */
2197 	       + nla_total_size(16) /* RTA_GATEWAY */
2198 	       + nla_total_size(16) /* RTA_PREFSRC */
2199 	       + nla_total_size(4) /* RTA_TABLE */
2200 	       + nla_total_size(4) /* RTA_IIF */
2201 	       + nla_total_size(4) /* RTA_OIF */
2202 	       + nla_total_size(4) /* RTA_PRIORITY */
2203 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2204 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2205 }
2206 
2207 static int rt6_fill_node(struct net *net,
2208 			 struct sk_buff *skb, struct rt6_info *rt,
2209 			 struct in6_addr *dst, struct in6_addr *src,
2210 			 int iif, int type, u32 pid, u32 seq,
2211 			 int prefix, int nowait, unsigned int flags)
2212 {
2213 	struct rtmsg *rtm;
2214 	struct nlmsghdr *nlh;
2215 	long expires;
2216 	u32 table;
2217 
2218 	if (prefix) {	/* user wants prefix routes only */
2219 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2220 			/* success since this is not a prefix route */
2221 			return 1;
2222 		}
2223 	}
2224 
2225 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2226 	if (nlh == NULL)
2227 		return -EMSGSIZE;
2228 
2229 	rtm = nlmsg_data(nlh);
2230 	rtm->rtm_family = AF_INET6;
2231 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2232 	rtm->rtm_src_len = rt->rt6i_src.plen;
2233 	rtm->rtm_tos = 0;
2234 	if (rt->rt6i_table)
2235 		table = rt->rt6i_table->tb6_id;
2236 	else
2237 		table = RT6_TABLE_UNSPEC;
2238 	rtm->rtm_table = table;
2239 	NLA_PUT_U32(skb, RTA_TABLE, table);
2240 	if (rt->rt6i_flags&RTF_REJECT)
2241 		rtm->rtm_type = RTN_UNREACHABLE;
2242 	else if (rt->rt6i_flags&RTF_LOCAL)
2243 		rtm->rtm_type = RTN_LOCAL;
2244 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2245 		rtm->rtm_type = RTN_LOCAL;
2246 	else
2247 		rtm->rtm_type = RTN_UNICAST;
2248 	rtm->rtm_flags = 0;
2249 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2250 	rtm->rtm_protocol = rt->rt6i_protocol;
2251 	if (rt->rt6i_flags&RTF_DYNAMIC)
2252 		rtm->rtm_protocol = RTPROT_REDIRECT;
2253 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2254 		rtm->rtm_protocol = RTPROT_KERNEL;
2255 	else if (rt->rt6i_flags&RTF_DEFAULT)
2256 		rtm->rtm_protocol = RTPROT_RA;
2257 
2258 	if (rt->rt6i_flags&RTF_CACHE)
2259 		rtm->rtm_flags |= RTM_F_CLONED;
2260 
2261 	if (dst) {
2262 		NLA_PUT(skb, RTA_DST, 16, dst);
2263 		rtm->rtm_dst_len = 128;
2264 	} else if (rtm->rtm_dst_len)
2265 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2266 #ifdef CONFIG_IPV6_SUBTREES
2267 	if (src) {
2268 		NLA_PUT(skb, RTA_SRC, 16, src);
2269 		rtm->rtm_src_len = 128;
2270 	} else if (rtm->rtm_src_len)
2271 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2272 #endif
2273 	if (iif) {
2274 #ifdef CONFIG_IPV6_MROUTE
2275 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2276 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2277 			if (err <= 0) {
2278 				if (!nowait) {
2279 					if (err == 0)
2280 						return 0;
2281 					goto nla_put_failure;
2282 				} else {
2283 					if (err == -EMSGSIZE)
2284 						goto nla_put_failure;
2285 				}
2286 			}
2287 		} else
2288 #endif
2289 			NLA_PUT_U32(skb, RTA_IIF, iif);
2290 	} else if (dst) {
2291 		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2292 		struct in6_addr saddr_buf;
2293 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2294 				       dst, 0, &saddr_buf) == 0)
2295 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2296 	}
2297 
2298 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2299 		goto nla_put_failure;
2300 
2301 	if (rt->dst.neighbour)
2302 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2303 
2304 	if (rt->dst.dev)
2305 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2306 
2307 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2308 
2309 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2310 		expires = 0;
2311 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2312 		expires = rt->rt6i_expires - jiffies;
2313 	else
2314 		expires = INT_MAX;
2315 
2316 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2317 			       expires, rt->dst.error) < 0)
2318 		goto nla_put_failure;
2319 
2320 	return nlmsg_end(skb, nlh);
2321 
2322 nla_put_failure:
2323 	nlmsg_cancel(skb, nlh);
2324 	return -EMSGSIZE;
2325 }
2326 
2327 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2328 {
2329 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2330 	int prefix;
2331 
2332 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2333 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2334 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2335 	} else
2336 		prefix = 0;
2337 
2338 	return rt6_fill_node(arg->net,
2339 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2340 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2341 		     prefix, 0, NLM_F_MULTI);
2342 }
2343 
2344 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2345 {
2346 	struct net *net = sock_net(in_skb->sk);
2347 	struct nlattr *tb[RTA_MAX+1];
2348 	struct rt6_info *rt;
2349 	struct sk_buff *skb;
2350 	struct rtmsg *rtm;
2351 	struct flowi fl;
2352 	int err, iif = 0;
2353 
2354 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2355 	if (err < 0)
2356 		goto errout;
2357 
2358 	err = -EINVAL;
2359 	memset(&fl, 0, sizeof(fl));
2360 
2361 	if (tb[RTA_SRC]) {
2362 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2363 			goto errout;
2364 
2365 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2366 	}
2367 
2368 	if (tb[RTA_DST]) {
2369 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2370 			goto errout;
2371 
2372 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2373 	}
2374 
2375 	if (tb[RTA_IIF])
2376 		iif = nla_get_u32(tb[RTA_IIF]);
2377 
2378 	if (tb[RTA_OIF])
2379 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2380 
2381 	if (iif) {
2382 		struct net_device *dev;
2383 		dev = __dev_get_by_index(net, iif);
2384 		if (!dev) {
2385 			err = -ENODEV;
2386 			goto errout;
2387 		}
2388 	}
2389 
2390 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2391 	if (skb == NULL) {
2392 		err = -ENOBUFS;
2393 		goto errout;
2394 	}
2395 
2396 	/* Reserve room for dummy headers, this skb can pass
2397 	   through good chunk of routing engine.
2398 	 */
2399 	skb_reset_mac_header(skb);
2400 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2401 
2402 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2403 	skb_dst_set(skb, &rt->dst);
2404 
2405 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2406 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2407 			    nlh->nlmsg_seq, 0, 0, 0);
2408 	if (err < 0) {
2409 		kfree_skb(skb);
2410 		goto errout;
2411 	}
2412 
2413 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2414 errout:
2415 	return err;
2416 }
2417 
2418 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2419 {
2420 	struct sk_buff *skb;
2421 	struct net *net = info->nl_net;
2422 	u32 seq;
2423 	int err;
2424 
2425 	err = -ENOBUFS;
2426 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2427 
2428 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2429 	if (skb == NULL)
2430 		goto errout;
2431 
2432 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2433 				event, info->pid, seq, 0, 0, 0);
2434 	if (err < 0) {
2435 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2436 		WARN_ON(err == -EMSGSIZE);
2437 		kfree_skb(skb);
2438 		goto errout;
2439 	}
2440 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2441 		    info->nlh, gfp_any());
2442 	return;
2443 errout:
2444 	if (err < 0)
2445 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2446 }
2447 
2448 static int ip6_route_dev_notify(struct notifier_block *this,
2449 				unsigned long event, void *data)
2450 {
2451 	struct net_device *dev = (struct net_device *)data;
2452 	struct net *net = dev_net(dev);
2453 
2454 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2455 		net->ipv6.ip6_null_entry->dst.dev = dev;
2456 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2457 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2458 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2459 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2460 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2461 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2462 #endif
2463 	}
2464 
2465 	return NOTIFY_OK;
2466 }
2467 
2468 /*
2469  *	/proc
2470  */
2471 
2472 #ifdef CONFIG_PROC_FS
2473 
2474 struct rt6_proc_arg
2475 {
2476 	char *buffer;
2477 	int offset;
2478 	int length;
2479 	int skip;
2480 	int len;
2481 };
2482 
2483 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2484 {
2485 	struct seq_file *m = p_arg;
2486 
2487 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2488 
2489 #ifdef CONFIG_IPV6_SUBTREES
2490 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2491 #else
2492 	seq_puts(m, "00000000000000000000000000000000 00 ");
2493 #endif
2494 
2495 	if (rt->rt6i_nexthop) {
2496 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2497 	} else {
2498 		seq_puts(m, "00000000000000000000000000000000");
2499 	}
2500 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2501 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2502 		   rt->dst.__use, rt->rt6i_flags,
2503 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2504 	return 0;
2505 }
2506 
2507 static int ipv6_route_show(struct seq_file *m, void *v)
2508 {
2509 	struct net *net = (struct net *)m->private;
2510 	fib6_clean_all(net, rt6_info_route, 0, m);
2511 	return 0;
2512 }
2513 
2514 static int ipv6_route_open(struct inode *inode, struct file *file)
2515 {
2516 	return single_open_net(inode, file, ipv6_route_show);
2517 }
2518 
2519 static const struct file_operations ipv6_route_proc_fops = {
2520 	.owner		= THIS_MODULE,
2521 	.open		= ipv6_route_open,
2522 	.read		= seq_read,
2523 	.llseek		= seq_lseek,
2524 	.release	= single_release_net,
2525 };
2526 
2527 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2528 {
2529 	struct net *net = (struct net *)seq->private;
2530 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2531 		   net->ipv6.rt6_stats->fib_nodes,
2532 		   net->ipv6.rt6_stats->fib_route_nodes,
2533 		   net->ipv6.rt6_stats->fib_rt_alloc,
2534 		   net->ipv6.rt6_stats->fib_rt_entries,
2535 		   net->ipv6.rt6_stats->fib_rt_cache,
2536 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2537 		   net->ipv6.rt6_stats->fib_discarded_routes);
2538 
2539 	return 0;
2540 }
2541 
2542 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2543 {
2544 	return single_open_net(inode, file, rt6_stats_seq_show);
2545 }
2546 
2547 static const struct file_operations rt6_stats_seq_fops = {
2548 	.owner	 = THIS_MODULE,
2549 	.open	 = rt6_stats_seq_open,
2550 	.read	 = seq_read,
2551 	.llseek	 = seq_lseek,
2552 	.release = single_release_net,
2553 };
2554 #endif	/* CONFIG_PROC_FS */
2555 
2556 #ifdef CONFIG_SYSCTL
2557 
2558 static
2559 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2560 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2561 {
2562 	struct net *net;
2563 	int delay;
2564 	if (!write)
2565 		return -EINVAL;
2566 
2567 	net = (struct net *)ctl->extra1;
2568 	delay = net->ipv6.sysctl.flush_delay;
2569 	proc_dointvec(ctl, write, buffer, lenp, ppos);
2570 	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2571 	return 0;
2572 }
2573 
2574 ctl_table ipv6_route_table_template[] = {
2575 	{
2576 		.procname	=	"flush",
2577 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2578 		.maxlen		=	sizeof(int),
2579 		.mode		=	0200,
2580 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2581 	},
2582 	{
2583 		.procname	=	"gc_thresh",
2584 		.data		=	&ip6_dst_ops_template.gc_thresh,
2585 		.maxlen		=	sizeof(int),
2586 		.mode		=	0644,
2587 		.proc_handler	=	proc_dointvec,
2588 	},
2589 	{
2590 		.procname	=	"max_size",
2591 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2592 		.maxlen		=	sizeof(int),
2593 		.mode		=	0644,
2594 		.proc_handler	=	proc_dointvec,
2595 	},
2596 	{
2597 		.procname	=	"gc_min_interval",
2598 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2599 		.maxlen		=	sizeof(int),
2600 		.mode		=	0644,
2601 		.proc_handler	=	proc_dointvec_jiffies,
2602 	},
2603 	{
2604 		.procname	=	"gc_timeout",
2605 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2606 		.maxlen		=	sizeof(int),
2607 		.mode		=	0644,
2608 		.proc_handler	=	proc_dointvec_jiffies,
2609 	},
2610 	{
2611 		.procname	=	"gc_interval",
2612 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2613 		.maxlen		=	sizeof(int),
2614 		.mode		=	0644,
2615 		.proc_handler	=	proc_dointvec_jiffies,
2616 	},
2617 	{
2618 		.procname	=	"gc_elasticity",
2619 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2620 		.maxlen		=	sizeof(int),
2621 		.mode		=	0644,
2622 		.proc_handler	=	proc_dointvec,
2623 	},
2624 	{
2625 		.procname	=	"mtu_expires",
2626 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2627 		.maxlen		=	sizeof(int),
2628 		.mode		=	0644,
2629 		.proc_handler	=	proc_dointvec_jiffies,
2630 	},
2631 	{
2632 		.procname	=	"min_adv_mss",
2633 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2634 		.maxlen		=	sizeof(int),
2635 		.mode		=	0644,
2636 		.proc_handler	=	proc_dointvec,
2637 	},
2638 	{
2639 		.procname	=	"gc_min_interval_ms",
2640 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2641 		.maxlen		=	sizeof(int),
2642 		.mode		=	0644,
2643 		.proc_handler	=	proc_dointvec_ms_jiffies,
2644 	},
2645 	{ }
2646 };
2647 
2648 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2649 {
2650 	struct ctl_table *table;
2651 
2652 	table = kmemdup(ipv6_route_table_template,
2653 			sizeof(ipv6_route_table_template),
2654 			GFP_KERNEL);
2655 
2656 	if (table) {
2657 		table[0].data = &net->ipv6.sysctl.flush_delay;
2658 		table[0].extra1 = net;
2659 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2660 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2661 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2662 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2663 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2664 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2665 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2666 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2667 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2668 	}
2669 
2670 	return table;
2671 }
2672 #endif
2673 
2674 static int __net_init ip6_route_net_init(struct net *net)
2675 {
2676 	int ret = -ENOMEM;
2677 
2678 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2679 	       sizeof(net->ipv6.ip6_dst_ops));
2680 
2681 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2682 		goto out_ip6_dst_ops;
2683 
2684 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2685 					   sizeof(*net->ipv6.ip6_null_entry),
2686 					   GFP_KERNEL);
2687 	if (!net->ipv6.ip6_null_entry)
2688 		goto out_ip6_dst_entries;
2689 	net->ipv6.ip6_null_entry->dst.path =
2690 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2691 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2692 	dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2693 
2694 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2695 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2696 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2697 					       GFP_KERNEL);
2698 	if (!net->ipv6.ip6_prohibit_entry)
2699 		goto out_ip6_null_entry;
2700 	net->ipv6.ip6_prohibit_entry->dst.path =
2701 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2702 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2703 	dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2704 
2705 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2706 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2707 					       GFP_KERNEL);
2708 	if (!net->ipv6.ip6_blk_hole_entry)
2709 		goto out_ip6_prohibit_entry;
2710 	net->ipv6.ip6_blk_hole_entry->dst.path =
2711 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2712 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2713 	dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2714 #endif
2715 
2716 	net->ipv6.sysctl.flush_delay = 0;
2717 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2718 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2719 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2720 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2721 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2722 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2723 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2724 
2725 #ifdef CONFIG_PROC_FS
2726 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2727 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2728 #endif
2729 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2730 
2731 	ret = 0;
2732 out:
2733 	return ret;
2734 
2735 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2736 out_ip6_prohibit_entry:
2737 	kfree(net->ipv6.ip6_prohibit_entry);
2738 out_ip6_null_entry:
2739 	kfree(net->ipv6.ip6_null_entry);
2740 #endif
2741 out_ip6_dst_entries:
2742 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2743 out_ip6_dst_ops:
2744 	goto out;
2745 }
2746 
2747 static void __net_exit ip6_route_net_exit(struct net *net)
2748 {
2749 #ifdef CONFIG_PROC_FS
2750 	proc_net_remove(net, "ipv6_route");
2751 	proc_net_remove(net, "rt6_stats");
2752 #endif
2753 	kfree(net->ipv6.ip6_null_entry);
2754 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2755 	kfree(net->ipv6.ip6_prohibit_entry);
2756 	kfree(net->ipv6.ip6_blk_hole_entry);
2757 #endif
2758 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2759 }
2760 
2761 static struct pernet_operations ip6_route_net_ops = {
2762 	.init = ip6_route_net_init,
2763 	.exit = ip6_route_net_exit,
2764 };
2765 
2766 static struct notifier_block ip6_route_dev_notifier = {
2767 	.notifier_call = ip6_route_dev_notify,
2768 	.priority = 0,
2769 };
2770 
2771 int __init ip6_route_init(void)
2772 {
2773 	int ret;
2774 
2775 	ret = -ENOMEM;
2776 	ip6_dst_ops_template.kmem_cachep =
2777 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2778 				  SLAB_HWCACHE_ALIGN, NULL);
2779 	if (!ip6_dst_ops_template.kmem_cachep)
2780 		goto out;
2781 
2782 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2783 	if (ret)
2784 		goto out_kmem_cache;
2785 
2786 	ret = register_pernet_subsys(&ip6_route_net_ops);
2787 	if (ret)
2788 		goto out_dst_entries;
2789 
2790 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2791 
2792 	/* Registering of the loopback is done before this portion of code,
2793 	 * the loopback reference in rt6_info will not be taken, do it
2794 	 * manually for init_net */
2795 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2796 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2797   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2798 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2799 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2800 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2801 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2802   #endif
2803 	ret = fib6_init();
2804 	if (ret)
2805 		goto out_register_subsys;
2806 
2807 	ret = xfrm6_init();
2808 	if (ret)
2809 		goto out_fib6_init;
2810 
2811 	ret = fib6_rules_init();
2812 	if (ret)
2813 		goto xfrm6_init;
2814 
2815 	ret = -ENOBUFS;
2816 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2817 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2818 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2819 		goto fib6_rules_init;
2820 
2821 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2822 	if (ret)
2823 		goto fib6_rules_init;
2824 
2825 out:
2826 	return ret;
2827 
2828 fib6_rules_init:
2829 	fib6_rules_cleanup();
2830 xfrm6_init:
2831 	xfrm6_fini();
2832 out_fib6_init:
2833 	fib6_gc_cleanup();
2834 out_register_subsys:
2835 	unregister_pernet_subsys(&ip6_route_net_ops);
2836 out_dst_entries:
2837 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2838 out_kmem_cache:
2839 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2840 	goto out;
2841 }
2842 
2843 void ip6_route_cleanup(void)
2844 {
2845 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2846 	fib6_rules_cleanup();
2847 	xfrm6_fini();
2848 	fib6_gc_cleanup();
2849 	unregister_pernet_subsys(&ip6_route_net_ops);
2850 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2851 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2852 }
2853