xref: /openbmc/linux/net/ipv6/route.c (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void		ip6_dst_destroy(struct dst_entry *);
81 static void		ip6_dst_ifdown(struct dst_entry *,
82 				       struct net_device *dev, int how);
83 static int		 ip6_dst_gc(struct dst_ops *ops);
84 
85 static int		ip6_pkt_discard(struct sk_buff *skb);
86 static int		ip6_pkt_discard_out(struct sk_buff *skb);
87 static void		ip6_link_failure(struct sk_buff *skb);
88 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89 
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 					   struct in6_addr *prefix, int prefixlen,
93 					   struct in6_addr *gwaddr, int ifindex,
94 					   unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 					   struct in6_addr *prefix, int prefixlen,
97 					   struct in6_addr *gwaddr, int ifindex);
98 #endif
99 
100 static struct dst_ops ip6_dst_ops_template = {
101 	.family			=	AF_INET6,
102 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
103 	.gc			=	ip6_dst_gc,
104 	.gc_thresh		=	1024,
105 	.check			=	ip6_dst_check,
106 	.default_advmss		=	ip6_default_advmss,
107 	.default_mtu		=	ip6_default_mtu,
108 	.destroy		=	ip6_dst_destroy,
109 	.ifdown			=	ip6_dst_ifdown,
110 	.negative_advice	=	ip6_negative_advice,
111 	.link_failure		=	ip6_link_failure,
112 	.update_pmtu		=	ip6_rt_update_pmtu,
113 	.local_out		=	__ip6_local_out,
114 };
115 
116 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
117 {
118 	return 0;
119 }
120 
121 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
122 {
123 }
124 
125 static struct dst_ops ip6_dst_blackhole_ops = {
126 	.family			=	AF_INET6,
127 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
128 	.destroy		=	ip6_dst_destroy,
129 	.check			=	ip6_dst_check,
130 	.default_mtu		=	ip6_blackhole_default_mtu,
131 	.default_advmss		=	ip6_default_advmss,
132 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
133 };
134 
135 static struct rt6_info ip6_null_entry_template = {
136 	.dst = {
137 		.__refcnt	= ATOMIC_INIT(1),
138 		.__use		= 1,
139 		.obsolete	= -1,
140 		.error		= -ENETUNREACH,
141 		.input		= ip6_pkt_discard,
142 		.output		= ip6_pkt_discard_out,
143 	},
144 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
145 	.rt6i_protocol  = RTPROT_KERNEL,
146 	.rt6i_metric	= ~(u32) 0,
147 	.rt6i_ref	= ATOMIC_INIT(1),
148 };
149 
150 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
151 
152 static int ip6_pkt_prohibit(struct sk_buff *skb);
153 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
154 
155 static struct rt6_info ip6_prohibit_entry_template = {
156 	.dst = {
157 		.__refcnt	= ATOMIC_INIT(1),
158 		.__use		= 1,
159 		.obsolete	= -1,
160 		.error		= -EACCES,
161 		.input		= ip6_pkt_prohibit,
162 		.output		= ip6_pkt_prohibit_out,
163 	},
164 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
165 	.rt6i_protocol  = RTPROT_KERNEL,
166 	.rt6i_metric	= ~(u32) 0,
167 	.rt6i_ref	= ATOMIC_INIT(1),
168 };
169 
170 static struct rt6_info ip6_blk_hole_entry_template = {
171 	.dst = {
172 		.__refcnt	= ATOMIC_INIT(1),
173 		.__use		= 1,
174 		.obsolete	= -1,
175 		.error		= -EINVAL,
176 		.input		= dst_discard,
177 		.output		= dst_discard,
178 	},
179 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
180 	.rt6i_protocol  = RTPROT_KERNEL,
181 	.rt6i_metric	= ~(u32) 0,
182 	.rt6i_ref	= ATOMIC_INIT(1),
183 };
184 
185 #endif
186 
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190 	return (struct rt6_info *)dst_alloc(ops);
191 }
192 
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195 	struct rt6_info *rt = (struct rt6_info *)dst;
196 	struct inet6_dev *idev = rt->rt6i_idev;
197 	struct inet_peer *peer = rt->rt6i_peer;
198 
199 	if (idev != NULL) {
200 		rt->rt6i_idev = NULL;
201 		in6_dev_put(idev);
202 	}
203 	if (peer) {
204 		rt->rt6i_peer = NULL;
205 		inet_putpeer(peer);
206 	}
207 }
208 
209 void rt6_bind_peer(struct rt6_info *rt, int create)
210 {
211 	struct inet_peer *peer;
212 
213 	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
214 	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
215 		inet_putpeer(peer);
216 }
217 
218 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
219 			   int how)
220 {
221 	struct rt6_info *rt = (struct rt6_info *)dst;
222 	struct inet6_dev *idev = rt->rt6i_idev;
223 	struct net_device *loopback_dev =
224 		dev_net(dev)->loopback_dev;
225 
226 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
227 		struct inet6_dev *loopback_idev =
228 			in6_dev_get(loopback_dev);
229 		if (loopback_idev != NULL) {
230 			rt->rt6i_idev = loopback_idev;
231 			in6_dev_put(idev);
232 		}
233 	}
234 }
235 
236 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
237 {
238 	return (rt->rt6i_flags & RTF_EXPIRES) &&
239 		time_after(jiffies, rt->rt6i_expires);
240 }
241 
242 static inline int rt6_need_strict(struct in6_addr *daddr)
243 {
244 	return ipv6_addr_type(daddr) &
245 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
246 }
247 
248 /*
249  *	Route lookup. Any table->tb6_lock is implied.
250  */
251 
252 static inline struct rt6_info *rt6_device_match(struct net *net,
253 						    struct rt6_info *rt,
254 						    struct in6_addr *saddr,
255 						    int oif,
256 						    int flags)
257 {
258 	struct rt6_info *local = NULL;
259 	struct rt6_info *sprt;
260 
261 	if (!oif && ipv6_addr_any(saddr))
262 		goto out;
263 
264 	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
265 		struct net_device *dev = sprt->rt6i_dev;
266 
267 		if (oif) {
268 			if (dev->ifindex == oif)
269 				return sprt;
270 			if (dev->flags & IFF_LOOPBACK) {
271 				if (sprt->rt6i_idev == NULL ||
272 				    sprt->rt6i_idev->dev->ifindex != oif) {
273 					if (flags & RT6_LOOKUP_F_IFACE && oif)
274 						continue;
275 					if (local && (!oif ||
276 						      local->rt6i_idev->dev->ifindex == oif))
277 						continue;
278 				}
279 				local = sprt;
280 			}
281 		} else {
282 			if (ipv6_chk_addr(net, saddr, dev,
283 					  flags & RT6_LOOKUP_F_IFACE))
284 				return sprt;
285 		}
286 	}
287 
288 	if (oif) {
289 		if (local)
290 			return local;
291 
292 		if (flags & RT6_LOOKUP_F_IFACE)
293 			return net->ipv6.ip6_null_entry;
294 	}
295 out:
296 	return rt;
297 }
298 
299 #ifdef CONFIG_IPV6_ROUTER_PREF
300 static void rt6_probe(struct rt6_info *rt)
301 {
302 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
303 	/*
304 	 * Okay, this does not seem to be appropriate
305 	 * for now, however, we need to check if it
306 	 * is really so; aka Router Reachability Probing.
307 	 *
308 	 * Router Reachability Probe MUST be rate-limited
309 	 * to no more than one per minute.
310 	 */
311 	if (!neigh || (neigh->nud_state & NUD_VALID))
312 		return;
313 	read_lock_bh(&neigh->lock);
314 	if (!(neigh->nud_state & NUD_VALID) &&
315 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
316 		struct in6_addr mcaddr;
317 		struct in6_addr *target;
318 
319 		neigh->updated = jiffies;
320 		read_unlock_bh(&neigh->lock);
321 
322 		target = (struct in6_addr *)&neigh->primary_key;
323 		addrconf_addr_solict_mult(target, &mcaddr);
324 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
325 	} else
326 		read_unlock_bh(&neigh->lock);
327 }
328 #else
329 static inline void rt6_probe(struct rt6_info *rt)
330 {
331 }
332 #endif
333 
334 /*
335  * Default Router Selection (RFC 2461 6.3.6)
336  */
337 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
338 {
339 	struct net_device *dev = rt->rt6i_dev;
340 	if (!oif || dev->ifindex == oif)
341 		return 2;
342 	if ((dev->flags & IFF_LOOPBACK) &&
343 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
344 		return 1;
345 	return 0;
346 }
347 
348 static inline int rt6_check_neigh(struct rt6_info *rt)
349 {
350 	struct neighbour *neigh = rt->rt6i_nexthop;
351 	int m;
352 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
353 	    !(rt->rt6i_flags & RTF_GATEWAY))
354 		m = 1;
355 	else if (neigh) {
356 		read_lock_bh(&neigh->lock);
357 		if (neigh->nud_state & NUD_VALID)
358 			m = 2;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360 		else if (neigh->nud_state & NUD_FAILED)
361 			m = 0;
362 #endif
363 		else
364 			m = 1;
365 		read_unlock_bh(&neigh->lock);
366 	} else
367 		m = 0;
368 	return m;
369 }
370 
371 static int rt6_score_route(struct rt6_info *rt, int oif,
372 			   int strict)
373 {
374 	int m, n;
375 
376 	m = rt6_check_dev(rt, oif);
377 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
378 		return -1;
379 #ifdef CONFIG_IPV6_ROUTER_PREF
380 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
381 #endif
382 	n = rt6_check_neigh(rt);
383 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
384 		return -1;
385 	return m;
386 }
387 
388 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
389 				   int *mpri, struct rt6_info *match)
390 {
391 	int m;
392 
393 	if (rt6_check_expired(rt))
394 		goto out;
395 
396 	m = rt6_score_route(rt, oif, strict);
397 	if (m < 0)
398 		goto out;
399 
400 	if (m > *mpri) {
401 		if (strict & RT6_LOOKUP_F_REACHABLE)
402 			rt6_probe(match);
403 		*mpri = m;
404 		match = rt;
405 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
406 		rt6_probe(rt);
407 	}
408 
409 out:
410 	return match;
411 }
412 
413 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
414 				     struct rt6_info *rr_head,
415 				     u32 metric, int oif, int strict)
416 {
417 	struct rt6_info *rt, *match;
418 	int mpri = -1;
419 
420 	match = NULL;
421 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
422 	     rt = rt->dst.rt6_next)
423 		match = find_match(rt, oif, strict, &mpri, match);
424 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
425 	     rt = rt->dst.rt6_next)
426 		match = find_match(rt, oif, strict, &mpri, match);
427 
428 	return match;
429 }
430 
431 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
432 {
433 	struct rt6_info *match, *rt0;
434 	struct net *net;
435 
436 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
437 		  __func__, fn->leaf, oif);
438 
439 	rt0 = fn->rr_ptr;
440 	if (!rt0)
441 		fn->rr_ptr = rt0 = fn->leaf;
442 
443 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
444 
445 	if (!match &&
446 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
447 		struct rt6_info *next = rt0->dst.rt6_next;
448 
449 		/* no entries matched; do round-robin */
450 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
451 			next = fn->leaf;
452 
453 		if (next != rt0)
454 			fn->rr_ptr = next;
455 	}
456 
457 	RT6_TRACE("%s() => %p\n",
458 		  __func__, match);
459 
460 	net = dev_net(rt0->rt6i_dev);
461 	return match ? match : net->ipv6.ip6_null_entry;
462 }
463 
464 #ifdef CONFIG_IPV6_ROUTE_INFO
465 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
466 		  struct in6_addr *gwaddr)
467 {
468 	struct net *net = dev_net(dev);
469 	struct route_info *rinfo = (struct route_info *) opt;
470 	struct in6_addr prefix_buf, *prefix;
471 	unsigned int pref;
472 	unsigned long lifetime;
473 	struct rt6_info *rt;
474 
475 	if (len < sizeof(struct route_info)) {
476 		return -EINVAL;
477 	}
478 
479 	/* Sanity check for prefix_len and length */
480 	if (rinfo->length > 3) {
481 		return -EINVAL;
482 	} else if (rinfo->prefix_len > 128) {
483 		return -EINVAL;
484 	} else if (rinfo->prefix_len > 64) {
485 		if (rinfo->length < 2) {
486 			return -EINVAL;
487 		}
488 	} else if (rinfo->prefix_len > 0) {
489 		if (rinfo->length < 1) {
490 			return -EINVAL;
491 		}
492 	}
493 
494 	pref = rinfo->route_pref;
495 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
496 		return -EINVAL;
497 
498 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
499 
500 	if (rinfo->length == 3)
501 		prefix = (struct in6_addr *)rinfo->prefix;
502 	else {
503 		/* this function is safe */
504 		ipv6_addr_prefix(&prefix_buf,
505 				 (struct in6_addr *)rinfo->prefix,
506 				 rinfo->prefix_len);
507 		prefix = &prefix_buf;
508 	}
509 
510 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
511 				dev->ifindex);
512 
513 	if (rt && !lifetime) {
514 		ip6_del_rt(rt);
515 		rt = NULL;
516 	}
517 
518 	if (!rt && lifetime)
519 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
520 					pref);
521 	else if (rt)
522 		rt->rt6i_flags = RTF_ROUTEINFO |
523 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
524 
525 	if (rt) {
526 		if (!addrconf_finite_timeout(lifetime)) {
527 			rt->rt6i_flags &= ~RTF_EXPIRES;
528 		} else {
529 			rt->rt6i_expires = jiffies + HZ * lifetime;
530 			rt->rt6i_flags |= RTF_EXPIRES;
531 		}
532 		dst_release(&rt->dst);
533 	}
534 	return 0;
535 }
536 #endif
537 
538 #define BACKTRACK(__net, saddr)			\
539 do { \
540 	if (rt == __net->ipv6.ip6_null_entry) {	\
541 		struct fib6_node *pn; \
542 		while (1) { \
543 			if (fn->fn_flags & RTN_TL_ROOT) \
544 				goto out; \
545 			pn = fn->parent; \
546 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
547 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
548 			else \
549 				fn = pn; \
550 			if (fn->fn_flags & RTN_RTINFO) \
551 				goto restart; \
552 		} \
553 	} \
554 } while(0)
555 
556 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
557 					     struct fib6_table *table,
558 					     struct flowi *fl, int flags)
559 {
560 	struct fib6_node *fn;
561 	struct rt6_info *rt;
562 
563 	read_lock_bh(&table->tb6_lock);
564 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
565 restart:
566 	rt = fn->leaf;
567 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
568 	BACKTRACK(net, &fl->fl6_src);
569 out:
570 	dst_use(&rt->dst, jiffies);
571 	read_unlock_bh(&table->tb6_lock);
572 	return rt;
573 
574 }
575 
576 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
577 			    const struct in6_addr *saddr, int oif, int strict)
578 {
579 	struct flowi fl = {
580 		.oif = oif,
581 		.fl6_dst = *daddr,
582 	};
583 	struct dst_entry *dst;
584 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
585 
586 	if (saddr) {
587 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
588 		flags |= RT6_LOOKUP_F_HAS_SADDR;
589 	}
590 
591 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
592 	if (dst->error == 0)
593 		return (struct rt6_info *) dst;
594 
595 	dst_release(dst);
596 
597 	return NULL;
598 }
599 
600 EXPORT_SYMBOL(rt6_lookup);
601 
602 /* ip6_ins_rt is called with FREE table->tb6_lock.
603    It takes new route entry, the addition fails by any reason the
604    route is freed. In any case, if caller does not hold it, it may
605    be destroyed.
606  */
607 
608 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
609 {
610 	int err;
611 	struct fib6_table *table;
612 
613 	table = rt->rt6i_table;
614 	write_lock_bh(&table->tb6_lock);
615 	err = fib6_add(&table->tb6_root, rt, info);
616 	write_unlock_bh(&table->tb6_lock);
617 
618 	return err;
619 }
620 
621 int ip6_ins_rt(struct rt6_info *rt)
622 {
623 	struct nl_info info = {
624 		.nl_net = dev_net(rt->rt6i_dev),
625 	};
626 	return __ip6_ins_rt(rt, &info);
627 }
628 
629 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
630 				      struct in6_addr *saddr)
631 {
632 	struct rt6_info *rt;
633 
634 	/*
635 	 *	Clone the route.
636 	 */
637 
638 	rt = ip6_rt_copy(ort);
639 
640 	if (rt) {
641 		struct neighbour *neigh;
642 		int attempts = !in_softirq();
643 
644 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
645 			if (rt->rt6i_dst.plen != 128 &&
646 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
647 				rt->rt6i_flags |= RTF_ANYCAST;
648 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
649 		}
650 
651 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
652 		rt->rt6i_dst.plen = 128;
653 		rt->rt6i_flags |= RTF_CACHE;
654 		rt->dst.flags |= DST_HOST;
655 
656 #ifdef CONFIG_IPV6_SUBTREES
657 		if (rt->rt6i_src.plen && saddr) {
658 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
659 			rt->rt6i_src.plen = 128;
660 		}
661 #endif
662 
663 	retry:
664 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
665 		if (IS_ERR(neigh)) {
666 			struct net *net = dev_net(rt->rt6i_dev);
667 			int saved_rt_min_interval =
668 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
669 			int saved_rt_elasticity =
670 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
671 
672 			if (attempts-- > 0) {
673 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
674 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
675 
676 				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
677 
678 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
679 					saved_rt_elasticity;
680 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
681 					saved_rt_min_interval;
682 				goto retry;
683 			}
684 
685 			if (net_ratelimit())
686 				printk(KERN_WARNING
687 				       "ipv6: Neighbour table overflow.\n");
688 			dst_free(&rt->dst);
689 			return NULL;
690 		}
691 		rt->rt6i_nexthop = neigh;
692 
693 	}
694 
695 	return rt;
696 }
697 
698 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
699 {
700 	struct rt6_info *rt = ip6_rt_copy(ort);
701 	if (rt) {
702 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
703 		rt->rt6i_dst.plen = 128;
704 		rt->rt6i_flags |= RTF_CACHE;
705 		rt->dst.flags |= DST_HOST;
706 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
707 	}
708 	return rt;
709 }
710 
711 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
712 				      struct flowi *fl, int flags)
713 {
714 	struct fib6_node *fn;
715 	struct rt6_info *rt, *nrt;
716 	int strict = 0;
717 	int attempts = 3;
718 	int err;
719 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
720 
721 	strict |= flags & RT6_LOOKUP_F_IFACE;
722 
723 relookup:
724 	read_lock_bh(&table->tb6_lock);
725 
726 restart_2:
727 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
728 
729 restart:
730 	rt = rt6_select(fn, oif, strict | reachable);
731 
732 	BACKTRACK(net, &fl->fl6_src);
733 	if (rt == net->ipv6.ip6_null_entry ||
734 	    rt->rt6i_flags & RTF_CACHE)
735 		goto out;
736 
737 	dst_hold(&rt->dst);
738 	read_unlock_bh(&table->tb6_lock);
739 
740 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
741 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
742 	else
743 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
744 
745 	dst_release(&rt->dst);
746 	rt = nrt ? : net->ipv6.ip6_null_entry;
747 
748 	dst_hold(&rt->dst);
749 	if (nrt) {
750 		err = ip6_ins_rt(nrt);
751 		if (!err)
752 			goto out2;
753 	}
754 
755 	if (--attempts <= 0)
756 		goto out2;
757 
758 	/*
759 	 * Race condition! In the gap, when table->tb6_lock was
760 	 * released someone could insert this route.  Relookup.
761 	 */
762 	dst_release(&rt->dst);
763 	goto relookup;
764 
765 out:
766 	if (reachable) {
767 		reachable = 0;
768 		goto restart_2;
769 	}
770 	dst_hold(&rt->dst);
771 	read_unlock_bh(&table->tb6_lock);
772 out2:
773 	rt->dst.lastuse = jiffies;
774 	rt->dst.__use++;
775 
776 	return rt;
777 }
778 
779 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
780 					    struct flowi *fl, int flags)
781 {
782 	return ip6_pol_route(net, table, fl->iif, fl, flags);
783 }
784 
785 void ip6_route_input(struct sk_buff *skb)
786 {
787 	struct ipv6hdr *iph = ipv6_hdr(skb);
788 	struct net *net = dev_net(skb->dev);
789 	int flags = RT6_LOOKUP_F_HAS_SADDR;
790 	struct flowi fl = {
791 		.iif = skb->dev->ifindex,
792 		.fl6_dst = iph->daddr,
793 		.fl6_src = iph->saddr,
794 		.fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
795 		.mark = skb->mark,
796 		.proto = iph->nexthdr,
797 	};
798 
799 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
800 		flags |= RT6_LOOKUP_F_IFACE;
801 
802 	skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
803 }
804 
805 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
806 					     struct flowi *fl, int flags)
807 {
808 	return ip6_pol_route(net, table, fl->oif, fl, flags);
809 }
810 
811 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
812 				    struct flowi *fl)
813 {
814 	int flags = 0;
815 
816 	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
817 		flags |= RT6_LOOKUP_F_IFACE;
818 
819 	if (!ipv6_addr_any(&fl->fl6_src))
820 		flags |= RT6_LOOKUP_F_HAS_SADDR;
821 	else if (sk)
822 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
823 
824 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
825 }
826 
827 EXPORT_SYMBOL(ip6_route_output);
828 
829 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
830 {
831 	struct rt6_info *ort = (struct rt6_info *) *dstp;
832 	struct rt6_info *rt = (struct rt6_info *)
833 		dst_alloc(&ip6_dst_blackhole_ops);
834 	struct dst_entry *new = NULL;
835 
836 	if (rt) {
837 		new = &rt->dst;
838 
839 		atomic_set(&new->__refcnt, 1);
840 		new->__use = 1;
841 		new->input = dst_discard;
842 		new->output = dst_discard;
843 
844 		dst_copy_metrics(new, &ort->dst);
845 		new->dev = ort->dst.dev;
846 		if (new->dev)
847 			dev_hold(new->dev);
848 		rt->rt6i_idev = ort->rt6i_idev;
849 		if (rt->rt6i_idev)
850 			in6_dev_hold(rt->rt6i_idev);
851 		rt->rt6i_expires = 0;
852 
853 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
854 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
855 		rt->rt6i_metric = 0;
856 
857 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
858 #ifdef CONFIG_IPV6_SUBTREES
859 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
860 #endif
861 
862 		dst_free(new);
863 	}
864 
865 	dst_release(*dstp);
866 	*dstp = new;
867 	return new ? 0 : -ENOMEM;
868 }
869 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
870 
871 /*
872  *	Destination cache support functions
873  */
874 
875 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
876 {
877 	struct rt6_info *rt;
878 
879 	rt = (struct rt6_info *) dst;
880 
881 	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
882 		return dst;
883 
884 	return NULL;
885 }
886 
887 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
888 {
889 	struct rt6_info *rt = (struct rt6_info *) dst;
890 
891 	if (rt) {
892 		if (rt->rt6i_flags & RTF_CACHE) {
893 			if (rt6_check_expired(rt)) {
894 				ip6_del_rt(rt);
895 				dst = NULL;
896 			}
897 		} else {
898 			dst_release(dst);
899 			dst = NULL;
900 		}
901 	}
902 	return dst;
903 }
904 
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907 	struct rt6_info *rt;
908 
909 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
910 
911 	rt = (struct rt6_info *) skb_dst(skb);
912 	if (rt) {
913 		if (rt->rt6i_flags&RTF_CACHE) {
914 			dst_set_expires(&rt->dst, 0);
915 			rt->rt6i_flags |= RTF_EXPIRES;
916 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917 			rt->rt6i_node->fn_sernum = -1;
918 	}
919 }
920 
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923 	struct rt6_info *rt6 = (struct rt6_info*)dst;
924 
925 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926 		rt6->rt6i_flags |= RTF_MODIFIED;
927 		if (mtu < IPV6_MIN_MTU) {
928 			u32 features = dst_metric(dst, RTAX_FEATURES);
929 			mtu = IPV6_MIN_MTU;
930 			features |= RTAX_FEATURE_ALLFRAG;
931 			dst_metric_set(dst, RTAX_FEATURES, features);
932 		}
933 		dst_metric_set(dst, RTAX_MTU, mtu);
934 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
935 	}
936 }
937 
938 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
939 {
940 	struct net_device *dev = dst->dev;
941 	unsigned int mtu = dst_mtu(dst);
942 	struct net *net = dev_net(dev);
943 
944 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
945 
946 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
947 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
948 
949 	/*
950 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
951 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
952 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
953 	 * rely only on pmtu discovery"
954 	 */
955 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
956 		mtu = IPV6_MAXPLEN;
957 	return mtu;
958 }
959 
960 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
961 {
962 	unsigned int mtu = IPV6_MIN_MTU;
963 	struct inet6_dev *idev;
964 
965 	rcu_read_lock();
966 	idev = __in6_dev_get(dst->dev);
967 	if (idev)
968 		mtu = idev->cnf.mtu6;
969 	rcu_read_unlock();
970 
971 	return mtu;
972 }
973 
974 static struct dst_entry *icmp6_dst_gc_list;
975 static DEFINE_SPINLOCK(icmp6_dst_lock);
976 
977 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
978 				  struct neighbour *neigh,
979 				  const struct in6_addr *addr)
980 {
981 	struct rt6_info *rt;
982 	struct inet6_dev *idev = in6_dev_get(dev);
983 	struct net *net = dev_net(dev);
984 
985 	if (unlikely(idev == NULL))
986 		return NULL;
987 
988 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
989 	if (unlikely(rt == NULL)) {
990 		in6_dev_put(idev);
991 		goto out;
992 	}
993 
994 	dev_hold(dev);
995 	if (neigh)
996 		neigh_hold(neigh);
997 	else {
998 		neigh = ndisc_get_neigh(dev, addr);
999 		if (IS_ERR(neigh))
1000 			neigh = NULL;
1001 	}
1002 
1003 	rt->rt6i_dev	  = dev;
1004 	rt->rt6i_idev     = idev;
1005 	rt->rt6i_nexthop  = neigh;
1006 	atomic_set(&rt->dst.__refcnt, 1);
1007 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1008 	rt->dst.output  = ip6_output;
1009 
1010 #if 0	/* there's no chance to use these for ndisc */
1011 	rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1012 				? DST_HOST
1013 				: 0;
1014 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1015 	rt->rt6i_dst.plen = 128;
1016 #endif
1017 
1018 	spin_lock_bh(&icmp6_dst_lock);
1019 	rt->dst.next = icmp6_dst_gc_list;
1020 	icmp6_dst_gc_list = &rt->dst;
1021 	spin_unlock_bh(&icmp6_dst_lock);
1022 
1023 	fib6_force_start_gc(net);
1024 
1025 out:
1026 	return &rt->dst;
1027 }
1028 
1029 int icmp6_dst_gc(void)
1030 {
1031 	struct dst_entry *dst, *next, **pprev;
1032 	int more = 0;
1033 
1034 	next = NULL;
1035 
1036 	spin_lock_bh(&icmp6_dst_lock);
1037 	pprev = &icmp6_dst_gc_list;
1038 
1039 	while ((dst = *pprev) != NULL) {
1040 		if (!atomic_read(&dst->__refcnt)) {
1041 			*pprev = dst->next;
1042 			dst_free(dst);
1043 		} else {
1044 			pprev = &dst->next;
1045 			++more;
1046 		}
1047 	}
1048 
1049 	spin_unlock_bh(&icmp6_dst_lock);
1050 
1051 	return more;
1052 }
1053 
1054 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1055 			    void *arg)
1056 {
1057 	struct dst_entry *dst, **pprev;
1058 
1059 	spin_lock_bh(&icmp6_dst_lock);
1060 	pprev = &icmp6_dst_gc_list;
1061 	while ((dst = *pprev) != NULL) {
1062 		struct rt6_info *rt = (struct rt6_info *) dst;
1063 		if (func(rt, arg)) {
1064 			*pprev = dst->next;
1065 			dst_free(dst);
1066 		} else {
1067 			pprev = &dst->next;
1068 		}
1069 	}
1070 	spin_unlock_bh(&icmp6_dst_lock);
1071 }
1072 
1073 static int ip6_dst_gc(struct dst_ops *ops)
1074 {
1075 	unsigned long now = jiffies;
1076 	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1077 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1078 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1079 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1080 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1081 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1082 	int entries;
1083 
1084 	entries = dst_entries_get_fast(ops);
1085 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1086 	    entries <= rt_max_size)
1087 		goto out;
1088 
1089 	net->ipv6.ip6_rt_gc_expire++;
1090 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1091 	net->ipv6.ip6_rt_last_gc = now;
1092 	entries = dst_entries_get_slow(ops);
1093 	if (entries < ops->gc_thresh)
1094 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1095 out:
1096 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1097 	return entries > rt_max_size;
1098 }
1099 
1100 /* Clean host part of a prefix. Not necessary in radix tree,
1101    but results in cleaner routing tables.
1102 
1103    Remove it only when all the things will work!
1104  */
1105 
1106 int ip6_dst_hoplimit(struct dst_entry *dst)
1107 {
1108 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1109 	if (hoplimit == 0) {
1110 		struct net_device *dev = dst->dev;
1111 		struct inet6_dev *idev;
1112 
1113 		rcu_read_lock();
1114 		idev = __in6_dev_get(dev);
1115 		if (idev)
1116 			hoplimit = idev->cnf.hop_limit;
1117 		else
1118 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1119 		rcu_read_unlock();
1120 	}
1121 	return hoplimit;
1122 }
1123 EXPORT_SYMBOL(ip6_dst_hoplimit);
1124 
1125 /*
1126  *
1127  */
1128 
1129 int ip6_route_add(struct fib6_config *cfg)
1130 {
1131 	int err;
1132 	struct net *net = cfg->fc_nlinfo.nl_net;
1133 	struct rt6_info *rt = NULL;
1134 	struct net_device *dev = NULL;
1135 	struct inet6_dev *idev = NULL;
1136 	struct fib6_table *table;
1137 	int addr_type;
1138 
1139 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1140 		return -EINVAL;
1141 #ifndef CONFIG_IPV6_SUBTREES
1142 	if (cfg->fc_src_len)
1143 		return -EINVAL;
1144 #endif
1145 	if (cfg->fc_ifindex) {
1146 		err = -ENODEV;
1147 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1148 		if (!dev)
1149 			goto out;
1150 		idev = in6_dev_get(dev);
1151 		if (!idev)
1152 			goto out;
1153 	}
1154 
1155 	if (cfg->fc_metric == 0)
1156 		cfg->fc_metric = IP6_RT_PRIO_USER;
1157 
1158 	table = fib6_new_table(net, cfg->fc_table);
1159 	if (table == NULL) {
1160 		err = -ENOBUFS;
1161 		goto out;
1162 	}
1163 
1164 	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1165 
1166 	if (rt == NULL) {
1167 		err = -ENOMEM;
1168 		goto out;
1169 	}
1170 
1171 	rt->dst.obsolete = -1;
1172 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1173 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1174 				0;
1175 
1176 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1177 		cfg->fc_protocol = RTPROT_BOOT;
1178 	rt->rt6i_protocol = cfg->fc_protocol;
1179 
1180 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1181 
1182 	if (addr_type & IPV6_ADDR_MULTICAST)
1183 		rt->dst.input = ip6_mc_input;
1184 	else if (cfg->fc_flags & RTF_LOCAL)
1185 		rt->dst.input = ip6_input;
1186 	else
1187 		rt->dst.input = ip6_forward;
1188 
1189 	rt->dst.output = ip6_output;
1190 
1191 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1192 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1193 	if (rt->rt6i_dst.plen == 128)
1194 	       rt->dst.flags = DST_HOST;
1195 
1196 #ifdef CONFIG_IPV6_SUBTREES
1197 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1198 	rt->rt6i_src.plen = cfg->fc_src_len;
1199 #endif
1200 
1201 	rt->rt6i_metric = cfg->fc_metric;
1202 
1203 	/* We cannot add true routes via loopback here,
1204 	   they would result in kernel looping; promote them to reject routes
1205 	 */
1206 	if ((cfg->fc_flags & RTF_REJECT) ||
1207 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1208 					      && !(cfg->fc_flags&RTF_LOCAL))) {
1209 		/* hold loopback dev/idev if we haven't done so. */
1210 		if (dev != net->loopback_dev) {
1211 			if (dev) {
1212 				dev_put(dev);
1213 				in6_dev_put(idev);
1214 			}
1215 			dev = net->loopback_dev;
1216 			dev_hold(dev);
1217 			idev = in6_dev_get(dev);
1218 			if (!idev) {
1219 				err = -ENODEV;
1220 				goto out;
1221 			}
1222 		}
1223 		rt->dst.output = ip6_pkt_discard_out;
1224 		rt->dst.input = ip6_pkt_discard;
1225 		rt->dst.error = -ENETUNREACH;
1226 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1227 		goto install_route;
1228 	}
1229 
1230 	if (cfg->fc_flags & RTF_GATEWAY) {
1231 		struct in6_addr *gw_addr;
1232 		int gwa_type;
1233 
1234 		gw_addr = &cfg->fc_gateway;
1235 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1236 		gwa_type = ipv6_addr_type(gw_addr);
1237 
1238 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1239 			struct rt6_info *grt;
1240 
1241 			/* IPv6 strictly inhibits using not link-local
1242 			   addresses as nexthop address.
1243 			   Otherwise, router will not able to send redirects.
1244 			   It is very good, but in some (rare!) circumstances
1245 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1246 			   some exceptions. --ANK
1247 			 */
1248 			err = -EINVAL;
1249 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1250 				goto out;
1251 
1252 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1253 
1254 			err = -EHOSTUNREACH;
1255 			if (grt == NULL)
1256 				goto out;
1257 			if (dev) {
1258 				if (dev != grt->rt6i_dev) {
1259 					dst_release(&grt->dst);
1260 					goto out;
1261 				}
1262 			} else {
1263 				dev = grt->rt6i_dev;
1264 				idev = grt->rt6i_idev;
1265 				dev_hold(dev);
1266 				in6_dev_hold(grt->rt6i_idev);
1267 			}
1268 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1269 				err = 0;
1270 			dst_release(&grt->dst);
1271 
1272 			if (err)
1273 				goto out;
1274 		}
1275 		err = -EINVAL;
1276 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1277 			goto out;
1278 	}
1279 
1280 	err = -ENODEV;
1281 	if (dev == NULL)
1282 		goto out;
1283 
1284 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1285 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1286 		if (IS_ERR(rt->rt6i_nexthop)) {
1287 			err = PTR_ERR(rt->rt6i_nexthop);
1288 			rt->rt6i_nexthop = NULL;
1289 			goto out;
1290 		}
1291 	}
1292 
1293 	rt->rt6i_flags = cfg->fc_flags;
1294 
1295 install_route:
1296 	if (cfg->fc_mx) {
1297 		struct nlattr *nla;
1298 		int remaining;
1299 
1300 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1301 			int type = nla_type(nla);
1302 
1303 			if (type) {
1304 				if (type > RTAX_MAX) {
1305 					err = -EINVAL;
1306 					goto out;
1307 				}
1308 
1309 				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1310 			}
1311 		}
1312 	}
1313 
1314 	rt->dst.dev = dev;
1315 	rt->rt6i_idev = idev;
1316 	rt->rt6i_table = table;
1317 
1318 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1319 
1320 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1321 
1322 out:
1323 	if (dev)
1324 		dev_put(dev);
1325 	if (idev)
1326 		in6_dev_put(idev);
1327 	if (rt)
1328 		dst_free(&rt->dst);
1329 	return err;
1330 }
1331 
1332 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1333 {
1334 	int err;
1335 	struct fib6_table *table;
1336 	struct net *net = dev_net(rt->rt6i_dev);
1337 
1338 	if (rt == net->ipv6.ip6_null_entry)
1339 		return -ENOENT;
1340 
1341 	table = rt->rt6i_table;
1342 	write_lock_bh(&table->tb6_lock);
1343 
1344 	err = fib6_del(rt, info);
1345 	dst_release(&rt->dst);
1346 
1347 	write_unlock_bh(&table->tb6_lock);
1348 
1349 	return err;
1350 }
1351 
1352 int ip6_del_rt(struct rt6_info *rt)
1353 {
1354 	struct nl_info info = {
1355 		.nl_net = dev_net(rt->rt6i_dev),
1356 	};
1357 	return __ip6_del_rt(rt, &info);
1358 }
1359 
1360 static int ip6_route_del(struct fib6_config *cfg)
1361 {
1362 	struct fib6_table *table;
1363 	struct fib6_node *fn;
1364 	struct rt6_info *rt;
1365 	int err = -ESRCH;
1366 
1367 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1368 	if (table == NULL)
1369 		return err;
1370 
1371 	read_lock_bh(&table->tb6_lock);
1372 
1373 	fn = fib6_locate(&table->tb6_root,
1374 			 &cfg->fc_dst, cfg->fc_dst_len,
1375 			 &cfg->fc_src, cfg->fc_src_len);
1376 
1377 	if (fn) {
1378 		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1379 			if (cfg->fc_ifindex &&
1380 			    (rt->rt6i_dev == NULL ||
1381 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1382 				continue;
1383 			if (cfg->fc_flags & RTF_GATEWAY &&
1384 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1385 				continue;
1386 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1387 				continue;
1388 			dst_hold(&rt->dst);
1389 			read_unlock_bh(&table->tb6_lock);
1390 
1391 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1392 		}
1393 	}
1394 	read_unlock_bh(&table->tb6_lock);
1395 
1396 	return err;
1397 }
1398 
1399 /*
1400  *	Handle redirects
1401  */
1402 struct ip6rd_flowi {
1403 	struct flowi fl;
1404 	struct in6_addr gateway;
1405 };
1406 
1407 static struct rt6_info *__ip6_route_redirect(struct net *net,
1408 					     struct fib6_table *table,
1409 					     struct flowi *fl,
1410 					     int flags)
1411 {
1412 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1413 	struct rt6_info *rt;
1414 	struct fib6_node *fn;
1415 
1416 	/*
1417 	 * Get the "current" route for this destination and
1418 	 * check if the redirect has come from approriate router.
1419 	 *
1420 	 * RFC 2461 specifies that redirects should only be
1421 	 * accepted if they come from the nexthop to the target.
1422 	 * Due to the way the routes are chosen, this notion
1423 	 * is a bit fuzzy and one might need to check all possible
1424 	 * routes.
1425 	 */
1426 
1427 	read_lock_bh(&table->tb6_lock);
1428 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1429 restart:
1430 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1431 		/*
1432 		 * Current route is on-link; redirect is always invalid.
1433 		 *
1434 		 * Seems, previous statement is not true. It could
1435 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1436 		 * But then router serving it might decide, that we should
1437 		 * know truth 8)8) --ANK (980726).
1438 		 */
1439 		if (rt6_check_expired(rt))
1440 			continue;
1441 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1442 			continue;
1443 		if (fl->oif != rt->rt6i_dev->ifindex)
1444 			continue;
1445 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1446 			continue;
1447 		break;
1448 	}
1449 
1450 	if (!rt)
1451 		rt = net->ipv6.ip6_null_entry;
1452 	BACKTRACK(net, &fl->fl6_src);
1453 out:
1454 	dst_hold(&rt->dst);
1455 
1456 	read_unlock_bh(&table->tb6_lock);
1457 
1458 	return rt;
1459 };
1460 
1461 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1462 					   struct in6_addr *src,
1463 					   struct in6_addr *gateway,
1464 					   struct net_device *dev)
1465 {
1466 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1467 	struct net *net = dev_net(dev);
1468 	struct ip6rd_flowi rdfl = {
1469 		.fl = {
1470 			.oif = dev->ifindex,
1471 			.fl6_dst = *dest,
1472 			.fl6_src = *src,
1473 		},
1474 	};
1475 
1476 	ipv6_addr_copy(&rdfl.gateway, gateway);
1477 
1478 	if (rt6_need_strict(dest))
1479 		flags |= RT6_LOOKUP_F_IFACE;
1480 
1481 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1482 						   flags, __ip6_route_redirect);
1483 }
1484 
1485 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1486 		  struct in6_addr *saddr,
1487 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1488 {
1489 	struct rt6_info *rt, *nrt = NULL;
1490 	struct netevent_redirect netevent;
1491 	struct net *net = dev_net(neigh->dev);
1492 
1493 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1494 
1495 	if (rt == net->ipv6.ip6_null_entry) {
1496 		if (net_ratelimit())
1497 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1498 			       "for redirect target\n");
1499 		goto out;
1500 	}
1501 
1502 	/*
1503 	 *	We have finally decided to accept it.
1504 	 */
1505 
1506 	neigh_update(neigh, lladdr, NUD_STALE,
1507 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1508 		     NEIGH_UPDATE_F_OVERRIDE|
1509 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1510 				     NEIGH_UPDATE_F_ISROUTER))
1511 		     );
1512 
1513 	/*
1514 	 * Redirect received -> path was valid.
1515 	 * Look, redirects are sent only in response to data packets,
1516 	 * so that this nexthop apparently is reachable. --ANK
1517 	 */
1518 	dst_confirm(&rt->dst);
1519 
1520 	/* Duplicate redirect: silently ignore. */
1521 	if (neigh == rt->dst.neighbour)
1522 		goto out;
1523 
1524 	nrt = ip6_rt_copy(rt);
1525 	if (nrt == NULL)
1526 		goto out;
1527 
1528 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1529 	if (on_link)
1530 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1531 
1532 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1533 	nrt->rt6i_dst.plen = 128;
1534 	nrt->dst.flags |= DST_HOST;
1535 
1536 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1537 	nrt->rt6i_nexthop = neigh_clone(neigh);
1538 
1539 	if (ip6_ins_rt(nrt))
1540 		goto out;
1541 
1542 	netevent.old = &rt->dst;
1543 	netevent.new = &nrt->dst;
1544 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1545 
1546 	if (rt->rt6i_flags&RTF_CACHE) {
1547 		ip6_del_rt(rt);
1548 		return;
1549 	}
1550 
1551 out:
1552 	dst_release(&rt->dst);
1553 }
1554 
1555 /*
1556  *	Handle ICMP "packet too big" messages
1557  *	i.e. Path MTU discovery
1558  */
1559 
1560 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1561 			     struct net *net, u32 pmtu, int ifindex)
1562 {
1563 	struct rt6_info *rt, *nrt;
1564 	int allfrag = 0;
1565 again:
1566 	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1567 	if (rt == NULL)
1568 		return;
1569 
1570 	if (rt6_check_expired(rt)) {
1571 		ip6_del_rt(rt);
1572 		goto again;
1573 	}
1574 
1575 	if (pmtu >= dst_mtu(&rt->dst))
1576 		goto out;
1577 
1578 	if (pmtu < IPV6_MIN_MTU) {
1579 		/*
1580 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1581 		 * MTU (1280) and a fragment header should always be included
1582 		 * after a node receiving Too Big message reporting PMTU is
1583 		 * less than the IPv6 Minimum Link MTU.
1584 		 */
1585 		pmtu = IPV6_MIN_MTU;
1586 		allfrag = 1;
1587 	}
1588 
1589 	/* New mtu received -> path was valid.
1590 	   They are sent only in response to data packets,
1591 	   so that this nexthop apparently is reachable. --ANK
1592 	 */
1593 	dst_confirm(&rt->dst);
1594 
1595 	/* Host route. If it is static, it would be better
1596 	   not to override it, but add new one, so that
1597 	   when cache entry will expire old pmtu
1598 	   would return automatically.
1599 	 */
1600 	if (rt->rt6i_flags & RTF_CACHE) {
1601 		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1602 		if (allfrag) {
1603 			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1604 			features |= RTAX_FEATURE_ALLFRAG;
1605 			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1606 		}
1607 		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1608 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1609 		goto out;
1610 	}
1611 
1612 	/* Network route.
1613 	   Two cases are possible:
1614 	   1. It is connected route. Action: COW
1615 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1616 	 */
1617 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1618 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1619 	else
1620 		nrt = rt6_alloc_clone(rt, daddr);
1621 
1622 	if (nrt) {
1623 		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1624 		if (allfrag) {
1625 			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1626 			features |= RTAX_FEATURE_ALLFRAG;
1627 			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1628 		}
1629 
1630 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1631 		 * happened within 5 mins, the recommended timer is 10 mins.
1632 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1633 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1634 		 * and detecting PMTU increase will be automatically happened.
1635 		 */
1636 		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1637 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1638 
1639 		ip6_ins_rt(nrt);
1640 	}
1641 out:
1642 	dst_release(&rt->dst);
1643 }
1644 
1645 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1646 			struct net_device *dev, u32 pmtu)
1647 {
1648 	struct net *net = dev_net(dev);
1649 
1650 	/*
1651 	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1652 	 * is sending along the path" that caused the Packet Too Big message.
1653 	 * Since it's not possible in the general case to determine which
1654 	 * interface was used to send the original packet, we update the MTU
1655 	 * on the interface that will be used to send future packets. We also
1656 	 * update the MTU on the interface that received the Packet Too Big in
1657 	 * case the original packet was forced out that interface with
1658 	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1659 	 * correct behaviour, which would be to update the MTU on all
1660 	 * interfaces.
1661 	 */
1662 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1663 	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1664 }
1665 
1666 /*
1667  *	Misc support functions
1668  */
1669 
1670 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1671 {
1672 	struct net *net = dev_net(ort->rt6i_dev);
1673 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1674 
1675 	if (rt) {
1676 		rt->dst.input = ort->dst.input;
1677 		rt->dst.output = ort->dst.output;
1678 
1679 		dst_copy_metrics(&rt->dst, &ort->dst);
1680 		rt->dst.error = ort->dst.error;
1681 		rt->dst.dev = ort->dst.dev;
1682 		if (rt->dst.dev)
1683 			dev_hold(rt->dst.dev);
1684 		rt->rt6i_idev = ort->rt6i_idev;
1685 		if (rt->rt6i_idev)
1686 			in6_dev_hold(rt->rt6i_idev);
1687 		rt->dst.lastuse = jiffies;
1688 		rt->rt6i_expires = 0;
1689 
1690 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1691 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1692 		rt->rt6i_metric = 0;
1693 
1694 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1695 #ifdef CONFIG_IPV6_SUBTREES
1696 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1697 #endif
1698 		rt->rt6i_table = ort->rt6i_table;
1699 	}
1700 	return rt;
1701 }
1702 
1703 #ifdef CONFIG_IPV6_ROUTE_INFO
1704 static struct rt6_info *rt6_get_route_info(struct net *net,
1705 					   struct in6_addr *prefix, int prefixlen,
1706 					   struct in6_addr *gwaddr, int ifindex)
1707 {
1708 	struct fib6_node *fn;
1709 	struct rt6_info *rt = NULL;
1710 	struct fib6_table *table;
1711 
1712 	table = fib6_get_table(net, RT6_TABLE_INFO);
1713 	if (table == NULL)
1714 		return NULL;
1715 
1716 	write_lock_bh(&table->tb6_lock);
1717 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1718 	if (!fn)
1719 		goto out;
1720 
1721 	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1722 		if (rt->rt6i_dev->ifindex != ifindex)
1723 			continue;
1724 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1725 			continue;
1726 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1727 			continue;
1728 		dst_hold(&rt->dst);
1729 		break;
1730 	}
1731 out:
1732 	write_unlock_bh(&table->tb6_lock);
1733 	return rt;
1734 }
1735 
1736 static struct rt6_info *rt6_add_route_info(struct net *net,
1737 					   struct in6_addr *prefix, int prefixlen,
1738 					   struct in6_addr *gwaddr, int ifindex,
1739 					   unsigned pref)
1740 {
1741 	struct fib6_config cfg = {
1742 		.fc_table	= RT6_TABLE_INFO,
1743 		.fc_metric	= IP6_RT_PRIO_USER,
1744 		.fc_ifindex	= ifindex,
1745 		.fc_dst_len	= prefixlen,
1746 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1747 				  RTF_UP | RTF_PREF(pref),
1748 		.fc_nlinfo.pid = 0,
1749 		.fc_nlinfo.nlh = NULL,
1750 		.fc_nlinfo.nl_net = net,
1751 	};
1752 
1753 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1754 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1755 
1756 	/* We should treat it as a default route if prefix length is 0. */
1757 	if (!prefixlen)
1758 		cfg.fc_flags |= RTF_DEFAULT;
1759 
1760 	ip6_route_add(&cfg);
1761 
1762 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1763 }
1764 #endif
1765 
1766 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1767 {
1768 	struct rt6_info *rt;
1769 	struct fib6_table *table;
1770 
1771 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1772 	if (table == NULL)
1773 		return NULL;
1774 
1775 	write_lock_bh(&table->tb6_lock);
1776 	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1777 		if (dev == rt->rt6i_dev &&
1778 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1779 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1780 			break;
1781 	}
1782 	if (rt)
1783 		dst_hold(&rt->dst);
1784 	write_unlock_bh(&table->tb6_lock);
1785 	return rt;
1786 }
1787 
1788 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1789 				     struct net_device *dev,
1790 				     unsigned int pref)
1791 {
1792 	struct fib6_config cfg = {
1793 		.fc_table	= RT6_TABLE_DFLT,
1794 		.fc_metric	= IP6_RT_PRIO_USER,
1795 		.fc_ifindex	= dev->ifindex,
1796 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1797 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1798 		.fc_nlinfo.pid = 0,
1799 		.fc_nlinfo.nlh = NULL,
1800 		.fc_nlinfo.nl_net = dev_net(dev),
1801 	};
1802 
1803 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1804 
1805 	ip6_route_add(&cfg);
1806 
1807 	return rt6_get_dflt_router(gwaddr, dev);
1808 }
1809 
1810 void rt6_purge_dflt_routers(struct net *net)
1811 {
1812 	struct rt6_info *rt;
1813 	struct fib6_table *table;
1814 
1815 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1816 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1817 	if (table == NULL)
1818 		return;
1819 
1820 restart:
1821 	read_lock_bh(&table->tb6_lock);
1822 	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1823 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1824 			dst_hold(&rt->dst);
1825 			read_unlock_bh(&table->tb6_lock);
1826 			ip6_del_rt(rt);
1827 			goto restart;
1828 		}
1829 	}
1830 	read_unlock_bh(&table->tb6_lock);
1831 }
1832 
1833 static void rtmsg_to_fib6_config(struct net *net,
1834 				 struct in6_rtmsg *rtmsg,
1835 				 struct fib6_config *cfg)
1836 {
1837 	memset(cfg, 0, sizeof(*cfg));
1838 
1839 	cfg->fc_table = RT6_TABLE_MAIN;
1840 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1841 	cfg->fc_metric = rtmsg->rtmsg_metric;
1842 	cfg->fc_expires = rtmsg->rtmsg_info;
1843 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1844 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1845 	cfg->fc_flags = rtmsg->rtmsg_flags;
1846 
1847 	cfg->fc_nlinfo.nl_net = net;
1848 
1849 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1850 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1851 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1852 }
1853 
1854 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1855 {
1856 	struct fib6_config cfg;
1857 	struct in6_rtmsg rtmsg;
1858 	int err;
1859 
1860 	switch(cmd) {
1861 	case SIOCADDRT:		/* Add a route */
1862 	case SIOCDELRT:		/* Delete a route */
1863 		if (!capable(CAP_NET_ADMIN))
1864 			return -EPERM;
1865 		err = copy_from_user(&rtmsg, arg,
1866 				     sizeof(struct in6_rtmsg));
1867 		if (err)
1868 			return -EFAULT;
1869 
1870 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1871 
1872 		rtnl_lock();
1873 		switch (cmd) {
1874 		case SIOCADDRT:
1875 			err = ip6_route_add(&cfg);
1876 			break;
1877 		case SIOCDELRT:
1878 			err = ip6_route_del(&cfg);
1879 			break;
1880 		default:
1881 			err = -EINVAL;
1882 		}
1883 		rtnl_unlock();
1884 
1885 		return err;
1886 	}
1887 
1888 	return -EINVAL;
1889 }
1890 
1891 /*
1892  *	Drop the packet on the floor
1893  */
1894 
1895 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1896 {
1897 	int type;
1898 	struct dst_entry *dst = skb_dst(skb);
1899 	switch (ipstats_mib_noroutes) {
1900 	case IPSTATS_MIB_INNOROUTES:
1901 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1902 		if (type == IPV6_ADDR_ANY) {
1903 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1904 				      IPSTATS_MIB_INADDRERRORS);
1905 			break;
1906 		}
1907 		/* FALLTHROUGH */
1908 	case IPSTATS_MIB_OUTNOROUTES:
1909 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1910 			      ipstats_mib_noroutes);
1911 		break;
1912 	}
1913 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1914 	kfree_skb(skb);
1915 	return 0;
1916 }
1917 
1918 static int ip6_pkt_discard(struct sk_buff *skb)
1919 {
1920 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1921 }
1922 
1923 static int ip6_pkt_discard_out(struct sk_buff *skb)
1924 {
1925 	skb->dev = skb_dst(skb)->dev;
1926 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1927 }
1928 
1929 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1930 
1931 static int ip6_pkt_prohibit(struct sk_buff *skb)
1932 {
1933 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1934 }
1935 
1936 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1937 {
1938 	skb->dev = skb_dst(skb)->dev;
1939 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1940 }
1941 
1942 #endif
1943 
1944 /*
1945  *	Allocate a dst for local (unicast / anycast) address.
1946  */
1947 
1948 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1949 				    const struct in6_addr *addr,
1950 				    int anycast)
1951 {
1952 	struct net *net = dev_net(idev->dev);
1953 	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1954 	struct neighbour *neigh;
1955 
1956 	if (rt == NULL) {
1957 		if (net_ratelimit())
1958 			pr_warning("IPv6:  Maximum number of routes reached,"
1959 				   " consider increasing route/max_size.\n");
1960 		return ERR_PTR(-ENOMEM);
1961 	}
1962 
1963 	dev_hold(net->loopback_dev);
1964 	in6_dev_hold(idev);
1965 
1966 	rt->dst.flags = DST_HOST;
1967 	rt->dst.input = ip6_input;
1968 	rt->dst.output = ip6_output;
1969 	rt->rt6i_dev = net->loopback_dev;
1970 	rt->rt6i_idev = idev;
1971 	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1972 	rt->dst.obsolete = -1;
1973 
1974 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1975 	if (anycast)
1976 		rt->rt6i_flags |= RTF_ANYCAST;
1977 	else
1978 		rt->rt6i_flags |= RTF_LOCAL;
1979 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1980 	if (IS_ERR(neigh)) {
1981 		dst_free(&rt->dst);
1982 
1983 		/* We are casting this because that is the return
1984 		 * value type.  But an errno encoded pointer is the
1985 		 * same regardless of the underlying pointer type,
1986 		 * and that's what we are returning.  So this is OK.
1987 		 */
1988 		return (struct rt6_info *) neigh;
1989 	}
1990 	rt->rt6i_nexthop = neigh;
1991 
1992 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1993 	rt->rt6i_dst.plen = 128;
1994 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1995 
1996 	atomic_set(&rt->dst.__refcnt, 1);
1997 
1998 	return rt;
1999 }
2000 
2001 struct arg_dev_net {
2002 	struct net_device *dev;
2003 	struct net *net;
2004 };
2005 
2006 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2007 {
2008 	const struct arg_dev_net *adn = arg;
2009 	const struct net_device *dev = adn->dev;
2010 
2011 	if ((rt->rt6i_dev == dev || dev == NULL) &&
2012 	    rt != adn->net->ipv6.ip6_null_entry) {
2013 		RT6_TRACE("deleted by ifdown %p\n", rt);
2014 		return -1;
2015 	}
2016 	return 0;
2017 }
2018 
2019 void rt6_ifdown(struct net *net, struct net_device *dev)
2020 {
2021 	struct arg_dev_net adn = {
2022 		.dev = dev,
2023 		.net = net,
2024 	};
2025 
2026 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2027 	icmp6_clean_all(fib6_ifdown, &adn);
2028 }
2029 
2030 struct rt6_mtu_change_arg
2031 {
2032 	struct net_device *dev;
2033 	unsigned mtu;
2034 };
2035 
2036 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2037 {
2038 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2039 	struct inet6_dev *idev;
2040 
2041 	/* In IPv6 pmtu discovery is not optional,
2042 	   so that RTAX_MTU lock cannot disable it.
2043 	   We still use this lock to block changes
2044 	   caused by addrconf/ndisc.
2045 	*/
2046 
2047 	idev = __in6_dev_get(arg->dev);
2048 	if (idev == NULL)
2049 		return 0;
2050 
2051 	/* For administrative MTU increase, there is no way to discover
2052 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2053 	   Since RFC 1981 doesn't include administrative MTU increase
2054 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2055 	 */
2056 	/*
2057 	   If new MTU is less than route PMTU, this new MTU will be the
2058 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2059 	   decreases; if new MTU is greater than route PMTU, and the
2060 	   old MTU is the lowest MTU in the path, update the route PMTU
2061 	   to reflect the increase. In this case if the other nodes' MTU
2062 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2063 	   PMTU discouvery.
2064 	 */
2065 	if (rt->rt6i_dev == arg->dev &&
2066 	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2067 	    (dst_mtu(&rt->dst) >= arg->mtu ||
2068 	     (dst_mtu(&rt->dst) < arg->mtu &&
2069 	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2070 		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2071 	}
2072 	return 0;
2073 }
2074 
2075 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2076 {
2077 	struct rt6_mtu_change_arg arg = {
2078 		.dev = dev,
2079 		.mtu = mtu,
2080 	};
2081 
2082 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2083 }
2084 
2085 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2086 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2087 	[RTA_OIF]               = { .type = NLA_U32 },
2088 	[RTA_IIF]		= { .type = NLA_U32 },
2089 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2090 	[RTA_METRICS]           = { .type = NLA_NESTED },
2091 };
2092 
2093 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2094 			      struct fib6_config *cfg)
2095 {
2096 	struct rtmsg *rtm;
2097 	struct nlattr *tb[RTA_MAX+1];
2098 	int err;
2099 
2100 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2101 	if (err < 0)
2102 		goto errout;
2103 
2104 	err = -EINVAL;
2105 	rtm = nlmsg_data(nlh);
2106 	memset(cfg, 0, sizeof(*cfg));
2107 
2108 	cfg->fc_table = rtm->rtm_table;
2109 	cfg->fc_dst_len = rtm->rtm_dst_len;
2110 	cfg->fc_src_len = rtm->rtm_src_len;
2111 	cfg->fc_flags = RTF_UP;
2112 	cfg->fc_protocol = rtm->rtm_protocol;
2113 
2114 	if (rtm->rtm_type == RTN_UNREACHABLE)
2115 		cfg->fc_flags |= RTF_REJECT;
2116 
2117 	if (rtm->rtm_type == RTN_LOCAL)
2118 		cfg->fc_flags |= RTF_LOCAL;
2119 
2120 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2121 	cfg->fc_nlinfo.nlh = nlh;
2122 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2123 
2124 	if (tb[RTA_GATEWAY]) {
2125 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2126 		cfg->fc_flags |= RTF_GATEWAY;
2127 	}
2128 
2129 	if (tb[RTA_DST]) {
2130 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2131 
2132 		if (nla_len(tb[RTA_DST]) < plen)
2133 			goto errout;
2134 
2135 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2136 	}
2137 
2138 	if (tb[RTA_SRC]) {
2139 		int plen = (rtm->rtm_src_len + 7) >> 3;
2140 
2141 		if (nla_len(tb[RTA_SRC]) < plen)
2142 			goto errout;
2143 
2144 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2145 	}
2146 
2147 	if (tb[RTA_OIF])
2148 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2149 
2150 	if (tb[RTA_PRIORITY])
2151 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2152 
2153 	if (tb[RTA_METRICS]) {
2154 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2155 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2156 	}
2157 
2158 	if (tb[RTA_TABLE])
2159 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2160 
2161 	err = 0;
2162 errout:
2163 	return err;
2164 }
2165 
2166 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2167 {
2168 	struct fib6_config cfg;
2169 	int err;
2170 
2171 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2172 	if (err < 0)
2173 		return err;
2174 
2175 	return ip6_route_del(&cfg);
2176 }
2177 
2178 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2179 {
2180 	struct fib6_config cfg;
2181 	int err;
2182 
2183 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2184 	if (err < 0)
2185 		return err;
2186 
2187 	return ip6_route_add(&cfg);
2188 }
2189 
2190 static inline size_t rt6_nlmsg_size(void)
2191 {
2192 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2193 	       + nla_total_size(16) /* RTA_SRC */
2194 	       + nla_total_size(16) /* RTA_DST */
2195 	       + nla_total_size(16) /* RTA_GATEWAY */
2196 	       + nla_total_size(16) /* RTA_PREFSRC */
2197 	       + nla_total_size(4) /* RTA_TABLE */
2198 	       + nla_total_size(4) /* RTA_IIF */
2199 	       + nla_total_size(4) /* RTA_OIF */
2200 	       + nla_total_size(4) /* RTA_PRIORITY */
2201 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2202 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2203 }
2204 
2205 static int rt6_fill_node(struct net *net,
2206 			 struct sk_buff *skb, struct rt6_info *rt,
2207 			 struct in6_addr *dst, struct in6_addr *src,
2208 			 int iif, int type, u32 pid, u32 seq,
2209 			 int prefix, int nowait, unsigned int flags)
2210 {
2211 	struct rtmsg *rtm;
2212 	struct nlmsghdr *nlh;
2213 	long expires;
2214 	u32 table;
2215 
2216 	if (prefix) {	/* user wants prefix routes only */
2217 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2218 			/* success since this is not a prefix route */
2219 			return 1;
2220 		}
2221 	}
2222 
2223 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2224 	if (nlh == NULL)
2225 		return -EMSGSIZE;
2226 
2227 	rtm = nlmsg_data(nlh);
2228 	rtm->rtm_family = AF_INET6;
2229 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2230 	rtm->rtm_src_len = rt->rt6i_src.plen;
2231 	rtm->rtm_tos = 0;
2232 	if (rt->rt6i_table)
2233 		table = rt->rt6i_table->tb6_id;
2234 	else
2235 		table = RT6_TABLE_UNSPEC;
2236 	rtm->rtm_table = table;
2237 	NLA_PUT_U32(skb, RTA_TABLE, table);
2238 	if (rt->rt6i_flags&RTF_REJECT)
2239 		rtm->rtm_type = RTN_UNREACHABLE;
2240 	else if (rt->rt6i_flags&RTF_LOCAL)
2241 		rtm->rtm_type = RTN_LOCAL;
2242 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2243 		rtm->rtm_type = RTN_LOCAL;
2244 	else
2245 		rtm->rtm_type = RTN_UNICAST;
2246 	rtm->rtm_flags = 0;
2247 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2248 	rtm->rtm_protocol = rt->rt6i_protocol;
2249 	if (rt->rt6i_flags&RTF_DYNAMIC)
2250 		rtm->rtm_protocol = RTPROT_REDIRECT;
2251 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2252 		rtm->rtm_protocol = RTPROT_KERNEL;
2253 	else if (rt->rt6i_flags&RTF_DEFAULT)
2254 		rtm->rtm_protocol = RTPROT_RA;
2255 
2256 	if (rt->rt6i_flags&RTF_CACHE)
2257 		rtm->rtm_flags |= RTM_F_CLONED;
2258 
2259 	if (dst) {
2260 		NLA_PUT(skb, RTA_DST, 16, dst);
2261 		rtm->rtm_dst_len = 128;
2262 	} else if (rtm->rtm_dst_len)
2263 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2264 #ifdef CONFIG_IPV6_SUBTREES
2265 	if (src) {
2266 		NLA_PUT(skb, RTA_SRC, 16, src);
2267 		rtm->rtm_src_len = 128;
2268 	} else if (rtm->rtm_src_len)
2269 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2270 #endif
2271 	if (iif) {
2272 #ifdef CONFIG_IPV6_MROUTE
2273 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2274 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2275 			if (err <= 0) {
2276 				if (!nowait) {
2277 					if (err == 0)
2278 						return 0;
2279 					goto nla_put_failure;
2280 				} else {
2281 					if (err == -EMSGSIZE)
2282 						goto nla_put_failure;
2283 				}
2284 			}
2285 		} else
2286 #endif
2287 			NLA_PUT_U32(skb, RTA_IIF, iif);
2288 	} else if (dst) {
2289 		struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2290 		struct in6_addr saddr_buf;
2291 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2292 				       dst, 0, &saddr_buf) == 0)
2293 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2294 	}
2295 
2296 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2297 		goto nla_put_failure;
2298 
2299 	if (rt->dst.neighbour)
2300 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2301 
2302 	if (rt->dst.dev)
2303 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2304 
2305 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2306 
2307 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2308 		expires = 0;
2309 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2310 		expires = rt->rt6i_expires - jiffies;
2311 	else
2312 		expires = INT_MAX;
2313 
2314 	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2315 			       expires, rt->dst.error) < 0)
2316 		goto nla_put_failure;
2317 
2318 	return nlmsg_end(skb, nlh);
2319 
2320 nla_put_failure:
2321 	nlmsg_cancel(skb, nlh);
2322 	return -EMSGSIZE;
2323 }
2324 
2325 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2326 {
2327 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2328 	int prefix;
2329 
2330 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2331 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2332 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2333 	} else
2334 		prefix = 0;
2335 
2336 	return rt6_fill_node(arg->net,
2337 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2338 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2339 		     prefix, 0, NLM_F_MULTI);
2340 }
2341 
2342 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2343 {
2344 	struct net *net = sock_net(in_skb->sk);
2345 	struct nlattr *tb[RTA_MAX+1];
2346 	struct rt6_info *rt;
2347 	struct sk_buff *skb;
2348 	struct rtmsg *rtm;
2349 	struct flowi fl;
2350 	int err, iif = 0;
2351 
2352 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2353 	if (err < 0)
2354 		goto errout;
2355 
2356 	err = -EINVAL;
2357 	memset(&fl, 0, sizeof(fl));
2358 
2359 	if (tb[RTA_SRC]) {
2360 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2361 			goto errout;
2362 
2363 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2364 	}
2365 
2366 	if (tb[RTA_DST]) {
2367 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2368 			goto errout;
2369 
2370 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2371 	}
2372 
2373 	if (tb[RTA_IIF])
2374 		iif = nla_get_u32(tb[RTA_IIF]);
2375 
2376 	if (tb[RTA_OIF])
2377 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2378 
2379 	if (iif) {
2380 		struct net_device *dev;
2381 		dev = __dev_get_by_index(net, iif);
2382 		if (!dev) {
2383 			err = -ENODEV;
2384 			goto errout;
2385 		}
2386 	}
2387 
2388 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2389 	if (skb == NULL) {
2390 		err = -ENOBUFS;
2391 		goto errout;
2392 	}
2393 
2394 	/* Reserve room for dummy headers, this skb can pass
2395 	   through good chunk of routing engine.
2396 	 */
2397 	skb_reset_mac_header(skb);
2398 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2399 
2400 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2401 	skb_dst_set(skb, &rt->dst);
2402 
2403 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2404 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2405 			    nlh->nlmsg_seq, 0, 0, 0);
2406 	if (err < 0) {
2407 		kfree_skb(skb);
2408 		goto errout;
2409 	}
2410 
2411 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2412 errout:
2413 	return err;
2414 }
2415 
2416 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2417 {
2418 	struct sk_buff *skb;
2419 	struct net *net = info->nl_net;
2420 	u32 seq;
2421 	int err;
2422 
2423 	err = -ENOBUFS;
2424 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2425 
2426 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2427 	if (skb == NULL)
2428 		goto errout;
2429 
2430 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2431 				event, info->pid, seq, 0, 0, 0);
2432 	if (err < 0) {
2433 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2434 		WARN_ON(err == -EMSGSIZE);
2435 		kfree_skb(skb);
2436 		goto errout;
2437 	}
2438 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2439 		    info->nlh, gfp_any());
2440 	return;
2441 errout:
2442 	if (err < 0)
2443 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2444 }
2445 
2446 static int ip6_route_dev_notify(struct notifier_block *this,
2447 				unsigned long event, void *data)
2448 {
2449 	struct net_device *dev = (struct net_device *)data;
2450 	struct net *net = dev_net(dev);
2451 
2452 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2453 		net->ipv6.ip6_null_entry->dst.dev = dev;
2454 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2455 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2456 		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2457 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2458 		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2459 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2460 #endif
2461 	}
2462 
2463 	return NOTIFY_OK;
2464 }
2465 
2466 /*
2467  *	/proc
2468  */
2469 
2470 #ifdef CONFIG_PROC_FS
2471 
2472 struct rt6_proc_arg
2473 {
2474 	char *buffer;
2475 	int offset;
2476 	int length;
2477 	int skip;
2478 	int len;
2479 };
2480 
2481 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2482 {
2483 	struct seq_file *m = p_arg;
2484 
2485 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2486 
2487 #ifdef CONFIG_IPV6_SUBTREES
2488 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2489 #else
2490 	seq_puts(m, "00000000000000000000000000000000 00 ");
2491 #endif
2492 
2493 	if (rt->rt6i_nexthop) {
2494 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2495 	} else {
2496 		seq_puts(m, "00000000000000000000000000000000");
2497 	}
2498 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2499 		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2500 		   rt->dst.__use, rt->rt6i_flags,
2501 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2502 	return 0;
2503 }
2504 
2505 static int ipv6_route_show(struct seq_file *m, void *v)
2506 {
2507 	struct net *net = (struct net *)m->private;
2508 	fib6_clean_all(net, rt6_info_route, 0, m);
2509 	return 0;
2510 }
2511 
2512 static int ipv6_route_open(struct inode *inode, struct file *file)
2513 {
2514 	return single_open_net(inode, file, ipv6_route_show);
2515 }
2516 
2517 static const struct file_operations ipv6_route_proc_fops = {
2518 	.owner		= THIS_MODULE,
2519 	.open		= ipv6_route_open,
2520 	.read		= seq_read,
2521 	.llseek		= seq_lseek,
2522 	.release	= single_release_net,
2523 };
2524 
2525 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2526 {
2527 	struct net *net = (struct net *)seq->private;
2528 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2529 		   net->ipv6.rt6_stats->fib_nodes,
2530 		   net->ipv6.rt6_stats->fib_route_nodes,
2531 		   net->ipv6.rt6_stats->fib_rt_alloc,
2532 		   net->ipv6.rt6_stats->fib_rt_entries,
2533 		   net->ipv6.rt6_stats->fib_rt_cache,
2534 		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2535 		   net->ipv6.rt6_stats->fib_discarded_routes);
2536 
2537 	return 0;
2538 }
2539 
2540 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2541 {
2542 	return single_open_net(inode, file, rt6_stats_seq_show);
2543 }
2544 
2545 static const struct file_operations rt6_stats_seq_fops = {
2546 	.owner	 = THIS_MODULE,
2547 	.open	 = rt6_stats_seq_open,
2548 	.read	 = seq_read,
2549 	.llseek	 = seq_lseek,
2550 	.release = single_release_net,
2551 };
2552 #endif	/* CONFIG_PROC_FS */
2553 
2554 #ifdef CONFIG_SYSCTL
2555 
2556 static
2557 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2558 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2559 {
2560 	struct net *net = current->nsproxy->net_ns;
2561 	int delay = net->ipv6.sysctl.flush_delay;
2562 	if (write) {
2563 		proc_dointvec(ctl, write, buffer, lenp, ppos);
2564 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2565 		return 0;
2566 	} else
2567 		return -EINVAL;
2568 }
2569 
2570 ctl_table ipv6_route_table_template[] = {
2571 	{
2572 		.procname	=	"flush",
2573 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2574 		.maxlen		=	sizeof(int),
2575 		.mode		=	0200,
2576 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2577 	},
2578 	{
2579 		.procname	=	"gc_thresh",
2580 		.data		=	&ip6_dst_ops_template.gc_thresh,
2581 		.maxlen		=	sizeof(int),
2582 		.mode		=	0644,
2583 		.proc_handler	=	proc_dointvec,
2584 	},
2585 	{
2586 		.procname	=	"max_size",
2587 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2588 		.maxlen		=	sizeof(int),
2589 		.mode		=	0644,
2590 		.proc_handler	=	proc_dointvec,
2591 	},
2592 	{
2593 		.procname	=	"gc_min_interval",
2594 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2595 		.maxlen		=	sizeof(int),
2596 		.mode		=	0644,
2597 		.proc_handler	=	proc_dointvec_jiffies,
2598 	},
2599 	{
2600 		.procname	=	"gc_timeout",
2601 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2602 		.maxlen		=	sizeof(int),
2603 		.mode		=	0644,
2604 		.proc_handler	=	proc_dointvec_jiffies,
2605 	},
2606 	{
2607 		.procname	=	"gc_interval",
2608 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2609 		.maxlen		=	sizeof(int),
2610 		.mode		=	0644,
2611 		.proc_handler	=	proc_dointvec_jiffies,
2612 	},
2613 	{
2614 		.procname	=	"gc_elasticity",
2615 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2616 		.maxlen		=	sizeof(int),
2617 		.mode		=	0644,
2618 		.proc_handler	=	proc_dointvec,
2619 	},
2620 	{
2621 		.procname	=	"mtu_expires",
2622 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2623 		.maxlen		=	sizeof(int),
2624 		.mode		=	0644,
2625 		.proc_handler	=	proc_dointvec_jiffies,
2626 	},
2627 	{
2628 		.procname	=	"min_adv_mss",
2629 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2630 		.maxlen		=	sizeof(int),
2631 		.mode		=	0644,
2632 		.proc_handler	=	proc_dointvec,
2633 	},
2634 	{
2635 		.procname	=	"gc_min_interval_ms",
2636 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2637 		.maxlen		=	sizeof(int),
2638 		.mode		=	0644,
2639 		.proc_handler	=	proc_dointvec_ms_jiffies,
2640 	},
2641 	{ }
2642 };
2643 
2644 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2645 {
2646 	struct ctl_table *table;
2647 
2648 	table = kmemdup(ipv6_route_table_template,
2649 			sizeof(ipv6_route_table_template),
2650 			GFP_KERNEL);
2651 
2652 	if (table) {
2653 		table[0].data = &net->ipv6.sysctl.flush_delay;
2654 		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2655 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2656 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2657 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2658 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2659 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2660 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2661 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2662 		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2663 	}
2664 
2665 	return table;
2666 }
2667 #endif
2668 
2669 static int __net_init ip6_route_net_init(struct net *net)
2670 {
2671 	int ret = -ENOMEM;
2672 
2673 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2674 	       sizeof(net->ipv6.ip6_dst_ops));
2675 
2676 	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2677 		goto out_ip6_dst_ops;
2678 
2679 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2680 					   sizeof(*net->ipv6.ip6_null_entry),
2681 					   GFP_KERNEL);
2682 	if (!net->ipv6.ip6_null_entry)
2683 		goto out_ip6_dst_entries;
2684 	net->ipv6.ip6_null_entry->dst.path =
2685 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2686 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2687 	dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2688 
2689 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2690 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2691 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2692 					       GFP_KERNEL);
2693 	if (!net->ipv6.ip6_prohibit_entry)
2694 		goto out_ip6_null_entry;
2695 	net->ipv6.ip6_prohibit_entry->dst.path =
2696 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2697 	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2698 	dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2699 
2700 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2701 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2702 					       GFP_KERNEL);
2703 	if (!net->ipv6.ip6_blk_hole_entry)
2704 		goto out_ip6_prohibit_entry;
2705 	net->ipv6.ip6_blk_hole_entry->dst.path =
2706 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2707 	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2708 	dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2709 #endif
2710 
2711 	net->ipv6.sysctl.flush_delay = 0;
2712 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2713 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2714 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2715 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2716 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2717 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2718 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2719 
2720 #ifdef CONFIG_PROC_FS
2721 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2722 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2723 #endif
2724 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2725 
2726 	ret = 0;
2727 out:
2728 	return ret;
2729 
2730 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2731 out_ip6_prohibit_entry:
2732 	kfree(net->ipv6.ip6_prohibit_entry);
2733 out_ip6_null_entry:
2734 	kfree(net->ipv6.ip6_null_entry);
2735 #endif
2736 out_ip6_dst_entries:
2737 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2738 out_ip6_dst_ops:
2739 	goto out;
2740 }
2741 
2742 static void __net_exit ip6_route_net_exit(struct net *net)
2743 {
2744 #ifdef CONFIG_PROC_FS
2745 	proc_net_remove(net, "ipv6_route");
2746 	proc_net_remove(net, "rt6_stats");
2747 #endif
2748 	kfree(net->ipv6.ip6_null_entry);
2749 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2750 	kfree(net->ipv6.ip6_prohibit_entry);
2751 	kfree(net->ipv6.ip6_blk_hole_entry);
2752 #endif
2753 	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2754 }
2755 
2756 static struct pernet_operations ip6_route_net_ops = {
2757 	.init = ip6_route_net_init,
2758 	.exit = ip6_route_net_exit,
2759 };
2760 
2761 static struct notifier_block ip6_route_dev_notifier = {
2762 	.notifier_call = ip6_route_dev_notify,
2763 	.priority = 0,
2764 };
2765 
2766 int __init ip6_route_init(void)
2767 {
2768 	int ret;
2769 
2770 	ret = -ENOMEM;
2771 	ip6_dst_ops_template.kmem_cachep =
2772 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2773 				  SLAB_HWCACHE_ALIGN, NULL);
2774 	if (!ip6_dst_ops_template.kmem_cachep)
2775 		goto out;
2776 
2777 	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2778 	if (ret)
2779 		goto out_kmem_cache;
2780 
2781 	ret = register_pernet_subsys(&ip6_route_net_ops);
2782 	if (ret)
2783 		goto out_dst_entries;
2784 
2785 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2786 
2787 	/* Registering of the loopback is done before this portion of code,
2788 	 * the loopback reference in rt6_info will not be taken, do it
2789 	 * manually for init_net */
2790 	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2791 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2792   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2793 	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2794 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2795 	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2796 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2797   #endif
2798 	ret = fib6_init();
2799 	if (ret)
2800 		goto out_register_subsys;
2801 
2802 	ret = xfrm6_init();
2803 	if (ret)
2804 		goto out_fib6_init;
2805 
2806 	ret = fib6_rules_init();
2807 	if (ret)
2808 		goto xfrm6_init;
2809 
2810 	ret = -ENOBUFS;
2811 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2812 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2813 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2814 		goto fib6_rules_init;
2815 
2816 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2817 	if (ret)
2818 		goto fib6_rules_init;
2819 
2820 out:
2821 	return ret;
2822 
2823 fib6_rules_init:
2824 	fib6_rules_cleanup();
2825 xfrm6_init:
2826 	xfrm6_fini();
2827 out_fib6_init:
2828 	fib6_gc_cleanup();
2829 out_register_subsys:
2830 	unregister_pernet_subsys(&ip6_route_net_ops);
2831 out_dst_entries:
2832 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2833 out_kmem_cache:
2834 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2835 	goto out;
2836 }
2837 
2838 void ip6_route_cleanup(void)
2839 {
2840 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2841 	fib6_rules_cleanup();
2842 	xfrm6_fini();
2843 	fib6_gc_cleanup();
2844 	unregister_pernet_subsys(&ip6_route_net_ops);
2845 	dst_entries_destroy(&ip6_dst_blackhole_ops);
2846 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2847 }
2848