xref: /openbmc/linux/net/ipv6/route.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13 
14 /*	Changes:
15  *
16  *	YOSHIFUJI Hideaki @USAGI
17  *		reworked default router selection.
18  *		- respect outgoing interface
19  *		- select from (probably) reachable routers (i.e.
20  *		routers in REACHABLE, STALE, DELAY or PROBE states).
21  *		- always select the same router if it is (probably)
22  *		reachable.  otherwise, round-robin the list.
23  *	Ville Nuorvala
24  *		Fixed routing subtrees.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56 
57 #include <asm/uaccess.h>
58 
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62 
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65 
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73 
74 #define CLONE_OFFLINK_ROUTE 0
75 
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void		ip6_dst_destroy(struct dst_entry *);
80 static void		ip6_dst_ifdown(struct dst_entry *,
81 				       struct net_device *dev, int how);
82 static int		 ip6_dst_gc(struct dst_ops *ops);
83 
84 static int		ip6_pkt_discard(struct sk_buff *skb);
85 static int		ip6_pkt_discard_out(struct sk_buff *skb);
86 static void		ip6_link_failure(struct sk_buff *skb);
87 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88 
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91 					   struct in6_addr *prefix, int prefixlen,
92 					   struct in6_addr *gwaddr, int ifindex,
93 					   unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95 					   struct in6_addr *prefix, int prefixlen,
96 					   struct in6_addr *gwaddr, int ifindex);
97 #endif
98 
99 static struct dst_ops ip6_dst_ops_template = {
100 	.family			=	AF_INET6,
101 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
102 	.gc			=	ip6_dst_gc,
103 	.gc_thresh		=	1024,
104 	.check			=	ip6_dst_check,
105 	.destroy		=	ip6_dst_destroy,
106 	.ifdown			=	ip6_dst_ifdown,
107 	.negative_advice	=	ip6_negative_advice,
108 	.link_failure		=	ip6_link_failure,
109 	.update_pmtu		=	ip6_rt_update_pmtu,
110 	.local_out		=	__ip6_local_out,
111 	.entries		=	ATOMIC_INIT(0),
112 };
113 
114 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
115 {
116 }
117 
118 static struct dst_ops ip6_dst_blackhole_ops = {
119 	.family			=	AF_INET6,
120 	.protocol		=	cpu_to_be16(ETH_P_IPV6),
121 	.destroy		=	ip6_dst_destroy,
122 	.check			=	ip6_dst_check,
123 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
124 	.entries		=	ATOMIC_INIT(0),
125 };
126 
127 static struct rt6_info ip6_null_entry_template = {
128 	.u = {
129 		.dst = {
130 			.__refcnt	= ATOMIC_INIT(1),
131 			.__use		= 1,
132 			.obsolete	= -1,
133 			.error		= -ENETUNREACH,
134 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
135 			.input		= ip6_pkt_discard,
136 			.output		= ip6_pkt_discard_out,
137 		}
138 	},
139 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
140 	.rt6i_metric	= ~(u32) 0,
141 	.rt6i_ref	= ATOMIC_INIT(1),
142 };
143 
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145 
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148 
149 static struct rt6_info ip6_prohibit_entry_template = {
150 	.u = {
151 		.dst = {
152 			.__refcnt	= ATOMIC_INIT(1),
153 			.__use		= 1,
154 			.obsolete	= -1,
155 			.error		= -EACCES,
156 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
157 			.input		= ip6_pkt_prohibit,
158 			.output		= ip6_pkt_prohibit_out,
159 		}
160 	},
161 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
162 	.rt6i_metric	= ~(u32) 0,
163 	.rt6i_ref	= ATOMIC_INIT(1),
164 };
165 
166 static struct rt6_info ip6_blk_hole_entry_template = {
167 	.u = {
168 		.dst = {
169 			.__refcnt	= ATOMIC_INIT(1),
170 			.__use		= 1,
171 			.obsolete	= -1,
172 			.error		= -EINVAL,
173 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
174 			.input		= dst_discard,
175 			.output		= dst_discard,
176 		}
177 	},
178 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
179 	.rt6i_metric	= ~(u32) 0,
180 	.rt6i_ref	= ATOMIC_INIT(1),
181 };
182 
183 #endif
184 
185 /* allocate dst with ip6_dst_ops */
186 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
187 {
188 	return (struct rt6_info *)dst_alloc(ops);
189 }
190 
191 static void ip6_dst_destroy(struct dst_entry *dst)
192 {
193 	struct rt6_info *rt = (struct rt6_info *)dst;
194 	struct inet6_dev *idev = rt->rt6i_idev;
195 
196 	if (idev != NULL) {
197 		rt->rt6i_idev = NULL;
198 		in6_dev_put(idev);
199 	}
200 }
201 
202 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
203 			   int how)
204 {
205 	struct rt6_info *rt = (struct rt6_info *)dst;
206 	struct inet6_dev *idev = rt->rt6i_idev;
207 	struct net_device *loopback_dev =
208 		dev_net(dev)->loopback_dev;
209 
210 	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
211 		struct inet6_dev *loopback_idev =
212 			in6_dev_get(loopback_dev);
213 		if (loopback_idev != NULL) {
214 			rt->rt6i_idev = loopback_idev;
215 			in6_dev_put(idev);
216 		}
217 	}
218 }
219 
220 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
221 {
222 	return (rt->rt6i_flags & RTF_EXPIRES &&
223 		time_after(jiffies, rt->rt6i_expires));
224 }
225 
226 static inline int rt6_need_strict(struct in6_addr *daddr)
227 {
228 	return (ipv6_addr_type(daddr) &
229 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
230 }
231 
232 /*
233  *	Route lookup. Any table->tb6_lock is implied.
234  */
235 
236 static inline struct rt6_info *rt6_device_match(struct net *net,
237 						    struct rt6_info *rt,
238 						    struct in6_addr *saddr,
239 						    int oif,
240 						    int flags)
241 {
242 	struct rt6_info *local = NULL;
243 	struct rt6_info *sprt;
244 
245 	if (!oif && ipv6_addr_any(saddr))
246 		goto out;
247 
248 	for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
249 		struct net_device *dev = sprt->rt6i_dev;
250 
251 		if (oif) {
252 			if (dev->ifindex == oif)
253 				return sprt;
254 			if (dev->flags & IFF_LOOPBACK) {
255 				if (sprt->rt6i_idev == NULL ||
256 				    sprt->rt6i_idev->dev->ifindex != oif) {
257 					if (flags & RT6_LOOKUP_F_IFACE && oif)
258 						continue;
259 					if (local && (!oif ||
260 						      local->rt6i_idev->dev->ifindex == oif))
261 						continue;
262 				}
263 				local = sprt;
264 			}
265 		} else {
266 			if (ipv6_chk_addr(net, saddr, dev,
267 					  flags & RT6_LOOKUP_F_IFACE))
268 				return sprt;
269 		}
270 	}
271 
272 	if (oif) {
273 		if (local)
274 			return local;
275 
276 		if (flags & RT6_LOOKUP_F_IFACE)
277 			return net->ipv6.ip6_null_entry;
278 	}
279 out:
280 	return rt;
281 }
282 
283 #ifdef CONFIG_IPV6_ROUTER_PREF
284 static void rt6_probe(struct rt6_info *rt)
285 {
286 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
287 	/*
288 	 * Okay, this does not seem to be appropriate
289 	 * for now, however, we need to check if it
290 	 * is really so; aka Router Reachability Probing.
291 	 *
292 	 * Router Reachability Probe MUST be rate-limited
293 	 * to no more than one per minute.
294 	 */
295 	if (!neigh || (neigh->nud_state & NUD_VALID))
296 		return;
297 	read_lock_bh(&neigh->lock);
298 	if (!(neigh->nud_state & NUD_VALID) &&
299 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
300 		struct in6_addr mcaddr;
301 		struct in6_addr *target;
302 
303 		neigh->updated = jiffies;
304 		read_unlock_bh(&neigh->lock);
305 
306 		target = (struct in6_addr *)&neigh->primary_key;
307 		addrconf_addr_solict_mult(target, &mcaddr);
308 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
309 	} else
310 		read_unlock_bh(&neigh->lock);
311 }
312 #else
313 static inline void rt6_probe(struct rt6_info *rt)
314 {
315 	return;
316 }
317 #endif
318 
319 /*
320  * Default Router Selection (RFC 2461 6.3.6)
321  */
322 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
323 {
324 	struct net_device *dev = rt->rt6i_dev;
325 	if (!oif || dev->ifindex == oif)
326 		return 2;
327 	if ((dev->flags & IFF_LOOPBACK) &&
328 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
329 		return 1;
330 	return 0;
331 }
332 
333 static inline int rt6_check_neigh(struct rt6_info *rt)
334 {
335 	struct neighbour *neigh = rt->rt6i_nexthop;
336 	int m;
337 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
338 	    !(rt->rt6i_flags & RTF_GATEWAY))
339 		m = 1;
340 	else if (neigh) {
341 		read_lock_bh(&neigh->lock);
342 		if (neigh->nud_state & NUD_VALID)
343 			m = 2;
344 #ifdef CONFIG_IPV6_ROUTER_PREF
345 		else if (neigh->nud_state & NUD_FAILED)
346 			m = 0;
347 #endif
348 		else
349 			m = 1;
350 		read_unlock_bh(&neigh->lock);
351 	} else
352 		m = 0;
353 	return m;
354 }
355 
356 static int rt6_score_route(struct rt6_info *rt, int oif,
357 			   int strict)
358 {
359 	int m, n;
360 
361 	m = rt6_check_dev(rt, oif);
362 	if (!m && (strict & RT6_LOOKUP_F_IFACE))
363 		return -1;
364 #ifdef CONFIG_IPV6_ROUTER_PREF
365 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
366 #endif
367 	n = rt6_check_neigh(rt);
368 	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
369 		return -1;
370 	return m;
371 }
372 
373 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
374 				   int *mpri, struct rt6_info *match)
375 {
376 	int m;
377 
378 	if (rt6_check_expired(rt))
379 		goto out;
380 
381 	m = rt6_score_route(rt, oif, strict);
382 	if (m < 0)
383 		goto out;
384 
385 	if (m > *mpri) {
386 		if (strict & RT6_LOOKUP_F_REACHABLE)
387 			rt6_probe(match);
388 		*mpri = m;
389 		match = rt;
390 	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
391 		rt6_probe(rt);
392 	}
393 
394 out:
395 	return match;
396 }
397 
398 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
399 				     struct rt6_info *rr_head,
400 				     u32 metric, int oif, int strict)
401 {
402 	struct rt6_info *rt, *match;
403 	int mpri = -1;
404 
405 	match = NULL;
406 	for (rt = rr_head; rt && rt->rt6i_metric == metric;
407 	     rt = rt->u.dst.rt6_next)
408 		match = find_match(rt, oif, strict, &mpri, match);
409 	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
410 	     rt = rt->u.dst.rt6_next)
411 		match = find_match(rt, oif, strict, &mpri, match);
412 
413 	return match;
414 }
415 
416 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
417 {
418 	struct rt6_info *match, *rt0;
419 	struct net *net;
420 
421 	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
422 		  __func__, fn->leaf, oif);
423 
424 	rt0 = fn->rr_ptr;
425 	if (!rt0)
426 		fn->rr_ptr = rt0 = fn->leaf;
427 
428 	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
429 
430 	if (!match &&
431 	    (strict & RT6_LOOKUP_F_REACHABLE)) {
432 		struct rt6_info *next = rt0->u.dst.rt6_next;
433 
434 		/* no entries matched; do round-robin */
435 		if (!next || next->rt6i_metric != rt0->rt6i_metric)
436 			next = fn->leaf;
437 
438 		if (next != rt0)
439 			fn->rr_ptr = next;
440 	}
441 
442 	RT6_TRACE("%s() => %p\n",
443 		  __func__, match);
444 
445 	net = dev_net(rt0->rt6i_dev);
446 	return (match ? match : net->ipv6.ip6_null_entry);
447 }
448 
449 #ifdef CONFIG_IPV6_ROUTE_INFO
450 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
451 		  struct in6_addr *gwaddr)
452 {
453 	struct net *net = dev_net(dev);
454 	struct route_info *rinfo = (struct route_info *) opt;
455 	struct in6_addr prefix_buf, *prefix;
456 	unsigned int pref;
457 	unsigned long lifetime;
458 	struct rt6_info *rt;
459 
460 	if (len < sizeof(struct route_info)) {
461 		return -EINVAL;
462 	}
463 
464 	/* Sanity check for prefix_len and length */
465 	if (rinfo->length > 3) {
466 		return -EINVAL;
467 	} else if (rinfo->prefix_len > 128) {
468 		return -EINVAL;
469 	} else if (rinfo->prefix_len > 64) {
470 		if (rinfo->length < 2) {
471 			return -EINVAL;
472 		}
473 	} else if (rinfo->prefix_len > 0) {
474 		if (rinfo->length < 1) {
475 			return -EINVAL;
476 		}
477 	}
478 
479 	pref = rinfo->route_pref;
480 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
481 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
482 
483 	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
484 
485 	if (rinfo->length == 3)
486 		prefix = (struct in6_addr *)rinfo->prefix;
487 	else {
488 		/* this function is safe */
489 		ipv6_addr_prefix(&prefix_buf,
490 				 (struct in6_addr *)rinfo->prefix,
491 				 rinfo->prefix_len);
492 		prefix = &prefix_buf;
493 	}
494 
495 	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
496 				dev->ifindex);
497 
498 	if (rt && !lifetime) {
499 		ip6_del_rt(rt);
500 		rt = NULL;
501 	}
502 
503 	if (!rt && lifetime)
504 		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
505 					pref);
506 	else if (rt)
507 		rt->rt6i_flags = RTF_ROUTEINFO |
508 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
509 
510 	if (rt) {
511 		if (!addrconf_finite_timeout(lifetime)) {
512 			rt->rt6i_flags &= ~RTF_EXPIRES;
513 		} else {
514 			rt->rt6i_expires = jiffies + HZ * lifetime;
515 			rt->rt6i_flags |= RTF_EXPIRES;
516 		}
517 		dst_release(&rt->u.dst);
518 	}
519 	return 0;
520 }
521 #endif
522 
523 #define BACKTRACK(__net, saddr)			\
524 do { \
525 	if (rt == __net->ipv6.ip6_null_entry) {	\
526 		struct fib6_node *pn; \
527 		while (1) { \
528 			if (fn->fn_flags & RTN_TL_ROOT) \
529 				goto out; \
530 			pn = fn->parent; \
531 			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
532 				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
533 			else \
534 				fn = pn; \
535 			if (fn->fn_flags & RTN_RTINFO) \
536 				goto restart; \
537 		} \
538 	} \
539 } while(0)
540 
541 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
542 					     struct fib6_table *table,
543 					     struct flowi *fl, int flags)
544 {
545 	struct fib6_node *fn;
546 	struct rt6_info *rt;
547 
548 	read_lock_bh(&table->tb6_lock);
549 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
550 restart:
551 	rt = fn->leaf;
552 	rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
553 	BACKTRACK(net, &fl->fl6_src);
554 out:
555 	dst_use(&rt->u.dst, jiffies);
556 	read_unlock_bh(&table->tb6_lock);
557 	return rt;
558 
559 }
560 
561 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
562 			    const struct in6_addr *saddr, int oif, int strict)
563 {
564 	struct flowi fl = {
565 		.oif = oif,
566 		.nl_u = {
567 			.ip6_u = {
568 				.daddr = *daddr,
569 			},
570 		},
571 	};
572 	struct dst_entry *dst;
573 	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
574 
575 	if (saddr) {
576 		memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
577 		flags |= RT6_LOOKUP_F_HAS_SADDR;
578 	}
579 
580 	dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
581 	if (dst->error == 0)
582 		return (struct rt6_info *) dst;
583 
584 	dst_release(dst);
585 
586 	return NULL;
587 }
588 
589 EXPORT_SYMBOL(rt6_lookup);
590 
591 /* ip6_ins_rt is called with FREE table->tb6_lock.
592    It takes new route entry, the addition fails by any reason the
593    route is freed. In any case, if caller does not hold it, it may
594    be destroyed.
595  */
596 
597 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
598 {
599 	int err;
600 	struct fib6_table *table;
601 
602 	table = rt->rt6i_table;
603 	write_lock_bh(&table->tb6_lock);
604 	err = fib6_add(&table->tb6_root, rt, info);
605 	write_unlock_bh(&table->tb6_lock);
606 
607 	return err;
608 }
609 
610 int ip6_ins_rt(struct rt6_info *rt)
611 {
612 	struct nl_info info = {
613 		.nl_net = dev_net(rt->rt6i_dev),
614 	};
615 	return __ip6_ins_rt(rt, &info);
616 }
617 
618 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
619 				      struct in6_addr *saddr)
620 {
621 	struct rt6_info *rt;
622 
623 	/*
624 	 *	Clone the route.
625 	 */
626 
627 	rt = ip6_rt_copy(ort);
628 
629 	if (rt) {
630 		struct neighbour *neigh;
631 		int attempts = !in_softirq();
632 
633 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
634 			if (rt->rt6i_dst.plen != 128 &&
635 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
636 				rt->rt6i_flags |= RTF_ANYCAST;
637 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
638 		}
639 
640 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
641 		rt->rt6i_dst.plen = 128;
642 		rt->rt6i_flags |= RTF_CACHE;
643 		rt->u.dst.flags |= DST_HOST;
644 
645 #ifdef CONFIG_IPV6_SUBTREES
646 		if (rt->rt6i_src.plen && saddr) {
647 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
648 			rt->rt6i_src.plen = 128;
649 		}
650 #endif
651 
652 	retry:
653 		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
654 		if (IS_ERR(neigh)) {
655 			struct net *net = dev_net(rt->rt6i_dev);
656 			int saved_rt_min_interval =
657 				net->ipv6.sysctl.ip6_rt_gc_min_interval;
658 			int saved_rt_elasticity =
659 				net->ipv6.sysctl.ip6_rt_gc_elasticity;
660 
661 			if (attempts-- > 0) {
662 				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
663 				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
664 
665 				ip6_dst_gc(net->ipv6.ip6_dst_ops);
666 
667 				net->ipv6.sysctl.ip6_rt_gc_elasticity =
668 					saved_rt_elasticity;
669 				net->ipv6.sysctl.ip6_rt_gc_min_interval =
670 					saved_rt_min_interval;
671 				goto retry;
672 			}
673 
674 			if (net_ratelimit())
675 				printk(KERN_WARNING
676 				       "Neighbour table overflow.\n");
677 			dst_free(&rt->u.dst);
678 			return NULL;
679 		}
680 		rt->rt6i_nexthop = neigh;
681 
682 	}
683 
684 	return rt;
685 }
686 
687 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
688 {
689 	struct rt6_info *rt = ip6_rt_copy(ort);
690 	if (rt) {
691 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
692 		rt->rt6i_dst.plen = 128;
693 		rt->rt6i_flags |= RTF_CACHE;
694 		rt->u.dst.flags |= DST_HOST;
695 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
696 	}
697 	return rt;
698 }
699 
700 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
701 				      struct flowi *fl, int flags)
702 {
703 	struct fib6_node *fn;
704 	struct rt6_info *rt, *nrt;
705 	int strict = 0;
706 	int attempts = 3;
707 	int err;
708 	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
709 
710 	strict |= flags & RT6_LOOKUP_F_IFACE;
711 
712 relookup:
713 	read_lock_bh(&table->tb6_lock);
714 
715 restart_2:
716 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
717 
718 restart:
719 	rt = rt6_select(fn, oif, strict | reachable);
720 
721 	BACKTRACK(net, &fl->fl6_src);
722 	if (rt == net->ipv6.ip6_null_entry ||
723 	    rt->rt6i_flags & RTF_CACHE)
724 		goto out;
725 
726 	dst_hold(&rt->u.dst);
727 	read_unlock_bh(&table->tb6_lock);
728 
729 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
730 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
731 	else {
732 #if CLONE_OFFLINK_ROUTE
733 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
734 #else
735 		goto out2;
736 #endif
737 	}
738 
739 	dst_release(&rt->u.dst);
740 	rt = nrt ? : net->ipv6.ip6_null_entry;
741 
742 	dst_hold(&rt->u.dst);
743 	if (nrt) {
744 		err = ip6_ins_rt(nrt);
745 		if (!err)
746 			goto out2;
747 	}
748 
749 	if (--attempts <= 0)
750 		goto out2;
751 
752 	/*
753 	 * Race condition! In the gap, when table->tb6_lock was
754 	 * released someone could insert this route.  Relookup.
755 	 */
756 	dst_release(&rt->u.dst);
757 	goto relookup;
758 
759 out:
760 	if (reachable) {
761 		reachable = 0;
762 		goto restart_2;
763 	}
764 	dst_hold(&rt->u.dst);
765 	read_unlock_bh(&table->tb6_lock);
766 out2:
767 	rt->u.dst.lastuse = jiffies;
768 	rt->u.dst.__use++;
769 
770 	return rt;
771 }
772 
773 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
774 					    struct flowi *fl, int flags)
775 {
776 	return ip6_pol_route(net, table, fl->iif, fl, flags);
777 }
778 
779 void ip6_route_input(struct sk_buff *skb)
780 {
781 	struct ipv6hdr *iph = ipv6_hdr(skb);
782 	struct net *net = dev_net(skb->dev);
783 	int flags = RT6_LOOKUP_F_HAS_SADDR;
784 	struct flowi fl = {
785 		.iif = skb->dev->ifindex,
786 		.nl_u = {
787 			.ip6_u = {
788 				.daddr = iph->daddr,
789 				.saddr = iph->saddr,
790 				.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
791 			},
792 		},
793 		.mark = skb->mark,
794 		.proto = iph->nexthdr,
795 	};
796 
797 	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
798 		flags |= RT6_LOOKUP_F_IFACE;
799 
800 	skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
801 }
802 
803 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
804 					     struct flowi *fl, int flags)
805 {
806 	return ip6_pol_route(net, table, fl->oif, fl, flags);
807 }
808 
809 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
810 				    struct flowi *fl)
811 {
812 	int flags = 0;
813 
814 	if (rt6_need_strict(&fl->fl6_dst))
815 		flags |= RT6_LOOKUP_F_IFACE;
816 
817 	if (!ipv6_addr_any(&fl->fl6_src))
818 		flags |= RT6_LOOKUP_F_HAS_SADDR;
819 	else if (sk) {
820 		unsigned int prefs = inet6_sk(sk)->srcprefs;
821 		if (prefs & IPV6_PREFER_SRC_TMP)
822 			flags |= RT6_LOOKUP_F_SRCPREF_TMP;
823 		if (prefs & IPV6_PREFER_SRC_PUBLIC)
824 			flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
825 		if (prefs & IPV6_PREFER_SRC_COA)
826 			flags |= RT6_LOOKUP_F_SRCPREF_COA;
827 	}
828 
829 	return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
830 }
831 
832 EXPORT_SYMBOL(ip6_route_output);
833 
834 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
835 {
836 	struct rt6_info *ort = (struct rt6_info *) *dstp;
837 	struct rt6_info *rt = (struct rt6_info *)
838 		dst_alloc(&ip6_dst_blackhole_ops);
839 	struct dst_entry *new = NULL;
840 
841 	if (rt) {
842 		new = &rt->u.dst;
843 
844 		atomic_set(&new->__refcnt, 1);
845 		new->__use = 1;
846 		new->input = dst_discard;
847 		new->output = dst_discard;
848 
849 		memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
850 		new->dev = ort->u.dst.dev;
851 		if (new->dev)
852 			dev_hold(new->dev);
853 		rt->rt6i_idev = ort->rt6i_idev;
854 		if (rt->rt6i_idev)
855 			in6_dev_hold(rt->rt6i_idev);
856 		rt->rt6i_expires = 0;
857 
858 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
859 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
860 		rt->rt6i_metric = 0;
861 
862 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
863 #ifdef CONFIG_IPV6_SUBTREES
864 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
865 #endif
866 
867 		dst_free(new);
868 	}
869 
870 	dst_release(*dstp);
871 	*dstp = new;
872 	return (new ? 0 : -ENOMEM);
873 }
874 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
875 
876 /*
877  *	Destination cache support functions
878  */
879 
880 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
881 {
882 	struct rt6_info *rt;
883 
884 	rt = (struct rt6_info *) dst;
885 
886 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
887 		return dst;
888 
889 	return NULL;
890 }
891 
892 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
893 {
894 	struct rt6_info *rt = (struct rt6_info *) dst;
895 
896 	if (rt) {
897 		if (rt->rt6i_flags & RTF_CACHE)
898 			ip6_del_rt(rt);
899 		else
900 			dst_release(dst);
901 	}
902 	return NULL;
903 }
904 
905 static void ip6_link_failure(struct sk_buff *skb)
906 {
907 	struct rt6_info *rt;
908 
909 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
910 
911 	rt = (struct rt6_info *) skb->dst;
912 	if (rt) {
913 		if (rt->rt6i_flags&RTF_CACHE) {
914 			dst_set_expires(&rt->u.dst, 0);
915 			rt->rt6i_flags |= RTF_EXPIRES;
916 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
917 			rt->rt6i_node->fn_sernum = -1;
918 	}
919 }
920 
921 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
922 {
923 	struct rt6_info *rt6 = (struct rt6_info*)dst;
924 
925 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
926 		rt6->rt6i_flags |= RTF_MODIFIED;
927 		if (mtu < IPV6_MIN_MTU) {
928 			mtu = IPV6_MIN_MTU;
929 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
930 		}
931 		dst->metrics[RTAX_MTU-1] = mtu;
932 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
933 	}
934 }
935 
936 static int ipv6_get_mtu(struct net_device *dev);
937 
938 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
939 {
940 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
941 
942 	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
943 		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
944 
945 	/*
946 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
947 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
948 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
949 	 * rely only on pmtu discovery"
950 	 */
951 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
952 		mtu = IPV6_MAXPLEN;
953 	return mtu;
954 }
955 
956 static struct dst_entry *icmp6_dst_gc_list;
957 static DEFINE_SPINLOCK(icmp6_dst_lock);
958 
959 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
960 				  struct neighbour *neigh,
961 				  const struct in6_addr *addr)
962 {
963 	struct rt6_info *rt;
964 	struct inet6_dev *idev = in6_dev_get(dev);
965 	struct net *net = dev_net(dev);
966 
967 	if (unlikely(idev == NULL))
968 		return NULL;
969 
970 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
971 	if (unlikely(rt == NULL)) {
972 		in6_dev_put(idev);
973 		goto out;
974 	}
975 
976 	dev_hold(dev);
977 	if (neigh)
978 		neigh_hold(neigh);
979 	else {
980 		neigh = ndisc_get_neigh(dev, addr);
981 		if (IS_ERR(neigh))
982 			neigh = NULL;
983 	}
984 
985 	rt->rt6i_dev	  = dev;
986 	rt->rt6i_idev     = idev;
987 	rt->rt6i_nexthop  = neigh;
988 	atomic_set(&rt->u.dst.__refcnt, 1);
989 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
990 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
991 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
992 	rt->u.dst.output  = ip6_output;
993 
994 #if 0	/* there's no chance to use these for ndisc */
995 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
996 				? DST_HOST
997 				: 0;
998 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
999 	rt->rt6i_dst.plen = 128;
1000 #endif
1001 
1002 	spin_lock_bh(&icmp6_dst_lock);
1003 	rt->u.dst.next = icmp6_dst_gc_list;
1004 	icmp6_dst_gc_list = &rt->u.dst;
1005 	spin_unlock_bh(&icmp6_dst_lock);
1006 
1007 	fib6_force_start_gc(net);
1008 
1009 out:
1010 	return &rt->u.dst;
1011 }
1012 
1013 int icmp6_dst_gc(void)
1014 {
1015 	struct dst_entry *dst, *next, **pprev;
1016 	int more = 0;
1017 
1018 	next = NULL;
1019 
1020 	spin_lock_bh(&icmp6_dst_lock);
1021 	pprev = &icmp6_dst_gc_list;
1022 
1023 	while ((dst = *pprev) != NULL) {
1024 		if (!atomic_read(&dst->__refcnt)) {
1025 			*pprev = dst->next;
1026 			dst_free(dst);
1027 		} else {
1028 			pprev = &dst->next;
1029 			++more;
1030 		}
1031 	}
1032 
1033 	spin_unlock_bh(&icmp6_dst_lock);
1034 
1035 	return more;
1036 }
1037 
1038 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1039 			    void *arg)
1040 {
1041 	struct dst_entry *dst, **pprev;
1042 
1043 	spin_lock_bh(&icmp6_dst_lock);
1044 	pprev = &icmp6_dst_gc_list;
1045 	while ((dst = *pprev) != NULL) {
1046 		struct rt6_info *rt = (struct rt6_info *) dst;
1047 		if (func(rt, arg)) {
1048 			*pprev = dst->next;
1049 			dst_free(dst);
1050 		} else {
1051 			pprev = &dst->next;
1052 		}
1053 	}
1054 	spin_unlock_bh(&icmp6_dst_lock);
1055 }
1056 
1057 static int ip6_dst_gc(struct dst_ops *ops)
1058 {
1059 	unsigned long now = jiffies;
1060 	struct net *net = ops->dst_net;
1061 	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1062 	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1063 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1064 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1065 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1066 
1067 	if (time_after(rt_last_gc + rt_min_interval, now) &&
1068 	    atomic_read(&ops->entries) <= rt_max_size)
1069 		goto out;
1070 
1071 	net->ipv6.ip6_rt_gc_expire++;
1072 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1073 	net->ipv6.ip6_rt_last_gc = now;
1074 	if (atomic_read(&ops->entries) < ops->gc_thresh)
1075 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1076 out:
1077 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1078 	return (atomic_read(&ops->entries) > rt_max_size);
1079 }
1080 
1081 /* Clean host part of a prefix. Not necessary in radix tree,
1082    but results in cleaner routing tables.
1083 
1084    Remove it only when all the things will work!
1085  */
1086 
1087 static int ipv6_get_mtu(struct net_device *dev)
1088 {
1089 	int mtu = IPV6_MIN_MTU;
1090 	struct inet6_dev *idev;
1091 
1092 	idev = in6_dev_get(dev);
1093 	if (idev) {
1094 		mtu = idev->cnf.mtu6;
1095 		in6_dev_put(idev);
1096 	}
1097 	return mtu;
1098 }
1099 
1100 int ip6_dst_hoplimit(struct dst_entry *dst)
1101 {
1102 	int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1103 	if (hoplimit < 0) {
1104 		struct net_device *dev = dst->dev;
1105 		struct inet6_dev *idev = in6_dev_get(dev);
1106 		if (idev) {
1107 			hoplimit = idev->cnf.hop_limit;
1108 			in6_dev_put(idev);
1109 		} else
1110 			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1111 	}
1112 	return hoplimit;
1113 }
1114 
1115 /*
1116  *
1117  */
1118 
1119 int ip6_route_add(struct fib6_config *cfg)
1120 {
1121 	int err;
1122 	struct net *net = cfg->fc_nlinfo.nl_net;
1123 	struct rt6_info *rt = NULL;
1124 	struct net_device *dev = NULL;
1125 	struct inet6_dev *idev = NULL;
1126 	struct fib6_table *table;
1127 	int addr_type;
1128 
1129 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1130 		return -EINVAL;
1131 #ifndef CONFIG_IPV6_SUBTREES
1132 	if (cfg->fc_src_len)
1133 		return -EINVAL;
1134 #endif
1135 	if (cfg->fc_ifindex) {
1136 		err = -ENODEV;
1137 		dev = dev_get_by_index(net, cfg->fc_ifindex);
1138 		if (!dev)
1139 			goto out;
1140 		idev = in6_dev_get(dev);
1141 		if (!idev)
1142 			goto out;
1143 	}
1144 
1145 	if (cfg->fc_metric == 0)
1146 		cfg->fc_metric = IP6_RT_PRIO_USER;
1147 
1148 	table = fib6_new_table(net, cfg->fc_table);
1149 	if (table == NULL) {
1150 		err = -ENOBUFS;
1151 		goto out;
1152 	}
1153 
1154 	rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1155 
1156 	if (rt == NULL) {
1157 		err = -ENOMEM;
1158 		goto out;
1159 	}
1160 
1161 	rt->u.dst.obsolete = -1;
1162 	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1163 				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1164 				0;
1165 
1166 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1167 		cfg->fc_protocol = RTPROT_BOOT;
1168 	rt->rt6i_protocol = cfg->fc_protocol;
1169 
1170 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1171 
1172 	if (addr_type & IPV6_ADDR_MULTICAST)
1173 		rt->u.dst.input = ip6_mc_input;
1174 	else
1175 		rt->u.dst.input = ip6_forward;
1176 
1177 	rt->u.dst.output = ip6_output;
1178 
1179 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1180 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1181 	if (rt->rt6i_dst.plen == 128)
1182 	       rt->u.dst.flags = DST_HOST;
1183 
1184 #ifdef CONFIG_IPV6_SUBTREES
1185 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1186 	rt->rt6i_src.plen = cfg->fc_src_len;
1187 #endif
1188 
1189 	rt->rt6i_metric = cfg->fc_metric;
1190 
1191 	/* We cannot add true routes via loopback here,
1192 	   they would result in kernel looping; promote them to reject routes
1193 	 */
1194 	if ((cfg->fc_flags & RTF_REJECT) ||
1195 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1196 		/* hold loopback dev/idev if we haven't done so. */
1197 		if (dev != net->loopback_dev) {
1198 			if (dev) {
1199 				dev_put(dev);
1200 				in6_dev_put(idev);
1201 			}
1202 			dev = net->loopback_dev;
1203 			dev_hold(dev);
1204 			idev = in6_dev_get(dev);
1205 			if (!idev) {
1206 				err = -ENODEV;
1207 				goto out;
1208 			}
1209 		}
1210 		rt->u.dst.output = ip6_pkt_discard_out;
1211 		rt->u.dst.input = ip6_pkt_discard;
1212 		rt->u.dst.error = -ENETUNREACH;
1213 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1214 		goto install_route;
1215 	}
1216 
1217 	if (cfg->fc_flags & RTF_GATEWAY) {
1218 		struct in6_addr *gw_addr;
1219 		int gwa_type;
1220 
1221 		gw_addr = &cfg->fc_gateway;
1222 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1223 		gwa_type = ipv6_addr_type(gw_addr);
1224 
1225 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1226 			struct rt6_info *grt;
1227 
1228 			/* IPv6 strictly inhibits using not link-local
1229 			   addresses as nexthop address.
1230 			   Otherwise, router will not able to send redirects.
1231 			   It is very good, but in some (rare!) circumstances
1232 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1233 			   some exceptions. --ANK
1234 			 */
1235 			err = -EINVAL;
1236 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1237 				goto out;
1238 
1239 			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1240 
1241 			err = -EHOSTUNREACH;
1242 			if (grt == NULL)
1243 				goto out;
1244 			if (dev) {
1245 				if (dev != grt->rt6i_dev) {
1246 					dst_release(&grt->u.dst);
1247 					goto out;
1248 				}
1249 			} else {
1250 				dev = grt->rt6i_dev;
1251 				idev = grt->rt6i_idev;
1252 				dev_hold(dev);
1253 				in6_dev_hold(grt->rt6i_idev);
1254 			}
1255 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1256 				err = 0;
1257 			dst_release(&grt->u.dst);
1258 
1259 			if (err)
1260 				goto out;
1261 		}
1262 		err = -EINVAL;
1263 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1264 			goto out;
1265 	}
1266 
1267 	err = -ENODEV;
1268 	if (dev == NULL)
1269 		goto out;
1270 
1271 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1272 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1273 		if (IS_ERR(rt->rt6i_nexthop)) {
1274 			err = PTR_ERR(rt->rt6i_nexthop);
1275 			rt->rt6i_nexthop = NULL;
1276 			goto out;
1277 		}
1278 	}
1279 
1280 	rt->rt6i_flags = cfg->fc_flags;
1281 
1282 install_route:
1283 	if (cfg->fc_mx) {
1284 		struct nlattr *nla;
1285 		int remaining;
1286 
1287 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1288 			int type = nla_type(nla);
1289 
1290 			if (type) {
1291 				if (type > RTAX_MAX) {
1292 					err = -EINVAL;
1293 					goto out;
1294 				}
1295 
1296 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1297 			}
1298 		}
1299 	}
1300 
1301 	if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1302 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1303 	if (!dst_mtu(&rt->u.dst))
1304 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1305 	if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1306 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1307 	rt->u.dst.dev = dev;
1308 	rt->rt6i_idev = idev;
1309 	rt->rt6i_table = table;
1310 
1311 	cfg->fc_nlinfo.nl_net = dev_net(dev);
1312 
1313 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1314 
1315 out:
1316 	if (dev)
1317 		dev_put(dev);
1318 	if (idev)
1319 		in6_dev_put(idev);
1320 	if (rt)
1321 		dst_free(&rt->u.dst);
1322 	return err;
1323 }
1324 
1325 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1326 {
1327 	int err;
1328 	struct fib6_table *table;
1329 	struct net *net = dev_net(rt->rt6i_dev);
1330 
1331 	if (rt == net->ipv6.ip6_null_entry)
1332 		return -ENOENT;
1333 
1334 	table = rt->rt6i_table;
1335 	write_lock_bh(&table->tb6_lock);
1336 
1337 	err = fib6_del(rt, info);
1338 	dst_release(&rt->u.dst);
1339 
1340 	write_unlock_bh(&table->tb6_lock);
1341 
1342 	return err;
1343 }
1344 
1345 int ip6_del_rt(struct rt6_info *rt)
1346 {
1347 	struct nl_info info = {
1348 		.nl_net = dev_net(rt->rt6i_dev),
1349 	};
1350 	return __ip6_del_rt(rt, &info);
1351 }
1352 
1353 static int ip6_route_del(struct fib6_config *cfg)
1354 {
1355 	struct fib6_table *table;
1356 	struct fib6_node *fn;
1357 	struct rt6_info *rt;
1358 	int err = -ESRCH;
1359 
1360 	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1361 	if (table == NULL)
1362 		return err;
1363 
1364 	read_lock_bh(&table->tb6_lock);
1365 
1366 	fn = fib6_locate(&table->tb6_root,
1367 			 &cfg->fc_dst, cfg->fc_dst_len,
1368 			 &cfg->fc_src, cfg->fc_src_len);
1369 
1370 	if (fn) {
1371 		for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1372 			if (cfg->fc_ifindex &&
1373 			    (rt->rt6i_dev == NULL ||
1374 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1375 				continue;
1376 			if (cfg->fc_flags & RTF_GATEWAY &&
1377 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1378 				continue;
1379 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1380 				continue;
1381 			dst_hold(&rt->u.dst);
1382 			read_unlock_bh(&table->tb6_lock);
1383 
1384 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1385 		}
1386 	}
1387 	read_unlock_bh(&table->tb6_lock);
1388 
1389 	return err;
1390 }
1391 
1392 /*
1393  *	Handle redirects
1394  */
1395 struct ip6rd_flowi {
1396 	struct flowi fl;
1397 	struct in6_addr gateway;
1398 };
1399 
1400 static struct rt6_info *__ip6_route_redirect(struct net *net,
1401 					     struct fib6_table *table,
1402 					     struct flowi *fl,
1403 					     int flags)
1404 {
1405 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1406 	struct rt6_info *rt;
1407 	struct fib6_node *fn;
1408 
1409 	/*
1410 	 * Get the "current" route for this destination and
1411 	 * check if the redirect has come from approriate router.
1412 	 *
1413 	 * RFC 2461 specifies that redirects should only be
1414 	 * accepted if they come from the nexthop to the target.
1415 	 * Due to the way the routes are chosen, this notion
1416 	 * is a bit fuzzy and one might need to check all possible
1417 	 * routes.
1418 	 */
1419 
1420 	read_lock_bh(&table->tb6_lock);
1421 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1422 restart:
1423 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1424 		/*
1425 		 * Current route is on-link; redirect is always invalid.
1426 		 *
1427 		 * Seems, previous statement is not true. It could
1428 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1429 		 * But then router serving it might decide, that we should
1430 		 * know truth 8)8) --ANK (980726).
1431 		 */
1432 		if (rt6_check_expired(rt))
1433 			continue;
1434 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1435 			continue;
1436 		if (fl->oif != rt->rt6i_dev->ifindex)
1437 			continue;
1438 		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1439 			continue;
1440 		break;
1441 	}
1442 
1443 	if (!rt)
1444 		rt = net->ipv6.ip6_null_entry;
1445 	BACKTRACK(net, &fl->fl6_src);
1446 out:
1447 	dst_hold(&rt->u.dst);
1448 
1449 	read_unlock_bh(&table->tb6_lock);
1450 
1451 	return rt;
1452 };
1453 
1454 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1455 					   struct in6_addr *src,
1456 					   struct in6_addr *gateway,
1457 					   struct net_device *dev)
1458 {
1459 	int flags = RT6_LOOKUP_F_HAS_SADDR;
1460 	struct net *net = dev_net(dev);
1461 	struct ip6rd_flowi rdfl = {
1462 		.fl = {
1463 			.oif = dev->ifindex,
1464 			.nl_u = {
1465 				.ip6_u = {
1466 					.daddr = *dest,
1467 					.saddr = *src,
1468 				},
1469 			},
1470 		},
1471 		.gateway = *gateway,
1472 	};
1473 
1474 	if (rt6_need_strict(dest))
1475 		flags |= RT6_LOOKUP_F_IFACE;
1476 
1477 	return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1478 						   flags, __ip6_route_redirect);
1479 }
1480 
1481 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1482 		  struct in6_addr *saddr,
1483 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1484 {
1485 	struct rt6_info *rt, *nrt = NULL;
1486 	struct netevent_redirect netevent;
1487 	struct net *net = dev_net(neigh->dev);
1488 
1489 	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1490 
1491 	if (rt == net->ipv6.ip6_null_entry) {
1492 		if (net_ratelimit())
1493 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1494 			       "for redirect target\n");
1495 		goto out;
1496 	}
1497 
1498 	/*
1499 	 *	We have finally decided to accept it.
1500 	 */
1501 
1502 	neigh_update(neigh, lladdr, NUD_STALE,
1503 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1504 		     NEIGH_UPDATE_F_OVERRIDE|
1505 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1506 				     NEIGH_UPDATE_F_ISROUTER))
1507 		     );
1508 
1509 	/*
1510 	 * Redirect received -> path was valid.
1511 	 * Look, redirects are sent only in response to data packets,
1512 	 * so that this nexthop apparently is reachable. --ANK
1513 	 */
1514 	dst_confirm(&rt->u.dst);
1515 
1516 	/* Duplicate redirect: silently ignore. */
1517 	if (neigh == rt->u.dst.neighbour)
1518 		goto out;
1519 
1520 	nrt = ip6_rt_copy(rt);
1521 	if (nrt == NULL)
1522 		goto out;
1523 
1524 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1525 	if (on_link)
1526 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1527 
1528 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1529 	nrt->rt6i_dst.plen = 128;
1530 	nrt->u.dst.flags |= DST_HOST;
1531 
1532 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1533 	nrt->rt6i_nexthop = neigh_clone(neigh);
1534 	/* Reset pmtu, it may be better */
1535 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1536 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1537 							dst_mtu(&nrt->u.dst));
1538 
1539 	if (ip6_ins_rt(nrt))
1540 		goto out;
1541 
1542 	netevent.old = &rt->u.dst;
1543 	netevent.new = &nrt->u.dst;
1544 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1545 
1546 	if (rt->rt6i_flags&RTF_CACHE) {
1547 		ip6_del_rt(rt);
1548 		return;
1549 	}
1550 
1551 out:
1552 	dst_release(&rt->u.dst);
1553 	return;
1554 }
1555 
1556 /*
1557  *	Handle ICMP "packet too big" messages
1558  *	i.e. Path MTU discovery
1559  */
1560 
1561 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1562 			struct net_device *dev, u32 pmtu)
1563 {
1564 	struct rt6_info *rt, *nrt;
1565 	struct net *net = dev_net(dev);
1566 	int allfrag = 0;
1567 
1568 	rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1569 	if (rt == NULL)
1570 		return;
1571 
1572 	if (pmtu >= dst_mtu(&rt->u.dst))
1573 		goto out;
1574 
1575 	if (pmtu < IPV6_MIN_MTU) {
1576 		/*
1577 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1578 		 * MTU (1280) and a fragment header should always be included
1579 		 * after a node receiving Too Big message reporting PMTU is
1580 		 * less than the IPv6 Minimum Link MTU.
1581 		 */
1582 		pmtu = IPV6_MIN_MTU;
1583 		allfrag = 1;
1584 	}
1585 
1586 	/* New mtu received -> path was valid.
1587 	   They are sent only in response to data packets,
1588 	   so that this nexthop apparently is reachable. --ANK
1589 	 */
1590 	dst_confirm(&rt->u.dst);
1591 
1592 	/* Host route. If it is static, it would be better
1593 	   not to override it, but add new one, so that
1594 	   when cache entry will expire old pmtu
1595 	   would return automatically.
1596 	 */
1597 	if (rt->rt6i_flags & RTF_CACHE) {
1598 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1599 		if (allfrag)
1600 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1601 		dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1602 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1603 		goto out;
1604 	}
1605 
1606 	/* Network route.
1607 	   Two cases are possible:
1608 	   1. It is connected route. Action: COW
1609 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1610 	 */
1611 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1612 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1613 	else
1614 		nrt = rt6_alloc_clone(rt, daddr);
1615 
1616 	if (nrt) {
1617 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1618 		if (allfrag)
1619 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1620 
1621 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1622 		 * happened within 5 mins, the recommended timer is 10 mins.
1623 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1624 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1625 		 * and detecting PMTU increase will be automatically happened.
1626 		 */
1627 		dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1628 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1629 
1630 		ip6_ins_rt(nrt);
1631 	}
1632 out:
1633 	dst_release(&rt->u.dst);
1634 }
1635 
1636 /*
1637  *	Misc support functions
1638  */
1639 
1640 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1641 {
1642 	struct net *net = dev_net(ort->rt6i_dev);
1643 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1644 
1645 	if (rt) {
1646 		rt->u.dst.input = ort->u.dst.input;
1647 		rt->u.dst.output = ort->u.dst.output;
1648 
1649 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1650 		rt->u.dst.error = ort->u.dst.error;
1651 		rt->u.dst.dev = ort->u.dst.dev;
1652 		if (rt->u.dst.dev)
1653 			dev_hold(rt->u.dst.dev);
1654 		rt->rt6i_idev = ort->rt6i_idev;
1655 		if (rt->rt6i_idev)
1656 			in6_dev_hold(rt->rt6i_idev);
1657 		rt->u.dst.lastuse = jiffies;
1658 		rt->rt6i_expires = 0;
1659 
1660 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1661 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1662 		rt->rt6i_metric = 0;
1663 
1664 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1665 #ifdef CONFIG_IPV6_SUBTREES
1666 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1667 #endif
1668 		rt->rt6i_table = ort->rt6i_table;
1669 	}
1670 	return rt;
1671 }
1672 
1673 #ifdef CONFIG_IPV6_ROUTE_INFO
1674 static struct rt6_info *rt6_get_route_info(struct net *net,
1675 					   struct in6_addr *prefix, int prefixlen,
1676 					   struct in6_addr *gwaddr, int ifindex)
1677 {
1678 	struct fib6_node *fn;
1679 	struct rt6_info *rt = NULL;
1680 	struct fib6_table *table;
1681 
1682 	table = fib6_get_table(net, RT6_TABLE_INFO);
1683 	if (table == NULL)
1684 		return NULL;
1685 
1686 	write_lock_bh(&table->tb6_lock);
1687 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1688 	if (!fn)
1689 		goto out;
1690 
1691 	for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1692 		if (rt->rt6i_dev->ifindex != ifindex)
1693 			continue;
1694 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1695 			continue;
1696 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1697 			continue;
1698 		dst_hold(&rt->u.dst);
1699 		break;
1700 	}
1701 out:
1702 	write_unlock_bh(&table->tb6_lock);
1703 	return rt;
1704 }
1705 
1706 static struct rt6_info *rt6_add_route_info(struct net *net,
1707 					   struct in6_addr *prefix, int prefixlen,
1708 					   struct in6_addr *gwaddr, int ifindex,
1709 					   unsigned pref)
1710 {
1711 	struct fib6_config cfg = {
1712 		.fc_table	= RT6_TABLE_INFO,
1713 		.fc_metric	= IP6_RT_PRIO_USER,
1714 		.fc_ifindex	= ifindex,
1715 		.fc_dst_len	= prefixlen,
1716 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1717 				  RTF_UP | RTF_PREF(pref),
1718 		.fc_nlinfo.pid = 0,
1719 		.fc_nlinfo.nlh = NULL,
1720 		.fc_nlinfo.nl_net = net,
1721 	};
1722 
1723 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1724 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1725 
1726 	/* We should treat it as a default route if prefix length is 0. */
1727 	if (!prefixlen)
1728 		cfg.fc_flags |= RTF_DEFAULT;
1729 
1730 	ip6_route_add(&cfg);
1731 
1732 	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1733 }
1734 #endif
1735 
1736 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1737 {
1738 	struct rt6_info *rt;
1739 	struct fib6_table *table;
1740 
1741 	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1742 	if (table == NULL)
1743 		return NULL;
1744 
1745 	write_lock_bh(&table->tb6_lock);
1746 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1747 		if (dev == rt->rt6i_dev &&
1748 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1749 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1750 			break;
1751 	}
1752 	if (rt)
1753 		dst_hold(&rt->u.dst);
1754 	write_unlock_bh(&table->tb6_lock);
1755 	return rt;
1756 }
1757 
1758 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1759 				     struct net_device *dev,
1760 				     unsigned int pref)
1761 {
1762 	struct fib6_config cfg = {
1763 		.fc_table	= RT6_TABLE_DFLT,
1764 		.fc_metric	= IP6_RT_PRIO_USER,
1765 		.fc_ifindex	= dev->ifindex,
1766 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1767 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1768 		.fc_nlinfo.pid = 0,
1769 		.fc_nlinfo.nlh = NULL,
1770 		.fc_nlinfo.nl_net = dev_net(dev),
1771 	};
1772 
1773 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1774 
1775 	ip6_route_add(&cfg);
1776 
1777 	return rt6_get_dflt_router(gwaddr, dev);
1778 }
1779 
1780 void rt6_purge_dflt_routers(struct net *net)
1781 {
1782 	struct rt6_info *rt;
1783 	struct fib6_table *table;
1784 
1785 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1786 	table = fib6_get_table(net, RT6_TABLE_DFLT);
1787 	if (table == NULL)
1788 		return;
1789 
1790 restart:
1791 	read_lock_bh(&table->tb6_lock);
1792 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1793 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1794 			dst_hold(&rt->u.dst);
1795 			read_unlock_bh(&table->tb6_lock);
1796 			ip6_del_rt(rt);
1797 			goto restart;
1798 		}
1799 	}
1800 	read_unlock_bh(&table->tb6_lock);
1801 }
1802 
1803 static void rtmsg_to_fib6_config(struct net *net,
1804 				 struct in6_rtmsg *rtmsg,
1805 				 struct fib6_config *cfg)
1806 {
1807 	memset(cfg, 0, sizeof(*cfg));
1808 
1809 	cfg->fc_table = RT6_TABLE_MAIN;
1810 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1811 	cfg->fc_metric = rtmsg->rtmsg_metric;
1812 	cfg->fc_expires = rtmsg->rtmsg_info;
1813 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1814 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1815 	cfg->fc_flags = rtmsg->rtmsg_flags;
1816 
1817 	cfg->fc_nlinfo.nl_net = net;
1818 
1819 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1820 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1821 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1822 }
1823 
1824 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1825 {
1826 	struct fib6_config cfg;
1827 	struct in6_rtmsg rtmsg;
1828 	int err;
1829 
1830 	switch(cmd) {
1831 	case SIOCADDRT:		/* Add a route */
1832 	case SIOCDELRT:		/* Delete a route */
1833 		if (!capable(CAP_NET_ADMIN))
1834 			return -EPERM;
1835 		err = copy_from_user(&rtmsg, arg,
1836 				     sizeof(struct in6_rtmsg));
1837 		if (err)
1838 			return -EFAULT;
1839 
1840 		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1841 
1842 		rtnl_lock();
1843 		switch (cmd) {
1844 		case SIOCADDRT:
1845 			err = ip6_route_add(&cfg);
1846 			break;
1847 		case SIOCDELRT:
1848 			err = ip6_route_del(&cfg);
1849 			break;
1850 		default:
1851 			err = -EINVAL;
1852 		}
1853 		rtnl_unlock();
1854 
1855 		return err;
1856 	}
1857 
1858 	return -EINVAL;
1859 }
1860 
1861 /*
1862  *	Drop the packet on the floor
1863  */
1864 
1865 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1866 {
1867 	int type;
1868 	struct dst_entry *dst = skb->dst;
1869 	switch (ipstats_mib_noroutes) {
1870 	case IPSTATS_MIB_INNOROUTES:
1871 		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1872 		if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1873 			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1874 				      IPSTATS_MIB_INADDRERRORS);
1875 			break;
1876 		}
1877 		/* FALLTHROUGH */
1878 	case IPSTATS_MIB_OUTNOROUTES:
1879 		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1880 			      ipstats_mib_noroutes);
1881 		break;
1882 	}
1883 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1884 	kfree_skb(skb);
1885 	return 0;
1886 }
1887 
1888 static int ip6_pkt_discard(struct sk_buff *skb)
1889 {
1890 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1891 }
1892 
1893 static int ip6_pkt_discard_out(struct sk_buff *skb)
1894 {
1895 	skb->dev = skb->dst->dev;
1896 	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1897 }
1898 
1899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1900 
1901 static int ip6_pkt_prohibit(struct sk_buff *skb)
1902 {
1903 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1904 }
1905 
1906 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1907 {
1908 	skb->dev = skb->dst->dev;
1909 	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1910 }
1911 
1912 #endif
1913 
1914 /*
1915  *	Allocate a dst for local (unicast / anycast) address.
1916  */
1917 
1918 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1919 				    const struct in6_addr *addr,
1920 				    int anycast)
1921 {
1922 	struct net *net = dev_net(idev->dev);
1923 	struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1924 	struct neighbour *neigh;
1925 
1926 	if (rt == NULL)
1927 		return ERR_PTR(-ENOMEM);
1928 
1929 	dev_hold(net->loopback_dev);
1930 	in6_dev_hold(idev);
1931 
1932 	rt->u.dst.flags = DST_HOST;
1933 	rt->u.dst.input = ip6_input;
1934 	rt->u.dst.output = ip6_output;
1935 	rt->rt6i_dev = net->loopback_dev;
1936 	rt->rt6i_idev = idev;
1937 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1938 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1939 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1940 	rt->u.dst.obsolete = -1;
1941 
1942 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1943 	if (anycast)
1944 		rt->rt6i_flags |= RTF_ANYCAST;
1945 	else
1946 		rt->rt6i_flags |= RTF_LOCAL;
1947 	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1948 	if (IS_ERR(neigh)) {
1949 		dst_free(&rt->u.dst);
1950 
1951 		/* We are casting this because that is the return
1952 		 * value type.  But an errno encoded pointer is the
1953 		 * same regardless of the underlying pointer type,
1954 		 * and that's what we are returning.  So this is OK.
1955 		 */
1956 		return (struct rt6_info *) neigh;
1957 	}
1958 	rt->rt6i_nexthop = neigh;
1959 
1960 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1961 	rt->rt6i_dst.plen = 128;
1962 	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1963 
1964 	atomic_set(&rt->u.dst.__refcnt, 1);
1965 
1966 	return rt;
1967 }
1968 
1969 struct arg_dev_net {
1970 	struct net_device *dev;
1971 	struct net *net;
1972 };
1973 
1974 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1975 {
1976 	struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1977 	struct net *net = ((struct arg_dev_net *)arg)->net;
1978 
1979 	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1980 	    rt != net->ipv6.ip6_null_entry) {
1981 		RT6_TRACE("deleted by ifdown %p\n", rt);
1982 		return -1;
1983 	}
1984 	return 0;
1985 }
1986 
1987 void rt6_ifdown(struct net *net, struct net_device *dev)
1988 {
1989 	struct arg_dev_net adn = {
1990 		.dev = dev,
1991 		.net = net,
1992 	};
1993 
1994 	fib6_clean_all(net, fib6_ifdown, 0, &adn);
1995 	icmp6_clean_all(fib6_ifdown, &adn);
1996 }
1997 
1998 struct rt6_mtu_change_arg
1999 {
2000 	struct net_device *dev;
2001 	unsigned mtu;
2002 };
2003 
2004 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2005 {
2006 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2007 	struct inet6_dev *idev;
2008 	struct net *net = dev_net(arg->dev);
2009 
2010 	/* In IPv6 pmtu discovery is not optional,
2011 	   so that RTAX_MTU lock cannot disable it.
2012 	   We still use this lock to block changes
2013 	   caused by addrconf/ndisc.
2014 	*/
2015 
2016 	idev = __in6_dev_get(arg->dev);
2017 	if (idev == NULL)
2018 		return 0;
2019 
2020 	/* For administrative MTU increase, there is no way to discover
2021 	   IPv6 PMTU increase, so PMTU increase should be updated here.
2022 	   Since RFC 1981 doesn't include administrative MTU increase
2023 	   update PMTU increase is a MUST. (i.e. jumbo frame)
2024 	 */
2025 	/*
2026 	   If new MTU is less than route PMTU, this new MTU will be the
2027 	   lowest MTU in the path, update the route PMTU to reflect PMTU
2028 	   decreases; if new MTU is greater than route PMTU, and the
2029 	   old MTU is the lowest MTU in the path, update the route PMTU
2030 	   to reflect the increase. In this case if the other nodes' MTU
2031 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2032 	   PMTU discouvery.
2033 	 */
2034 	if (rt->rt6i_dev == arg->dev &&
2035 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
2036 	    (dst_mtu(&rt->u.dst) >= arg->mtu ||
2037 	     (dst_mtu(&rt->u.dst) < arg->mtu &&
2038 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
2039 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
2040 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2041 	}
2042 	return 0;
2043 }
2044 
2045 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2046 {
2047 	struct rt6_mtu_change_arg arg = {
2048 		.dev = dev,
2049 		.mtu = mtu,
2050 	};
2051 
2052 	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2053 }
2054 
2055 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2056 	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2057 	[RTA_OIF]               = { .type = NLA_U32 },
2058 	[RTA_IIF]		= { .type = NLA_U32 },
2059 	[RTA_PRIORITY]          = { .type = NLA_U32 },
2060 	[RTA_METRICS]           = { .type = NLA_NESTED },
2061 };
2062 
2063 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2064 			      struct fib6_config *cfg)
2065 {
2066 	struct rtmsg *rtm;
2067 	struct nlattr *tb[RTA_MAX+1];
2068 	int err;
2069 
2070 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2071 	if (err < 0)
2072 		goto errout;
2073 
2074 	err = -EINVAL;
2075 	rtm = nlmsg_data(nlh);
2076 	memset(cfg, 0, sizeof(*cfg));
2077 
2078 	cfg->fc_table = rtm->rtm_table;
2079 	cfg->fc_dst_len = rtm->rtm_dst_len;
2080 	cfg->fc_src_len = rtm->rtm_src_len;
2081 	cfg->fc_flags = RTF_UP;
2082 	cfg->fc_protocol = rtm->rtm_protocol;
2083 
2084 	if (rtm->rtm_type == RTN_UNREACHABLE)
2085 		cfg->fc_flags |= RTF_REJECT;
2086 
2087 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2088 	cfg->fc_nlinfo.nlh = nlh;
2089 	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2090 
2091 	if (tb[RTA_GATEWAY]) {
2092 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2093 		cfg->fc_flags |= RTF_GATEWAY;
2094 	}
2095 
2096 	if (tb[RTA_DST]) {
2097 		int plen = (rtm->rtm_dst_len + 7) >> 3;
2098 
2099 		if (nla_len(tb[RTA_DST]) < plen)
2100 			goto errout;
2101 
2102 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2103 	}
2104 
2105 	if (tb[RTA_SRC]) {
2106 		int plen = (rtm->rtm_src_len + 7) >> 3;
2107 
2108 		if (nla_len(tb[RTA_SRC]) < plen)
2109 			goto errout;
2110 
2111 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2112 	}
2113 
2114 	if (tb[RTA_OIF])
2115 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2116 
2117 	if (tb[RTA_PRIORITY])
2118 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2119 
2120 	if (tb[RTA_METRICS]) {
2121 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2122 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2123 	}
2124 
2125 	if (tb[RTA_TABLE])
2126 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2127 
2128 	err = 0;
2129 errout:
2130 	return err;
2131 }
2132 
2133 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2134 {
2135 	struct fib6_config cfg;
2136 	int err;
2137 
2138 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2139 	if (err < 0)
2140 		return err;
2141 
2142 	return ip6_route_del(&cfg);
2143 }
2144 
2145 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2146 {
2147 	struct fib6_config cfg;
2148 	int err;
2149 
2150 	err = rtm_to_fib6_config(skb, nlh, &cfg);
2151 	if (err < 0)
2152 		return err;
2153 
2154 	return ip6_route_add(&cfg);
2155 }
2156 
2157 static inline size_t rt6_nlmsg_size(void)
2158 {
2159 	return NLMSG_ALIGN(sizeof(struct rtmsg))
2160 	       + nla_total_size(16) /* RTA_SRC */
2161 	       + nla_total_size(16) /* RTA_DST */
2162 	       + nla_total_size(16) /* RTA_GATEWAY */
2163 	       + nla_total_size(16) /* RTA_PREFSRC */
2164 	       + nla_total_size(4) /* RTA_TABLE */
2165 	       + nla_total_size(4) /* RTA_IIF */
2166 	       + nla_total_size(4) /* RTA_OIF */
2167 	       + nla_total_size(4) /* RTA_PRIORITY */
2168 	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2169 	       + nla_total_size(sizeof(struct rta_cacheinfo));
2170 }
2171 
2172 static int rt6_fill_node(struct net *net,
2173 			 struct sk_buff *skb, struct rt6_info *rt,
2174 			 struct in6_addr *dst, struct in6_addr *src,
2175 			 int iif, int type, u32 pid, u32 seq,
2176 			 int prefix, int nowait, unsigned int flags)
2177 {
2178 	struct rtmsg *rtm;
2179 	struct nlmsghdr *nlh;
2180 	long expires;
2181 	u32 table;
2182 
2183 	if (prefix) {	/* user wants prefix routes only */
2184 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2185 			/* success since this is not a prefix route */
2186 			return 1;
2187 		}
2188 	}
2189 
2190 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2191 	if (nlh == NULL)
2192 		return -EMSGSIZE;
2193 
2194 	rtm = nlmsg_data(nlh);
2195 	rtm->rtm_family = AF_INET6;
2196 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2197 	rtm->rtm_src_len = rt->rt6i_src.plen;
2198 	rtm->rtm_tos = 0;
2199 	if (rt->rt6i_table)
2200 		table = rt->rt6i_table->tb6_id;
2201 	else
2202 		table = RT6_TABLE_UNSPEC;
2203 	rtm->rtm_table = table;
2204 	NLA_PUT_U32(skb, RTA_TABLE, table);
2205 	if (rt->rt6i_flags&RTF_REJECT)
2206 		rtm->rtm_type = RTN_UNREACHABLE;
2207 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2208 		rtm->rtm_type = RTN_LOCAL;
2209 	else
2210 		rtm->rtm_type = RTN_UNICAST;
2211 	rtm->rtm_flags = 0;
2212 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2213 	rtm->rtm_protocol = rt->rt6i_protocol;
2214 	if (rt->rt6i_flags&RTF_DYNAMIC)
2215 		rtm->rtm_protocol = RTPROT_REDIRECT;
2216 	else if (rt->rt6i_flags & RTF_ADDRCONF)
2217 		rtm->rtm_protocol = RTPROT_KERNEL;
2218 	else if (rt->rt6i_flags&RTF_DEFAULT)
2219 		rtm->rtm_protocol = RTPROT_RA;
2220 
2221 	if (rt->rt6i_flags&RTF_CACHE)
2222 		rtm->rtm_flags |= RTM_F_CLONED;
2223 
2224 	if (dst) {
2225 		NLA_PUT(skb, RTA_DST, 16, dst);
2226 		rtm->rtm_dst_len = 128;
2227 	} else if (rtm->rtm_dst_len)
2228 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2229 #ifdef CONFIG_IPV6_SUBTREES
2230 	if (src) {
2231 		NLA_PUT(skb, RTA_SRC, 16, src);
2232 		rtm->rtm_src_len = 128;
2233 	} else if (rtm->rtm_src_len)
2234 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2235 #endif
2236 	if (iif) {
2237 #ifdef CONFIG_IPV6_MROUTE
2238 		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2239 			int err = ip6mr_get_route(net, skb, rtm, nowait);
2240 			if (err <= 0) {
2241 				if (!nowait) {
2242 					if (err == 0)
2243 						return 0;
2244 					goto nla_put_failure;
2245 				} else {
2246 					if (err == -EMSGSIZE)
2247 						goto nla_put_failure;
2248 				}
2249 			}
2250 		} else
2251 #endif
2252 			NLA_PUT_U32(skb, RTA_IIF, iif);
2253 	} else if (dst) {
2254 		struct inet6_dev *idev = ip6_dst_idev(&rt->u.dst);
2255 		struct in6_addr saddr_buf;
2256 		if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2257 				       dst, 0, &saddr_buf) == 0)
2258 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2259 	}
2260 
2261 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2262 		goto nla_put_failure;
2263 
2264 	if (rt->u.dst.neighbour)
2265 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2266 
2267 	if (rt->u.dst.dev)
2268 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2269 
2270 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2271 
2272 	if (!(rt->rt6i_flags & RTF_EXPIRES))
2273 		expires = 0;
2274 	else if (rt->rt6i_expires - jiffies < INT_MAX)
2275 		expires = rt->rt6i_expires - jiffies;
2276 	else
2277 		expires = INT_MAX;
2278 
2279 	if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2280 			       expires, rt->u.dst.error) < 0)
2281 		goto nla_put_failure;
2282 
2283 	return nlmsg_end(skb, nlh);
2284 
2285 nla_put_failure:
2286 	nlmsg_cancel(skb, nlh);
2287 	return -EMSGSIZE;
2288 }
2289 
2290 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2291 {
2292 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2293 	int prefix;
2294 
2295 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2296 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2297 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2298 	} else
2299 		prefix = 0;
2300 
2301 	return rt6_fill_node(arg->net,
2302 		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2303 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2304 		     prefix, 0, NLM_F_MULTI);
2305 }
2306 
2307 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2308 {
2309 	struct net *net = sock_net(in_skb->sk);
2310 	struct nlattr *tb[RTA_MAX+1];
2311 	struct rt6_info *rt;
2312 	struct sk_buff *skb;
2313 	struct rtmsg *rtm;
2314 	struct flowi fl;
2315 	int err, iif = 0;
2316 
2317 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2318 	if (err < 0)
2319 		goto errout;
2320 
2321 	err = -EINVAL;
2322 	memset(&fl, 0, sizeof(fl));
2323 
2324 	if (tb[RTA_SRC]) {
2325 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2326 			goto errout;
2327 
2328 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2329 	}
2330 
2331 	if (tb[RTA_DST]) {
2332 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2333 			goto errout;
2334 
2335 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2336 	}
2337 
2338 	if (tb[RTA_IIF])
2339 		iif = nla_get_u32(tb[RTA_IIF]);
2340 
2341 	if (tb[RTA_OIF])
2342 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2343 
2344 	if (iif) {
2345 		struct net_device *dev;
2346 		dev = __dev_get_by_index(net, iif);
2347 		if (!dev) {
2348 			err = -ENODEV;
2349 			goto errout;
2350 		}
2351 	}
2352 
2353 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2354 	if (skb == NULL) {
2355 		err = -ENOBUFS;
2356 		goto errout;
2357 	}
2358 
2359 	/* Reserve room for dummy headers, this skb can pass
2360 	   through good chunk of routing engine.
2361 	 */
2362 	skb_reset_mac_header(skb);
2363 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2364 
2365 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2366 	skb->dst = &rt->u.dst;
2367 
2368 	err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2369 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2370 			    nlh->nlmsg_seq, 0, 0, 0);
2371 	if (err < 0) {
2372 		kfree_skb(skb);
2373 		goto errout;
2374 	}
2375 
2376 	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2377 errout:
2378 	return err;
2379 }
2380 
2381 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2382 {
2383 	struct sk_buff *skb;
2384 	struct net *net = info->nl_net;
2385 	u32 seq;
2386 	int err;
2387 
2388 	err = -ENOBUFS;
2389 	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2390 
2391 	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2392 	if (skb == NULL)
2393 		goto errout;
2394 
2395 	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2396 				event, info->pid, seq, 0, 0, 0);
2397 	if (err < 0) {
2398 		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2399 		WARN_ON(err == -EMSGSIZE);
2400 		kfree_skb(skb);
2401 		goto errout;
2402 	}
2403 	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2404 		    info->nlh, gfp_any());
2405 	return;
2406 errout:
2407 	if (err < 0)
2408 		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2409 }
2410 
2411 static int ip6_route_dev_notify(struct notifier_block *this,
2412 				unsigned long event, void *data)
2413 {
2414 	struct net_device *dev = (struct net_device *)data;
2415 	struct net *net = dev_net(dev);
2416 
2417 	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2418 		net->ipv6.ip6_null_entry->u.dst.dev = dev;
2419 		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2420 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2421 		net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2422 		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2423 		net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2424 		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2425 #endif
2426 	}
2427 
2428 	return NOTIFY_OK;
2429 }
2430 
2431 /*
2432  *	/proc
2433  */
2434 
2435 #ifdef CONFIG_PROC_FS
2436 
2437 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2438 
2439 struct rt6_proc_arg
2440 {
2441 	char *buffer;
2442 	int offset;
2443 	int length;
2444 	int skip;
2445 	int len;
2446 };
2447 
2448 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2449 {
2450 	struct seq_file *m = p_arg;
2451 
2452 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2453 
2454 #ifdef CONFIG_IPV6_SUBTREES
2455 	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2456 #else
2457 	seq_puts(m, "00000000000000000000000000000000 00 ");
2458 #endif
2459 
2460 	if (rt->rt6i_nexthop) {
2461 		seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2462 	} else {
2463 		seq_puts(m, "00000000000000000000000000000000");
2464 	}
2465 	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2466 		   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2467 		   rt->u.dst.__use, rt->rt6i_flags,
2468 		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2469 	return 0;
2470 }
2471 
2472 static int ipv6_route_show(struct seq_file *m, void *v)
2473 {
2474 	struct net *net = (struct net *)m->private;
2475 	fib6_clean_all(net, rt6_info_route, 0, m);
2476 	return 0;
2477 }
2478 
2479 static int ipv6_route_open(struct inode *inode, struct file *file)
2480 {
2481 	return single_open_net(inode, file, ipv6_route_show);
2482 }
2483 
2484 static const struct file_operations ipv6_route_proc_fops = {
2485 	.owner		= THIS_MODULE,
2486 	.open		= ipv6_route_open,
2487 	.read		= seq_read,
2488 	.llseek		= seq_lseek,
2489 	.release	= single_release_net,
2490 };
2491 
2492 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2493 {
2494 	struct net *net = (struct net *)seq->private;
2495 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2496 		   net->ipv6.rt6_stats->fib_nodes,
2497 		   net->ipv6.rt6_stats->fib_route_nodes,
2498 		   net->ipv6.rt6_stats->fib_rt_alloc,
2499 		   net->ipv6.rt6_stats->fib_rt_entries,
2500 		   net->ipv6.rt6_stats->fib_rt_cache,
2501 		   atomic_read(&net->ipv6.ip6_dst_ops->entries),
2502 		   net->ipv6.rt6_stats->fib_discarded_routes);
2503 
2504 	return 0;
2505 }
2506 
2507 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2508 {
2509 	return single_open_net(inode, file, rt6_stats_seq_show);
2510 }
2511 
2512 static const struct file_operations rt6_stats_seq_fops = {
2513 	.owner	 = THIS_MODULE,
2514 	.open	 = rt6_stats_seq_open,
2515 	.read	 = seq_read,
2516 	.llseek	 = seq_lseek,
2517 	.release = single_release_net,
2518 };
2519 #endif	/* CONFIG_PROC_FS */
2520 
2521 #ifdef CONFIG_SYSCTL
2522 
2523 static
2524 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2525 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2526 {
2527 	struct net *net = current->nsproxy->net_ns;
2528 	int delay = net->ipv6.sysctl.flush_delay;
2529 	if (write) {
2530 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2531 		fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2532 		return 0;
2533 	} else
2534 		return -EINVAL;
2535 }
2536 
2537 ctl_table ipv6_route_table_template[] = {
2538 	{
2539 		.procname	=	"flush",
2540 		.data		=	&init_net.ipv6.sysctl.flush_delay,
2541 		.maxlen		=	sizeof(int),
2542 		.mode		=	0200,
2543 		.proc_handler	=	ipv6_sysctl_rtcache_flush
2544 	},
2545 	{
2546 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2547 		.procname	=	"gc_thresh",
2548 		.data		=	&ip6_dst_ops_template.gc_thresh,
2549 		.maxlen		=	sizeof(int),
2550 		.mode		=	0644,
2551 		.proc_handler	=	proc_dointvec,
2552 	},
2553 	{
2554 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2555 		.procname	=	"max_size",
2556 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2557 		.maxlen		=	sizeof(int),
2558 		.mode		=	0644,
2559 		.proc_handler	=	proc_dointvec,
2560 	},
2561 	{
2562 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2563 		.procname	=	"gc_min_interval",
2564 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2565 		.maxlen		=	sizeof(int),
2566 		.mode		=	0644,
2567 		.proc_handler	=	proc_dointvec_jiffies,
2568 		.strategy	=	sysctl_jiffies,
2569 	},
2570 	{
2571 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2572 		.procname	=	"gc_timeout",
2573 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2574 		.maxlen		=	sizeof(int),
2575 		.mode		=	0644,
2576 		.proc_handler	=	proc_dointvec_jiffies,
2577 		.strategy	=	sysctl_jiffies,
2578 	},
2579 	{
2580 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2581 		.procname	=	"gc_interval",
2582 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2583 		.maxlen		=	sizeof(int),
2584 		.mode		=	0644,
2585 		.proc_handler	=	proc_dointvec_jiffies,
2586 		.strategy	=	sysctl_jiffies,
2587 	},
2588 	{
2589 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2590 		.procname	=	"gc_elasticity",
2591 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2592 		.maxlen		=	sizeof(int),
2593 		.mode		=	0644,
2594 		.proc_handler	=	proc_dointvec_jiffies,
2595 		.strategy	=	sysctl_jiffies,
2596 	},
2597 	{
2598 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2599 		.procname	=	"mtu_expires",
2600 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2601 		.maxlen		=	sizeof(int),
2602 		.mode		=	0644,
2603 		.proc_handler	=	proc_dointvec_jiffies,
2604 		.strategy	=	sysctl_jiffies,
2605 	},
2606 	{
2607 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2608 		.procname	=	"min_adv_mss",
2609 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2610 		.maxlen		=	sizeof(int),
2611 		.mode		=	0644,
2612 		.proc_handler	=	proc_dointvec_jiffies,
2613 		.strategy	=	sysctl_jiffies,
2614 	},
2615 	{
2616 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2617 		.procname	=	"gc_min_interval_ms",
2618 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2619 		.maxlen		=	sizeof(int),
2620 		.mode		=	0644,
2621 		.proc_handler	=	proc_dointvec_ms_jiffies,
2622 		.strategy	=	sysctl_ms_jiffies,
2623 	},
2624 	{ .ctl_name = 0 }
2625 };
2626 
2627 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2628 {
2629 	struct ctl_table *table;
2630 
2631 	table = kmemdup(ipv6_route_table_template,
2632 			sizeof(ipv6_route_table_template),
2633 			GFP_KERNEL);
2634 
2635 	if (table) {
2636 		table[0].data = &net->ipv6.sysctl.flush_delay;
2637 		table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2638 		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2639 		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2640 		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2641 		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2642 		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2643 		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2644 		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2645 	}
2646 
2647 	return table;
2648 }
2649 #endif
2650 
2651 static int ip6_route_net_init(struct net *net)
2652 {
2653 	int ret = -ENOMEM;
2654 
2655 	net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2656 					sizeof(*net->ipv6.ip6_dst_ops),
2657 					GFP_KERNEL);
2658 	if (!net->ipv6.ip6_dst_ops)
2659 		goto out;
2660 	net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2661 
2662 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2663 					   sizeof(*net->ipv6.ip6_null_entry),
2664 					   GFP_KERNEL);
2665 	if (!net->ipv6.ip6_null_entry)
2666 		goto out_ip6_dst_ops;
2667 	net->ipv6.ip6_null_entry->u.dst.path =
2668 		(struct dst_entry *)net->ipv6.ip6_null_entry;
2669 	net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2670 
2671 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2672 	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2673 					       sizeof(*net->ipv6.ip6_prohibit_entry),
2674 					       GFP_KERNEL);
2675 	if (!net->ipv6.ip6_prohibit_entry)
2676 		goto out_ip6_null_entry;
2677 	net->ipv6.ip6_prohibit_entry->u.dst.path =
2678 		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2679 	net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2680 
2681 	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2682 					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2683 					       GFP_KERNEL);
2684 	if (!net->ipv6.ip6_blk_hole_entry)
2685 		goto out_ip6_prohibit_entry;
2686 	net->ipv6.ip6_blk_hole_entry->u.dst.path =
2687 		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2688 	net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2689 #endif
2690 
2691 	net->ipv6.sysctl.flush_delay = 0;
2692 	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2693 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2694 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2695 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2696 	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2697 	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2698 	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2699 
2700 #ifdef CONFIG_PROC_FS
2701 	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2702 	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2703 #endif
2704 	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2705 
2706 	ret = 0;
2707 out:
2708 	return ret;
2709 
2710 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2711 out_ip6_prohibit_entry:
2712 	kfree(net->ipv6.ip6_prohibit_entry);
2713 out_ip6_null_entry:
2714 	kfree(net->ipv6.ip6_null_entry);
2715 #endif
2716 out_ip6_dst_ops:
2717 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2718 	kfree(net->ipv6.ip6_dst_ops);
2719 	goto out;
2720 }
2721 
2722 static void ip6_route_net_exit(struct net *net)
2723 {
2724 #ifdef CONFIG_PROC_FS
2725 	proc_net_remove(net, "ipv6_route");
2726 	proc_net_remove(net, "rt6_stats");
2727 #endif
2728 	kfree(net->ipv6.ip6_null_entry);
2729 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2730 	kfree(net->ipv6.ip6_prohibit_entry);
2731 	kfree(net->ipv6.ip6_blk_hole_entry);
2732 #endif
2733 	release_net(net->ipv6.ip6_dst_ops->dst_net);
2734 	kfree(net->ipv6.ip6_dst_ops);
2735 }
2736 
2737 static struct pernet_operations ip6_route_net_ops = {
2738 	.init = ip6_route_net_init,
2739 	.exit = ip6_route_net_exit,
2740 };
2741 
2742 static struct notifier_block ip6_route_dev_notifier = {
2743 	.notifier_call = ip6_route_dev_notify,
2744 	.priority = 0,
2745 };
2746 
2747 int __init ip6_route_init(void)
2748 {
2749 	int ret;
2750 
2751 	ret = -ENOMEM;
2752 	ip6_dst_ops_template.kmem_cachep =
2753 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2754 				  SLAB_HWCACHE_ALIGN, NULL);
2755 	if (!ip6_dst_ops_template.kmem_cachep)
2756 		goto out;
2757 
2758 	ret = register_pernet_subsys(&ip6_route_net_ops);
2759 	if (ret)
2760 		goto out_kmem_cache;
2761 
2762 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2763 
2764 	/* Registering of the loopback is done before this portion of code,
2765 	 * the loopback reference in rt6_info will not be taken, do it
2766 	 * manually for init_net */
2767 	init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2768 	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2769   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2770 	init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2771 	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2772 	init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2773 	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2774   #endif
2775 	ret = fib6_init();
2776 	if (ret)
2777 		goto out_register_subsys;
2778 
2779 	ret = xfrm6_init();
2780 	if (ret)
2781 		goto out_fib6_init;
2782 
2783 	ret = fib6_rules_init();
2784 	if (ret)
2785 		goto xfrm6_init;
2786 
2787 	ret = -ENOBUFS;
2788 	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2789 	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2790 	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2791 		goto fib6_rules_init;
2792 
2793 	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2794 	if (ret)
2795 		goto fib6_rules_init;
2796 
2797 out:
2798 	return ret;
2799 
2800 fib6_rules_init:
2801 	fib6_rules_cleanup();
2802 xfrm6_init:
2803 	xfrm6_fini();
2804 out_fib6_init:
2805 	fib6_gc_cleanup();
2806 out_register_subsys:
2807 	unregister_pernet_subsys(&ip6_route_net_ops);
2808 out_kmem_cache:
2809 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2810 	goto out;
2811 }
2812 
2813 void ip6_route_cleanup(void)
2814 {
2815 	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2816 	fib6_rules_cleanup();
2817 	xfrm6_fini();
2818 	fib6_gc_cleanup();
2819 	unregister_pernet_subsys(&ip6_route_net_ops);
2820 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2821 }
2822