xref: /openbmc/linux/net/ipv6/route.c (revision 2d7202bfdd28687073f5efef8d2f51bbab0af867)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 
40 #ifdef 	CONFIG_PROC_FS
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #endif
44 
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 
145 struct rt6_info ip6_prohibit_entry = {
146 	.u = {
147 		.dst = {
148 			.__refcnt	= ATOMIC_INIT(1),
149 			.__use		= 1,
150 			.dev		= &loopback_dev,
151 			.obsolete	= -1,
152 			.error		= -EACCES,
153 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
154 			.input		= ip6_pkt_discard,
155 			.output		= ip6_pkt_discard_out,
156 			.ops		= &ip6_dst_ops,
157 			.path		= (struct dst_entry*)&ip6_prohibit_entry,
158 		}
159 	},
160 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
161 	.rt6i_metric	= ~(u32) 0,
162 	.rt6i_ref	= ATOMIC_INIT(1),
163 };
164 
165 struct rt6_info ip6_blk_hole_entry = {
166 	.u = {
167 		.dst = {
168 			.__refcnt	= ATOMIC_INIT(1),
169 			.__use		= 1,
170 			.dev		= &loopback_dev,
171 			.obsolete	= -1,
172 			.error		= -EINVAL,
173 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
174 			.input		= ip6_pkt_discard,
175 			.output		= ip6_pkt_discard_out,
176 			.ops		= &ip6_dst_ops,
177 			.path		= (struct dst_entry*)&ip6_blk_hole_entry,
178 		}
179 	},
180 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
181 	.rt6i_metric	= ~(u32) 0,
182 	.rt6i_ref	= ATOMIC_INIT(1),
183 };
184 
185 #endif
186 
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 {
190 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
191 }
192 
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195 	struct rt6_info *rt = (struct rt6_info *)dst;
196 	struct inet6_dev *idev = rt->rt6i_idev;
197 
198 	if (idev != NULL) {
199 		rt->rt6i_idev = NULL;
200 		in6_dev_put(idev);
201 	}
202 }
203 
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205 			   int how)
206 {
207 	struct rt6_info *rt = (struct rt6_info *)dst;
208 	struct inet6_dev *idev = rt->rt6i_idev;
209 
210 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212 		if (loopback_idev != NULL) {
213 			rt->rt6i_idev = loopback_idev;
214 			in6_dev_put(idev);
215 		}
216 	}
217 }
218 
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 {
221 	return (rt->rt6i_flags & RTF_EXPIRES &&
222 		time_after(jiffies, rt->rt6i_expires));
223 }
224 
225 static inline int rt6_need_strict(struct in6_addr *daddr)
226 {
227 	return (ipv6_addr_type(daddr) &
228 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
229 }
230 
231 /*
232  *	Route lookup. Any table->tb6_lock is implied.
233  */
234 
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
236 						    int oif,
237 						    int strict)
238 {
239 	struct rt6_info *local = NULL;
240 	struct rt6_info *sprt;
241 
242 	if (oif) {
243 		for (sprt = rt; sprt; sprt = sprt->u.next) {
244 			struct net_device *dev = sprt->rt6i_dev;
245 			if (dev->ifindex == oif)
246 				return sprt;
247 			if (dev->flags & IFF_LOOPBACK) {
248 				if (sprt->rt6i_idev == NULL ||
249 				    sprt->rt6i_idev->dev->ifindex != oif) {
250 					if (strict && oif)
251 						continue;
252 					if (local && (!oif ||
253 						      local->rt6i_idev->dev->ifindex == oif))
254 						continue;
255 				}
256 				local = sprt;
257 			}
258 		}
259 
260 		if (local)
261 			return local;
262 
263 		if (strict)
264 			return &ip6_null_entry;
265 	}
266 	return rt;
267 }
268 
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
271 {
272 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 	/*
274 	 * Okay, this does not seem to be appropriate
275 	 * for now, however, we need to check if it
276 	 * is really so; aka Router Reachability Probing.
277 	 *
278 	 * Router Reachability Probe MUST be rate-limited
279 	 * to no more than one per minute.
280 	 */
281 	if (!neigh || (neigh->nud_state & NUD_VALID))
282 		return;
283 	read_lock_bh(&neigh->lock);
284 	if (!(neigh->nud_state & NUD_VALID) &&
285 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286 		struct in6_addr mcaddr;
287 		struct in6_addr *target;
288 
289 		neigh->updated = jiffies;
290 		read_unlock_bh(&neigh->lock);
291 
292 		target = (struct in6_addr *)&neigh->primary_key;
293 		addrconf_addr_solict_mult(target, &mcaddr);
294 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 	} else
296 		read_unlock_bh(&neigh->lock);
297 }
298 #else
299 static inline void rt6_probe(struct rt6_info *rt)
300 {
301 	return;
302 }
303 #endif
304 
305 /*
306  * Default Router Selection (RFC 2461 6.3.6)
307  */
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 {
310 	struct net_device *dev = rt->rt6i_dev;
311 	if (!oif || dev->ifindex == oif)
312 		return 2;
313 	if ((dev->flags & IFF_LOOPBACK) &&
314 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
315 		return 1;
316 	return 0;
317 }
318 
319 static int inline rt6_check_neigh(struct rt6_info *rt)
320 {
321 	struct neighbour *neigh = rt->rt6i_nexthop;
322 	int m = 0;
323 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
324 	    !(rt->rt6i_flags & RTF_GATEWAY))
325 		m = 1;
326 	else if (neigh) {
327 		read_lock_bh(&neigh->lock);
328 		if (neigh->nud_state & NUD_VALID)
329 			m = 2;
330 		read_unlock_bh(&neigh->lock);
331 	}
332 	return m;
333 }
334 
335 static int rt6_score_route(struct rt6_info *rt, int oif,
336 			   int strict)
337 {
338 	int m, n;
339 
340 	m = rt6_check_dev(rt, oif);
341 	if (!m && (strict & RT6_SELECT_F_IFACE))
342 		return -1;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 #endif
346 	n = rt6_check_neigh(rt);
347 	if (n > 1)
348 		m |= 16;
349 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
350 		return -1;
351 	return m;
352 }
353 
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
355 				   int strict)
356 {
357 	struct rt6_info *match = NULL, *last = NULL;
358 	struct rt6_info *rt, *rt0 = *head;
359 	u32 metric;
360 	int mpri = -1;
361 
362 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363 		  __FUNCTION__, head, head ? *head : NULL, oif);
364 
365 	for (rt = rt0, metric = rt0->rt6i_metric;
366 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
367 	     rt = rt->u.next) {
368 		int m;
369 
370 		if (rt6_check_expired(rt))
371 			continue;
372 
373 		last = rt;
374 
375 		m = rt6_score_route(rt, oif, strict);
376 		if (m < 0)
377 			continue;
378 
379 		if (m > mpri) {
380 			rt6_probe(match);
381 			match = rt;
382 			mpri = m;
383 		} else {
384 			rt6_probe(rt);
385 		}
386 	}
387 
388 	if (!match &&
389 	    (strict & RT6_SELECT_F_REACHABLE) &&
390 	    last && last != rt0) {
391 		/* no entries matched; do round-robin */
392 		static DEFINE_SPINLOCK(lock);
393 		spin_lock(&lock);
394 		*head = rt0->u.next;
395 		rt0->u.next = last->u.next;
396 		last->u.next = rt0;
397 		spin_unlock(&lock);
398 	}
399 
400 	RT6_TRACE("%s() => %p, score=%d\n",
401 		  __FUNCTION__, match, mpri);
402 
403 	return (match ? match : &ip6_null_entry);
404 }
405 
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408 		  struct in6_addr *gwaddr)
409 {
410 	struct route_info *rinfo = (struct route_info *) opt;
411 	struct in6_addr prefix_buf, *prefix;
412 	unsigned int pref;
413 	u32 lifetime;
414 	struct rt6_info *rt;
415 
416 	if (len < sizeof(struct route_info)) {
417 		return -EINVAL;
418 	}
419 
420 	/* Sanity check for prefix_len and length */
421 	if (rinfo->length > 3) {
422 		return -EINVAL;
423 	} else if (rinfo->prefix_len > 128) {
424 		return -EINVAL;
425 	} else if (rinfo->prefix_len > 64) {
426 		if (rinfo->length < 2) {
427 			return -EINVAL;
428 		}
429 	} else if (rinfo->prefix_len > 0) {
430 		if (rinfo->length < 1) {
431 			return -EINVAL;
432 		}
433 	}
434 
435 	pref = rinfo->route_pref;
436 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
437 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 
439 	lifetime = htonl(rinfo->lifetime);
440 	if (lifetime == 0xffffffff) {
441 		/* infinity */
442 	} else if (lifetime > 0x7fffffff/HZ) {
443 		/* Avoid arithmetic overflow */
444 		lifetime = 0x7fffffff/HZ - 1;
445 	}
446 
447 	if (rinfo->length == 3)
448 		prefix = (struct in6_addr *)rinfo->prefix;
449 	else {
450 		/* this function is safe */
451 		ipv6_addr_prefix(&prefix_buf,
452 				 (struct in6_addr *)rinfo->prefix,
453 				 rinfo->prefix_len);
454 		prefix = &prefix_buf;
455 	}
456 
457 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 
459 	if (rt && !lifetime) {
460 		ip6_del_rt(rt);
461 		rt = NULL;
462 	}
463 
464 	if (!rt && lifetime)
465 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
466 					pref);
467 	else if (rt)
468 		rt->rt6i_flags = RTF_ROUTEINFO |
469 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
470 
471 	if (rt) {
472 		if (lifetime == 0xffffffff) {
473 			rt->rt6i_flags &= ~RTF_EXPIRES;
474 		} else {
475 			rt->rt6i_expires = jiffies + HZ * lifetime;
476 			rt->rt6i_flags |= RTF_EXPIRES;
477 		}
478 		dst_release(&rt->u.dst);
479 	}
480 	return 0;
481 }
482 #endif
483 
484 #define BACKTRACK() \
485 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
486 	while ((fn = fn->parent) != NULL) { \
487 		if (fn->fn_flags & RTN_TL_ROOT) { \
488 			dst_hold(&rt->u.dst); \
489 			goto out; \
490 		} \
491 		if (fn->fn_flags & RTN_RTINFO) \
492 			goto restart; \
493 	} \
494 }
495 
496 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
497 					     struct flowi *fl, int flags)
498 {
499 	struct fib6_node *fn;
500 	struct rt6_info *rt;
501 
502 	read_lock_bh(&table->tb6_lock);
503 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
504 restart:
505 	rt = fn->leaf;
506 	rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
507 	BACKTRACK();
508 	dst_hold(&rt->u.dst);
509 out:
510 	read_unlock_bh(&table->tb6_lock);
511 
512 	rt->u.dst.lastuse = jiffies;
513 	rt->u.dst.__use++;
514 
515 	return rt;
516 
517 }
518 
519 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
520 			    int oif, int strict)
521 {
522 	struct flowi fl = {
523 		.oif = oif,
524 		.nl_u = {
525 			.ip6_u = {
526 				.daddr = *daddr,
527 				/* TODO: saddr */
528 			},
529 		},
530 	};
531 	struct dst_entry *dst;
532 	int flags = strict ? RT6_F_STRICT : 0;
533 
534 	dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
535 	if (dst->error == 0)
536 		return (struct rt6_info *) dst;
537 
538 	dst_release(dst);
539 
540 	return NULL;
541 }
542 
543 /* ip6_ins_rt is called with FREE table->tb6_lock.
544    It takes new route entry, the addition fails by any reason the
545    route is freed. In any case, if caller does not hold it, it may
546    be destroyed.
547  */
548 
549 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
550 {
551 	int err;
552 	struct fib6_table *table;
553 
554 	table = rt->rt6i_table;
555 	write_lock_bh(&table->tb6_lock);
556 	err = fib6_add(&table->tb6_root, rt, info);
557 	write_unlock_bh(&table->tb6_lock);
558 
559 	return err;
560 }
561 
562 int ip6_ins_rt(struct rt6_info *rt)
563 {
564 	return __ip6_ins_rt(rt, NULL);
565 }
566 
567 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
568 				      struct in6_addr *saddr)
569 {
570 	struct rt6_info *rt;
571 
572 	/*
573 	 *	Clone the route.
574 	 */
575 
576 	rt = ip6_rt_copy(ort);
577 
578 	if (rt) {
579 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
580 			if (rt->rt6i_dst.plen != 128 &&
581 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
582 				rt->rt6i_flags |= RTF_ANYCAST;
583 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
584 		}
585 
586 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
587 		rt->rt6i_dst.plen = 128;
588 		rt->rt6i_flags |= RTF_CACHE;
589 		rt->u.dst.flags |= DST_HOST;
590 
591 #ifdef CONFIG_IPV6_SUBTREES
592 		if (rt->rt6i_src.plen && saddr) {
593 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
594 			rt->rt6i_src.plen = 128;
595 		}
596 #endif
597 
598 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
599 
600 	}
601 
602 	return rt;
603 }
604 
605 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
606 {
607 	struct rt6_info *rt = ip6_rt_copy(ort);
608 	if (rt) {
609 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
610 		rt->rt6i_dst.plen = 128;
611 		rt->rt6i_flags |= RTF_CACHE;
612 		if (rt->rt6i_flags & RTF_REJECT)
613 			rt->u.dst.error = ort->u.dst.error;
614 		rt->u.dst.flags |= DST_HOST;
615 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
616 	}
617 	return rt;
618 }
619 
620 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
621 					    struct flowi *fl, int flags)
622 {
623 	struct fib6_node *fn;
624 	struct rt6_info *rt, *nrt;
625 	int strict = 0;
626 	int attempts = 3;
627 	int err;
628 	int reachable = RT6_SELECT_F_REACHABLE;
629 
630 	if (flags & RT6_F_STRICT)
631 		strict = RT6_SELECT_F_IFACE;
632 
633 relookup:
634 	read_lock_bh(&table->tb6_lock);
635 
636 restart_2:
637 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
638 
639 restart:
640 	rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
641 	BACKTRACK();
642 	if (rt == &ip6_null_entry ||
643 	    rt->rt6i_flags & RTF_CACHE)
644 		goto out;
645 
646 	dst_hold(&rt->u.dst);
647 	read_unlock_bh(&table->tb6_lock);
648 
649 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
650 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
651 	else {
652 #if CLONE_OFFLINK_ROUTE
653 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
654 #else
655 		goto out2;
656 #endif
657 	}
658 
659 	dst_release(&rt->u.dst);
660 	rt = nrt ? : &ip6_null_entry;
661 
662 	dst_hold(&rt->u.dst);
663 	if (nrt) {
664 		err = ip6_ins_rt(nrt);
665 		if (!err)
666 			goto out2;
667 	}
668 
669 	if (--attempts <= 0)
670 		goto out2;
671 
672 	/*
673 	 * Race condition! In the gap, when table->tb6_lock was
674 	 * released someone could insert this route.  Relookup.
675 	 */
676 	dst_release(&rt->u.dst);
677 	goto relookup;
678 
679 out:
680 	if (reachable) {
681 		reachable = 0;
682 		goto restart_2;
683 	}
684 	dst_hold(&rt->u.dst);
685 	read_unlock_bh(&table->tb6_lock);
686 out2:
687 	rt->u.dst.lastuse = jiffies;
688 	rt->u.dst.__use++;
689 
690 	return rt;
691 }
692 
693 void ip6_route_input(struct sk_buff *skb)
694 {
695 	struct ipv6hdr *iph = skb->nh.ipv6h;
696 	struct flowi fl = {
697 		.iif = skb->dev->ifindex,
698 		.nl_u = {
699 			.ip6_u = {
700 				.daddr = iph->daddr,
701 				.saddr = iph->saddr,
702 				.flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
703 			},
704 		},
705 		.proto = iph->nexthdr,
706 	};
707 	int flags = 0;
708 
709 	if (rt6_need_strict(&iph->daddr))
710 		flags |= RT6_F_STRICT;
711 
712 	skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
713 }
714 
715 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
716 					     struct flowi *fl, int flags)
717 {
718 	struct fib6_node *fn;
719 	struct rt6_info *rt, *nrt;
720 	int strict = 0;
721 	int attempts = 3;
722 	int err;
723 	int reachable = RT6_SELECT_F_REACHABLE;
724 
725 	if (flags & RT6_F_STRICT)
726 		strict = RT6_SELECT_F_IFACE;
727 
728 relookup:
729 	read_lock_bh(&table->tb6_lock);
730 
731 restart_2:
732 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
733 
734 restart:
735 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
736 	BACKTRACK();
737 	if (rt == &ip6_null_entry ||
738 	    rt->rt6i_flags & RTF_CACHE)
739 		goto out;
740 
741 	dst_hold(&rt->u.dst);
742 	read_unlock_bh(&table->tb6_lock);
743 
744 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
745 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
746 	else {
747 #if CLONE_OFFLINK_ROUTE
748 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
749 #else
750 		goto out2;
751 #endif
752 	}
753 
754 	dst_release(&rt->u.dst);
755 	rt = nrt ? : &ip6_null_entry;
756 
757 	dst_hold(&rt->u.dst);
758 	if (nrt) {
759 		err = ip6_ins_rt(nrt);
760 		if (!err)
761 			goto out2;
762 	}
763 
764 	if (--attempts <= 0)
765 		goto out2;
766 
767 	/*
768 	 * Race condition! In the gap, when table->tb6_lock was
769 	 * released someone could insert this route.  Relookup.
770 	 */
771 	dst_release(&rt->u.dst);
772 	goto relookup;
773 
774 out:
775 	if (reachable) {
776 		reachable = 0;
777 		goto restart_2;
778 	}
779 	dst_hold(&rt->u.dst);
780 	read_unlock_bh(&table->tb6_lock);
781 out2:
782 	rt->u.dst.lastuse = jiffies;
783 	rt->u.dst.__use++;
784 	return rt;
785 }
786 
787 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
788 {
789 	int flags = 0;
790 
791 	if (rt6_need_strict(&fl->fl6_dst))
792 		flags |= RT6_F_STRICT;
793 
794 	return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
795 }
796 
797 
798 /*
799  *	Destination cache support functions
800  */
801 
802 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
803 {
804 	struct rt6_info *rt;
805 
806 	rt = (struct rt6_info *) dst;
807 
808 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
809 		return dst;
810 
811 	return NULL;
812 }
813 
814 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
815 {
816 	struct rt6_info *rt = (struct rt6_info *) dst;
817 
818 	if (rt) {
819 		if (rt->rt6i_flags & RTF_CACHE)
820 			ip6_del_rt(rt);
821 		else
822 			dst_release(dst);
823 	}
824 	return NULL;
825 }
826 
827 static void ip6_link_failure(struct sk_buff *skb)
828 {
829 	struct rt6_info *rt;
830 
831 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
832 
833 	rt = (struct rt6_info *) skb->dst;
834 	if (rt) {
835 		if (rt->rt6i_flags&RTF_CACHE) {
836 			dst_set_expires(&rt->u.dst, 0);
837 			rt->rt6i_flags |= RTF_EXPIRES;
838 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
839 			rt->rt6i_node->fn_sernum = -1;
840 	}
841 }
842 
843 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
844 {
845 	struct rt6_info *rt6 = (struct rt6_info*)dst;
846 
847 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
848 		rt6->rt6i_flags |= RTF_MODIFIED;
849 		if (mtu < IPV6_MIN_MTU) {
850 			mtu = IPV6_MIN_MTU;
851 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
852 		}
853 		dst->metrics[RTAX_MTU-1] = mtu;
854 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
855 	}
856 }
857 
858 static int ipv6_get_mtu(struct net_device *dev);
859 
860 static inline unsigned int ipv6_advmss(unsigned int mtu)
861 {
862 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
863 
864 	if (mtu < ip6_rt_min_advmss)
865 		mtu = ip6_rt_min_advmss;
866 
867 	/*
868 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
869 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
870 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
871 	 * rely only on pmtu discovery"
872 	 */
873 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
874 		mtu = IPV6_MAXPLEN;
875 	return mtu;
876 }
877 
878 static struct dst_entry *ndisc_dst_gc_list;
879 static DEFINE_SPINLOCK(ndisc_lock);
880 
881 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
882 				  struct neighbour *neigh,
883 				  struct in6_addr *addr,
884 				  int (*output)(struct sk_buff *))
885 {
886 	struct rt6_info *rt;
887 	struct inet6_dev *idev = in6_dev_get(dev);
888 
889 	if (unlikely(idev == NULL))
890 		return NULL;
891 
892 	rt = ip6_dst_alloc();
893 	if (unlikely(rt == NULL)) {
894 		in6_dev_put(idev);
895 		goto out;
896 	}
897 
898 	dev_hold(dev);
899 	if (neigh)
900 		neigh_hold(neigh);
901 	else
902 		neigh = ndisc_get_neigh(dev, addr);
903 
904 	rt->rt6i_dev	  = dev;
905 	rt->rt6i_idev     = idev;
906 	rt->rt6i_nexthop  = neigh;
907 	atomic_set(&rt->u.dst.__refcnt, 1);
908 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
909 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
910 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
911 	rt->u.dst.output  = output;
912 
913 #if 0	/* there's no chance to use these for ndisc */
914 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
915 				? DST_HOST
916 				: 0;
917 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
918 	rt->rt6i_dst.plen = 128;
919 #endif
920 
921 	spin_lock_bh(&ndisc_lock);
922 	rt->u.dst.next = ndisc_dst_gc_list;
923 	ndisc_dst_gc_list = &rt->u.dst;
924 	spin_unlock_bh(&ndisc_lock);
925 
926 	fib6_force_start_gc();
927 
928 out:
929 	return (struct dst_entry *)rt;
930 }
931 
932 int ndisc_dst_gc(int *more)
933 {
934 	struct dst_entry *dst, *next, **pprev;
935 	int freed;
936 
937 	next = NULL;
938  	freed = 0;
939 
940 	spin_lock_bh(&ndisc_lock);
941 	pprev = &ndisc_dst_gc_list;
942 
943 	while ((dst = *pprev) != NULL) {
944 		if (!atomic_read(&dst->__refcnt)) {
945 			*pprev = dst->next;
946 			dst_free(dst);
947 			freed++;
948 		} else {
949 			pprev = &dst->next;
950 			(*more)++;
951 		}
952 	}
953 
954 	spin_unlock_bh(&ndisc_lock);
955 
956 	return freed;
957 }
958 
959 static int ip6_dst_gc(void)
960 {
961 	static unsigned expire = 30*HZ;
962 	static unsigned long last_gc;
963 	unsigned long now = jiffies;
964 
965 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
966 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
967 		goto out;
968 
969 	expire++;
970 	fib6_run_gc(expire);
971 	last_gc = now;
972 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
973 		expire = ip6_rt_gc_timeout>>1;
974 
975 out:
976 	expire -= expire>>ip6_rt_gc_elasticity;
977 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
978 }
979 
980 /* Clean host part of a prefix. Not necessary in radix tree,
981    but results in cleaner routing tables.
982 
983    Remove it only when all the things will work!
984  */
985 
986 static int ipv6_get_mtu(struct net_device *dev)
987 {
988 	int mtu = IPV6_MIN_MTU;
989 	struct inet6_dev *idev;
990 
991 	idev = in6_dev_get(dev);
992 	if (idev) {
993 		mtu = idev->cnf.mtu6;
994 		in6_dev_put(idev);
995 	}
996 	return mtu;
997 }
998 
999 int ipv6_get_hoplimit(struct net_device *dev)
1000 {
1001 	int hoplimit = ipv6_devconf.hop_limit;
1002 	struct inet6_dev *idev;
1003 
1004 	idev = in6_dev_get(dev);
1005 	if (idev) {
1006 		hoplimit = idev->cnf.hop_limit;
1007 		in6_dev_put(idev);
1008 	}
1009 	return hoplimit;
1010 }
1011 
1012 /*
1013  *
1014  */
1015 
1016 int ip6_route_add(struct fib6_config *cfg)
1017 {
1018 	int err;
1019 	struct rt6_info *rt = NULL;
1020 	struct net_device *dev = NULL;
1021 	struct inet6_dev *idev = NULL;
1022 	struct fib6_table *table;
1023 	int addr_type;
1024 
1025 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1026 		return -EINVAL;
1027 #ifndef CONFIG_IPV6_SUBTREES
1028 	if (cfg->fc_src_len)
1029 		return -EINVAL;
1030 #endif
1031 	if (cfg->fc_ifindex) {
1032 		err = -ENODEV;
1033 		dev = dev_get_by_index(cfg->fc_ifindex);
1034 		if (!dev)
1035 			goto out;
1036 		idev = in6_dev_get(dev);
1037 		if (!idev)
1038 			goto out;
1039 	}
1040 
1041 	if (cfg->fc_metric == 0)
1042 		cfg->fc_metric = IP6_RT_PRIO_USER;
1043 
1044 	table = fib6_new_table(cfg->fc_table);
1045 	if (table == NULL) {
1046 		err = -ENOBUFS;
1047 		goto out;
1048 	}
1049 
1050 	rt = ip6_dst_alloc();
1051 
1052 	if (rt == NULL) {
1053 		err = -ENOMEM;
1054 		goto out;
1055 	}
1056 
1057 	rt->u.dst.obsolete = -1;
1058 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1059 
1060 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1061 		cfg->fc_protocol = RTPROT_BOOT;
1062 	rt->rt6i_protocol = cfg->fc_protocol;
1063 
1064 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1065 
1066 	if (addr_type & IPV6_ADDR_MULTICAST)
1067 		rt->u.dst.input = ip6_mc_input;
1068 	else
1069 		rt->u.dst.input = ip6_forward;
1070 
1071 	rt->u.dst.output = ip6_output;
1072 
1073 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1074 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1075 	if (rt->rt6i_dst.plen == 128)
1076 	       rt->u.dst.flags = DST_HOST;
1077 
1078 #ifdef CONFIG_IPV6_SUBTREES
1079 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1080 	rt->rt6i_src.plen = cfg->fc_src_len;
1081 #endif
1082 
1083 	rt->rt6i_metric = cfg->fc_metric;
1084 
1085 	/* We cannot add true routes via loopback here,
1086 	   they would result in kernel looping; promote them to reject routes
1087 	 */
1088 	if ((cfg->fc_flags & RTF_REJECT) ||
1089 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1090 		/* hold loopback dev/idev if we haven't done so. */
1091 		if (dev != &loopback_dev) {
1092 			if (dev) {
1093 				dev_put(dev);
1094 				in6_dev_put(idev);
1095 			}
1096 			dev = &loopback_dev;
1097 			dev_hold(dev);
1098 			idev = in6_dev_get(dev);
1099 			if (!idev) {
1100 				err = -ENODEV;
1101 				goto out;
1102 			}
1103 		}
1104 		rt->u.dst.output = ip6_pkt_discard_out;
1105 		rt->u.dst.input = ip6_pkt_discard;
1106 		rt->u.dst.error = -ENETUNREACH;
1107 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1108 		goto install_route;
1109 	}
1110 
1111 	if (cfg->fc_flags & RTF_GATEWAY) {
1112 		struct in6_addr *gw_addr;
1113 		int gwa_type;
1114 
1115 		gw_addr = &cfg->fc_gateway;
1116 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1117 		gwa_type = ipv6_addr_type(gw_addr);
1118 
1119 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1120 			struct rt6_info *grt;
1121 
1122 			/* IPv6 strictly inhibits using not link-local
1123 			   addresses as nexthop address.
1124 			   Otherwise, router will not able to send redirects.
1125 			   It is very good, but in some (rare!) circumstances
1126 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1127 			   some exceptions. --ANK
1128 			 */
1129 			err = -EINVAL;
1130 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1131 				goto out;
1132 
1133 			grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1134 
1135 			err = -EHOSTUNREACH;
1136 			if (grt == NULL)
1137 				goto out;
1138 			if (dev) {
1139 				if (dev != grt->rt6i_dev) {
1140 					dst_release(&grt->u.dst);
1141 					goto out;
1142 				}
1143 			} else {
1144 				dev = grt->rt6i_dev;
1145 				idev = grt->rt6i_idev;
1146 				dev_hold(dev);
1147 				in6_dev_hold(grt->rt6i_idev);
1148 			}
1149 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1150 				err = 0;
1151 			dst_release(&grt->u.dst);
1152 
1153 			if (err)
1154 				goto out;
1155 		}
1156 		err = -EINVAL;
1157 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1158 			goto out;
1159 	}
1160 
1161 	err = -ENODEV;
1162 	if (dev == NULL)
1163 		goto out;
1164 
1165 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1166 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1167 		if (IS_ERR(rt->rt6i_nexthop)) {
1168 			err = PTR_ERR(rt->rt6i_nexthop);
1169 			rt->rt6i_nexthop = NULL;
1170 			goto out;
1171 		}
1172 	}
1173 
1174 	rt->rt6i_flags = cfg->fc_flags;
1175 
1176 install_route:
1177 	if (cfg->fc_mx) {
1178 		struct nlattr *nla;
1179 		int remaining;
1180 
1181 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1182 			int type = nla->nla_type;
1183 
1184 			if (type) {
1185 				if (type > RTAX_MAX) {
1186 					err = -EINVAL;
1187 					goto out;
1188 				}
1189 
1190 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1191 			}
1192 		}
1193 	}
1194 
1195 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1196 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1197 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1198 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1199 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1200 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1201 	rt->u.dst.dev = dev;
1202 	rt->rt6i_idev = idev;
1203 	rt->rt6i_table = table;
1204 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1205 
1206 out:
1207 	if (dev)
1208 		dev_put(dev);
1209 	if (idev)
1210 		in6_dev_put(idev);
1211 	if (rt)
1212 		dst_free((struct dst_entry *) rt);
1213 	return err;
1214 }
1215 
1216 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1217 {
1218 	int err;
1219 	struct fib6_table *table;
1220 
1221 	if (rt == &ip6_null_entry)
1222 		return -ENOENT;
1223 
1224 	table = rt->rt6i_table;
1225 	write_lock_bh(&table->tb6_lock);
1226 
1227 	err = fib6_del(rt, info);
1228 	dst_release(&rt->u.dst);
1229 
1230 	write_unlock_bh(&table->tb6_lock);
1231 
1232 	return err;
1233 }
1234 
1235 int ip6_del_rt(struct rt6_info *rt)
1236 {
1237 	return __ip6_del_rt(rt, NULL);
1238 }
1239 
1240 static int ip6_route_del(struct fib6_config *cfg)
1241 {
1242 	struct fib6_table *table;
1243 	struct fib6_node *fn;
1244 	struct rt6_info *rt;
1245 	int err = -ESRCH;
1246 
1247 	table = fib6_get_table(cfg->fc_table);
1248 	if (table == NULL)
1249 		return err;
1250 
1251 	read_lock_bh(&table->tb6_lock);
1252 
1253 	fn = fib6_locate(&table->tb6_root,
1254 			 &cfg->fc_dst, cfg->fc_dst_len,
1255 			 &cfg->fc_src, cfg->fc_src_len);
1256 
1257 	if (fn) {
1258 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1259 			if (cfg->fc_ifindex &&
1260 			    (rt->rt6i_dev == NULL ||
1261 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1262 				continue;
1263 			if (cfg->fc_flags & RTF_GATEWAY &&
1264 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1265 				continue;
1266 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1267 				continue;
1268 			dst_hold(&rt->u.dst);
1269 			read_unlock_bh(&table->tb6_lock);
1270 
1271 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1272 		}
1273 	}
1274 	read_unlock_bh(&table->tb6_lock);
1275 
1276 	return err;
1277 }
1278 
1279 /*
1280  *	Handle redirects
1281  */
1282 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1283 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1284 {
1285 	struct rt6_info *rt, *nrt = NULL;
1286 	struct fib6_node *fn;
1287 	struct fib6_table *table;
1288 	struct netevent_redirect netevent;
1289 
1290 	/* TODO: Very lazy, might need to check all tables */
1291 	table = fib6_get_table(RT6_TABLE_MAIN);
1292 	if (table == NULL)
1293 		return;
1294 
1295 	/*
1296 	 * Get the "current" route for this destination and
1297 	 * check if the redirect has come from approriate router.
1298 	 *
1299 	 * RFC 2461 specifies that redirects should only be
1300 	 * accepted if they come from the nexthop to the target.
1301 	 * Due to the way the routes are chosen, this notion
1302 	 * is a bit fuzzy and one might need to check all possible
1303 	 * routes.
1304 	 */
1305 
1306 	read_lock_bh(&table->tb6_lock);
1307 	fn = fib6_lookup(&table->tb6_root, dest, NULL);
1308 restart:
1309 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1310 		/*
1311 		 * Current route is on-link; redirect is always invalid.
1312 		 *
1313 		 * Seems, previous statement is not true. It could
1314 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1315 		 * But then router serving it might decide, that we should
1316 		 * know truth 8)8) --ANK (980726).
1317 		 */
1318 		if (rt6_check_expired(rt))
1319 			continue;
1320 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1321 			continue;
1322 		if (neigh->dev != rt->rt6i_dev)
1323 			continue;
1324 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1325 			continue;
1326 		break;
1327 	}
1328 	if (rt)
1329 		dst_hold(&rt->u.dst);
1330 	else if (rt6_need_strict(dest)) {
1331 		while ((fn = fn->parent) != NULL) {
1332 			if (fn->fn_flags & RTN_ROOT)
1333 				break;
1334 			if (fn->fn_flags & RTN_RTINFO)
1335 				goto restart;
1336 		}
1337 	}
1338 	read_unlock_bh(&table->tb6_lock);
1339 
1340 	if (!rt) {
1341 		if (net_ratelimit())
1342 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1343 			       "for redirect target\n");
1344 		return;
1345 	}
1346 
1347 	/*
1348 	 *	We have finally decided to accept it.
1349 	 */
1350 
1351 	neigh_update(neigh, lladdr, NUD_STALE,
1352 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1353 		     NEIGH_UPDATE_F_OVERRIDE|
1354 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1355 				     NEIGH_UPDATE_F_ISROUTER))
1356 		     );
1357 
1358 	/*
1359 	 * Redirect received -> path was valid.
1360 	 * Look, redirects are sent only in response to data packets,
1361 	 * so that this nexthop apparently is reachable. --ANK
1362 	 */
1363 	dst_confirm(&rt->u.dst);
1364 
1365 	/* Duplicate redirect: silently ignore. */
1366 	if (neigh == rt->u.dst.neighbour)
1367 		goto out;
1368 
1369 	nrt = ip6_rt_copy(rt);
1370 	if (nrt == NULL)
1371 		goto out;
1372 
1373 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1374 	if (on_link)
1375 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1376 
1377 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1378 	nrt->rt6i_dst.plen = 128;
1379 	nrt->u.dst.flags |= DST_HOST;
1380 
1381 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1382 	nrt->rt6i_nexthop = neigh_clone(neigh);
1383 	/* Reset pmtu, it may be better */
1384 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1385 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1386 
1387 	if (ip6_ins_rt(nrt))
1388 		goto out;
1389 
1390 	netevent.old = &rt->u.dst;
1391 	netevent.new = &nrt->u.dst;
1392 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1393 
1394 	if (rt->rt6i_flags&RTF_CACHE) {
1395 		ip6_del_rt(rt);
1396 		return;
1397 	}
1398 
1399 out:
1400         dst_release(&rt->u.dst);
1401 	return;
1402 }
1403 
1404 /*
1405  *	Handle ICMP "packet too big" messages
1406  *	i.e. Path MTU discovery
1407  */
1408 
1409 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1410 			struct net_device *dev, u32 pmtu)
1411 {
1412 	struct rt6_info *rt, *nrt;
1413 	int allfrag = 0;
1414 
1415 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1416 	if (rt == NULL)
1417 		return;
1418 
1419 	if (pmtu >= dst_mtu(&rt->u.dst))
1420 		goto out;
1421 
1422 	if (pmtu < IPV6_MIN_MTU) {
1423 		/*
1424 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1425 		 * MTU (1280) and a fragment header should always be included
1426 		 * after a node receiving Too Big message reporting PMTU is
1427 		 * less than the IPv6 Minimum Link MTU.
1428 		 */
1429 		pmtu = IPV6_MIN_MTU;
1430 		allfrag = 1;
1431 	}
1432 
1433 	/* New mtu received -> path was valid.
1434 	   They are sent only in response to data packets,
1435 	   so that this nexthop apparently is reachable. --ANK
1436 	 */
1437 	dst_confirm(&rt->u.dst);
1438 
1439 	/* Host route. If it is static, it would be better
1440 	   not to override it, but add new one, so that
1441 	   when cache entry will expire old pmtu
1442 	   would return automatically.
1443 	 */
1444 	if (rt->rt6i_flags & RTF_CACHE) {
1445 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1446 		if (allfrag)
1447 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1448 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1449 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1450 		goto out;
1451 	}
1452 
1453 	/* Network route.
1454 	   Two cases are possible:
1455 	   1. It is connected route. Action: COW
1456 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1457 	 */
1458 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1459 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1460 	else
1461 		nrt = rt6_alloc_clone(rt, daddr);
1462 
1463 	if (nrt) {
1464 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1465 		if (allfrag)
1466 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1467 
1468 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1469 		 * happened within 5 mins, the recommended timer is 10 mins.
1470 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1471 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1472 		 * and detecting PMTU increase will be automatically happened.
1473 		 */
1474 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1475 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1476 
1477 		ip6_ins_rt(nrt);
1478 	}
1479 out:
1480 	dst_release(&rt->u.dst);
1481 }
1482 
1483 /*
1484  *	Misc support functions
1485  */
1486 
1487 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1488 {
1489 	struct rt6_info *rt = ip6_dst_alloc();
1490 
1491 	if (rt) {
1492 		rt->u.dst.input = ort->u.dst.input;
1493 		rt->u.dst.output = ort->u.dst.output;
1494 
1495 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1496 		rt->u.dst.dev = ort->u.dst.dev;
1497 		if (rt->u.dst.dev)
1498 			dev_hold(rt->u.dst.dev);
1499 		rt->rt6i_idev = ort->rt6i_idev;
1500 		if (rt->rt6i_idev)
1501 			in6_dev_hold(rt->rt6i_idev);
1502 		rt->u.dst.lastuse = jiffies;
1503 		rt->rt6i_expires = 0;
1504 
1505 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1506 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1507 		rt->rt6i_metric = 0;
1508 
1509 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1510 #ifdef CONFIG_IPV6_SUBTREES
1511 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1512 #endif
1513 		rt->rt6i_table = ort->rt6i_table;
1514 	}
1515 	return rt;
1516 }
1517 
1518 #ifdef CONFIG_IPV6_ROUTE_INFO
1519 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1520 					   struct in6_addr *gwaddr, int ifindex)
1521 {
1522 	struct fib6_node *fn;
1523 	struct rt6_info *rt = NULL;
1524 	struct fib6_table *table;
1525 
1526 	table = fib6_get_table(RT6_TABLE_INFO);
1527 	if (table == NULL)
1528 		return NULL;
1529 
1530 	write_lock_bh(&table->tb6_lock);
1531 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1532 	if (!fn)
1533 		goto out;
1534 
1535 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1536 		if (rt->rt6i_dev->ifindex != ifindex)
1537 			continue;
1538 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1539 			continue;
1540 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1541 			continue;
1542 		dst_hold(&rt->u.dst);
1543 		break;
1544 	}
1545 out:
1546 	write_unlock_bh(&table->tb6_lock);
1547 	return rt;
1548 }
1549 
1550 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1551 					   struct in6_addr *gwaddr, int ifindex,
1552 					   unsigned pref)
1553 {
1554 	struct fib6_config cfg = {
1555 		.fc_table	= RT6_TABLE_INFO,
1556 		.fc_metric	= 1024,
1557 		.fc_ifindex	= ifindex,
1558 		.fc_dst_len	= prefixlen,
1559 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1560 				  RTF_UP | RTF_PREF(pref),
1561 	};
1562 
1563 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1564 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1565 
1566 	/* We should treat it as a default route if prefix length is 0. */
1567 	if (!prefixlen)
1568 		cfg.fc_flags |= RTF_DEFAULT;
1569 
1570 	ip6_route_add(&cfg);
1571 
1572 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1573 }
1574 #endif
1575 
1576 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1577 {
1578 	struct rt6_info *rt;
1579 	struct fib6_table *table;
1580 
1581 	table = fib6_get_table(RT6_TABLE_DFLT);
1582 	if (table == NULL)
1583 		return NULL;
1584 
1585 	write_lock_bh(&table->tb6_lock);
1586 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1587 		if (dev == rt->rt6i_dev &&
1588 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1589 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1590 			break;
1591 	}
1592 	if (rt)
1593 		dst_hold(&rt->u.dst);
1594 	write_unlock_bh(&table->tb6_lock);
1595 	return rt;
1596 }
1597 
1598 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1599 				     struct net_device *dev,
1600 				     unsigned int pref)
1601 {
1602 	struct fib6_config cfg = {
1603 		.fc_table	= RT6_TABLE_DFLT,
1604 		.fc_metric	= 1024,
1605 		.fc_ifindex	= dev->ifindex,
1606 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1607 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1608 	};
1609 
1610 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1611 
1612 	ip6_route_add(&cfg);
1613 
1614 	return rt6_get_dflt_router(gwaddr, dev);
1615 }
1616 
1617 void rt6_purge_dflt_routers(void)
1618 {
1619 	struct rt6_info *rt;
1620 	struct fib6_table *table;
1621 
1622 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1623 	table = fib6_get_table(RT6_TABLE_DFLT);
1624 	if (table == NULL)
1625 		return;
1626 
1627 restart:
1628 	read_lock_bh(&table->tb6_lock);
1629 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1630 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1631 			dst_hold(&rt->u.dst);
1632 			read_unlock_bh(&table->tb6_lock);
1633 			ip6_del_rt(rt);
1634 			goto restart;
1635 		}
1636 	}
1637 	read_unlock_bh(&table->tb6_lock);
1638 }
1639 
1640 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1641 				 struct fib6_config *cfg)
1642 {
1643 	memset(cfg, 0, sizeof(*cfg));
1644 
1645 	cfg->fc_table = RT6_TABLE_MAIN;
1646 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1647 	cfg->fc_metric = rtmsg->rtmsg_metric;
1648 	cfg->fc_expires = rtmsg->rtmsg_info;
1649 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1650 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1651 	cfg->fc_flags = rtmsg->rtmsg_flags;
1652 
1653 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1654 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1655 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1656 }
1657 
1658 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1659 {
1660 	struct fib6_config cfg;
1661 	struct in6_rtmsg rtmsg;
1662 	int err;
1663 
1664 	switch(cmd) {
1665 	case SIOCADDRT:		/* Add a route */
1666 	case SIOCDELRT:		/* Delete a route */
1667 		if (!capable(CAP_NET_ADMIN))
1668 			return -EPERM;
1669 		err = copy_from_user(&rtmsg, arg,
1670 				     sizeof(struct in6_rtmsg));
1671 		if (err)
1672 			return -EFAULT;
1673 
1674 		rtmsg_to_fib6_config(&rtmsg, &cfg);
1675 
1676 		rtnl_lock();
1677 		switch (cmd) {
1678 		case SIOCADDRT:
1679 			err = ip6_route_add(&cfg);
1680 			break;
1681 		case SIOCDELRT:
1682 			err = ip6_route_del(&cfg);
1683 			break;
1684 		default:
1685 			err = -EINVAL;
1686 		}
1687 		rtnl_unlock();
1688 
1689 		return err;
1690 	};
1691 
1692 	return -EINVAL;
1693 }
1694 
1695 /*
1696  *	Drop the packet on the floor
1697  */
1698 
1699 static int ip6_pkt_discard(struct sk_buff *skb)
1700 {
1701 	int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1702 	if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1703 		IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1704 
1705 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1706 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static int ip6_pkt_discard_out(struct sk_buff *skb)
1712 {
1713 	skb->dev = skb->dst->dev;
1714 	return ip6_pkt_discard(skb);
1715 }
1716 
1717 /*
1718  *	Allocate a dst for local (unicast / anycast) address.
1719  */
1720 
1721 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1722 				    const struct in6_addr *addr,
1723 				    int anycast)
1724 {
1725 	struct rt6_info *rt = ip6_dst_alloc();
1726 
1727 	if (rt == NULL)
1728 		return ERR_PTR(-ENOMEM);
1729 
1730 	dev_hold(&loopback_dev);
1731 	in6_dev_hold(idev);
1732 
1733 	rt->u.dst.flags = DST_HOST;
1734 	rt->u.dst.input = ip6_input;
1735 	rt->u.dst.output = ip6_output;
1736 	rt->rt6i_dev = &loopback_dev;
1737 	rt->rt6i_idev = idev;
1738 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1739 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1740 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1741 	rt->u.dst.obsolete = -1;
1742 
1743 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1744 	if (anycast)
1745 		rt->rt6i_flags |= RTF_ANYCAST;
1746 	else
1747 		rt->rt6i_flags |= RTF_LOCAL;
1748 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1749 	if (rt->rt6i_nexthop == NULL) {
1750 		dst_free((struct dst_entry *) rt);
1751 		return ERR_PTR(-ENOMEM);
1752 	}
1753 
1754 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1755 	rt->rt6i_dst.plen = 128;
1756 	rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1757 
1758 	atomic_set(&rt->u.dst.__refcnt, 1);
1759 
1760 	return rt;
1761 }
1762 
1763 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1764 {
1765 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1766 	    rt != &ip6_null_entry) {
1767 		RT6_TRACE("deleted by ifdown %p\n", rt);
1768 		return -1;
1769 	}
1770 	return 0;
1771 }
1772 
1773 void rt6_ifdown(struct net_device *dev)
1774 {
1775 	fib6_clean_all(fib6_ifdown, 0, dev);
1776 }
1777 
1778 struct rt6_mtu_change_arg
1779 {
1780 	struct net_device *dev;
1781 	unsigned mtu;
1782 };
1783 
1784 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1785 {
1786 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1787 	struct inet6_dev *idev;
1788 
1789 	/* In IPv6 pmtu discovery is not optional,
1790 	   so that RTAX_MTU lock cannot disable it.
1791 	   We still use this lock to block changes
1792 	   caused by addrconf/ndisc.
1793 	*/
1794 
1795 	idev = __in6_dev_get(arg->dev);
1796 	if (idev == NULL)
1797 		return 0;
1798 
1799 	/* For administrative MTU increase, there is no way to discover
1800 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1801 	   Since RFC 1981 doesn't include administrative MTU increase
1802 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1803 	 */
1804 	/*
1805 	   If new MTU is less than route PMTU, this new MTU will be the
1806 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1807 	   decreases; if new MTU is greater than route PMTU, and the
1808 	   old MTU is the lowest MTU in the path, update the route PMTU
1809 	   to reflect the increase. In this case if the other nodes' MTU
1810 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1811 	   PMTU discouvery.
1812 	 */
1813 	if (rt->rt6i_dev == arg->dev &&
1814 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1815             (dst_mtu(&rt->u.dst) > arg->mtu ||
1816              (dst_mtu(&rt->u.dst) < arg->mtu &&
1817 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1818 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1819 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1820 	return 0;
1821 }
1822 
1823 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1824 {
1825 	struct rt6_mtu_change_arg arg = {
1826 		.dev = dev,
1827 		.mtu = mtu,
1828 	};
1829 
1830 	fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1831 }
1832 
1833 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1834 	[RTA_GATEWAY]           = { .minlen = sizeof(struct in6_addr) },
1835 	[RTA_OIF]               = { .type = NLA_U32 },
1836 	[RTA_PRIORITY]          = { .type = NLA_U32 },
1837 	[RTA_METRICS]           = { .type = NLA_NESTED },
1838 };
1839 
1840 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1841 			      struct fib6_config *cfg)
1842 {
1843 	struct rtmsg *rtm;
1844 	struct nlattr *tb[RTA_MAX+1];
1845 	int err;
1846 
1847 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1848 	if (err < 0)
1849 		goto errout;
1850 
1851 	err = -EINVAL;
1852 	rtm = nlmsg_data(nlh);
1853 	memset(cfg, 0, sizeof(*cfg));
1854 
1855 	cfg->fc_table = rtm->rtm_table;
1856 	cfg->fc_dst_len = rtm->rtm_dst_len;
1857 	cfg->fc_src_len = rtm->rtm_src_len;
1858 	cfg->fc_flags = RTF_UP;
1859 	cfg->fc_protocol = rtm->rtm_protocol;
1860 
1861 	if (rtm->rtm_type == RTN_UNREACHABLE)
1862 		cfg->fc_flags |= RTF_REJECT;
1863 
1864 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1865 	cfg->fc_nlinfo.nlh = nlh;
1866 
1867 	if (tb[RTA_GATEWAY]) {
1868 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1869 		cfg->fc_flags |= RTF_GATEWAY;
1870 	}
1871 
1872 	if (tb[RTA_DST]) {
1873 		int plen = (rtm->rtm_dst_len + 7) >> 3;
1874 
1875 		if (nla_len(tb[RTA_DST]) < plen)
1876 			goto errout;
1877 
1878 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1879 	}
1880 
1881 	if (tb[RTA_SRC]) {
1882 		int plen = (rtm->rtm_src_len + 7) >> 3;
1883 
1884 		if (nla_len(tb[RTA_SRC]) < plen)
1885 			goto errout;
1886 
1887 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1888 	}
1889 
1890 	if (tb[RTA_OIF])
1891 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1892 
1893 	if (tb[RTA_PRIORITY])
1894 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1895 
1896 	if (tb[RTA_METRICS]) {
1897 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1898 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1899 	}
1900 
1901 	if (tb[RTA_TABLE])
1902 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1903 
1904 	err = 0;
1905 errout:
1906 	return err;
1907 }
1908 
1909 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1910 {
1911 	struct fib6_config cfg;
1912 	int err;
1913 
1914 	err = rtm_to_fib6_config(skb, nlh, &cfg);
1915 	if (err < 0)
1916 		return err;
1917 
1918 	return ip6_route_del(&cfg);
1919 }
1920 
1921 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1922 {
1923 	struct fib6_config cfg;
1924 	int err;
1925 
1926 	err = rtm_to_fib6_config(skb, nlh, &cfg);
1927 	if (err < 0)
1928 		return err;
1929 
1930 	return ip6_route_add(&cfg);
1931 }
1932 
1933 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1934 			 struct in6_addr *dst, struct in6_addr *src,
1935 			 int iif, int type, u32 pid, u32 seq,
1936 			 int prefix, unsigned int flags)
1937 {
1938 	struct rtmsg *rtm;
1939 	struct nlmsghdr *nlh;
1940 	struct rta_cacheinfo ci;
1941 	u32 table;
1942 
1943 	if (prefix) {	/* user wants prefix routes only */
1944 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1945 			/* success since this is not a prefix route */
1946 			return 1;
1947 		}
1948 	}
1949 
1950 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1951 	if (nlh == NULL)
1952 		return -ENOBUFS;
1953 
1954 	rtm = nlmsg_data(nlh);
1955 	rtm->rtm_family = AF_INET6;
1956 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1957 	rtm->rtm_src_len = rt->rt6i_src.plen;
1958 	rtm->rtm_tos = 0;
1959 	if (rt->rt6i_table)
1960 		table = rt->rt6i_table->tb6_id;
1961 	else
1962 		table = RT6_TABLE_UNSPEC;
1963 	rtm->rtm_table = table;
1964 	NLA_PUT_U32(skb, RTA_TABLE, table);
1965 	if (rt->rt6i_flags&RTF_REJECT)
1966 		rtm->rtm_type = RTN_UNREACHABLE;
1967 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1968 		rtm->rtm_type = RTN_LOCAL;
1969 	else
1970 		rtm->rtm_type = RTN_UNICAST;
1971 	rtm->rtm_flags = 0;
1972 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1973 	rtm->rtm_protocol = rt->rt6i_protocol;
1974 	if (rt->rt6i_flags&RTF_DYNAMIC)
1975 		rtm->rtm_protocol = RTPROT_REDIRECT;
1976 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1977 		rtm->rtm_protocol = RTPROT_KERNEL;
1978 	else if (rt->rt6i_flags&RTF_DEFAULT)
1979 		rtm->rtm_protocol = RTPROT_RA;
1980 
1981 	if (rt->rt6i_flags&RTF_CACHE)
1982 		rtm->rtm_flags |= RTM_F_CLONED;
1983 
1984 	if (dst) {
1985 		NLA_PUT(skb, RTA_DST, 16, dst);
1986 	        rtm->rtm_dst_len = 128;
1987 	} else if (rtm->rtm_dst_len)
1988 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1989 #ifdef CONFIG_IPV6_SUBTREES
1990 	if (src) {
1991 		NLA_PUT(skb, RTA_SRC, 16, src);
1992 	        rtm->rtm_src_len = 128;
1993 	} else if (rtm->rtm_src_len)
1994 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1995 #endif
1996 	if (iif)
1997 		NLA_PUT_U32(skb, RTA_IIF, iif);
1998 	else if (dst) {
1999 		struct in6_addr saddr_buf;
2000 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2001 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2002 	}
2003 
2004 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2005 		goto nla_put_failure;
2006 
2007 	if (rt->u.dst.neighbour)
2008 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2009 
2010 	if (rt->u.dst.dev)
2011 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2012 
2013 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2014 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2015 	if (rt->rt6i_expires)
2016 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2017 	else
2018 		ci.rta_expires = 0;
2019 	ci.rta_used = rt->u.dst.__use;
2020 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2021 	ci.rta_error = rt->u.dst.error;
2022 	ci.rta_id = 0;
2023 	ci.rta_ts = 0;
2024 	ci.rta_tsage = 0;
2025 	NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2026 
2027 	return nlmsg_end(skb, nlh);
2028 
2029 nla_put_failure:
2030 	return nlmsg_cancel(skb, nlh);
2031 }
2032 
2033 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2034 {
2035 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2036 	int prefix;
2037 
2038 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2039 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2040 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2041 	} else
2042 		prefix = 0;
2043 
2044 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2045 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2046 		     prefix, NLM_F_MULTI);
2047 }
2048 
2049 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2050 {
2051 	struct rtattr **rta = arg;
2052 	int iif = 0;
2053 	int err = -ENOBUFS;
2054 	struct sk_buff *skb;
2055 	struct flowi fl;
2056 	struct rt6_info *rt;
2057 
2058 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2059 	if (skb == NULL)
2060 		goto out;
2061 
2062 	/* Reserve room for dummy headers, this skb can pass
2063 	   through good chunk of routing engine.
2064 	 */
2065 	skb->mac.raw = skb->data;
2066 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2067 
2068 	memset(&fl, 0, sizeof(fl));
2069 	if (rta[RTA_SRC-1])
2070 		ipv6_addr_copy(&fl.fl6_src,
2071 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
2072 	if (rta[RTA_DST-1])
2073 		ipv6_addr_copy(&fl.fl6_dst,
2074 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
2075 
2076 	if (rta[RTA_IIF-1])
2077 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2078 
2079 	if (iif) {
2080 		struct net_device *dev;
2081 		dev = __dev_get_by_index(iif);
2082 		if (!dev) {
2083 			err = -ENODEV;
2084 			goto out_free;
2085 		}
2086 	}
2087 
2088 	fl.oif = 0;
2089 	if (rta[RTA_OIF-1])
2090 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2091 
2092 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
2093 
2094 	skb->dst = &rt->u.dst;
2095 
2096 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2097 	err = rt6_fill_node(skb, rt,
2098 			    &fl.fl6_dst, &fl.fl6_src,
2099 			    iif,
2100 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2101 			    nlh->nlmsg_seq, 0, 0);
2102 	if (err < 0) {
2103 		err = -EMSGSIZE;
2104 		goto out_free;
2105 	}
2106 
2107 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2108 out:
2109 	return err;
2110 out_free:
2111 	kfree_skb(skb);
2112 	goto out;
2113 }
2114 
2115 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2116 {
2117 	struct sk_buff *skb;
2118 	u32 pid = 0, seq = 0;
2119 	struct nlmsghdr *nlh = NULL;
2120 	int payload = sizeof(struct rtmsg) + 256;
2121 	int err = -ENOBUFS;
2122 
2123 	if (info) {
2124 		pid = info->pid;
2125 		nlh = info->nlh;
2126 		if (nlh)
2127 			seq = nlh->nlmsg_seq;
2128 	}
2129 
2130 	skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2131 	if (skb == NULL)
2132 		goto errout;
2133 
2134 	err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2135 	if (err < 0) {
2136 		kfree_skb(skb);
2137 		goto errout;
2138 	}
2139 
2140 	err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2141 errout:
2142 	if (err < 0)
2143 		rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2144 }
2145 
2146 /*
2147  *	/proc
2148  */
2149 
2150 #ifdef CONFIG_PROC_FS
2151 
2152 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2153 
2154 struct rt6_proc_arg
2155 {
2156 	char *buffer;
2157 	int offset;
2158 	int length;
2159 	int skip;
2160 	int len;
2161 };
2162 
2163 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2164 {
2165 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2166 	int i;
2167 
2168 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2169 		arg->skip++;
2170 		return 0;
2171 	}
2172 
2173 	if (arg->len >= arg->length)
2174 		return 0;
2175 
2176 	for (i=0; i<16; i++) {
2177 		sprintf(arg->buffer + arg->len, "%02x",
2178 			rt->rt6i_dst.addr.s6_addr[i]);
2179 		arg->len += 2;
2180 	}
2181 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2182 			    rt->rt6i_dst.plen);
2183 
2184 #ifdef CONFIG_IPV6_SUBTREES
2185 	for (i=0; i<16; i++) {
2186 		sprintf(arg->buffer + arg->len, "%02x",
2187 			rt->rt6i_src.addr.s6_addr[i]);
2188 		arg->len += 2;
2189 	}
2190 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2191 			    rt->rt6i_src.plen);
2192 #else
2193 	sprintf(arg->buffer + arg->len,
2194 		"00000000000000000000000000000000 00 ");
2195 	arg->len += 36;
2196 #endif
2197 
2198 	if (rt->rt6i_nexthop) {
2199 		for (i=0; i<16; i++) {
2200 			sprintf(arg->buffer + arg->len, "%02x",
2201 				rt->rt6i_nexthop->primary_key[i]);
2202 			arg->len += 2;
2203 		}
2204 	} else {
2205 		sprintf(arg->buffer + arg->len,
2206 			"00000000000000000000000000000000");
2207 		arg->len += 32;
2208 	}
2209 	arg->len += sprintf(arg->buffer + arg->len,
2210 			    " %08x %08x %08x %08x %8s\n",
2211 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2212 			    rt->u.dst.__use, rt->rt6i_flags,
2213 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2214 	return 0;
2215 }
2216 
2217 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2218 {
2219 	struct rt6_proc_arg arg = {
2220 		.buffer = buffer,
2221 		.offset = offset,
2222 		.length = length,
2223 	};
2224 
2225 	fib6_clean_all(rt6_info_route, 0, &arg);
2226 
2227 	*start = buffer;
2228 	if (offset)
2229 		*start += offset % RT6_INFO_LEN;
2230 
2231 	arg.len -= offset % RT6_INFO_LEN;
2232 
2233 	if (arg.len > length)
2234 		arg.len = length;
2235 	if (arg.len < 0)
2236 		arg.len = 0;
2237 
2238 	return arg.len;
2239 }
2240 
2241 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2242 {
2243 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2244 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2245 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2246 		      rt6_stats.fib_rt_cache,
2247 		      atomic_read(&ip6_dst_ops.entries),
2248 		      rt6_stats.fib_discarded_routes);
2249 
2250 	return 0;
2251 }
2252 
2253 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2254 {
2255 	return single_open(file, rt6_stats_seq_show, NULL);
2256 }
2257 
2258 static struct file_operations rt6_stats_seq_fops = {
2259 	.owner	 = THIS_MODULE,
2260 	.open	 = rt6_stats_seq_open,
2261 	.read	 = seq_read,
2262 	.llseek	 = seq_lseek,
2263 	.release = single_release,
2264 };
2265 #endif	/* CONFIG_PROC_FS */
2266 
2267 #ifdef CONFIG_SYSCTL
2268 
2269 static int flush_delay;
2270 
2271 static
2272 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2273 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2274 {
2275 	if (write) {
2276 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2277 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2278 		return 0;
2279 	} else
2280 		return -EINVAL;
2281 }
2282 
2283 ctl_table ipv6_route_table[] = {
2284         {
2285 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2286 		.procname	=	"flush",
2287          	.data		=	&flush_delay,
2288 		.maxlen		=	sizeof(int),
2289 		.mode		=	0200,
2290          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2291 	},
2292 	{
2293 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2294 		.procname	=	"gc_thresh",
2295          	.data		=	&ip6_dst_ops.gc_thresh,
2296 		.maxlen		=	sizeof(int),
2297 		.mode		=	0644,
2298          	.proc_handler	=	&proc_dointvec,
2299 	},
2300 	{
2301 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2302 		.procname	=	"max_size",
2303          	.data		=	&ip6_rt_max_size,
2304 		.maxlen		=	sizeof(int),
2305 		.mode		=	0644,
2306          	.proc_handler	=	&proc_dointvec,
2307 	},
2308 	{
2309 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2310 		.procname	=	"gc_min_interval",
2311          	.data		=	&ip6_rt_gc_min_interval,
2312 		.maxlen		=	sizeof(int),
2313 		.mode		=	0644,
2314          	.proc_handler	=	&proc_dointvec_jiffies,
2315 		.strategy	=	&sysctl_jiffies,
2316 	},
2317 	{
2318 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2319 		.procname	=	"gc_timeout",
2320          	.data		=	&ip6_rt_gc_timeout,
2321 		.maxlen		=	sizeof(int),
2322 		.mode		=	0644,
2323          	.proc_handler	=	&proc_dointvec_jiffies,
2324 		.strategy	=	&sysctl_jiffies,
2325 	},
2326 	{
2327 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2328 		.procname	=	"gc_interval",
2329          	.data		=	&ip6_rt_gc_interval,
2330 		.maxlen		=	sizeof(int),
2331 		.mode		=	0644,
2332          	.proc_handler	=	&proc_dointvec_jiffies,
2333 		.strategy	=	&sysctl_jiffies,
2334 	},
2335 	{
2336 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2337 		.procname	=	"gc_elasticity",
2338          	.data		=	&ip6_rt_gc_elasticity,
2339 		.maxlen		=	sizeof(int),
2340 		.mode		=	0644,
2341          	.proc_handler	=	&proc_dointvec_jiffies,
2342 		.strategy	=	&sysctl_jiffies,
2343 	},
2344 	{
2345 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2346 		.procname	=	"mtu_expires",
2347          	.data		=	&ip6_rt_mtu_expires,
2348 		.maxlen		=	sizeof(int),
2349 		.mode		=	0644,
2350          	.proc_handler	=	&proc_dointvec_jiffies,
2351 		.strategy	=	&sysctl_jiffies,
2352 	},
2353 	{
2354 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2355 		.procname	=	"min_adv_mss",
2356          	.data		=	&ip6_rt_min_advmss,
2357 		.maxlen		=	sizeof(int),
2358 		.mode		=	0644,
2359          	.proc_handler	=	&proc_dointvec_jiffies,
2360 		.strategy	=	&sysctl_jiffies,
2361 	},
2362 	{
2363 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2364 		.procname	=	"gc_min_interval_ms",
2365          	.data		=	&ip6_rt_gc_min_interval,
2366 		.maxlen		=	sizeof(int),
2367 		.mode		=	0644,
2368          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2369 		.strategy	=	&sysctl_ms_jiffies,
2370 	},
2371 	{ .ctl_name = 0 }
2372 };
2373 
2374 #endif
2375 
2376 void __init ip6_route_init(void)
2377 {
2378 	struct proc_dir_entry *p;
2379 
2380 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2381 						     sizeof(struct rt6_info),
2382 						     0, SLAB_HWCACHE_ALIGN,
2383 						     NULL, NULL);
2384 	if (!ip6_dst_ops.kmem_cachep)
2385 		panic("cannot create ip6_dst_cache");
2386 
2387 	fib6_init();
2388 #ifdef 	CONFIG_PROC_FS
2389 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2390 	if (p)
2391 		p->owner = THIS_MODULE;
2392 
2393 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2394 #endif
2395 #ifdef CONFIG_XFRM
2396 	xfrm6_init();
2397 #endif
2398 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2399 	fib6_rules_init();
2400 #endif
2401 }
2402 
2403 void ip6_route_cleanup(void)
2404 {
2405 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2406 	fib6_rules_cleanup();
2407 #endif
2408 #ifdef CONFIG_PROC_FS
2409 	proc_net_remove("ipv6_route");
2410 	proc_net_remove("rt6_stats");
2411 #endif
2412 #ifdef CONFIG_XFRM
2413 	xfrm6_fini();
2414 #endif
2415 	rt6_ifdown(NULL);
2416 	fib6_gc_cleanup();
2417 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2418 }
2419