xref: /openbmc/linux/net/ipv6/route.c (revision 5d0bbeeb144f631150881712607345c532e38e7e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40 
41 #ifdef 	CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45 
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 struct fib6_node ip6_routing_table = {
144 	.leaf		= &ip6_null_entry,
145 	.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147 
148 /* Protects all the ip6 fib */
149 
150 DEFINE_RWLOCK(rt6_lock);
151 
152 
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158 
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161 	struct rt6_info *rt = (struct rt6_info *)dst;
162 	struct inet6_dev *idev = rt->rt6i_idev;
163 
164 	if (idev != NULL) {
165 		rt->rt6i_idev = NULL;
166 		in6_dev_put(idev);
167 	}
168 }
169 
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171 			   int how)
172 {
173 	struct rt6_info *rt = (struct rt6_info *)dst;
174 	struct inet6_dev *idev = rt->rt6i_idev;
175 
176 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178 		if (loopback_idev != NULL) {
179 			rt->rt6i_idev = loopback_idev;
180 			in6_dev_put(idev);
181 		}
182 	}
183 }
184 
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187 	return (rt->rt6i_flags & RTF_EXPIRES &&
188 		time_after(jiffies, rt->rt6i_expires));
189 }
190 
191 /*
192  *	Route lookup. Any rt6_lock is implied.
193  */
194 
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196 						    int oif,
197 						    int strict)
198 {
199 	struct rt6_info *local = NULL;
200 	struct rt6_info *sprt;
201 
202 	if (oif) {
203 		for (sprt = rt; sprt; sprt = sprt->u.next) {
204 			struct net_device *dev = sprt->rt6i_dev;
205 			if (dev->ifindex == oif)
206 				return sprt;
207 			if (dev->flags & IFF_LOOPBACK) {
208 				if (sprt->rt6i_idev == NULL ||
209 				    sprt->rt6i_idev->dev->ifindex != oif) {
210 					if (strict && oif)
211 						continue;
212 					if (local && (!oif ||
213 						      local->rt6i_idev->dev->ifindex == oif))
214 						continue;
215 				}
216 				local = sprt;
217 			}
218 		}
219 
220 		if (local)
221 			return local;
222 
223 		if (strict)
224 			return &ip6_null_entry;
225 	}
226 	return rt;
227 }
228 
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233 	/*
234 	 * Okay, this does not seem to be appropriate
235 	 * for now, however, we need to check if it
236 	 * is really so; aka Router Reachability Probing.
237 	 *
238 	 * Router Reachability Probe MUST be rate-limited
239 	 * to no more than one per minute.
240 	 */
241 	if (!neigh || (neigh->nud_state & NUD_VALID))
242 		return;
243 	read_lock_bh(&neigh->lock);
244 	if (!(neigh->nud_state & NUD_VALID) &&
245 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246 		struct in6_addr mcaddr;
247 		struct in6_addr *target;
248 
249 		neigh->updated = jiffies;
250 		read_unlock_bh(&neigh->lock);
251 
252 		target = (struct in6_addr *)&neigh->primary_key;
253 		addrconf_addr_solict_mult(target, &mcaddr);
254 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255 	} else
256 		read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261 	return;
262 }
263 #endif
264 
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270 	struct net_device *dev = rt->rt6i_dev;
271 	if (!oif || dev->ifindex == oif)
272 		return 2;
273 	if ((dev->flags & IFF_LOOPBACK) &&
274 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275 		return 1;
276 	return 0;
277 }
278 
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281 	struct neighbour *neigh = rt->rt6i_nexthop;
282 	int m = 0;
283 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
284 	    !(rt->rt6i_flags & RTF_GATEWAY))
285 		m = 1;
286 	else if (neigh) {
287 		read_lock_bh(&neigh->lock);
288 		if (neigh->nud_state & NUD_VALID)
289 			m = 2;
290 		read_unlock_bh(&neigh->lock);
291 	}
292 	return m;
293 }
294 
295 static int rt6_score_route(struct rt6_info *rt, int oif,
296 			   int strict)
297 {
298 	int m, n;
299 
300 	m = rt6_check_dev(rt, oif);
301 	if (!m && (strict & RT6_SELECT_F_IFACE))
302 		return -1;
303 #ifdef CONFIG_IPV6_ROUTER_PREF
304 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
305 #endif
306 	n = rt6_check_neigh(rt);
307 	if (n > 1)
308 		m |= 16;
309 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
310 		return -1;
311 	return m;
312 }
313 
314 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
315 				   int strict)
316 {
317 	struct rt6_info *match = NULL, *last = NULL;
318 	struct rt6_info *rt, *rt0 = *head;
319 	u32 metric;
320 	int mpri = -1;
321 
322 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
323 		  __FUNCTION__, head, head ? *head : NULL, oif);
324 
325 	for (rt = rt0, metric = rt0->rt6i_metric;
326 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
327 	     rt = rt->u.next) {
328 		int m;
329 
330 		if (rt6_check_expired(rt))
331 			continue;
332 
333 		last = rt;
334 
335 		m = rt6_score_route(rt, oif, strict);
336 		if (m < 0)
337 			continue;
338 
339 		if (m > mpri) {
340 			rt6_probe(match);
341 			match = rt;
342 			mpri = m;
343 		} else {
344 			rt6_probe(rt);
345 		}
346 	}
347 
348 	if (!match &&
349 	    (strict & RT6_SELECT_F_REACHABLE) &&
350 	    last && last != rt0) {
351 		/* no entries matched; do round-robin */
352 		static DEFINE_SPINLOCK(lock);
353 		spin_lock(&lock);
354 		*head = rt0->u.next;
355 		rt0->u.next = last->u.next;
356 		last->u.next = rt0;
357 		spin_unlock(&lock);
358 	}
359 
360 	RT6_TRACE("%s() => %p, score=%d\n",
361 		  __FUNCTION__, match, mpri);
362 
363 	return (match ? match : &ip6_null_entry);
364 }
365 
366 #ifdef CONFIG_IPV6_ROUTE_INFO
367 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
368 		  struct in6_addr *gwaddr)
369 {
370 	struct route_info *rinfo = (struct route_info *) opt;
371 	struct in6_addr prefix_buf, *prefix;
372 	unsigned int pref;
373 	u32 lifetime;
374 	struct rt6_info *rt;
375 
376 	if (len < sizeof(struct route_info)) {
377 		return -EINVAL;
378 	}
379 
380 	/* Sanity check for prefix_len and length */
381 	if (rinfo->length > 3) {
382 		return -EINVAL;
383 	} else if (rinfo->prefix_len > 128) {
384 		return -EINVAL;
385 	} else if (rinfo->prefix_len > 64) {
386 		if (rinfo->length < 2) {
387 			return -EINVAL;
388 		}
389 	} else if (rinfo->prefix_len > 0) {
390 		if (rinfo->length < 1) {
391 			return -EINVAL;
392 		}
393 	}
394 
395 	pref = rinfo->route_pref;
396 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
397 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
398 
399 	lifetime = htonl(rinfo->lifetime);
400 	if (lifetime == 0xffffffff) {
401 		/* infinity */
402 	} else if (lifetime > 0x7fffffff/HZ) {
403 		/* Avoid arithmetic overflow */
404 		lifetime = 0x7fffffff/HZ - 1;
405 	}
406 
407 	if (rinfo->length == 3)
408 		prefix = (struct in6_addr *)rinfo->prefix;
409 	else {
410 		/* this function is safe */
411 		ipv6_addr_prefix(&prefix_buf,
412 				 (struct in6_addr *)rinfo->prefix,
413 				 rinfo->prefix_len);
414 		prefix = &prefix_buf;
415 	}
416 
417 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
418 
419 	if (rt && !lifetime) {
420 		ip6_del_rt(rt, NULL, NULL, NULL);
421 		rt = NULL;
422 	}
423 
424 	if (!rt && lifetime)
425 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
426 					pref);
427 	else if (rt)
428 		rt->rt6i_flags = RTF_ROUTEINFO |
429 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
430 
431 	if (rt) {
432 		if (lifetime == 0xffffffff) {
433 			rt->rt6i_flags &= ~RTF_EXPIRES;
434 		} else {
435 			rt->rt6i_expires = jiffies + HZ * lifetime;
436 			rt->rt6i_flags |= RTF_EXPIRES;
437 		}
438 		dst_release(&rt->u.dst);
439 	}
440 	return 0;
441 }
442 #endif
443 
444 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
445 			    int oif, int strict)
446 {
447 	struct fib6_node *fn;
448 	struct rt6_info *rt;
449 
450 	read_lock_bh(&rt6_lock);
451 	fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
452 	rt = rt6_device_match(fn->leaf, oif, strict);
453 	dst_hold(&rt->u.dst);
454 	rt->u.dst.__use++;
455 	read_unlock_bh(&rt6_lock);
456 
457 	rt->u.dst.lastuse = jiffies;
458 	if (rt->u.dst.error == 0)
459 		return rt;
460 	dst_release(&rt->u.dst);
461 	return NULL;
462 }
463 
464 /* ip6_ins_rt is called with FREE rt6_lock.
465    It takes new route entry, the addition fails by any reason the
466    route is freed. In any case, if caller does not hold it, it may
467    be destroyed.
468  */
469 
470 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
471 		void *_rtattr, struct netlink_skb_parms *req)
472 {
473 	int err;
474 
475 	write_lock_bh(&rt6_lock);
476 	err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
477 	write_unlock_bh(&rt6_lock);
478 
479 	return err;
480 }
481 
482 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
483 				      struct in6_addr *saddr)
484 {
485 	struct rt6_info *rt;
486 
487 	/*
488 	 *	Clone the route.
489 	 */
490 
491 	rt = ip6_rt_copy(ort);
492 
493 	if (rt) {
494 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
495 			if (rt->rt6i_dst.plen != 128 &&
496 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
497 				rt->rt6i_flags |= RTF_ANYCAST;
498 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
499 		}
500 
501 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
502 		rt->rt6i_dst.plen = 128;
503 		rt->rt6i_flags |= RTF_CACHE;
504 		rt->u.dst.flags |= DST_HOST;
505 
506 #ifdef CONFIG_IPV6_SUBTREES
507 		if (rt->rt6i_src.plen && saddr) {
508 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
509 			rt->rt6i_src.plen = 128;
510 		}
511 #endif
512 
513 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
514 
515 	}
516 
517 	return rt;
518 }
519 
520 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
521 {
522 	struct rt6_info *rt = ip6_rt_copy(ort);
523 	if (rt) {
524 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
525 		rt->rt6i_dst.plen = 128;
526 		rt->rt6i_flags |= RTF_CACHE;
527 		if (rt->rt6i_flags & RTF_REJECT)
528 			rt->u.dst.error = ort->u.dst.error;
529 		rt->u.dst.flags |= DST_HOST;
530 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
531 	}
532 	return rt;
533 }
534 
535 #define BACKTRACK() \
536 if (rt == &ip6_null_entry) { \
537        while ((fn = fn->parent) != NULL) { \
538 		if (fn->fn_flags & RTN_ROOT) { \
539 			goto out; \
540 		} \
541 		if (fn->fn_flags & RTN_RTINFO) \
542 			goto restart; \
543 	} \
544 }
545 
546 
547 void ip6_route_input(struct sk_buff *skb)
548 {
549 	struct fib6_node *fn;
550 	struct rt6_info *rt, *nrt;
551 	int strict;
552 	int attempts = 3;
553 	int err;
554 	int reachable = RT6_SELECT_F_REACHABLE;
555 
556 	strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
557 
558 relookup:
559 	read_lock_bh(&rt6_lock);
560 
561 restart_2:
562 	fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
563 			 &skb->nh.ipv6h->saddr);
564 
565 restart:
566 	rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
567 	BACKTRACK();
568 	if (rt == &ip6_null_entry ||
569 	    rt->rt6i_flags & RTF_CACHE)
570 		goto out;
571 
572 	dst_hold(&rt->u.dst);
573 	read_unlock_bh(&rt6_lock);
574 
575 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
576 		nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
577 	else {
578 #if CLONE_OFFLINK_ROUTE
579 		nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
580 #else
581 		goto out2;
582 #endif
583 	}
584 
585 	dst_release(&rt->u.dst);
586 	rt = nrt ? : &ip6_null_entry;
587 
588 	dst_hold(&rt->u.dst);
589 	if (nrt) {
590 		err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
591 		if (!err)
592 			goto out2;
593 	}
594 
595 	if (--attempts <= 0)
596 		goto out2;
597 
598 	/*
599 	 * Race condition! In the gap, when rt6_lock was
600 	 * released someone could insert this route.  Relookup.
601 	 */
602 	dst_release(&rt->u.dst);
603 	goto relookup;
604 
605 out:
606 	if (reachable) {
607 		reachable = 0;
608 		goto restart_2;
609 	}
610 	dst_hold(&rt->u.dst);
611 	read_unlock_bh(&rt6_lock);
612 out2:
613 	rt->u.dst.lastuse = jiffies;
614 	rt->u.dst.__use++;
615 	skb->dst = (struct dst_entry *) rt;
616 	return;
617 }
618 
619 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
620 {
621 	struct fib6_node *fn;
622 	struct rt6_info *rt, *nrt;
623 	int strict;
624 	int attempts = 3;
625 	int err;
626 	int reachable = RT6_SELECT_F_REACHABLE;
627 
628 	strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
629 
630 relookup:
631 	read_lock_bh(&rt6_lock);
632 
633 restart_2:
634 	fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
635 
636 restart:
637 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
638 	BACKTRACK();
639 	if (rt == &ip6_null_entry ||
640 	    rt->rt6i_flags & RTF_CACHE)
641 		goto out;
642 
643 	dst_hold(&rt->u.dst);
644 	read_unlock_bh(&rt6_lock);
645 
646 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
647 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
648 	else {
649 #if CLONE_OFFLINK_ROUTE
650 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
651 #else
652 		goto out2;
653 #endif
654 	}
655 
656 	dst_release(&rt->u.dst);
657 	rt = nrt ? : &ip6_null_entry;
658 
659 	dst_hold(&rt->u.dst);
660 	if (nrt) {
661 		err = ip6_ins_rt(nrt, NULL, NULL, NULL);
662 		if (!err)
663 			goto out2;
664 	}
665 
666 	if (--attempts <= 0)
667 		goto out2;
668 
669 	/*
670 	 * Race condition! In the gap, when rt6_lock was
671 	 * released someone could insert this route.  Relookup.
672 	 */
673 	dst_release(&rt->u.dst);
674 	goto relookup;
675 
676 out:
677 	if (reachable) {
678 		reachable = 0;
679 		goto restart_2;
680 	}
681 	dst_hold(&rt->u.dst);
682 	read_unlock_bh(&rt6_lock);
683 out2:
684 	rt->u.dst.lastuse = jiffies;
685 	rt->u.dst.__use++;
686 	return &rt->u.dst;
687 }
688 
689 
690 /*
691  *	Destination cache support functions
692  */
693 
694 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
695 {
696 	struct rt6_info *rt;
697 
698 	rt = (struct rt6_info *) dst;
699 
700 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
701 		return dst;
702 
703 	return NULL;
704 }
705 
706 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
707 {
708 	struct rt6_info *rt = (struct rt6_info *) dst;
709 
710 	if (rt) {
711 		if (rt->rt6i_flags & RTF_CACHE)
712 			ip6_del_rt(rt, NULL, NULL, NULL);
713 		else
714 			dst_release(dst);
715 	}
716 	return NULL;
717 }
718 
719 static void ip6_link_failure(struct sk_buff *skb)
720 {
721 	struct rt6_info *rt;
722 
723 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
724 
725 	rt = (struct rt6_info *) skb->dst;
726 	if (rt) {
727 		if (rt->rt6i_flags&RTF_CACHE) {
728 			dst_set_expires(&rt->u.dst, 0);
729 			rt->rt6i_flags |= RTF_EXPIRES;
730 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
731 			rt->rt6i_node->fn_sernum = -1;
732 	}
733 }
734 
735 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
736 {
737 	struct rt6_info *rt6 = (struct rt6_info*)dst;
738 
739 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
740 		rt6->rt6i_flags |= RTF_MODIFIED;
741 		if (mtu < IPV6_MIN_MTU) {
742 			mtu = IPV6_MIN_MTU;
743 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
744 		}
745 		dst->metrics[RTAX_MTU-1] = mtu;
746 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
747 	}
748 }
749 
750 static int ipv6_get_mtu(struct net_device *dev);
751 
752 static inline unsigned int ipv6_advmss(unsigned int mtu)
753 {
754 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
755 
756 	if (mtu < ip6_rt_min_advmss)
757 		mtu = ip6_rt_min_advmss;
758 
759 	/*
760 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
761 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
762 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
763 	 * rely only on pmtu discovery"
764 	 */
765 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
766 		mtu = IPV6_MAXPLEN;
767 	return mtu;
768 }
769 
770 static struct dst_entry *ndisc_dst_gc_list;
771 DEFINE_SPINLOCK(ndisc_lock);
772 
773 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
774 				  struct neighbour *neigh,
775 				  struct in6_addr *addr,
776 				  int (*output)(struct sk_buff *))
777 {
778 	struct rt6_info *rt;
779 	struct inet6_dev *idev = in6_dev_get(dev);
780 
781 	if (unlikely(idev == NULL))
782 		return NULL;
783 
784 	rt = ip6_dst_alloc();
785 	if (unlikely(rt == NULL)) {
786 		in6_dev_put(idev);
787 		goto out;
788 	}
789 
790 	dev_hold(dev);
791 	if (neigh)
792 		neigh_hold(neigh);
793 	else
794 		neigh = ndisc_get_neigh(dev, addr);
795 
796 	rt->rt6i_dev	  = dev;
797 	rt->rt6i_idev     = idev;
798 	rt->rt6i_nexthop  = neigh;
799 	atomic_set(&rt->u.dst.__refcnt, 1);
800 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
801 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
802 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
803 	rt->u.dst.output  = output;
804 
805 #if 0	/* there's no chance to use these for ndisc */
806 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
807 				? DST_HOST
808 				: 0;
809 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
810 	rt->rt6i_dst.plen = 128;
811 #endif
812 
813 	spin_lock_bh(&ndisc_lock);
814 	rt->u.dst.next = ndisc_dst_gc_list;
815 	ndisc_dst_gc_list = &rt->u.dst;
816 	spin_unlock_bh(&ndisc_lock);
817 
818 	fib6_force_start_gc();
819 
820 out:
821 	return (struct dst_entry *)rt;
822 }
823 
824 int ndisc_dst_gc(int *more)
825 {
826 	struct dst_entry *dst, *next, **pprev;
827 	int freed;
828 
829 	next = NULL;
830  	freed = 0;
831 
832 	spin_lock_bh(&ndisc_lock);
833 	pprev = &ndisc_dst_gc_list;
834 
835 	while ((dst = *pprev) != NULL) {
836 		if (!atomic_read(&dst->__refcnt)) {
837 			*pprev = dst->next;
838 			dst_free(dst);
839 			freed++;
840 		} else {
841 			pprev = &dst->next;
842 			(*more)++;
843 		}
844 	}
845 
846 	spin_unlock_bh(&ndisc_lock);
847 
848 	return freed;
849 }
850 
851 static int ip6_dst_gc(void)
852 {
853 	static unsigned expire = 30*HZ;
854 	static unsigned long last_gc;
855 	unsigned long now = jiffies;
856 
857 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
858 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
859 		goto out;
860 
861 	expire++;
862 	fib6_run_gc(expire);
863 	last_gc = now;
864 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
865 		expire = ip6_rt_gc_timeout>>1;
866 
867 out:
868 	expire -= expire>>ip6_rt_gc_elasticity;
869 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
870 }
871 
872 /* Clean host part of a prefix. Not necessary in radix tree,
873    but results in cleaner routing tables.
874 
875    Remove it only when all the things will work!
876  */
877 
878 static int ipv6_get_mtu(struct net_device *dev)
879 {
880 	int mtu = IPV6_MIN_MTU;
881 	struct inet6_dev *idev;
882 
883 	idev = in6_dev_get(dev);
884 	if (idev) {
885 		mtu = idev->cnf.mtu6;
886 		in6_dev_put(idev);
887 	}
888 	return mtu;
889 }
890 
891 int ipv6_get_hoplimit(struct net_device *dev)
892 {
893 	int hoplimit = ipv6_devconf.hop_limit;
894 	struct inet6_dev *idev;
895 
896 	idev = in6_dev_get(dev);
897 	if (idev) {
898 		hoplimit = idev->cnf.hop_limit;
899 		in6_dev_put(idev);
900 	}
901 	return hoplimit;
902 }
903 
904 /*
905  *
906  */
907 
908 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
909 		void *_rtattr, struct netlink_skb_parms *req)
910 {
911 	int err;
912 	struct rtmsg *r;
913 	struct rtattr **rta;
914 	struct rt6_info *rt = NULL;
915 	struct net_device *dev = NULL;
916 	struct inet6_dev *idev = NULL;
917 	int addr_type;
918 
919 	rta = (struct rtattr **) _rtattr;
920 
921 	if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
922 		return -EINVAL;
923 #ifndef CONFIG_IPV6_SUBTREES
924 	if (rtmsg->rtmsg_src_len)
925 		return -EINVAL;
926 #endif
927 	if (rtmsg->rtmsg_ifindex) {
928 		err = -ENODEV;
929 		dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
930 		if (!dev)
931 			goto out;
932 		idev = in6_dev_get(dev);
933 		if (!idev)
934 			goto out;
935 	}
936 
937 	if (rtmsg->rtmsg_metric == 0)
938 		rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
939 
940 	rt = ip6_dst_alloc();
941 
942 	if (rt == NULL) {
943 		err = -ENOMEM;
944 		goto out;
945 	}
946 
947 	rt->u.dst.obsolete = -1;
948 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
949 	if (nlh && (r = NLMSG_DATA(nlh))) {
950 		rt->rt6i_protocol = r->rtm_protocol;
951 	} else {
952 		rt->rt6i_protocol = RTPROT_BOOT;
953 	}
954 
955 	addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
956 
957 	if (addr_type & IPV6_ADDR_MULTICAST)
958 		rt->u.dst.input = ip6_mc_input;
959 	else
960 		rt->u.dst.input = ip6_forward;
961 
962 	rt->u.dst.output = ip6_output;
963 
964 	ipv6_addr_prefix(&rt->rt6i_dst.addr,
965 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
966 	rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
967 	if (rt->rt6i_dst.plen == 128)
968 	       rt->u.dst.flags = DST_HOST;
969 
970 #ifdef CONFIG_IPV6_SUBTREES
971 	ipv6_addr_prefix(&rt->rt6i_src.addr,
972 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
973 	rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
974 #endif
975 
976 	rt->rt6i_metric = rtmsg->rtmsg_metric;
977 
978 	/* We cannot add true routes via loopback here,
979 	   they would result in kernel looping; promote them to reject routes
980 	 */
981 	if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
982 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
983 		/* hold loopback dev/idev if we haven't done so. */
984 		if (dev != &loopback_dev) {
985 			if (dev) {
986 				dev_put(dev);
987 				in6_dev_put(idev);
988 			}
989 			dev = &loopback_dev;
990 			dev_hold(dev);
991 			idev = in6_dev_get(dev);
992 			if (!idev) {
993 				err = -ENODEV;
994 				goto out;
995 			}
996 		}
997 		rt->u.dst.output = ip6_pkt_discard_out;
998 		rt->u.dst.input = ip6_pkt_discard;
999 		rt->u.dst.error = -ENETUNREACH;
1000 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1001 		goto install_route;
1002 	}
1003 
1004 	if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1005 		struct in6_addr *gw_addr;
1006 		int gwa_type;
1007 
1008 		gw_addr = &rtmsg->rtmsg_gateway;
1009 		ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1010 		gwa_type = ipv6_addr_type(gw_addr);
1011 
1012 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1013 			struct rt6_info *grt;
1014 
1015 			/* IPv6 strictly inhibits using not link-local
1016 			   addresses as nexthop address.
1017 			   Otherwise, router will not able to send redirects.
1018 			   It is very good, but in some (rare!) circumstances
1019 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1020 			   some exceptions. --ANK
1021 			 */
1022 			err = -EINVAL;
1023 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1024 				goto out;
1025 
1026 			grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1027 
1028 			err = -EHOSTUNREACH;
1029 			if (grt == NULL)
1030 				goto out;
1031 			if (dev) {
1032 				if (dev != grt->rt6i_dev) {
1033 					dst_release(&grt->u.dst);
1034 					goto out;
1035 				}
1036 			} else {
1037 				dev = grt->rt6i_dev;
1038 				idev = grt->rt6i_idev;
1039 				dev_hold(dev);
1040 				in6_dev_hold(grt->rt6i_idev);
1041 			}
1042 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1043 				err = 0;
1044 			dst_release(&grt->u.dst);
1045 
1046 			if (err)
1047 				goto out;
1048 		}
1049 		err = -EINVAL;
1050 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1051 			goto out;
1052 	}
1053 
1054 	err = -ENODEV;
1055 	if (dev == NULL)
1056 		goto out;
1057 
1058 	if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1059 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1060 		if (IS_ERR(rt->rt6i_nexthop)) {
1061 			err = PTR_ERR(rt->rt6i_nexthop);
1062 			rt->rt6i_nexthop = NULL;
1063 			goto out;
1064 		}
1065 	}
1066 
1067 	rt->rt6i_flags = rtmsg->rtmsg_flags;
1068 
1069 install_route:
1070 	if (rta && rta[RTA_METRICS-1]) {
1071 		int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1072 		struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1073 
1074 		while (RTA_OK(attr, attrlen)) {
1075 			unsigned flavor = attr->rta_type;
1076 			if (flavor) {
1077 				if (flavor > RTAX_MAX) {
1078 					err = -EINVAL;
1079 					goto out;
1080 				}
1081 				rt->u.dst.metrics[flavor-1] =
1082 					*(u32 *)RTA_DATA(attr);
1083 			}
1084 			attr = RTA_NEXT(attr, attrlen);
1085 		}
1086 	}
1087 
1088 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1089 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1090 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1091 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1092 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1093 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1094 	rt->u.dst.dev = dev;
1095 	rt->rt6i_idev = idev;
1096 	return ip6_ins_rt(rt, nlh, _rtattr, req);
1097 
1098 out:
1099 	if (dev)
1100 		dev_put(dev);
1101 	if (idev)
1102 		in6_dev_put(idev);
1103 	if (rt)
1104 		dst_free((struct dst_entry *) rt);
1105 	return err;
1106 }
1107 
1108 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1109 {
1110 	int err;
1111 
1112 	write_lock_bh(&rt6_lock);
1113 
1114 	err = fib6_del(rt, nlh, _rtattr, req);
1115 	dst_release(&rt->u.dst);
1116 
1117 	write_unlock_bh(&rt6_lock);
1118 
1119 	return err;
1120 }
1121 
1122 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1123 {
1124 	struct fib6_node *fn;
1125 	struct rt6_info *rt;
1126 	int err = -ESRCH;
1127 
1128 	read_lock_bh(&rt6_lock);
1129 
1130 	fn = fib6_locate(&ip6_routing_table,
1131 			 &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1132 			 &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1133 
1134 	if (fn) {
1135 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1136 			if (rtmsg->rtmsg_ifindex &&
1137 			    (rt->rt6i_dev == NULL ||
1138 			     rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1139 				continue;
1140 			if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1141 			    !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1142 				continue;
1143 			if (rtmsg->rtmsg_metric &&
1144 			    rtmsg->rtmsg_metric != rt->rt6i_metric)
1145 				continue;
1146 			dst_hold(&rt->u.dst);
1147 			read_unlock_bh(&rt6_lock);
1148 
1149 			return ip6_del_rt(rt, nlh, _rtattr, req);
1150 		}
1151 	}
1152 	read_unlock_bh(&rt6_lock);
1153 
1154 	return err;
1155 }
1156 
1157 /*
1158  *	Handle redirects
1159  */
1160 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1161 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1162 {
1163 	struct rt6_info *rt, *nrt = NULL;
1164 	int strict;
1165 	struct fib6_node *fn;
1166 	struct netevent_redirect netevent;
1167 
1168 	/*
1169 	 * Get the "current" route for this destination and
1170 	 * check if the redirect has come from approriate router.
1171 	 *
1172 	 * RFC 2461 specifies that redirects should only be
1173 	 * accepted if they come from the nexthop to the target.
1174 	 * Due to the way the routes are chosen, this notion
1175 	 * is a bit fuzzy and one might need to check all possible
1176 	 * routes.
1177 	 */
1178 	strict = ipv6_addr_type(dest) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL);
1179 
1180 	read_lock_bh(&rt6_lock);
1181 	fn = fib6_lookup(&ip6_routing_table, dest, NULL);
1182 restart:
1183 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1184 		/*
1185 		 * Current route is on-link; redirect is always invalid.
1186 		 *
1187 		 * Seems, previous statement is not true. It could
1188 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1189 		 * But then router serving it might decide, that we should
1190 		 * know truth 8)8) --ANK (980726).
1191 		 */
1192 		if (rt6_check_expired(rt))
1193 			continue;
1194 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1195 			continue;
1196 		if (neigh->dev != rt->rt6i_dev)
1197 			continue;
1198 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1199 			continue;
1200 		break;
1201 	}
1202 	if (rt)
1203 		dst_hold(&rt->u.dst);
1204 	else if (strict) {
1205 		while ((fn = fn->parent) != NULL) {
1206 			if (fn->fn_flags & RTN_ROOT)
1207 				break;
1208 			if (fn->fn_flags & RTN_RTINFO)
1209 				goto restart;
1210 		}
1211 	}
1212 	read_unlock_bh(&rt6_lock);
1213 
1214 	if (!rt) {
1215 		if (net_ratelimit())
1216 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1217 			       "for redirect target\n");
1218 		return;
1219 	}
1220 
1221 	/*
1222 	 *	We have finally decided to accept it.
1223 	 */
1224 
1225 	neigh_update(neigh, lladdr, NUD_STALE,
1226 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1227 		     NEIGH_UPDATE_F_OVERRIDE|
1228 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1229 				     NEIGH_UPDATE_F_ISROUTER))
1230 		     );
1231 
1232 	/*
1233 	 * Redirect received -> path was valid.
1234 	 * Look, redirects are sent only in response to data packets,
1235 	 * so that this nexthop apparently is reachable. --ANK
1236 	 */
1237 	dst_confirm(&rt->u.dst);
1238 
1239 	/* Duplicate redirect: silently ignore. */
1240 	if (neigh == rt->u.dst.neighbour)
1241 		goto out;
1242 
1243 	nrt = ip6_rt_copy(rt);
1244 	if (nrt == NULL)
1245 		goto out;
1246 
1247 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1248 	if (on_link)
1249 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1250 
1251 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1252 	nrt->rt6i_dst.plen = 128;
1253 	nrt->u.dst.flags |= DST_HOST;
1254 
1255 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1256 	nrt->rt6i_nexthop = neigh_clone(neigh);
1257 	/* Reset pmtu, it may be better */
1258 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1259 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1260 
1261 	if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1262 		goto out;
1263 
1264 	netevent.old = &rt->u.dst;
1265 	netevent.new = &nrt->u.dst;
1266 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1267 
1268 	if (rt->rt6i_flags&RTF_CACHE) {
1269 		ip6_del_rt(rt, NULL, NULL, NULL);
1270 		return;
1271 	}
1272 
1273 out:
1274         dst_release(&rt->u.dst);
1275 	return;
1276 }
1277 
1278 /*
1279  *	Handle ICMP "packet too big" messages
1280  *	i.e. Path MTU discovery
1281  */
1282 
1283 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1284 			struct net_device *dev, u32 pmtu)
1285 {
1286 	struct rt6_info *rt, *nrt;
1287 	int allfrag = 0;
1288 
1289 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1290 	if (rt == NULL)
1291 		return;
1292 
1293 	if (pmtu >= dst_mtu(&rt->u.dst))
1294 		goto out;
1295 
1296 	if (pmtu < IPV6_MIN_MTU) {
1297 		/*
1298 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1299 		 * MTU (1280) and a fragment header should always be included
1300 		 * after a node receiving Too Big message reporting PMTU is
1301 		 * less than the IPv6 Minimum Link MTU.
1302 		 */
1303 		pmtu = IPV6_MIN_MTU;
1304 		allfrag = 1;
1305 	}
1306 
1307 	/* New mtu received -> path was valid.
1308 	   They are sent only in response to data packets,
1309 	   so that this nexthop apparently is reachable. --ANK
1310 	 */
1311 	dst_confirm(&rt->u.dst);
1312 
1313 	/* Host route. If it is static, it would be better
1314 	   not to override it, but add new one, so that
1315 	   when cache entry will expire old pmtu
1316 	   would return automatically.
1317 	 */
1318 	if (rt->rt6i_flags & RTF_CACHE) {
1319 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1320 		if (allfrag)
1321 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1322 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1323 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1324 		goto out;
1325 	}
1326 
1327 	/* Network route.
1328 	   Two cases are possible:
1329 	   1. It is connected route. Action: COW
1330 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1331 	 */
1332 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1333 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1334 	else
1335 		nrt = rt6_alloc_clone(rt, daddr);
1336 
1337 	if (nrt) {
1338 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1339 		if (allfrag)
1340 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1341 
1342 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1343 		 * happened within 5 mins, the recommended timer is 10 mins.
1344 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1345 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1346 		 * and detecting PMTU increase will be automatically happened.
1347 		 */
1348 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1349 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1350 
1351 		ip6_ins_rt(nrt, NULL, NULL, NULL);
1352 	}
1353 out:
1354 	dst_release(&rt->u.dst);
1355 }
1356 
1357 /*
1358  *	Misc support functions
1359  */
1360 
1361 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1362 {
1363 	struct rt6_info *rt = ip6_dst_alloc();
1364 
1365 	if (rt) {
1366 		rt->u.dst.input = ort->u.dst.input;
1367 		rt->u.dst.output = ort->u.dst.output;
1368 
1369 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1370 		rt->u.dst.dev = ort->u.dst.dev;
1371 		if (rt->u.dst.dev)
1372 			dev_hold(rt->u.dst.dev);
1373 		rt->rt6i_idev = ort->rt6i_idev;
1374 		if (rt->rt6i_idev)
1375 			in6_dev_hold(rt->rt6i_idev);
1376 		rt->u.dst.lastuse = jiffies;
1377 		rt->rt6i_expires = 0;
1378 
1379 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1380 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1381 		rt->rt6i_metric = 0;
1382 
1383 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1384 #ifdef CONFIG_IPV6_SUBTREES
1385 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1386 #endif
1387 	}
1388 	return rt;
1389 }
1390 
1391 #ifdef CONFIG_IPV6_ROUTE_INFO
1392 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1393 					   struct in6_addr *gwaddr, int ifindex)
1394 {
1395 	struct fib6_node *fn;
1396 	struct rt6_info *rt = NULL;
1397 
1398 	write_lock_bh(&rt6_lock);
1399 	fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1400 	if (!fn)
1401 		goto out;
1402 
1403 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1404 		if (rt->rt6i_dev->ifindex != ifindex)
1405 			continue;
1406 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1407 			continue;
1408 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1409 			continue;
1410 		dst_hold(&rt->u.dst);
1411 		break;
1412 	}
1413 out:
1414 	write_unlock_bh(&rt6_lock);
1415 	return rt;
1416 }
1417 
1418 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1419 					   struct in6_addr *gwaddr, int ifindex,
1420 					   unsigned pref)
1421 {
1422 	struct in6_rtmsg rtmsg;
1423 
1424 	memset(&rtmsg, 0, sizeof(rtmsg));
1425 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1426 	ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1427 	rtmsg.rtmsg_dst_len = prefixlen;
1428 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1429 	rtmsg.rtmsg_metric = 1024;
1430 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1431 	/* We should treat it as a default route if prefix length is 0. */
1432 	if (!prefixlen)
1433 		rtmsg.rtmsg_flags |= RTF_DEFAULT;
1434 	rtmsg.rtmsg_ifindex = ifindex;
1435 
1436 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1437 
1438 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1439 }
1440 #endif
1441 
1442 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1443 {
1444 	struct rt6_info *rt;
1445 	struct fib6_node *fn;
1446 
1447 	fn = &ip6_routing_table;
1448 
1449 	write_lock_bh(&rt6_lock);
1450 	for (rt = fn->leaf; rt; rt=rt->u.next) {
1451 		if (dev == rt->rt6i_dev &&
1452 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1453 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1454 			break;
1455 	}
1456 	if (rt)
1457 		dst_hold(&rt->u.dst);
1458 	write_unlock_bh(&rt6_lock);
1459 	return rt;
1460 }
1461 
1462 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1463 				     struct net_device *dev,
1464 				     unsigned int pref)
1465 {
1466 	struct in6_rtmsg rtmsg;
1467 
1468 	memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1469 	rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1470 	ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1471 	rtmsg.rtmsg_metric = 1024;
1472 	rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1473 			    RTF_PREF(pref);
1474 
1475 	rtmsg.rtmsg_ifindex = dev->ifindex;
1476 
1477 	ip6_route_add(&rtmsg, NULL, NULL, NULL);
1478 	return rt6_get_dflt_router(gwaddr, dev);
1479 }
1480 
1481 void rt6_purge_dflt_routers(void)
1482 {
1483 	struct rt6_info *rt;
1484 
1485 restart:
1486 	read_lock_bh(&rt6_lock);
1487 	for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1488 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1489 			dst_hold(&rt->u.dst);
1490 
1491 			read_unlock_bh(&rt6_lock);
1492 
1493 			ip6_del_rt(rt, NULL, NULL, NULL);
1494 
1495 			goto restart;
1496 		}
1497 	}
1498 	read_unlock_bh(&rt6_lock);
1499 }
1500 
1501 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1502 {
1503 	struct in6_rtmsg rtmsg;
1504 	int err;
1505 
1506 	switch(cmd) {
1507 	case SIOCADDRT:		/* Add a route */
1508 	case SIOCDELRT:		/* Delete a route */
1509 		if (!capable(CAP_NET_ADMIN))
1510 			return -EPERM;
1511 		err = copy_from_user(&rtmsg, arg,
1512 				     sizeof(struct in6_rtmsg));
1513 		if (err)
1514 			return -EFAULT;
1515 
1516 		rtnl_lock();
1517 		switch (cmd) {
1518 		case SIOCADDRT:
1519 			err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1520 			break;
1521 		case SIOCDELRT:
1522 			err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1523 			break;
1524 		default:
1525 			err = -EINVAL;
1526 		}
1527 		rtnl_unlock();
1528 
1529 		return err;
1530 	};
1531 
1532 	return -EINVAL;
1533 }
1534 
1535 /*
1536  *	Drop the packet on the floor
1537  */
1538 
1539 static int ip6_pkt_discard(struct sk_buff *skb)
1540 {
1541 	int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1542 	if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1543 		IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1544 
1545 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1546 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1547 	kfree_skb(skb);
1548 	return 0;
1549 }
1550 
1551 static int ip6_pkt_discard_out(struct sk_buff *skb)
1552 {
1553 	skb->dev = skb->dst->dev;
1554 	return ip6_pkt_discard(skb);
1555 }
1556 
1557 /*
1558  *	Allocate a dst for local (unicast / anycast) address.
1559  */
1560 
1561 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1562 				    const struct in6_addr *addr,
1563 				    int anycast)
1564 {
1565 	struct rt6_info *rt = ip6_dst_alloc();
1566 
1567 	if (rt == NULL)
1568 		return ERR_PTR(-ENOMEM);
1569 
1570 	dev_hold(&loopback_dev);
1571 	in6_dev_hold(idev);
1572 
1573 	rt->u.dst.flags = DST_HOST;
1574 	rt->u.dst.input = ip6_input;
1575 	rt->u.dst.output = ip6_output;
1576 	rt->rt6i_dev = &loopback_dev;
1577 	rt->rt6i_idev = idev;
1578 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1579 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1580 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1581 	rt->u.dst.obsolete = -1;
1582 
1583 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1584 	if (anycast)
1585 		rt->rt6i_flags |= RTF_ANYCAST;
1586 	else
1587 		rt->rt6i_flags |= RTF_LOCAL;
1588 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1589 	if (rt->rt6i_nexthop == NULL) {
1590 		dst_free((struct dst_entry *) rt);
1591 		return ERR_PTR(-ENOMEM);
1592 	}
1593 
1594 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1595 	rt->rt6i_dst.plen = 128;
1596 
1597 	atomic_set(&rt->u.dst.__refcnt, 1);
1598 
1599 	return rt;
1600 }
1601 
1602 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1603 {
1604 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1605 	    rt != &ip6_null_entry) {
1606 		RT6_TRACE("deleted by ifdown %p\n", rt);
1607 		return -1;
1608 	}
1609 	return 0;
1610 }
1611 
1612 void rt6_ifdown(struct net_device *dev)
1613 {
1614 	write_lock_bh(&rt6_lock);
1615 	fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1616 	write_unlock_bh(&rt6_lock);
1617 }
1618 
1619 struct rt6_mtu_change_arg
1620 {
1621 	struct net_device *dev;
1622 	unsigned mtu;
1623 };
1624 
1625 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1626 {
1627 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1628 	struct inet6_dev *idev;
1629 
1630 	/* In IPv6 pmtu discovery is not optional,
1631 	   so that RTAX_MTU lock cannot disable it.
1632 	   We still use this lock to block changes
1633 	   caused by addrconf/ndisc.
1634 	*/
1635 
1636 	idev = __in6_dev_get(arg->dev);
1637 	if (idev == NULL)
1638 		return 0;
1639 
1640 	/* For administrative MTU increase, there is no way to discover
1641 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1642 	   Since RFC 1981 doesn't include administrative MTU increase
1643 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1644 	 */
1645 	/*
1646 	   If new MTU is less than route PMTU, this new MTU will be the
1647 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1648 	   decreases; if new MTU is greater than route PMTU, and the
1649 	   old MTU is the lowest MTU in the path, update the route PMTU
1650 	   to reflect the increase. In this case if the other nodes' MTU
1651 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1652 	   PMTU discouvery.
1653 	 */
1654 	if (rt->rt6i_dev == arg->dev &&
1655 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1656             (dst_mtu(&rt->u.dst) > arg->mtu ||
1657              (dst_mtu(&rt->u.dst) < arg->mtu &&
1658 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1659 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1660 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1661 	return 0;
1662 }
1663 
1664 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1665 {
1666 	struct rt6_mtu_change_arg arg;
1667 
1668 	arg.dev = dev;
1669 	arg.mtu = mtu;
1670 	read_lock_bh(&rt6_lock);
1671 	fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1672 	read_unlock_bh(&rt6_lock);
1673 }
1674 
1675 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1676 			      struct in6_rtmsg *rtmsg)
1677 {
1678 	memset(rtmsg, 0, sizeof(*rtmsg));
1679 
1680 	rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1681 	rtmsg->rtmsg_src_len = r->rtm_src_len;
1682 	rtmsg->rtmsg_flags = RTF_UP;
1683 	if (r->rtm_type == RTN_UNREACHABLE)
1684 		rtmsg->rtmsg_flags |= RTF_REJECT;
1685 
1686 	if (rta[RTA_GATEWAY-1]) {
1687 		if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1688 			return -EINVAL;
1689 		memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1690 		rtmsg->rtmsg_flags |= RTF_GATEWAY;
1691 	}
1692 	if (rta[RTA_DST-1]) {
1693 		if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1694 			return -EINVAL;
1695 		memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1696 	}
1697 	if (rta[RTA_SRC-1]) {
1698 		if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1699 			return -EINVAL;
1700 		memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1701 	}
1702 	if (rta[RTA_OIF-1]) {
1703 		if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1704 			return -EINVAL;
1705 		memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1706 	}
1707 	if (rta[RTA_PRIORITY-1]) {
1708 		if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1709 			return -EINVAL;
1710 		memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1711 	}
1712 	return 0;
1713 }
1714 
1715 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1716 {
1717 	struct rtmsg *r = NLMSG_DATA(nlh);
1718 	struct in6_rtmsg rtmsg;
1719 
1720 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1721 		return -EINVAL;
1722 	return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1723 }
1724 
1725 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1726 {
1727 	struct rtmsg *r = NLMSG_DATA(nlh);
1728 	struct in6_rtmsg rtmsg;
1729 
1730 	if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1731 		return -EINVAL;
1732 	return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1733 }
1734 
1735 struct rt6_rtnl_dump_arg
1736 {
1737 	struct sk_buff *skb;
1738 	struct netlink_callback *cb;
1739 };
1740 
1741 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1742 			 struct in6_addr *dst, struct in6_addr *src,
1743 			 int iif, int type, u32 pid, u32 seq,
1744 			 int prefix, unsigned int flags)
1745 {
1746 	struct rtmsg *rtm;
1747 	struct nlmsghdr  *nlh;
1748 	unsigned char	 *b = skb->tail;
1749 	struct rta_cacheinfo ci;
1750 
1751 	if (prefix) {	/* user wants prefix routes only */
1752 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1753 			/* success since this is not a prefix route */
1754 			return 1;
1755 		}
1756 	}
1757 
1758 	nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1759 	rtm = NLMSG_DATA(nlh);
1760 	rtm->rtm_family = AF_INET6;
1761 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1762 	rtm->rtm_src_len = rt->rt6i_src.plen;
1763 	rtm->rtm_tos = 0;
1764 	rtm->rtm_table = RT_TABLE_MAIN;
1765 	if (rt->rt6i_flags&RTF_REJECT)
1766 		rtm->rtm_type = RTN_UNREACHABLE;
1767 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1768 		rtm->rtm_type = RTN_LOCAL;
1769 	else
1770 		rtm->rtm_type = RTN_UNICAST;
1771 	rtm->rtm_flags = 0;
1772 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1773 	rtm->rtm_protocol = rt->rt6i_protocol;
1774 	if (rt->rt6i_flags&RTF_DYNAMIC)
1775 		rtm->rtm_protocol = RTPROT_REDIRECT;
1776 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1777 		rtm->rtm_protocol = RTPROT_KERNEL;
1778 	else if (rt->rt6i_flags&RTF_DEFAULT)
1779 		rtm->rtm_protocol = RTPROT_RA;
1780 
1781 	if (rt->rt6i_flags&RTF_CACHE)
1782 		rtm->rtm_flags |= RTM_F_CLONED;
1783 
1784 	if (dst) {
1785 		RTA_PUT(skb, RTA_DST, 16, dst);
1786 	        rtm->rtm_dst_len = 128;
1787 	} else if (rtm->rtm_dst_len)
1788 		RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1789 #ifdef CONFIG_IPV6_SUBTREES
1790 	if (src) {
1791 		RTA_PUT(skb, RTA_SRC, 16, src);
1792 	        rtm->rtm_src_len = 128;
1793 	} else if (rtm->rtm_src_len)
1794 		RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1795 #endif
1796 	if (iif)
1797 		RTA_PUT(skb, RTA_IIF, 4, &iif);
1798 	else if (dst) {
1799 		struct in6_addr saddr_buf;
1800 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1801 			RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1802 	}
1803 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1804 		goto rtattr_failure;
1805 	if (rt->u.dst.neighbour)
1806 		RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1807 	if (rt->u.dst.dev)
1808 		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1809 	RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1810 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1811 	if (rt->rt6i_expires)
1812 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1813 	else
1814 		ci.rta_expires = 0;
1815 	ci.rta_used = rt->u.dst.__use;
1816 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1817 	ci.rta_error = rt->u.dst.error;
1818 	ci.rta_id = 0;
1819 	ci.rta_ts = 0;
1820 	ci.rta_tsage = 0;
1821 	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1822 	nlh->nlmsg_len = skb->tail - b;
1823 	return skb->len;
1824 
1825 nlmsg_failure:
1826 rtattr_failure:
1827 	skb_trim(skb, b - skb->data);
1828 	return -1;
1829 }
1830 
1831 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1832 {
1833 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1834 	int prefix;
1835 
1836 	if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1837 		struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1838 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1839 	} else
1840 		prefix = 0;
1841 
1842 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1843 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1844 		     prefix, NLM_F_MULTI);
1845 }
1846 
1847 static int fib6_dump_node(struct fib6_walker_t *w)
1848 {
1849 	int res;
1850 	struct rt6_info *rt;
1851 
1852 	for (rt = w->leaf; rt; rt = rt->u.next) {
1853 		res = rt6_dump_route(rt, w->args);
1854 		if (res < 0) {
1855 			/* Frame is full, suspend walking */
1856 			w->leaf = rt;
1857 			return 1;
1858 		}
1859 		BUG_TRAP(res!=0);
1860 	}
1861 	w->leaf = NULL;
1862 	return 0;
1863 }
1864 
1865 static void fib6_dump_end(struct netlink_callback *cb)
1866 {
1867 	struct fib6_walker_t *w = (void*)cb->args[0];
1868 
1869 	if (w) {
1870 		cb->args[0] = 0;
1871 		fib6_walker_unlink(w);
1872 		kfree(w);
1873 	}
1874 	cb->done = (void*)cb->args[1];
1875 	cb->args[1] = 0;
1876 }
1877 
1878 static int fib6_dump_done(struct netlink_callback *cb)
1879 {
1880 	fib6_dump_end(cb);
1881 	return cb->done ? cb->done(cb) : 0;
1882 }
1883 
1884 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1885 {
1886 	struct rt6_rtnl_dump_arg arg;
1887 	struct fib6_walker_t *w;
1888 	int res;
1889 
1890 	arg.skb = skb;
1891 	arg.cb = cb;
1892 
1893 	w = (void*)cb->args[0];
1894 	if (w == NULL) {
1895 		/* New dump:
1896 		 *
1897 		 * 1. hook callback destructor.
1898 		 */
1899 		cb->args[1] = (long)cb->done;
1900 		cb->done = fib6_dump_done;
1901 
1902 		/*
1903 		 * 2. allocate and initialize walker.
1904 		 */
1905 		w = kzalloc(sizeof(*w), GFP_ATOMIC);
1906 		if (w == NULL)
1907 			return -ENOMEM;
1908 		RT6_TRACE("dump<%p", w);
1909 		w->root = &ip6_routing_table;
1910 		w->func = fib6_dump_node;
1911 		w->args = &arg;
1912 		cb->args[0] = (long)w;
1913 		read_lock_bh(&rt6_lock);
1914 		res = fib6_walk(w);
1915 		read_unlock_bh(&rt6_lock);
1916 	} else {
1917 		w->args = &arg;
1918 		read_lock_bh(&rt6_lock);
1919 		res = fib6_walk_continue(w);
1920 		read_unlock_bh(&rt6_lock);
1921 	}
1922 #if RT6_DEBUG >= 3
1923 	if (res <= 0 && skb->len == 0)
1924 		RT6_TRACE("%p>dump end\n", w);
1925 #endif
1926 	res = res < 0 ? res : skb->len;
1927 	/* res < 0 is an error. (really, impossible)
1928 	   res == 0 means that dump is complete, but skb still can contain data.
1929 	   res > 0 dump is not complete, but frame is full.
1930 	 */
1931 	/* Destroy walker, if dump of this table is complete. */
1932 	if (res <= 0)
1933 		fib6_dump_end(cb);
1934 	return res;
1935 }
1936 
1937 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1938 {
1939 	struct rtattr **rta = arg;
1940 	int iif = 0;
1941 	int err = -ENOBUFS;
1942 	struct sk_buff *skb;
1943 	struct flowi fl;
1944 	struct rt6_info *rt;
1945 
1946 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1947 	if (skb == NULL)
1948 		goto out;
1949 
1950 	/* Reserve room for dummy headers, this skb can pass
1951 	   through good chunk of routing engine.
1952 	 */
1953 	skb->mac.raw = skb->data;
1954 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1955 
1956 	memset(&fl, 0, sizeof(fl));
1957 	if (rta[RTA_SRC-1])
1958 		ipv6_addr_copy(&fl.fl6_src,
1959 			       (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1960 	if (rta[RTA_DST-1])
1961 		ipv6_addr_copy(&fl.fl6_dst,
1962 			       (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1963 
1964 	if (rta[RTA_IIF-1])
1965 		memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1966 
1967 	if (iif) {
1968 		struct net_device *dev;
1969 		dev = __dev_get_by_index(iif);
1970 		if (!dev) {
1971 			err = -ENODEV;
1972 			goto out_free;
1973 		}
1974 	}
1975 
1976 	fl.oif = 0;
1977 	if (rta[RTA_OIF-1])
1978 		memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1979 
1980 	rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1981 
1982 	skb->dst = &rt->u.dst;
1983 
1984 	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1985 	err = rt6_fill_node(skb, rt,
1986 			    &fl.fl6_dst, &fl.fl6_src,
1987 			    iif,
1988 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1989 			    nlh->nlmsg_seq, 0, 0);
1990 	if (err < 0) {
1991 		err = -EMSGSIZE;
1992 		goto out_free;
1993 	}
1994 
1995 	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1996 	if (err > 0)
1997 		err = 0;
1998 out:
1999 	return err;
2000 out_free:
2001 	kfree_skb(skb);
2002 	goto out;
2003 }
2004 
2005 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh,
2006 			struct netlink_skb_parms *req)
2007 {
2008 	struct sk_buff *skb;
2009 	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2010 	u32 pid = current->pid;
2011 	u32 seq = 0;
2012 
2013 	if (req)
2014 		pid = req->pid;
2015 	if (nlh)
2016 		seq = nlh->nlmsg_seq;
2017 
2018 	skb = alloc_skb(size, gfp_any());
2019 	if (!skb) {
2020 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2021 		return;
2022 	}
2023 	if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2024 		kfree_skb(skb);
2025 		netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2026 		return;
2027 	}
2028 	NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2029 	netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2030 }
2031 
2032 /*
2033  *	/proc
2034  */
2035 
2036 #ifdef CONFIG_PROC_FS
2037 
2038 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2039 
2040 struct rt6_proc_arg
2041 {
2042 	char *buffer;
2043 	int offset;
2044 	int length;
2045 	int skip;
2046 	int len;
2047 };
2048 
2049 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2050 {
2051 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2052 	int i;
2053 
2054 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2055 		arg->skip++;
2056 		return 0;
2057 	}
2058 
2059 	if (arg->len >= arg->length)
2060 		return 0;
2061 
2062 	for (i=0; i<16; i++) {
2063 		sprintf(arg->buffer + arg->len, "%02x",
2064 			rt->rt6i_dst.addr.s6_addr[i]);
2065 		arg->len += 2;
2066 	}
2067 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2068 			    rt->rt6i_dst.plen);
2069 
2070 #ifdef CONFIG_IPV6_SUBTREES
2071 	for (i=0; i<16; i++) {
2072 		sprintf(arg->buffer + arg->len, "%02x",
2073 			rt->rt6i_src.addr.s6_addr[i]);
2074 		arg->len += 2;
2075 	}
2076 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2077 			    rt->rt6i_src.plen);
2078 #else
2079 	sprintf(arg->buffer + arg->len,
2080 		"00000000000000000000000000000000 00 ");
2081 	arg->len += 36;
2082 #endif
2083 
2084 	if (rt->rt6i_nexthop) {
2085 		for (i=0; i<16; i++) {
2086 			sprintf(arg->buffer + arg->len, "%02x",
2087 				rt->rt6i_nexthop->primary_key[i]);
2088 			arg->len += 2;
2089 		}
2090 	} else {
2091 		sprintf(arg->buffer + arg->len,
2092 			"00000000000000000000000000000000");
2093 		arg->len += 32;
2094 	}
2095 	arg->len += sprintf(arg->buffer + arg->len,
2096 			    " %08x %08x %08x %08x %8s\n",
2097 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2098 			    rt->u.dst.__use, rt->rt6i_flags,
2099 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2100 	return 0;
2101 }
2102 
2103 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2104 {
2105 	struct rt6_proc_arg arg;
2106 	arg.buffer = buffer;
2107 	arg.offset = offset;
2108 	arg.length = length;
2109 	arg.skip = 0;
2110 	arg.len = 0;
2111 
2112 	read_lock_bh(&rt6_lock);
2113 	fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2114 	read_unlock_bh(&rt6_lock);
2115 
2116 	*start = buffer;
2117 	if (offset)
2118 		*start += offset % RT6_INFO_LEN;
2119 
2120 	arg.len -= offset % RT6_INFO_LEN;
2121 
2122 	if (arg.len > length)
2123 		arg.len = length;
2124 	if (arg.len < 0)
2125 		arg.len = 0;
2126 
2127 	return arg.len;
2128 }
2129 
2130 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2131 {
2132 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2133 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2134 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2135 		      rt6_stats.fib_rt_cache,
2136 		      atomic_read(&ip6_dst_ops.entries),
2137 		      rt6_stats.fib_discarded_routes);
2138 
2139 	return 0;
2140 }
2141 
2142 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2143 {
2144 	return single_open(file, rt6_stats_seq_show, NULL);
2145 }
2146 
2147 static struct file_operations rt6_stats_seq_fops = {
2148 	.owner	 = THIS_MODULE,
2149 	.open	 = rt6_stats_seq_open,
2150 	.read	 = seq_read,
2151 	.llseek	 = seq_lseek,
2152 	.release = single_release,
2153 };
2154 #endif	/* CONFIG_PROC_FS */
2155 
2156 #ifdef CONFIG_SYSCTL
2157 
2158 static int flush_delay;
2159 
2160 static
2161 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2162 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2163 {
2164 	if (write) {
2165 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2166 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2167 		return 0;
2168 	} else
2169 		return -EINVAL;
2170 }
2171 
2172 ctl_table ipv6_route_table[] = {
2173         {
2174 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2175 		.procname	=	"flush",
2176          	.data		=	&flush_delay,
2177 		.maxlen		=	sizeof(int),
2178 		.mode		=	0200,
2179          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2180 	},
2181 	{
2182 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2183 		.procname	=	"gc_thresh",
2184          	.data		=	&ip6_dst_ops.gc_thresh,
2185 		.maxlen		=	sizeof(int),
2186 		.mode		=	0644,
2187          	.proc_handler	=	&proc_dointvec,
2188 	},
2189 	{
2190 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2191 		.procname	=	"max_size",
2192          	.data		=	&ip6_rt_max_size,
2193 		.maxlen		=	sizeof(int),
2194 		.mode		=	0644,
2195          	.proc_handler	=	&proc_dointvec,
2196 	},
2197 	{
2198 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2199 		.procname	=	"gc_min_interval",
2200          	.data		=	&ip6_rt_gc_min_interval,
2201 		.maxlen		=	sizeof(int),
2202 		.mode		=	0644,
2203          	.proc_handler	=	&proc_dointvec_jiffies,
2204 		.strategy	=	&sysctl_jiffies,
2205 	},
2206 	{
2207 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2208 		.procname	=	"gc_timeout",
2209          	.data		=	&ip6_rt_gc_timeout,
2210 		.maxlen		=	sizeof(int),
2211 		.mode		=	0644,
2212          	.proc_handler	=	&proc_dointvec_jiffies,
2213 		.strategy	=	&sysctl_jiffies,
2214 	},
2215 	{
2216 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2217 		.procname	=	"gc_interval",
2218          	.data		=	&ip6_rt_gc_interval,
2219 		.maxlen		=	sizeof(int),
2220 		.mode		=	0644,
2221          	.proc_handler	=	&proc_dointvec_jiffies,
2222 		.strategy	=	&sysctl_jiffies,
2223 	},
2224 	{
2225 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2226 		.procname	=	"gc_elasticity",
2227          	.data		=	&ip6_rt_gc_elasticity,
2228 		.maxlen		=	sizeof(int),
2229 		.mode		=	0644,
2230          	.proc_handler	=	&proc_dointvec_jiffies,
2231 		.strategy	=	&sysctl_jiffies,
2232 	},
2233 	{
2234 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2235 		.procname	=	"mtu_expires",
2236          	.data		=	&ip6_rt_mtu_expires,
2237 		.maxlen		=	sizeof(int),
2238 		.mode		=	0644,
2239          	.proc_handler	=	&proc_dointvec_jiffies,
2240 		.strategy	=	&sysctl_jiffies,
2241 	},
2242 	{
2243 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2244 		.procname	=	"min_adv_mss",
2245          	.data		=	&ip6_rt_min_advmss,
2246 		.maxlen		=	sizeof(int),
2247 		.mode		=	0644,
2248          	.proc_handler	=	&proc_dointvec_jiffies,
2249 		.strategy	=	&sysctl_jiffies,
2250 	},
2251 	{
2252 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2253 		.procname	=	"gc_min_interval_ms",
2254          	.data		=	&ip6_rt_gc_min_interval,
2255 		.maxlen		=	sizeof(int),
2256 		.mode		=	0644,
2257          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2258 		.strategy	=	&sysctl_ms_jiffies,
2259 	},
2260 	{ .ctl_name = 0 }
2261 };
2262 
2263 #endif
2264 
2265 void __init ip6_route_init(void)
2266 {
2267 	struct proc_dir_entry *p;
2268 
2269 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2270 						     sizeof(struct rt6_info),
2271 						     0, SLAB_HWCACHE_ALIGN,
2272 						     NULL, NULL);
2273 	if (!ip6_dst_ops.kmem_cachep)
2274 		panic("cannot create ip6_dst_cache");
2275 
2276 	fib6_init();
2277 #ifdef 	CONFIG_PROC_FS
2278 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2279 	if (p)
2280 		p->owner = THIS_MODULE;
2281 
2282 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2283 #endif
2284 #ifdef CONFIG_XFRM
2285 	xfrm6_init();
2286 #endif
2287 }
2288 
2289 void ip6_route_cleanup(void)
2290 {
2291 #ifdef CONFIG_PROC_FS
2292 	proc_net_remove("ipv6_route");
2293 	proc_net_remove("rt6_stats");
2294 #endif
2295 #ifdef CONFIG_XFRM
2296 	xfrm6_fini();
2297 #endif
2298 	rt6_ifdown(NULL);
2299 	fib6_gc_cleanup();
2300 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2301 }
2302