xref: /openbmc/linux/net/ipv6/route.c (revision ab364a6f96bad9625bdb97b5688c76c44eb1e96e)
1 /*
2  *	Linux INET6 implementation
3  *	FIB front-end.
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	$Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15 
16 /*	Changes:
17  *
18  *	YOSHIFUJI Hideaki @USAGI
19  *		reworked default router selection.
20  *		- respect outgoing interface
21  *		- select from (probably) reachable routers (i.e.
22  *		routers in REACHABLE, STALE, DELAY or PROBE states).
23  *		- always select the same router if it is (probably)
24  *		reachable.  otherwise, round-robin the list.
25  */
26 
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/if_arp.h>
39 
40 #ifdef 	CONFIG_PROC_FS
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #endif
44 
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57 
58 #include <asm/uaccess.h>
59 
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63 
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66 
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74 
75 #define CLONE_OFFLINK_ROUTE 0
76 
77 #define RT6_SELECT_F_IFACE	0x1
78 #define RT6_SELECT_F_REACHABLE	0x2
79 
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87 
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void		ip6_dst_destroy(struct dst_entry *);
92 static void		ip6_dst_ifdown(struct dst_entry *,
93 				       struct net_device *dev, int how);
94 static int		 ip6_dst_gc(void);
95 
96 static int		ip6_pkt_discard(struct sk_buff *skb);
97 static int		ip6_pkt_discard_out(struct sk_buff *skb);
98 static void		ip6_link_failure(struct sk_buff *skb);
99 static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100 
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103 					   struct in6_addr *gwaddr, int ifindex,
104 					   unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106 					   struct in6_addr *gwaddr, int ifindex);
107 #endif
108 
109 static struct dst_ops ip6_dst_ops = {
110 	.family			=	AF_INET6,
111 	.protocol		=	__constant_htons(ETH_P_IPV6),
112 	.gc			=	ip6_dst_gc,
113 	.gc_thresh		=	1024,
114 	.check			=	ip6_dst_check,
115 	.destroy		=	ip6_dst_destroy,
116 	.ifdown			=	ip6_dst_ifdown,
117 	.negative_advice	=	ip6_negative_advice,
118 	.link_failure		=	ip6_link_failure,
119 	.update_pmtu		=	ip6_rt_update_pmtu,
120 	.entry_size		=	sizeof(struct rt6_info),
121 };
122 
123 struct rt6_info ip6_null_entry = {
124 	.u = {
125 		.dst = {
126 			.__refcnt	= ATOMIC_INIT(1),
127 			.__use		= 1,
128 			.dev		= &loopback_dev,
129 			.obsolete	= -1,
130 			.error		= -ENETUNREACH,
131 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
132 			.input		= ip6_pkt_discard,
133 			.output		= ip6_pkt_discard_out,
134 			.ops		= &ip6_dst_ops,
135 			.path		= (struct dst_entry*)&ip6_null_entry,
136 		}
137 	},
138 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
139 	.rt6i_metric	= ~(u32) 0,
140 	.rt6i_ref	= ATOMIC_INIT(1),
141 };
142 
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144 
145 struct rt6_info ip6_prohibit_entry = {
146 	.u = {
147 		.dst = {
148 			.__refcnt	= ATOMIC_INIT(1),
149 			.__use		= 1,
150 			.dev		= &loopback_dev,
151 			.obsolete	= -1,
152 			.error		= -EACCES,
153 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
154 			.input		= ip6_pkt_discard,
155 			.output		= ip6_pkt_discard_out,
156 			.ops		= &ip6_dst_ops,
157 			.path		= (struct dst_entry*)&ip6_prohibit_entry,
158 		}
159 	},
160 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
161 	.rt6i_metric	= ~(u32) 0,
162 	.rt6i_ref	= ATOMIC_INIT(1),
163 };
164 
165 struct rt6_info ip6_blk_hole_entry = {
166 	.u = {
167 		.dst = {
168 			.__refcnt	= ATOMIC_INIT(1),
169 			.__use		= 1,
170 			.dev		= &loopback_dev,
171 			.obsolete	= -1,
172 			.error		= -EINVAL,
173 			.metrics	= { [RTAX_HOPLIMIT - 1] = 255, },
174 			.input		= ip6_pkt_discard,
175 			.output		= ip6_pkt_discard_out,
176 			.ops		= &ip6_dst_ops,
177 			.path		= (struct dst_entry*)&ip6_blk_hole_entry,
178 		}
179 	},
180 	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
181 	.rt6i_metric	= ~(u32) 0,
182 	.rt6i_ref	= ATOMIC_INIT(1),
183 };
184 
185 #endif
186 
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 {
190 	return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
191 }
192 
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195 	struct rt6_info *rt = (struct rt6_info *)dst;
196 	struct inet6_dev *idev = rt->rt6i_idev;
197 
198 	if (idev != NULL) {
199 		rt->rt6i_idev = NULL;
200 		in6_dev_put(idev);
201 	}
202 }
203 
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205 			   int how)
206 {
207 	struct rt6_info *rt = (struct rt6_info *)dst;
208 	struct inet6_dev *idev = rt->rt6i_idev;
209 
210 	if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211 		struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212 		if (loopback_idev != NULL) {
213 			rt->rt6i_idev = loopback_idev;
214 			in6_dev_put(idev);
215 		}
216 	}
217 }
218 
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 {
221 	return (rt->rt6i_flags & RTF_EXPIRES &&
222 		time_after(jiffies, rt->rt6i_expires));
223 }
224 
225 static inline int rt6_need_strict(struct in6_addr *daddr)
226 {
227 	return (ipv6_addr_type(daddr) &
228 		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
229 }
230 
231 /*
232  *	Route lookup. Any table->tb6_lock is implied.
233  */
234 
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
236 						    int oif,
237 						    int strict)
238 {
239 	struct rt6_info *local = NULL;
240 	struct rt6_info *sprt;
241 
242 	if (oif) {
243 		for (sprt = rt; sprt; sprt = sprt->u.next) {
244 			struct net_device *dev = sprt->rt6i_dev;
245 			if (dev->ifindex == oif)
246 				return sprt;
247 			if (dev->flags & IFF_LOOPBACK) {
248 				if (sprt->rt6i_idev == NULL ||
249 				    sprt->rt6i_idev->dev->ifindex != oif) {
250 					if (strict && oif)
251 						continue;
252 					if (local && (!oif ||
253 						      local->rt6i_idev->dev->ifindex == oif))
254 						continue;
255 				}
256 				local = sprt;
257 			}
258 		}
259 
260 		if (local)
261 			return local;
262 
263 		if (strict)
264 			return &ip6_null_entry;
265 	}
266 	return rt;
267 }
268 
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
271 {
272 	struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273 	/*
274 	 * Okay, this does not seem to be appropriate
275 	 * for now, however, we need to check if it
276 	 * is really so; aka Router Reachability Probing.
277 	 *
278 	 * Router Reachability Probe MUST be rate-limited
279 	 * to no more than one per minute.
280 	 */
281 	if (!neigh || (neigh->nud_state & NUD_VALID))
282 		return;
283 	read_lock_bh(&neigh->lock);
284 	if (!(neigh->nud_state & NUD_VALID) &&
285 	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286 		struct in6_addr mcaddr;
287 		struct in6_addr *target;
288 
289 		neigh->updated = jiffies;
290 		read_unlock_bh(&neigh->lock);
291 
292 		target = (struct in6_addr *)&neigh->primary_key;
293 		addrconf_addr_solict_mult(target, &mcaddr);
294 		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295 	} else
296 		read_unlock_bh(&neigh->lock);
297 }
298 #else
299 static inline void rt6_probe(struct rt6_info *rt)
300 {
301 	return;
302 }
303 #endif
304 
305 /*
306  * Default Router Selection (RFC 2461 6.3.6)
307  */
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 {
310 	struct net_device *dev = rt->rt6i_dev;
311 	if (!oif || dev->ifindex == oif)
312 		return 2;
313 	if ((dev->flags & IFF_LOOPBACK) &&
314 	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
315 		return 1;
316 	return 0;
317 }
318 
319 static int inline rt6_check_neigh(struct rt6_info *rt)
320 {
321 	struct neighbour *neigh = rt->rt6i_nexthop;
322 	int m = 0;
323 	if (rt->rt6i_flags & RTF_NONEXTHOP ||
324 	    !(rt->rt6i_flags & RTF_GATEWAY))
325 		m = 1;
326 	else if (neigh) {
327 		read_lock_bh(&neigh->lock);
328 		if (neigh->nud_state & NUD_VALID)
329 			m = 2;
330 		read_unlock_bh(&neigh->lock);
331 	}
332 	return m;
333 }
334 
335 static int rt6_score_route(struct rt6_info *rt, int oif,
336 			   int strict)
337 {
338 	int m, n;
339 
340 	m = rt6_check_dev(rt, oif);
341 	if (!m && (strict & RT6_SELECT_F_IFACE))
342 		return -1;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 #endif
346 	n = rt6_check_neigh(rt);
347 	if (n > 1)
348 		m |= 16;
349 	else if (!n && strict & RT6_SELECT_F_REACHABLE)
350 		return -1;
351 	return m;
352 }
353 
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
355 				   int strict)
356 {
357 	struct rt6_info *match = NULL, *last = NULL;
358 	struct rt6_info *rt, *rt0 = *head;
359 	u32 metric;
360 	int mpri = -1;
361 
362 	RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363 		  __FUNCTION__, head, head ? *head : NULL, oif);
364 
365 	for (rt = rt0, metric = rt0->rt6i_metric;
366 	     rt && rt->rt6i_metric == metric && (!last || rt != rt0);
367 	     rt = rt->u.next) {
368 		int m;
369 
370 		if (rt6_check_expired(rt))
371 			continue;
372 
373 		last = rt;
374 
375 		m = rt6_score_route(rt, oif, strict);
376 		if (m < 0)
377 			continue;
378 
379 		if (m > mpri) {
380 			rt6_probe(match);
381 			match = rt;
382 			mpri = m;
383 		} else {
384 			rt6_probe(rt);
385 		}
386 	}
387 
388 	if (!match &&
389 	    (strict & RT6_SELECT_F_REACHABLE) &&
390 	    last && last != rt0) {
391 		/* no entries matched; do round-robin */
392 		static DEFINE_SPINLOCK(lock);
393 		spin_lock(&lock);
394 		*head = rt0->u.next;
395 		rt0->u.next = last->u.next;
396 		last->u.next = rt0;
397 		spin_unlock(&lock);
398 	}
399 
400 	RT6_TRACE("%s() => %p, score=%d\n",
401 		  __FUNCTION__, match, mpri);
402 
403 	return (match ? match : &ip6_null_entry);
404 }
405 
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408 		  struct in6_addr *gwaddr)
409 {
410 	struct route_info *rinfo = (struct route_info *) opt;
411 	struct in6_addr prefix_buf, *prefix;
412 	unsigned int pref;
413 	u32 lifetime;
414 	struct rt6_info *rt;
415 
416 	if (len < sizeof(struct route_info)) {
417 		return -EINVAL;
418 	}
419 
420 	/* Sanity check for prefix_len and length */
421 	if (rinfo->length > 3) {
422 		return -EINVAL;
423 	} else if (rinfo->prefix_len > 128) {
424 		return -EINVAL;
425 	} else if (rinfo->prefix_len > 64) {
426 		if (rinfo->length < 2) {
427 			return -EINVAL;
428 		}
429 	} else if (rinfo->prefix_len > 0) {
430 		if (rinfo->length < 1) {
431 			return -EINVAL;
432 		}
433 	}
434 
435 	pref = rinfo->route_pref;
436 	if (pref == ICMPV6_ROUTER_PREF_INVALID)
437 		pref = ICMPV6_ROUTER_PREF_MEDIUM;
438 
439 	lifetime = htonl(rinfo->lifetime);
440 	if (lifetime == 0xffffffff) {
441 		/* infinity */
442 	} else if (lifetime > 0x7fffffff/HZ) {
443 		/* Avoid arithmetic overflow */
444 		lifetime = 0x7fffffff/HZ - 1;
445 	}
446 
447 	if (rinfo->length == 3)
448 		prefix = (struct in6_addr *)rinfo->prefix;
449 	else {
450 		/* this function is safe */
451 		ipv6_addr_prefix(&prefix_buf,
452 				 (struct in6_addr *)rinfo->prefix,
453 				 rinfo->prefix_len);
454 		prefix = &prefix_buf;
455 	}
456 
457 	rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458 
459 	if (rt && !lifetime) {
460 		ip6_del_rt(rt);
461 		rt = NULL;
462 	}
463 
464 	if (!rt && lifetime)
465 		rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
466 					pref);
467 	else if (rt)
468 		rt->rt6i_flags = RTF_ROUTEINFO |
469 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
470 
471 	if (rt) {
472 		if (lifetime == 0xffffffff) {
473 			rt->rt6i_flags &= ~RTF_EXPIRES;
474 		} else {
475 			rt->rt6i_expires = jiffies + HZ * lifetime;
476 			rt->rt6i_flags |= RTF_EXPIRES;
477 		}
478 		dst_release(&rt->u.dst);
479 	}
480 	return 0;
481 }
482 #endif
483 
484 #define BACKTRACK() \
485 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
486 	while ((fn = fn->parent) != NULL) { \
487 		if (fn->fn_flags & RTN_TL_ROOT) { \
488 			dst_hold(&rt->u.dst); \
489 			goto out; \
490 		} \
491 		if (fn->fn_flags & RTN_RTINFO) \
492 			goto restart; \
493 	} \
494 }
495 
496 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
497 					     struct flowi *fl, int flags)
498 {
499 	struct fib6_node *fn;
500 	struct rt6_info *rt;
501 
502 	read_lock_bh(&table->tb6_lock);
503 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
504 restart:
505 	rt = fn->leaf;
506 	rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
507 	BACKTRACK();
508 	dst_hold(&rt->u.dst);
509 out:
510 	read_unlock_bh(&table->tb6_lock);
511 
512 	rt->u.dst.lastuse = jiffies;
513 	rt->u.dst.__use++;
514 
515 	return rt;
516 
517 }
518 
519 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
520 			    int oif, int strict)
521 {
522 	struct flowi fl = {
523 		.oif = oif,
524 		.nl_u = {
525 			.ip6_u = {
526 				.daddr = *daddr,
527 				/* TODO: saddr */
528 			},
529 		},
530 	};
531 	struct dst_entry *dst;
532 	int flags = strict ? RT6_F_STRICT : 0;
533 
534 	dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
535 	if (dst->error == 0)
536 		return (struct rt6_info *) dst;
537 
538 	dst_release(dst);
539 
540 	return NULL;
541 }
542 
543 /* ip6_ins_rt is called with FREE table->tb6_lock.
544    It takes new route entry, the addition fails by any reason the
545    route is freed. In any case, if caller does not hold it, it may
546    be destroyed.
547  */
548 
549 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
550 {
551 	int err;
552 	struct fib6_table *table;
553 
554 	table = rt->rt6i_table;
555 	write_lock_bh(&table->tb6_lock);
556 	err = fib6_add(&table->tb6_root, rt, info);
557 	write_unlock_bh(&table->tb6_lock);
558 
559 	return err;
560 }
561 
562 int ip6_ins_rt(struct rt6_info *rt)
563 {
564 	return __ip6_ins_rt(rt, NULL);
565 }
566 
567 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
568 				      struct in6_addr *saddr)
569 {
570 	struct rt6_info *rt;
571 
572 	/*
573 	 *	Clone the route.
574 	 */
575 
576 	rt = ip6_rt_copy(ort);
577 
578 	if (rt) {
579 		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
580 			if (rt->rt6i_dst.plen != 128 &&
581 			    ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
582 				rt->rt6i_flags |= RTF_ANYCAST;
583 			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
584 		}
585 
586 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
587 		rt->rt6i_dst.plen = 128;
588 		rt->rt6i_flags |= RTF_CACHE;
589 		rt->u.dst.flags |= DST_HOST;
590 
591 #ifdef CONFIG_IPV6_SUBTREES
592 		if (rt->rt6i_src.plen && saddr) {
593 			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
594 			rt->rt6i_src.plen = 128;
595 		}
596 #endif
597 
598 		rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
599 
600 	}
601 
602 	return rt;
603 }
604 
605 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
606 {
607 	struct rt6_info *rt = ip6_rt_copy(ort);
608 	if (rt) {
609 		ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
610 		rt->rt6i_dst.plen = 128;
611 		rt->rt6i_flags |= RTF_CACHE;
612 		if (rt->rt6i_flags & RTF_REJECT)
613 			rt->u.dst.error = ort->u.dst.error;
614 		rt->u.dst.flags |= DST_HOST;
615 		rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
616 	}
617 	return rt;
618 }
619 
620 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
621 					    struct flowi *fl, int flags)
622 {
623 	struct fib6_node *fn;
624 	struct rt6_info *rt, *nrt;
625 	int strict = 0;
626 	int attempts = 3;
627 	int err;
628 	int reachable = RT6_SELECT_F_REACHABLE;
629 
630 	if (flags & RT6_F_STRICT)
631 		strict = RT6_SELECT_F_IFACE;
632 
633 relookup:
634 	read_lock_bh(&table->tb6_lock);
635 
636 restart_2:
637 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
638 
639 restart:
640 	rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
641 	BACKTRACK();
642 	if (rt == &ip6_null_entry ||
643 	    rt->rt6i_flags & RTF_CACHE)
644 		goto out;
645 
646 	dst_hold(&rt->u.dst);
647 	read_unlock_bh(&table->tb6_lock);
648 
649 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
650 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
651 	else {
652 #if CLONE_OFFLINK_ROUTE
653 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
654 #else
655 		goto out2;
656 #endif
657 	}
658 
659 	dst_release(&rt->u.dst);
660 	rt = nrt ? : &ip6_null_entry;
661 
662 	dst_hold(&rt->u.dst);
663 	if (nrt) {
664 		err = ip6_ins_rt(nrt);
665 		if (!err)
666 			goto out2;
667 	}
668 
669 	if (--attempts <= 0)
670 		goto out2;
671 
672 	/*
673 	 * Race condition! In the gap, when table->tb6_lock was
674 	 * released someone could insert this route.  Relookup.
675 	 */
676 	dst_release(&rt->u.dst);
677 	goto relookup;
678 
679 out:
680 	if (reachable) {
681 		reachable = 0;
682 		goto restart_2;
683 	}
684 	dst_hold(&rt->u.dst);
685 	read_unlock_bh(&table->tb6_lock);
686 out2:
687 	rt->u.dst.lastuse = jiffies;
688 	rt->u.dst.__use++;
689 
690 	return rt;
691 }
692 
693 void ip6_route_input(struct sk_buff *skb)
694 {
695 	struct ipv6hdr *iph = skb->nh.ipv6h;
696 	struct flowi fl = {
697 		.iif = skb->dev->ifindex,
698 		.nl_u = {
699 			.ip6_u = {
700 				.daddr = iph->daddr,
701 				.saddr = iph->saddr,
702 				.flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
703 			},
704 		},
705 		.proto = iph->nexthdr,
706 	};
707 	int flags = 0;
708 
709 	if (rt6_need_strict(&iph->daddr))
710 		flags |= RT6_F_STRICT;
711 
712 	skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
713 }
714 
715 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
716 					     struct flowi *fl, int flags)
717 {
718 	struct fib6_node *fn;
719 	struct rt6_info *rt, *nrt;
720 	int strict = 0;
721 	int attempts = 3;
722 	int err;
723 	int reachable = RT6_SELECT_F_REACHABLE;
724 
725 	if (flags & RT6_F_STRICT)
726 		strict = RT6_SELECT_F_IFACE;
727 
728 relookup:
729 	read_lock_bh(&table->tb6_lock);
730 
731 restart_2:
732 	fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
733 
734 restart:
735 	rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
736 	BACKTRACK();
737 	if (rt == &ip6_null_entry ||
738 	    rt->rt6i_flags & RTF_CACHE)
739 		goto out;
740 
741 	dst_hold(&rt->u.dst);
742 	read_unlock_bh(&table->tb6_lock);
743 
744 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
745 		nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
746 	else {
747 #if CLONE_OFFLINK_ROUTE
748 		nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
749 #else
750 		goto out2;
751 #endif
752 	}
753 
754 	dst_release(&rt->u.dst);
755 	rt = nrt ? : &ip6_null_entry;
756 
757 	dst_hold(&rt->u.dst);
758 	if (nrt) {
759 		err = ip6_ins_rt(nrt);
760 		if (!err)
761 			goto out2;
762 	}
763 
764 	if (--attempts <= 0)
765 		goto out2;
766 
767 	/*
768 	 * Race condition! In the gap, when table->tb6_lock was
769 	 * released someone could insert this route.  Relookup.
770 	 */
771 	dst_release(&rt->u.dst);
772 	goto relookup;
773 
774 out:
775 	if (reachable) {
776 		reachable = 0;
777 		goto restart_2;
778 	}
779 	dst_hold(&rt->u.dst);
780 	read_unlock_bh(&table->tb6_lock);
781 out2:
782 	rt->u.dst.lastuse = jiffies;
783 	rt->u.dst.__use++;
784 	return rt;
785 }
786 
787 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
788 {
789 	int flags = 0;
790 
791 	if (rt6_need_strict(&fl->fl6_dst))
792 		flags |= RT6_F_STRICT;
793 
794 	return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
795 }
796 
797 
798 /*
799  *	Destination cache support functions
800  */
801 
802 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
803 {
804 	struct rt6_info *rt;
805 
806 	rt = (struct rt6_info *) dst;
807 
808 	if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
809 		return dst;
810 
811 	return NULL;
812 }
813 
814 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
815 {
816 	struct rt6_info *rt = (struct rt6_info *) dst;
817 
818 	if (rt) {
819 		if (rt->rt6i_flags & RTF_CACHE)
820 			ip6_del_rt(rt);
821 		else
822 			dst_release(dst);
823 	}
824 	return NULL;
825 }
826 
827 static void ip6_link_failure(struct sk_buff *skb)
828 {
829 	struct rt6_info *rt;
830 
831 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
832 
833 	rt = (struct rt6_info *) skb->dst;
834 	if (rt) {
835 		if (rt->rt6i_flags&RTF_CACHE) {
836 			dst_set_expires(&rt->u.dst, 0);
837 			rt->rt6i_flags |= RTF_EXPIRES;
838 		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
839 			rt->rt6i_node->fn_sernum = -1;
840 	}
841 }
842 
843 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
844 {
845 	struct rt6_info *rt6 = (struct rt6_info*)dst;
846 
847 	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
848 		rt6->rt6i_flags |= RTF_MODIFIED;
849 		if (mtu < IPV6_MIN_MTU) {
850 			mtu = IPV6_MIN_MTU;
851 			dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
852 		}
853 		dst->metrics[RTAX_MTU-1] = mtu;
854 		call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
855 	}
856 }
857 
858 static int ipv6_get_mtu(struct net_device *dev);
859 
860 static inline unsigned int ipv6_advmss(unsigned int mtu)
861 {
862 	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
863 
864 	if (mtu < ip6_rt_min_advmss)
865 		mtu = ip6_rt_min_advmss;
866 
867 	/*
868 	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
869 	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
870 	 * IPV6_MAXPLEN is also valid and means: "any MSS,
871 	 * rely only on pmtu discovery"
872 	 */
873 	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
874 		mtu = IPV6_MAXPLEN;
875 	return mtu;
876 }
877 
878 static struct dst_entry *ndisc_dst_gc_list;
879 static DEFINE_SPINLOCK(ndisc_lock);
880 
881 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
882 				  struct neighbour *neigh,
883 				  struct in6_addr *addr,
884 				  int (*output)(struct sk_buff *))
885 {
886 	struct rt6_info *rt;
887 	struct inet6_dev *idev = in6_dev_get(dev);
888 
889 	if (unlikely(idev == NULL))
890 		return NULL;
891 
892 	rt = ip6_dst_alloc();
893 	if (unlikely(rt == NULL)) {
894 		in6_dev_put(idev);
895 		goto out;
896 	}
897 
898 	dev_hold(dev);
899 	if (neigh)
900 		neigh_hold(neigh);
901 	else
902 		neigh = ndisc_get_neigh(dev, addr);
903 
904 	rt->rt6i_dev	  = dev;
905 	rt->rt6i_idev     = idev;
906 	rt->rt6i_nexthop  = neigh;
907 	atomic_set(&rt->u.dst.__refcnt, 1);
908 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
909 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
910 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
911 	rt->u.dst.output  = output;
912 
913 #if 0	/* there's no chance to use these for ndisc */
914 	rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
915 				? DST_HOST
916 				: 0;
917 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
918 	rt->rt6i_dst.plen = 128;
919 #endif
920 
921 	spin_lock_bh(&ndisc_lock);
922 	rt->u.dst.next = ndisc_dst_gc_list;
923 	ndisc_dst_gc_list = &rt->u.dst;
924 	spin_unlock_bh(&ndisc_lock);
925 
926 	fib6_force_start_gc();
927 
928 out:
929 	return (struct dst_entry *)rt;
930 }
931 
932 int ndisc_dst_gc(int *more)
933 {
934 	struct dst_entry *dst, *next, **pprev;
935 	int freed;
936 
937 	next = NULL;
938  	freed = 0;
939 
940 	spin_lock_bh(&ndisc_lock);
941 	pprev = &ndisc_dst_gc_list;
942 
943 	while ((dst = *pprev) != NULL) {
944 		if (!atomic_read(&dst->__refcnt)) {
945 			*pprev = dst->next;
946 			dst_free(dst);
947 			freed++;
948 		} else {
949 			pprev = &dst->next;
950 			(*more)++;
951 		}
952 	}
953 
954 	spin_unlock_bh(&ndisc_lock);
955 
956 	return freed;
957 }
958 
959 static int ip6_dst_gc(void)
960 {
961 	static unsigned expire = 30*HZ;
962 	static unsigned long last_gc;
963 	unsigned long now = jiffies;
964 
965 	if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
966 	    atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
967 		goto out;
968 
969 	expire++;
970 	fib6_run_gc(expire);
971 	last_gc = now;
972 	if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
973 		expire = ip6_rt_gc_timeout>>1;
974 
975 out:
976 	expire -= expire>>ip6_rt_gc_elasticity;
977 	return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
978 }
979 
980 /* Clean host part of a prefix. Not necessary in radix tree,
981    but results in cleaner routing tables.
982 
983    Remove it only when all the things will work!
984  */
985 
986 static int ipv6_get_mtu(struct net_device *dev)
987 {
988 	int mtu = IPV6_MIN_MTU;
989 	struct inet6_dev *idev;
990 
991 	idev = in6_dev_get(dev);
992 	if (idev) {
993 		mtu = idev->cnf.mtu6;
994 		in6_dev_put(idev);
995 	}
996 	return mtu;
997 }
998 
999 int ipv6_get_hoplimit(struct net_device *dev)
1000 {
1001 	int hoplimit = ipv6_devconf.hop_limit;
1002 	struct inet6_dev *idev;
1003 
1004 	idev = in6_dev_get(dev);
1005 	if (idev) {
1006 		hoplimit = idev->cnf.hop_limit;
1007 		in6_dev_put(idev);
1008 	}
1009 	return hoplimit;
1010 }
1011 
1012 /*
1013  *
1014  */
1015 
1016 int ip6_route_add(struct fib6_config *cfg)
1017 {
1018 	int err;
1019 	struct rt6_info *rt = NULL;
1020 	struct net_device *dev = NULL;
1021 	struct inet6_dev *idev = NULL;
1022 	struct fib6_table *table;
1023 	int addr_type;
1024 
1025 	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1026 		return -EINVAL;
1027 #ifndef CONFIG_IPV6_SUBTREES
1028 	if (cfg->fc_src_len)
1029 		return -EINVAL;
1030 #endif
1031 	if (cfg->fc_ifindex) {
1032 		err = -ENODEV;
1033 		dev = dev_get_by_index(cfg->fc_ifindex);
1034 		if (!dev)
1035 			goto out;
1036 		idev = in6_dev_get(dev);
1037 		if (!idev)
1038 			goto out;
1039 	}
1040 
1041 	if (cfg->fc_metric == 0)
1042 		cfg->fc_metric = IP6_RT_PRIO_USER;
1043 
1044 	table = fib6_new_table(cfg->fc_table);
1045 	if (table == NULL) {
1046 		err = -ENOBUFS;
1047 		goto out;
1048 	}
1049 
1050 	rt = ip6_dst_alloc();
1051 
1052 	if (rt == NULL) {
1053 		err = -ENOMEM;
1054 		goto out;
1055 	}
1056 
1057 	rt->u.dst.obsolete = -1;
1058 	rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1059 
1060 	if (cfg->fc_protocol == RTPROT_UNSPEC)
1061 		cfg->fc_protocol = RTPROT_BOOT;
1062 	rt->rt6i_protocol = cfg->fc_protocol;
1063 
1064 	addr_type = ipv6_addr_type(&cfg->fc_dst);
1065 
1066 	if (addr_type & IPV6_ADDR_MULTICAST)
1067 		rt->u.dst.input = ip6_mc_input;
1068 	else
1069 		rt->u.dst.input = ip6_forward;
1070 
1071 	rt->u.dst.output = ip6_output;
1072 
1073 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1074 	rt->rt6i_dst.plen = cfg->fc_dst_len;
1075 	if (rt->rt6i_dst.plen == 128)
1076 	       rt->u.dst.flags = DST_HOST;
1077 
1078 #ifdef CONFIG_IPV6_SUBTREES
1079 	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1080 	rt->rt6i_src.plen = cfg->fc_src_len;
1081 #endif
1082 
1083 	rt->rt6i_metric = cfg->fc_metric;
1084 
1085 	/* We cannot add true routes via loopback here,
1086 	   they would result in kernel looping; promote them to reject routes
1087 	 */
1088 	if ((cfg->fc_flags & RTF_REJECT) ||
1089 	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1090 		/* hold loopback dev/idev if we haven't done so. */
1091 		if (dev != &loopback_dev) {
1092 			if (dev) {
1093 				dev_put(dev);
1094 				in6_dev_put(idev);
1095 			}
1096 			dev = &loopback_dev;
1097 			dev_hold(dev);
1098 			idev = in6_dev_get(dev);
1099 			if (!idev) {
1100 				err = -ENODEV;
1101 				goto out;
1102 			}
1103 		}
1104 		rt->u.dst.output = ip6_pkt_discard_out;
1105 		rt->u.dst.input = ip6_pkt_discard;
1106 		rt->u.dst.error = -ENETUNREACH;
1107 		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1108 		goto install_route;
1109 	}
1110 
1111 	if (cfg->fc_flags & RTF_GATEWAY) {
1112 		struct in6_addr *gw_addr;
1113 		int gwa_type;
1114 
1115 		gw_addr = &cfg->fc_gateway;
1116 		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1117 		gwa_type = ipv6_addr_type(gw_addr);
1118 
1119 		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1120 			struct rt6_info *grt;
1121 
1122 			/* IPv6 strictly inhibits using not link-local
1123 			   addresses as nexthop address.
1124 			   Otherwise, router will not able to send redirects.
1125 			   It is very good, but in some (rare!) circumstances
1126 			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1127 			   some exceptions. --ANK
1128 			 */
1129 			err = -EINVAL;
1130 			if (!(gwa_type&IPV6_ADDR_UNICAST))
1131 				goto out;
1132 
1133 			grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1134 
1135 			err = -EHOSTUNREACH;
1136 			if (grt == NULL)
1137 				goto out;
1138 			if (dev) {
1139 				if (dev != grt->rt6i_dev) {
1140 					dst_release(&grt->u.dst);
1141 					goto out;
1142 				}
1143 			} else {
1144 				dev = grt->rt6i_dev;
1145 				idev = grt->rt6i_idev;
1146 				dev_hold(dev);
1147 				in6_dev_hold(grt->rt6i_idev);
1148 			}
1149 			if (!(grt->rt6i_flags&RTF_GATEWAY))
1150 				err = 0;
1151 			dst_release(&grt->u.dst);
1152 
1153 			if (err)
1154 				goto out;
1155 		}
1156 		err = -EINVAL;
1157 		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1158 			goto out;
1159 	}
1160 
1161 	err = -ENODEV;
1162 	if (dev == NULL)
1163 		goto out;
1164 
1165 	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1166 		rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1167 		if (IS_ERR(rt->rt6i_nexthop)) {
1168 			err = PTR_ERR(rt->rt6i_nexthop);
1169 			rt->rt6i_nexthop = NULL;
1170 			goto out;
1171 		}
1172 	}
1173 
1174 	rt->rt6i_flags = cfg->fc_flags;
1175 
1176 install_route:
1177 	if (cfg->fc_mx) {
1178 		struct nlattr *nla;
1179 		int remaining;
1180 
1181 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1182 			int type = nla->nla_type;
1183 
1184 			if (type) {
1185 				if (type > RTAX_MAX) {
1186 					err = -EINVAL;
1187 					goto out;
1188 				}
1189 
1190 				rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1191 			}
1192 		}
1193 	}
1194 
1195 	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1196 		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1197 	if (!rt->u.dst.metrics[RTAX_MTU-1])
1198 		rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1199 	if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1200 		rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1201 	rt->u.dst.dev = dev;
1202 	rt->rt6i_idev = idev;
1203 	rt->rt6i_table = table;
1204 	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1205 
1206 out:
1207 	if (dev)
1208 		dev_put(dev);
1209 	if (idev)
1210 		in6_dev_put(idev);
1211 	if (rt)
1212 		dst_free((struct dst_entry *) rt);
1213 	return err;
1214 }
1215 
1216 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1217 {
1218 	int err;
1219 	struct fib6_table *table;
1220 
1221 	if (rt == &ip6_null_entry)
1222 		return -ENOENT;
1223 
1224 	table = rt->rt6i_table;
1225 	write_lock_bh(&table->tb6_lock);
1226 
1227 	err = fib6_del(rt, info);
1228 	dst_release(&rt->u.dst);
1229 
1230 	write_unlock_bh(&table->tb6_lock);
1231 
1232 	return err;
1233 }
1234 
1235 int ip6_del_rt(struct rt6_info *rt)
1236 {
1237 	return __ip6_del_rt(rt, NULL);
1238 }
1239 
1240 static int ip6_route_del(struct fib6_config *cfg)
1241 {
1242 	struct fib6_table *table;
1243 	struct fib6_node *fn;
1244 	struct rt6_info *rt;
1245 	int err = -ESRCH;
1246 
1247 	table = fib6_get_table(cfg->fc_table);
1248 	if (table == NULL)
1249 		return err;
1250 
1251 	read_lock_bh(&table->tb6_lock);
1252 
1253 	fn = fib6_locate(&table->tb6_root,
1254 			 &cfg->fc_dst, cfg->fc_dst_len,
1255 			 &cfg->fc_src, cfg->fc_src_len);
1256 
1257 	if (fn) {
1258 		for (rt = fn->leaf; rt; rt = rt->u.next) {
1259 			if (cfg->fc_ifindex &&
1260 			    (rt->rt6i_dev == NULL ||
1261 			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1262 				continue;
1263 			if (cfg->fc_flags & RTF_GATEWAY &&
1264 			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1265 				continue;
1266 			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1267 				continue;
1268 			dst_hold(&rt->u.dst);
1269 			read_unlock_bh(&table->tb6_lock);
1270 
1271 			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1272 		}
1273 	}
1274 	read_unlock_bh(&table->tb6_lock);
1275 
1276 	return err;
1277 }
1278 
1279 /*
1280  *	Handle redirects
1281  */
1282 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1283 		  struct neighbour *neigh, u8 *lladdr, int on_link)
1284 {
1285 	struct rt6_info *rt, *nrt = NULL;
1286 	struct fib6_node *fn;
1287 	struct fib6_table *table;
1288 	struct netevent_redirect netevent;
1289 
1290 	/* TODO: Very lazy, might need to check all tables */
1291 	table = fib6_get_table(RT6_TABLE_MAIN);
1292 	if (table == NULL)
1293 		return;
1294 
1295 	/*
1296 	 * Get the "current" route for this destination and
1297 	 * check if the redirect has come from approriate router.
1298 	 *
1299 	 * RFC 2461 specifies that redirects should only be
1300 	 * accepted if they come from the nexthop to the target.
1301 	 * Due to the way the routes are chosen, this notion
1302 	 * is a bit fuzzy and one might need to check all possible
1303 	 * routes.
1304 	 */
1305 
1306 	read_lock_bh(&table->tb6_lock);
1307 	fn = fib6_lookup(&table->tb6_root, dest, NULL);
1308 restart:
1309 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1310 		/*
1311 		 * Current route is on-link; redirect is always invalid.
1312 		 *
1313 		 * Seems, previous statement is not true. It could
1314 		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1315 		 * But then router serving it might decide, that we should
1316 		 * know truth 8)8) --ANK (980726).
1317 		 */
1318 		if (rt6_check_expired(rt))
1319 			continue;
1320 		if (!(rt->rt6i_flags & RTF_GATEWAY))
1321 			continue;
1322 		if (neigh->dev != rt->rt6i_dev)
1323 			continue;
1324 		if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1325 			continue;
1326 		break;
1327 	}
1328 	if (rt)
1329 		dst_hold(&rt->u.dst);
1330 	else if (rt6_need_strict(dest)) {
1331 		while ((fn = fn->parent) != NULL) {
1332 			if (fn->fn_flags & RTN_ROOT)
1333 				break;
1334 			if (fn->fn_flags & RTN_RTINFO)
1335 				goto restart;
1336 		}
1337 	}
1338 	read_unlock_bh(&table->tb6_lock);
1339 
1340 	if (!rt) {
1341 		if (net_ratelimit())
1342 			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1343 			       "for redirect target\n");
1344 		return;
1345 	}
1346 
1347 	/*
1348 	 *	We have finally decided to accept it.
1349 	 */
1350 
1351 	neigh_update(neigh, lladdr, NUD_STALE,
1352 		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1353 		     NEIGH_UPDATE_F_OVERRIDE|
1354 		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1355 				     NEIGH_UPDATE_F_ISROUTER))
1356 		     );
1357 
1358 	/*
1359 	 * Redirect received -> path was valid.
1360 	 * Look, redirects are sent only in response to data packets,
1361 	 * so that this nexthop apparently is reachable. --ANK
1362 	 */
1363 	dst_confirm(&rt->u.dst);
1364 
1365 	/* Duplicate redirect: silently ignore. */
1366 	if (neigh == rt->u.dst.neighbour)
1367 		goto out;
1368 
1369 	nrt = ip6_rt_copy(rt);
1370 	if (nrt == NULL)
1371 		goto out;
1372 
1373 	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1374 	if (on_link)
1375 		nrt->rt6i_flags &= ~RTF_GATEWAY;
1376 
1377 	ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1378 	nrt->rt6i_dst.plen = 128;
1379 	nrt->u.dst.flags |= DST_HOST;
1380 
1381 	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1382 	nrt->rt6i_nexthop = neigh_clone(neigh);
1383 	/* Reset pmtu, it may be better */
1384 	nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1385 	nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1386 
1387 	if (ip6_ins_rt(nrt))
1388 		goto out;
1389 
1390 	netevent.old = &rt->u.dst;
1391 	netevent.new = &nrt->u.dst;
1392 	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1393 
1394 	if (rt->rt6i_flags&RTF_CACHE) {
1395 		ip6_del_rt(rt);
1396 		return;
1397 	}
1398 
1399 out:
1400         dst_release(&rt->u.dst);
1401 	return;
1402 }
1403 
1404 /*
1405  *	Handle ICMP "packet too big" messages
1406  *	i.e. Path MTU discovery
1407  */
1408 
1409 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1410 			struct net_device *dev, u32 pmtu)
1411 {
1412 	struct rt6_info *rt, *nrt;
1413 	int allfrag = 0;
1414 
1415 	rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1416 	if (rt == NULL)
1417 		return;
1418 
1419 	if (pmtu >= dst_mtu(&rt->u.dst))
1420 		goto out;
1421 
1422 	if (pmtu < IPV6_MIN_MTU) {
1423 		/*
1424 		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1425 		 * MTU (1280) and a fragment header should always be included
1426 		 * after a node receiving Too Big message reporting PMTU is
1427 		 * less than the IPv6 Minimum Link MTU.
1428 		 */
1429 		pmtu = IPV6_MIN_MTU;
1430 		allfrag = 1;
1431 	}
1432 
1433 	/* New mtu received -> path was valid.
1434 	   They are sent only in response to data packets,
1435 	   so that this nexthop apparently is reachable. --ANK
1436 	 */
1437 	dst_confirm(&rt->u.dst);
1438 
1439 	/* Host route. If it is static, it would be better
1440 	   not to override it, but add new one, so that
1441 	   when cache entry will expire old pmtu
1442 	   would return automatically.
1443 	 */
1444 	if (rt->rt6i_flags & RTF_CACHE) {
1445 		rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1446 		if (allfrag)
1447 			rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1448 		dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1449 		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1450 		goto out;
1451 	}
1452 
1453 	/* Network route.
1454 	   Two cases are possible:
1455 	   1. It is connected route. Action: COW
1456 	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1457 	 */
1458 	if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1459 		nrt = rt6_alloc_cow(rt, daddr, saddr);
1460 	else
1461 		nrt = rt6_alloc_clone(rt, daddr);
1462 
1463 	if (nrt) {
1464 		nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1465 		if (allfrag)
1466 			nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1467 
1468 		/* According to RFC 1981, detecting PMTU increase shouldn't be
1469 		 * happened within 5 mins, the recommended timer is 10 mins.
1470 		 * Here this route expiration time is set to ip6_rt_mtu_expires
1471 		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1472 		 * and detecting PMTU increase will be automatically happened.
1473 		 */
1474 		dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1475 		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1476 
1477 		ip6_ins_rt(nrt);
1478 	}
1479 out:
1480 	dst_release(&rt->u.dst);
1481 }
1482 
1483 /*
1484  *	Misc support functions
1485  */
1486 
1487 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1488 {
1489 	struct rt6_info *rt = ip6_dst_alloc();
1490 
1491 	if (rt) {
1492 		rt->u.dst.input = ort->u.dst.input;
1493 		rt->u.dst.output = ort->u.dst.output;
1494 
1495 		memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1496 		rt->u.dst.dev = ort->u.dst.dev;
1497 		if (rt->u.dst.dev)
1498 			dev_hold(rt->u.dst.dev);
1499 		rt->rt6i_idev = ort->rt6i_idev;
1500 		if (rt->rt6i_idev)
1501 			in6_dev_hold(rt->rt6i_idev);
1502 		rt->u.dst.lastuse = jiffies;
1503 		rt->rt6i_expires = 0;
1504 
1505 		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1506 		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1507 		rt->rt6i_metric = 0;
1508 
1509 		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1510 #ifdef CONFIG_IPV6_SUBTREES
1511 		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1512 #endif
1513 		rt->rt6i_table = ort->rt6i_table;
1514 	}
1515 	return rt;
1516 }
1517 
1518 #ifdef CONFIG_IPV6_ROUTE_INFO
1519 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1520 					   struct in6_addr *gwaddr, int ifindex)
1521 {
1522 	struct fib6_node *fn;
1523 	struct rt6_info *rt = NULL;
1524 	struct fib6_table *table;
1525 
1526 	table = fib6_get_table(RT6_TABLE_INFO);
1527 	if (table == NULL)
1528 		return NULL;
1529 
1530 	write_lock_bh(&table->tb6_lock);
1531 	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1532 	if (!fn)
1533 		goto out;
1534 
1535 	for (rt = fn->leaf; rt; rt = rt->u.next) {
1536 		if (rt->rt6i_dev->ifindex != ifindex)
1537 			continue;
1538 		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1539 			continue;
1540 		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1541 			continue;
1542 		dst_hold(&rt->u.dst);
1543 		break;
1544 	}
1545 out:
1546 	write_unlock_bh(&table->tb6_lock);
1547 	return rt;
1548 }
1549 
1550 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1551 					   struct in6_addr *gwaddr, int ifindex,
1552 					   unsigned pref)
1553 {
1554 	struct fib6_config cfg = {
1555 		.fc_table	= RT6_TABLE_INFO,
1556 		.fc_metric	= 1024,
1557 		.fc_ifindex	= ifindex,
1558 		.fc_dst_len	= prefixlen,
1559 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1560 				  RTF_UP | RTF_PREF(pref),
1561 	};
1562 
1563 	ipv6_addr_copy(&cfg.fc_dst, prefix);
1564 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1565 
1566 	/* We should treat it as a default route if prefix length is 0. */
1567 	if (!prefixlen)
1568 		cfg.fc_flags |= RTF_DEFAULT;
1569 
1570 	ip6_route_add(&cfg);
1571 
1572 	return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1573 }
1574 #endif
1575 
1576 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1577 {
1578 	struct rt6_info *rt;
1579 	struct fib6_table *table;
1580 
1581 	table = fib6_get_table(RT6_TABLE_DFLT);
1582 	if (table == NULL)
1583 		return NULL;
1584 
1585 	write_lock_bh(&table->tb6_lock);
1586 	for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1587 		if (dev == rt->rt6i_dev &&
1588 		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1589 		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1590 			break;
1591 	}
1592 	if (rt)
1593 		dst_hold(&rt->u.dst);
1594 	write_unlock_bh(&table->tb6_lock);
1595 	return rt;
1596 }
1597 
1598 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1599 				     struct net_device *dev,
1600 				     unsigned int pref)
1601 {
1602 	struct fib6_config cfg = {
1603 		.fc_table	= RT6_TABLE_DFLT,
1604 		.fc_metric	= 1024,
1605 		.fc_ifindex	= dev->ifindex,
1606 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1607 				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1608 	};
1609 
1610 	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1611 
1612 	ip6_route_add(&cfg);
1613 
1614 	return rt6_get_dflt_router(gwaddr, dev);
1615 }
1616 
1617 void rt6_purge_dflt_routers(void)
1618 {
1619 	struct rt6_info *rt;
1620 	struct fib6_table *table;
1621 
1622 	/* NOTE: Keep consistent with rt6_get_dflt_router */
1623 	table = fib6_get_table(RT6_TABLE_DFLT);
1624 	if (table == NULL)
1625 		return;
1626 
1627 restart:
1628 	read_lock_bh(&table->tb6_lock);
1629 	for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1630 		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1631 			dst_hold(&rt->u.dst);
1632 			read_unlock_bh(&table->tb6_lock);
1633 			ip6_del_rt(rt);
1634 			goto restart;
1635 		}
1636 	}
1637 	read_unlock_bh(&table->tb6_lock);
1638 }
1639 
1640 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1641 				 struct fib6_config *cfg)
1642 {
1643 	memset(cfg, 0, sizeof(*cfg));
1644 
1645 	cfg->fc_table = RT6_TABLE_MAIN;
1646 	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1647 	cfg->fc_metric = rtmsg->rtmsg_metric;
1648 	cfg->fc_expires = rtmsg->rtmsg_info;
1649 	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1650 	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1651 	cfg->fc_flags = rtmsg->rtmsg_flags;
1652 
1653 	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1654 	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1655 	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1656 }
1657 
1658 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1659 {
1660 	struct fib6_config cfg;
1661 	struct in6_rtmsg rtmsg;
1662 	int err;
1663 
1664 	switch(cmd) {
1665 	case SIOCADDRT:		/* Add a route */
1666 	case SIOCDELRT:		/* Delete a route */
1667 		if (!capable(CAP_NET_ADMIN))
1668 			return -EPERM;
1669 		err = copy_from_user(&rtmsg, arg,
1670 				     sizeof(struct in6_rtmsg));
1671 		if (err)
1672 			return -EFAULT;
1673 
1674 		rtmsg_to_fib6_config(&rtmsg, &cfg);
1675 
1676 		rtnl_lock();
1677 		switch (cmd) {
1678 		case SIOCADDRT:
1679 			err = ip6_route_add(&cfg);
1680 			break;
1681 		case SIOCDELRT:
1682 			err = ip6_route_del(&cfg);
1683 			break;
1684 		default:
1685 			err = -EINVAL;
1686 		}
1687 		rtnl_unlock();
1688 
1689 		return err;
1690 	};
1691 
1692 	return -EINVAL;
1693 }
1694 
1695 /*
1696  *	Drop the packet on the floor
1697  */
1698 
1699 static int ip6_pkt_discard(struct sk_buff *skb)
1700 {
1701 	int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1702 	if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1703 		IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1704 
1705 	IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1706 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static int ip6_pkt_discard_out(struct sk_buff *skb)
1712 {
1713 	skb->dev = skb->dst->dev;
1714 	return ip6_pkt_discard(skb);
1715 }
1716 
1717 /*
1718  *	Allocate a dst for local (unicast / anycast) address.
1719  */
1720 
1721 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1722 				    const struct in6_addr *addr,
1723 				    int anycast)
1724 {
1725 	struct rt6_info *rt = ip6_dst_alloc();
1726 
1727 	if (rt == NULL)
1728 		return ERR_PTR(-ENOMEM);
1729 
1730 	dev_hold(&loopback_dev);
1731 	in6_dev_hold(idev);
1732 
1733 	rt->u.dst.flags = DST_HOST;
1734 	rt->u.dst.input = ip6_input;
1735 	rt->u.dst.output = ip6_output;
1736 	rt->rt6i_dev = &loopback_dev;
1737 	rt->rt6i_idev = idev;
1738 	rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1739 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1740 	rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1741 	rt->u.dst.obsolete = -1;
1742 
1743 	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1744 	if (anycast)
1745 		rt->rt6i_flags |= RTF_ANYCAST;
1746 	else
1747 		rt->rt6i_flags |= RTF_LOCAL;
1748 	rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1749 	if (rt->rt6i_nexthop == NULL) {
1750 		dst_free((struct dst_entry *) rt);
1751 		return ERR_PTR(-ENOMEM);
1752 	}
1753 
1754 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1755 	rt->rt6i_dst.plen = 128;
1756 	rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1757 
1758 	atomic_set(&rt->u.dst.__refcnt, 1);
1759 
1760 	return rt;
1761 }
1762 
1763 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1764 {
1765 	if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1766 	    rt != &ip6_null_entry) {
1767 		RT6_TRACE("deleted by ifdown %p\n", rt);
1768 		return -1;
1769 	}
1770 	return 0;
1771 }
1772 
1773 void rt6_ifdown(struct net_device *dev)
1774 {
1775 	fib6_clean_all(fib6_ifdown, 0, dev);
1776 }
1777 
1778 struct rt6_mtu_change_arg
1779 {
1780 	struct net_device *dev;
1781 	unsigned mtu;
1782 };
1783 
1784 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1785 {
1786 	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1787 	struct inet6_dev *idev;
1788 
1789 	/* In IPv6 pmtu discovery is not optional,
1790 	   so that RTAX_MTU lock cannot disable it.
1791 	   We still use this lock to block changes
1792 	   caused by addrconf/ndisc.
1793 	*/
1794 
1795 	idev = __in6_dev_get(arg->dev);
1796 	if (idev == NULL)
1797 		return 0;
1798 
1799 	/* For administrative MTU increase, there is no way to discover
1800 	   IPv6 PMTU increase, so PMTU increase should be updated here.
1801 	   Since RFC 1981 doesn't include administrative MTU increase
1802 	   update PMTU increase is a MUST. (i.e. jumbo frame)
1803 	 */
1804 	/*
1805 	   If new MTU is less than route PMTU, this new MTU will be the
1806 	   lowest MTU in the path, update the route PMTU to reflect PMTU
1807 	   decreases; if new MTU is greater than route PMTU, and the
1808 	   old MTU is the lowest MTU in the path, update the route PMTU
1809 	   to reflect the increase. In this case if the other nodes' MTU
1810 	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
1811 	   PMTU discouvery.
1812 	 */
1813 	if (rt->rt6i_dev == arg->dev &&
1814 	    !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1815             (dst_mtu(&rt->u.dst) > arg->mtu ||
1816              (dst_mtu(&rt->u.dst) < arg->mtu &&
1817 	      dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1818 		rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1819 	rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1820 	return 0;
1821 }
1822 
1823 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1824 {
1825 	struct rt6_mtu_change_arg arg = {
1826 		.dev = dev,
1827 		.mtu = mtu,
1828 	};
1829 
1830 	fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1831 }
1832 
1833 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1834 	[RTA_GATEWAY]           = { .minlen = sizeof(struct in6_addr) },
1835 	[RTA_OIF]               = { .type = NLA_U32 },
1836 	[RTA_IIF]		= { .type = NLA_U32 },
1837 	[RTA_PRIORITY]          = { .type = NLA_U32 },
1838 	[RTA_METRICS]           = { .type = NLA_NESTED },
1839 };
1840 
1841 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1842 			      struct fib6_config *cfg)
1843 {
1844 	struct rtmsg *rtm;
1845 	struct nlattr *tb[RTA_MAX+1];
1846 	int err;
1847 
1848 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1849 	if (err < 0)
1850 		goto errout;
1851 
1852 	err = -EINVAL;
1853 	rtm = nlmsg_data(nlh);
1854 	memset(cfg, 0, sizeof(*cfg));
1855 
1856 	cfg->fc_table = rtm->rtm_table;
1857 	cfg->fc_dst_len = rtm->rtm_dst_len;
1858 	cfg->fc_src_len = rtm->rtm_src_len;
1859 	cfg->fc_flags = RTF_UP;
1860 	cfg->fc_protocol = rtm->rtm_protocol;
1861 
1862 	if (rtm->rtm_type == RTN_UNREACHABLE)
1863 		cfg->fc_flags |= RTF_REJECT;
1864 
1865 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1866 	cfg->fc_nlinfo.nlh = nlh;
1867 
1868 	if (tb[RTA_GATEWAY]) {
1869 		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1870 		cfg->fc_flags |= RTF_GATEWAY;
1871 	}
1872 
1873 	if (tb[RTA_DST]) {
1874 		int plen = (rtm->rtm_dst_len + 7) >> 3;
1875 
1876 		if (nla_len(tb[RTA_DST]) < plen)
1877 			goto errout;
1878 
1879 		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1880 	}
1881 
1882 	if (tb[RTA_SRC]) {
1883 		int plen = (rtm->rtm_src_len + 7) >> 3;
1884 
1885 		if (nla_len(tb[RTA_SRC]) < plen)
1886 			goto errout;
1887 
1888 		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1889 	}
1890 
1891 	if (tb[RTA_OIF])
1892 		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1893 
1894 	if (tb[RTA_PRIORITY])
1895 		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1896 
1897 	if (tb[RTA_METRICS]) {
1898 		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1899 		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1900 	}
1901 
1902 	if (tb[RTA_TABLE])
1903 		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1904 
1905 	err = 0;
1906 errout:
1907 	return err;
1908 }
1909 
1910 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1911 {
1912 	struct fib6_config cfg;
1913 	int err;
1914 
1915 	err = rtm_to_fib6_config(skb, nlh, &cfg);
1916 	if (err < 0)
1917 		return err;
1918 
1919 	return ip6_route_del(&cfg);
1920 }
1921 
1922 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1923 {
1924 	struct fib6_config cfg;
1925 	int err;
1926 
1927 	err = rtm_to_fib6_config(skb, nlh, &cfg);
1928 	if (err < 0)
1929 		return err;
1930 
1931 	return ip6_route_add(&cfg);
1932 }
1933 
1934 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1935 			 struct in6_addr *dst, struct in6_addr *src,
1936 			 int iif, int type, u32 pid, u32 seq,
1937 			 int prefix, unsigned int flags)
1938 {
1939 	struct rtmsg *rtm;
1940 	struct nlmsghdr *nlh;
1941 	struct rta_cacheinfo ci;
1942 	u32 table;
1943 
1944 	if (prefix) {	/* user wants prefix routes only */
1945 		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1946 			/* success since this is not a prefix route */
1947 			return 1;
1948 		}
1949 	}
1950 
1951 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
1952 	if (nlh == NULL)
1953 		return -ENOBUFS;
1954 
1955 	rtm = nlmsg_data(nlh);
1956 	rtm->rtm_family = AF_INET6;
1957 	rtm->rtm_dst_len = rt->rt6i_dst.plen;
1958 	rtm->rtm_src_len = rt->rt6i_src.plen;
1959 	rtm->rtm_tos = 0;
1960 	if (rt->rt6i_table)
1961 		table = rt->rt6i_table->tb6_id;
1962 	else
1963 		table = RT6_TABLE_UNSPEC;
1964 	rtm->rtm_table = table;
1965 	NLA_PUT_U32(skb, RTA_TABLE, table);
1966 	if (rt->rt6i_flags&RTF_REJECT)
1967 		rtm->rtm_type = RTN_UNREACHABLE;
1968 	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1969 		rtm->rtm_type = RTN_LOCAL;
1970 	else
1971 		rtm->rtm_type = RTN_UNICAST;
1972 	rtm->rtm_flags = 0;
1973 	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1974 	rtm->rtm_protocol = rt->rt6i_protocol;
1975 	if (rt->rt6i_flags&RTF_DYNAMIC)
1976 		rtm->rtm_protocol = RTPROT_REDIRECT;
1977 	else if (rt->rt6i_flags & RTF_ADDRCONF)
1978 		rtm->rtm_protocol = RTPROT_KERNEL;
1979 	else if (rt->rt6i_flags&RTF_DEFAULT)
1980 		rtm->rtm_protocol = RTPROT_RA;
1981 
1982 	if (rt->rt6i_flags&RTF_CACHE)
1983 		rtm->rtm_flags |= RTM_F_CLONED;
1984 
1985 	if (dst) {
1986 		NLA_PUT(skb, RTA_DST, 16, dst);
1987 	        rtm->rtm_dst_len = 128;
1988 	} else if (rtm->rtm_dst_len)
1989 		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1990 #ifdef CONFIG_IPV6_SUBTREES
1991 	if (src) {
1992 		NLA_PUT(skb, RTA_SRC, 16, src);
1993 	        rtm->rtm_src_len = 128;
1994 	} else if (rtm->rtm_src_len)
1995 		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1996 #endif
1997 	if (iif)
1998 		NLA_PUT_U32(skb, RTA_IIF, iif);
1999 	else if (dst) {
2000 		struct in6_addr saddr_buf;
2001 		if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2002 			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2003 	}
2004 
2005 	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2006 		goto nla_put_failure;
2007 
2008 	if (rt->u.dst.neighbour)
2009 		NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2010 
2011 	if (rt->u.dst.dev)
2012 		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2013 
2014 	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2015 	ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2016 	if (rt->rt6i_expires)
2017 		ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2018 	else
2019 		ci.rta_expires = 0;
2020 	ci.rta_used = rt->u.dst.__use;
2021 	ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2022 	ci.rta_error = rt->u.dst.error;
2023 	ci.rta_id = 0;
2024 	ci.rta_ts = 0;
2025 	ci.rta_tsage = 0;
2026 	NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2027 
2028 	return nlmsg_end(skb, nlh);
2029 
2030 nla_put_failure:
2031 	return nlmsg_cancel(skb, nlh);
2032 }
2033 
2034 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2035 {
2036 	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2037 	int prefix;
2038 
2039 	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2040 		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2041 		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2042 	} else
2043 		prefix = 0;
2044 
2045 	return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2046 		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2047 		     prefix, NLM_F_MULTI);
2048 }
2049 
2050 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2051 {
2052 	struct nlattr *tb[RTA_MAX+1];
2053 	struct rt6_info *rt;
2054 	struct sk_buff *skb;
2055 	struct rtmsg *rtm;
2056 	struct flowi fl;
2057 	int err, iif = 0;
2058 
2059 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2060 	if (err < 0)
2061 		goto errout;
2062 
2063 	err = -EINVAL;
2064 	memset(&fl, 0, sizeof(fl));
2065 
2066 	if (tb[RTA_SRC]) {
2067 		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2068 			goto errout;
2069 
2070 		ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2071 	}
2072 
2073 	if (tb[RTA_DST]) {
2074 		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2075 			goto errout;
2076 
2077 		ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2078 	}
2079 
2080 	if (tb[RTA_IIF])
2081 		iif = nla_get_u32(tb[RTA_IIF]);
2082 
2083 	if (tb[RTA_OIF])
2084 		fl.oif = nla_get_u32(tb[RTA_OIF]);
2085 
2086 	if (iif) {
2087 		struct net_device *dev;
2088 		dev = __dev_get_by_index(iif);
2089 		if (!dev) {
2090 			err = -ENODEV;
2091 			goto errout;
2092 		}
2093 	}
2094 
2095 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2096 	if (skb == NULL) {
2097 		err = -ENOBUFS;
2098 		goto errout;
2099 	}
2100 
2101 	/* Reserve room for dummy headers, this skb can pass
2102 	   through good chunk of routing engine.
2103 	 */
2104 	skb->mac.raw = skb->data;
2105 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2106 
2107 	rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2108 	skb->dst = &rt->u.dst;
2109 
2110 	err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2111 			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2112 			    nlh->nlmsg_seq, 0, 0);
2113 	if (err < 0) {
2114 		kfree_skb(skb);
2115 		goto errout;
2116 	}
2117 
2118 	err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2119 errout:
2120 	return err;
2121 }
2122 
2123 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2124 {
2125 	struct sk_buff *skb;
2126 	u32 pid = 0, seq = 0;
2127 	struct nlmsghdr *nlh = NULL;
2128 	int payload = sizeof(struct rtmsg) + 256;
2129 	int err = -ENOBUFS;
2130 
2131 	if (info) {
2132 		pid = info->pid;
2133 		nlh = info->nlh;
2134 		if (nlh)
2135 			seq = nlh->nlmsg_seq;
2136 	}
2137 
2138 	skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2139 	if (skb == NULL)
2140 		goto errout;
2141 
2142 	err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2143 	if (err < 0) {
2144 		kfree_skb(skb);
2145 		goto errout;
2146 	}
2147 
2148 	err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2149 errout:
2150 	if (err < 0)
2151 		rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2152 }
2153 
2154 /*
2155  *	/proc
2156  */
2157 
2158 #ifdef CONFIG_PROC_FS
2159 
2160 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2161 
2162 struct rt6_proc_arg
2163 {
2164 	char *buffer;
2165 	int offset;
2166 	int length;
2167 	int skip;
2168 	int len;
2169 };
2170 
2171 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2172 {
2173 	struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2174 	int i;
2175 
2176 	if (arg->skip < arg->offset / RT6_INFO_LEN) {
2177 		arg->skip++;
2178 		return 0;
2179 	}
2180 
2181 	if (arg->len >= arg->length)
2182 		return 0;
2183 
2184 	for (i=0; i<16; i++) {
2185 		sprintf(arg->buffer + arg->len, "%02x",
2186 			rt->rt6i_dst.addr.s6_addr[i]);
2187 		arg->len += 2;
2188 	}
2189 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2190 			    rt->rt6i_dst.plen);
2191 
2192 #ifdef CONFIG_IPV6_SUBTREES
2193 	for (i=0; i<16; i++) {
2194 		sprintf(arg->buffer + arg->len, "%02x",
2195 			rt->rt6i_src.addr.s6_addr[i]);
2196 		arg->len += 2;
2197 	}
2198 	arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2199 			    rt->rt6i_src.plen);
2200 #else
2201 	sprintf(arg->buffer + arg->len,
2202 		"00000000000000000000000000000000 00 ");
2203 	arg->len += 36;
2204 #endif
2205 
2206 	if (rt->rt6i_nexthop) {
2207 		for (i=0; i<16; i++) {
2208 			sprintf(arg->buffer + arg->len, "%02x",
2209 				rt->rt6i_nexthop->primary_key[i]);
2210 			arg->len += 2;
2211 		}
2212 	} else {
2213 		sprintf(arg->buffer + arg->len,
2214 			"00000000000000000000000000000000");
2215 		arg->len += 32;
2216 	}
2217 	arg->len += sprintf(arg->buffer + arg->len,
2218 			    " %08x %08x %08x %08x %8s\n",
2219 			    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2220 			    rt->u.dst.__use, rt->rt6i_flags,
2221 			    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2222 	return 0;
2223 }
2224 
2225 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2226 {
2227 	struct rt6_proc_arg arg = {
2228 		.buffer = buffer,
2229 		.offset = offset,
2230 		.length = length,
2231 	};
2232 
2233 	fib6_clean_all(rt6_info_route, 0, &arg);
2234 
2235 	*start = buffer;
2236 	if (offset)
2237 		*start += offset % RT6_INFO_LEN;
2238 
2239 	arg.len -= offset % RT6_INFO_LEN;
2240 
2241 	if (arg.len > length)
2242 		arg.len = length;
2243 	if (arg.len < 0)
2244 		arg.len = 0;
2245 
2246 	return arg.len;
2247 }
2248 
2249 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2250 {
2251 	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2252 		      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2253 		      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2254 		      rt6_stats.fib_rt_cache,
2255 		      atomic_read(&ip6_dst_ops.entries),
2256 		      rt6_stats.fib_discarded_routes);
2257 
2258 	return 0;
2259 }
2260 
2261 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2262 {
2263 	return single_open(file, rt6_stats_seq_show, NULL);
2264 }
2265 
2266 static struct file_operations rt6_stats_seq_fops = {
2267 	.owner	 = THIS_MODULE,
2268 	.open	 = rt6_stats_seq_open,
2269 	.read	 = seq_read,
2270 	.llseek	 = seq_lseek,
2271 	.release = single_release,
2272 };
2273 #endif	/* CONFIG_PROC_FS */
2274 
2275 #ifdef CONFIG_SYSCTL
2276 
2277 static int flush_delay;
2278 
2279 static
2280 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2281 			      void __user *buffer, size_t *lenp, loff_t *ppos)
2282 {
2283 	if (write) {
2284 		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2285 		fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2286 		return 0;
2287 	} else
2288 		return -EINVAL;
2289 }
2290 
2291 ctl_table ipv6_route_table[] = {
2292         {
2293 		.ctl_name	=	NET_IPV6_ROUTE_FLUSH,
2294 		.procname	=	"flush",
2295          	.data		=	&flush_delay,
2296 		.maxlen		=	sizeof(int),
2297 		.mode		=	0200,
2298          	.proc_handler	=	&ipv6_sysctl_rtcache_flush
2299 	},
2300 	{
2301 		.ctl_name	=	NET_IPV6_ROUTE_GC_THRESH,
2302 		.procname	=	"gc_thresh",
2303          	.data		=	&ip6_dst_ops.gc_thresh,
2304 		.maxlen		=	sizeof(int),
2305 		.mode		=	0644,
2306          	.proc_handler	=	&proc_dointvec,
2307 	},
2308 	{
2309 		.ctl_name	=	NET_IPV6_ROUTE_MAX_SIZE,
2310 		.procname	=	"max_size",
2311          	.data		=	&ip6_rt_max_size,
2312 		.maxlen		=	sizeof(int),
2313 		.mode		=	0644,
2314          	.proc_handler	=	&proc_dointvec,
2315 	},
2316 	{
2317 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2318 		.procname	=	"gc_min_interval",
2319          	.data		=	&ip6_rt_gc_min_interval,
2320 		.maxlen		=	sizeof(int),
2321 		.mode		=	0644,
2322          	.proc_handler	=	&proc_dointvec_jiffies,
2323 		.strategy	=	&sysctl_jiffies,
2324 	},
2325 	{
2326 		.ctl_name	=	NET_IPV6_ROUTE_GC_TIMEOUT,
2327 		.procname	=	"gc_timeout",
2328          	.data		=	&ip6_rt_gc_timeout,
2329 		.maxlen		=	sizeof(int),
2330 		.mode		=	0644,
2331          	.proc_handler	=	&proc_dointvec_jiffies,
2332 		.strategy	=	&sysctl_jiffies,
2333 	},
2334 	{
2335 		.ctl_name	=	NET_IPV6_ROUTE_GC_INTERVAL,
2336 		.procname	=	"gc_interval",
2337          	.data		=	&ip6_rt_gc_interval,
2338 		.maxlen		=	sizeof(int),
2339 		.mode		=	0644,
2340          	.proc_handler	=	&proc_dointvec_jiffies,
2341 		.strategy	=	&sysctl_jiffies,
2342 	},
2343 	{
2344 		.ctl_name	=	NET_IPV6_ROUTE_GC_ELASTICITY,
2345 		.procname	=	"gc_elasticity",
2346          	.data		=	&ip6_rt_gc_elasticity,
2347 		.maxlen		=	sizeof(int),
2348 		.mode		=	0644,
2349          	.proc_handler	=	&proc_dointvec_jiffies,
2350 		.strategy	=	&sysctl_jiffies,
2351 	},
2352 	{
2353 		.ctl_name	=	NET_IPV6_ROUTE_MTU_EXPIRES,
2354 		.procname	=	"mtu_expires",
2355          	.data		=	&ip6_rt_mtu_expires,
2356 		.maxlen		=	sizeof(int),
2357 		.mode		=	0644,
2358          	.proc_handler	=	&proc_dointvec_jiffies,
2359 		.strategy	=	&sysctl_jiffies,
2360 	},
2361 	{
2362 		.ctl_name	=	NET_IPV6_ROUTE_MIN_ADVMSS,
2363 		.procname	=	"min_adv_mss",
2364          	.data		=	&ip6_rt_min_advmss,
2365 		.maxlen		=	sizeof(int),
2366 		.mode		=	0644,
2367          	.proc_handler	=	&proc_dointvec_jiffies,
2368 		.strategy	=	&sysctl_jiffies,
2369 	},
2370 	{
2371 		.ctl_name	=	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2372 		.procname	=	"gc_min_interval_ms",
2373          	.data		=	&ip6_rt_gc_min_interval,
2374 		.maxlen		=	sizeof(int),
2375 		.mode		=	0644,
2376          	.proc_handler	=	&proc_dointvec_ms_jiffies,
2377 		.strategy	=	&sysctl_ms_jiffies,
2378 	},
2379 	{ .ctl_name = 0 }
2380 };
2381 
2382 #endif
2383 
2384 void __init ip6_route_init(void)
2385 {
2386 	struct proc_dir_entry *p;
2387 
2388 	ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2389 						     sizeof(struct rt6_info),
2390 						     0, SLAB_HWCACHE_ALIGN,
2391 						     NULL, NULL);
2392 	if (!ip6_dst_ops.kmem_cachep)
2393 		panic("cannot create ip6_dst_cache");
2394 
2395 	fib6_init();
2396 #ifdef 	CONFIG_PROC_FS
2397 	p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2398 	if (p)
2399 		p->owner = THIS_MODULE;
2400 
2401 	proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2402 #endif
2403 #ifdef CONFIG_XFRM
2404 	xfrm6_init();
2405 #endif
2406 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2407 	fib6_rules_init();
2408 #endif
2409 }
2410 
2411 void ip6_route_cleanup(void)
2412 {
2413 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2414 	fib6_rules_cleanup();
2415 #endif
2416 #ifdef CONFIG_PROC_FS
2417 	proc_net_remove("ipv6_route");
2418 	proc_net_remove("rt6_stats");
2419 #endif
2420 #ifdef CONFIG_XFRM
2421 	xfrm6_fini();
2422 #endif
2423 	rt6_ifdown(NULL);
2424 	fib6_gc_cleanup();
2425 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2426 }
2427