xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision 64c70b1c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48 
49 #include "fib_lookup.h"
50 
51 #define FSprintk(a...)
52 
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58 
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62 
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64 
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66 
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72 
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74 
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76 
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79 
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82 
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84 
85 #define endfor_nexthops(fi) }
86 
87 
88 static const struct
89 {
90 	int	error;
91 	u8	scope;
92 } fib_props[RTN_MAX + 1] = {
93 	{
94 		.error	= 0,
95 		.scope	= RT_SCOPE_NOWHERE,
96 	},	/* RTN_UNSPEC */
97 	{
98 		.error	= 0,
99 		.scope	= RT_SCOPE_UNIVERSE,
100 	},	/* RTN_UNICAST */
101 	{
102 		.error	= 0,
103 		.scope	= RT_SCOPE_HOST,
104 	},	/* RTN_LOCAL */
105 	{
106 		.error	= 0,
107 		.scope	= RT_SCOPE_LINK,
108 	},	/* RTN_BROADCAST */
109 	{
110 		.error	= 0,
111 		.scope	= RT_SCOPE_LINK,
112 	},	/* RTN_ANYCAST */
113 	{
114 		.error	= 0,
115 		.scope	= RT_SCOPE_UNIVERSE,
116 	},	/* RTN_MULTICAST */
117 	{
118 		.error	= -EINVAL,
119 		.scope	= RT_SCOPE_UNIVERSE,
120 	},	/* RTN_BLACKHOLE */
121 	{
122 		.error	= -EHOSTUNREACH,
123 		.scope	= RT_SCOPE_UNIVERSE,
124 	},	/* RTN_UNREACHABLE */
125 	{
126 		.error	= -EACCES,
127 		.scope	= RT_SCOPE_UNIVERSE,
128 	},	/* RTN_PROHIBIT */
129 	{
130 		.error	= -EAGAIN,
131 		.scope	= RT_SCOPE_UNIVERSE,
132 	},	/* RTN_THROW */
133 	{
134 		.error	= -EINVAL,
135 		.scope	= RT_SCOPE_NOWHERE,
136 	},	/* RTN_NAT */
137 	{
138 		.error	= -EINVAL,
139 		.scope	= RT_SCOPE_NOWHERE,
140 	},	/* RTN_XRESOLVE */
141 };
142 
143 
144 /* Release a nexthop info record */
145 
146 void free_fib_info(struct fib_info *fi)
147 {
148 	if (fi->fib_dead == 0) {
149 		printk("Freeing alive fib_info %p\n", fi);
150 		return;
151 	}
152 	change_nexthops(fi) {
153 		if (nh->nh_dev)
154 			dev_put(nh->nh_dev);
155 		nh->nh_dev = NULL;
156 	} endfor_nexthops(fi);
157 	fib_info_cnt--;
158 	kfree(fi);
159 }
160 
161 void fib_release_info(struct fib_info *fi)
162 {
163 	spin_lock_bh(&fib_info_lock);
164 	if (fi && --fi->fib_treeref == 0) {
165 		hlist_del(&fi->fib_hash);
166 		if (fi->fib_prefsrc)
167 			hlist_del(&fi->fib_lhash);
168 		change_nexthops(fi) {
169 			if (!nh->nh_dev)
170 				continue;
171 			hlist_del(&nh->nh_hash);
172 		} endfor_nexthops(fi)
173 		fi->fib_dead = 1;
174 		fib_info_put(fi);
175 	}
176 	spin_unlock_bh(&fib_info_lock);
177 }
178 
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181 	const struct fib_nh *onh = ofi->fib_nh;
182 
183 	for_nexthops(fi) {
184 		if (nh->nh_oif != onh->nh_oif ||
185 		    nh->nh_gw  != onh->nh_gw ||
186 		    nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188 		    nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191 		    nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194 			return -1;
195 		onh++;
196 	} endfor_nexthops(fi);
197 	return 0;
198 }
199 
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202 	unsigned int mask = (fib_hash_size - 1);
203 	unsigned int val = fi->fib_nhs;
204 
205 	val ^= fi->fib_protocol;
206 	val ^= (__force u32)fi->fib_prefsrc;
207 	val ^= fi->fib_priority;
208 
209 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211 
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214 	struct hlist_head *head;
215 	struct hlist_node *node;
216 	struct fib_info *fi;
217 	unsigned int hash;
218 
219 	hash = fib_info_hashfn(nfi);
220 	head = &fib_info_hash[hash];
221 
222 	hlist_for_each_entry(fi, node, head, fib_hash) {
223 		if (fi->fib_nhs != nfi->fib_nhs)
224 			continue;
225 		if (nfi->fib_protocol == fi->fib_protocol &&
226 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
227 		    nfi->fib_priority == fi->fib_priority &&
228 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
229 			   sizeof(fi->fib_metrics)) == 0 &&
230 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232 			return fi;
233 	}
234 
235 	return NULL;
236 }
237 
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
241 
242 	return (val ^
243 		(val >> DEVINDEX_HASHBITS) ^
244 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246 
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250 
251 int ip_fib_check_default(__be32 gw, struct net_device *dev)
252 {
253 	struct hlist_head *head;
254 	struct hlist_node *node;
255 	struct fib_nh *nh;
256 	unsigned int hash;
257 
258 	spin_lock(&fib_info_lock);
259 
260 	hash = fib_devindex_hashfn(dev->ifindex);
261 	head = &fib_info_devhash[hash];
262 	hlist_for_each_entry(nh, node, head, nh_hash) {
263 		if (nh->nh_dev == dev &&
264 		    nh->nh_gw == gw &&
265 		    !(nh->nh_flags&RTNH_F_DEAD)) {
266 			spin_unlock(&fib_info_lock);
267 			return 0;
268 		}
269 	}
270 
271 	spin_unlock(&fib_info_lock);
272 
273 	return -1;
274 }
275 
276 static inline size_t fib_nlmsg_size(struct fib_info *fi)
277 {
278 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
279 			 + nla_total_size(4) /* RTA_TABLE */
280 			 + nla_total_size(4) /* RTA_DST */
281 			 + nla_total_size(4) /* RTA_PRIORITY */
282 			 + nla_total_size(4); /* RTA_PREFSRC */
283 
284 	/* space for nested metrics */
285 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
286 
287 	if (fi->fib_nhs) {
288 		/* Also handles the special case fib_nhs == 1 */
289 
290 		/* each nexthop is packed in an attribute */
291 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
292 
293 		/* may contain flow and gateway attribute */
294 		nhsize += 2 * nla_total_size(4);
295 
296 		/* all nexthops are packed in a nested attribute */
297 		payload += nla_total_size(fi->fib_nhs * nhsize);
298 	}
299 
300 	return payload;
301 }
302 
303 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
304 	       int dst_len, u32 tb_id, struct nl_info *info,
305 	       unsigned int nlm_flags)
306 {
307 	struct sk_buff *skb;
308 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
309 	int err = -ENOBUFS;
310 
311 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
312 	if (skb == NULL)
313 		goto errout;
314 
315 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
316 			    fa->fa_type, fa->fa_scope, key, dst_len,
317 			    fa->fa_tos, fa->fa_info, nlm_flags);
318 	if (err < 0) {
319 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
320 		WARN_ON(err == -EMSGSIZE);
321 		kfree_skb(skb);
322 		goto errout;
323 	}
324 	err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
325 			  info->nlh, GFP_KERNEL);
326 errout:
327 	if (err < 0)
328 		rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
329 }
330 
331 /* Return the first fib alias matching TOS with
332  * priority less than or equal to PRIO.
333  */
334 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
335 {
336 	if (fah) {
337 		struct fib_alias *fa;
338 		list_for_each_entry(fa, fah, fa_list) {
339 			if (fa->fa_tos > tos)
340 				continue;
341 			if (fa->fa_info->fib_priority >= prio ||
342 			    fa->fa_tos < tos)
343 				return fa;
344 		}
345 	}
346 	return NULL;
347 }
348 
349 int fib_detect_death(struct fib_info *fi, int order,
350 		     struct fib_info **last_resort, int *last_idx, int *dflt)
351 {
352 	struct neighbour *n;
353 	int state = NUD_NONE;
354 
355 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
356 	if (n) {
357 		state = n->nud_state;
358 		neigh_release(n);
359 	}
360 	if (state==NUD_REACHABLE)
361 		return 0;
362 	if ((state&NUD_VALID) && order != *dflt)
363 		return 0;
364 	if ((state&NUD_VALID) ||
365 	    (*last_idx<0 && order > *dflt)) {
366 		*last_resort = fi;
367 		*last_idx = order;
368 	}
369 	return 1;
370 }
371 
372 #ifdef CONFIG_IP_ROUTE_MULTIPATH
373 
374 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
375 {
376 	int nhs = 0;
377 
378 	while (rtnh_ok(rtnh, remaining)) {
379 		nhs++;
380 		rtnh = rtnh_next(rtnh, &remaining);
381 	}
382 
383 	/* leftover implies invalid nexthop configuration, discard it */
384 	return remaining > 0 ? 0 : nhs;
385 }
386 
387 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
388 		       int remaining, struct fib_config *cfg)
389 {
390 	change_nexthops(fi) {
391 		int attrlen;
392 
393 		if (!rtnh_ok(rtnh, remaining))
394 			return -EINVAL;
395 
396 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
397 		nh->nh_oif = rtnh->rtnh_ifindex;
398 		nh->nh_weight = rtnh->rtnh_hops + 1;
399 
400 		attrlen = rtnh_attrlen(rtnh);
401 		if (attrlen > 0) {
402 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
403 
404 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
405 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
406 #ifdef CONFIG_NET_CLS_ROUTE
407 			nla = nla_find(attrs, attrlen, RTA_FLOW);
408 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
409 #endif
410 		}
411 
412 		rtnh = rtnh_next(rtnh, &remaining);
413 	} endfor_nexthops(fi);
414 
415 	return 0;
416 }
417 
418 #endif
419 
420 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
421 {
422 #ifdef CONFIG_IP_ROUTE_MULTIPATH
423 	struct rtnexthop *rtnh;
424 	int remaining;
425 #endif
426 
427 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
428 		return 1;
429 
430 	if (cfg->fc_oif || cfg->fc_gw) {
431 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
432 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
433 			return 0;
434 		return 1;
435 	}
436 
437 #ifdef CONFIG_IP_ROUTE_MULTIPATH
438 	if (cfg->fc_mp == NULL)
439 		return 0;
440 
441 	rtnh = cfg->fc_mp;
442 	remaining = cfg->fc_mp_len;
443 
444 	for_nexthops(fi) {
445 		int attrlen;
446 
447 		if (!rtnh_ok(rtnh, remaining))
448 			return -EINVAL;
449 
450 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
451 			return 1;
452 
453 		attrlen = rtnh_attrlen(rtnh);
454 		if (attrlen < 0) {
455 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
456 
457 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
458 			if (nla && nla_get_be32(nla) != nh->nh_gw)
459 				return 1;
460 #ifdef CONFIG_NET_CLS_ROUTE
461 			nla = nla_find(attrs, attrlen, RTA_FLOW);
462 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
463 				return 1;
464 #endif
465 		}
466 
467 		rtnh = rtnh_next(rtnh, &remaining);
468 	} endfor_nexthops(fi);
469 #endif
470 	return 0;
471 }
472 
473 
474 /*
475    Picture
476    -------
477 
478    Semantics of nexthop is very messy by historical reasons.
479    We have to take into account, that:
480    a) gateway can be actually local interface address,
481       so that gatewayed route is direct.
482    b) gateway must be on-link address, possibly
483       described not by an ifaddr, but also by a direct route.
484    c) If both gateway and interface are specified, they should not
485       contradict.
486    d) If we use tunnel routes, gateway could be not on-link.
487 
488    Attempt to reconcile all of these (alas, self-contradictory) conditions
489    results in pretty ugly and hairy code with obscure logic.
490 
491    I chose to generalized it instead, so that the size
492    of code does not increase practically, but it becomes
493    much more general.
494    Every prefix is assigned a "scope" value: "host" is local address,
495    "link" is direct route,
496    [ ... "site" ... "interior" ... ]
497    and "universe" is true gateway route with global meaning.
498 
499    Every prefix refers to a set of "nexthop"s (gw, oif),
500    where gw must have narrower scope. This recursion stops
501    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
502    which means that gw is forced to be on link.
503 
504    Code is still hairy, but now it is apparently logically
505    consistent and very flexible. F.e. as by-product it allows
506    to co-exists in peace independent exterior and interior
507    routing processes.
508 
509    Normally it looks as following.
510 
511    {universe prefix}  -> (gw, oif) [scope link]
512 			  |
513 			  |-> {link prefix} -> (gw, oif) [scope local]
514 						|
515 						|-> {local prefix} (terminal node)
516  */
517 
518 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
519 			struct fib_nh *nh)
520 {
521 	int err;
522 
523 	if (nh->nh_gw) {
524 		struct fib_result res;
525 
526 #ifdef CONFIG_IP_ROUTE_PERVASIVE
527 		if (nh->nh_flags&RTNH_F_PERVASIVE)
528 			return 0;
529 #endif
530 		if (nh->nh_flags&RTNH_F_ONLINK) {
531 			struct net_device *dev;
532 
533 			if (cfg->fc_scope >= RT_SCOPE_LINK)
534 				return -EINVAL;
535 			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
536 				return -EINVAL;
537 			if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
538 				return -ENODEV;
539 			if (!(dev->flags&IFF_UP))
540 				return -ENETDOWN;
541 			nh->nh_dev = dev;
542 			dev_hold(dev);
543 			nh->nh_scope = RT_SCOPE_LINK;
544 			return 0;
545 		}
546 		{
547 			struct flowi fl = {
548 				.nl_u = {
549 					.ip4_u = {
550 						.daddr = nh->nh_gw,
551 						.scope = cfg->fc_scope + 1,
552 					},
553 				},
554 				.oif = nh->nh_oif,
555 			};
556 
557 			/* It is not necessary, but requires a bit of thinking */
558 			if (fl.fl4_scope < RT_SCOPE_LINK)
559 				fl.fl4_scope = RT_SCOPE_LINK;
560 			if ((err = fib_lookup(&fl, &res)) != 0)
561 				return err;
562 		}
563 		err = -EINVAL;
564 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
565 			goto out;
566 		nh->nh_scope = res.scope;
567 		nh->nh_oif = FIB_RES_OIF(res);
568 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
569 			goto out;
570 		dev_hold(nh->nh_dev);
571 		err = -ENETDOWN;
572 		if (!(nh->nh_dev->flags & IFF_UP))
573 			goto out;
574 		err = 0;
575 out:
576 		fib_res_put(&res);
577 		return err;
578 	} else {
579 		struct in_device *in_dev;
580 
581 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
582 			return -EINVAL;
583 
584 		in_dev = inetdev_by_index(nh->nh_oif);
585 		if (in_dev == NULL)
586 			return -ENODEV;
587 		if (!(in_dev->dev->flags&IFF_UP)) {
588 			in_dev_put(in_dev);
589 			return -ENETDOWN;
590 		}
591 		nh->nh_dev = in_dev->dev;
592 		dev_hold(nh->nh_dev);
593 		nh->nh_scope = RT_SCOPE_HOST;
594 		in_dev_put(in_dev);
595 	}
596 	return 0;
597 }
598 
599 static inline unsigned int fib_laddr_hashfn(__be32 val)
600 {
601 	unsigned int mask = (fib_hash_size - 1);
602 
603 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
604 }
605 
606 static struct hlist_head *fib_hash_alloc(int bytes)
607 {
608 	if (bytes <= PAGE_SIZE)
609 		return kmalloc(bytes, GFP_KERNEL);
610 	else
611 		return (struct hlist_head *)
612 			__get_free_pages(GFP_KERNEL, get_order(bytes));
613 }
614 
615 static void fib_hash_free(struct hlist_head *hash, int bytes)
616 {
617 	if (!hash)
618 		return;
619 
620 	if (bytes <= PAGE_SIZE)
621 		kfree(hash);
622 	else
623 		free_pages((unsigned long) hash, get_order(bytes));
624 }
625 
626 static void fib_hash_move(struct hlist_head *new_info_hash,
627 			  struct hlist_head *new_laddrhash,
628 			  unsigned int new_size)
629 {
630 	struct hlist_head *old_info_hash, *old_laddrhash;
631 	unsigned int old_size = fib_hash_size;
632 	unsigned int i, bytes;
633 
634 	spin_lock_bh(&fib_info_lock);
635 	old_info_hash = fib_info_hash;
636 	old_laddrhash = fib_info_laddrhash;
637 	fib_hash_size = new_size;
638 
639 	for (i = 0; i < old_size; i++) {
640 		struct hlist_head *head = &fib_info_hash[i];
641 		struct hlist_node *node, *n;
642 		struct fib_info *fi;
643 
644 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
645 			struct hlist_head *dest;
646 			unsigned int new_hash;
647 
648 			hlist_del(&fi->fib_hash);
649 
650 			new_hash = fib_info_hashfn(fi);
651 			dest = &new_info_hash[new_hash];
652 			hlist_add_head(&fi->fib_hash, dest);
653 		}
654 	}
655 	fib_info_hash = new_info_hash;
656 
657 	for (i = 0; i < old_size; i++) {
658 		struct hlist_head *lhead = &fib_info_laddrhash[i];
659 		struct hlist_node *node, *n;
660 		struct fib_info *fi;
661 
662 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
663 			struct hlist_head *ldest;
664 			unsigned int new_hash;
665 
666 			hlist_del(&fi->fib_lhash);
667 
668 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
669 			ldest = &new_laddrhash[new_hash];
670 			hlist_add_head(&fi->fib_lhash, ldest);
671 		}
672 	}
673 	fib_info_laddrhash = new_laddrhash;
674 
675 	spin_unlock_bh(&fib_info_lock);
676 
677 	bytes = old_size * sizeof(struct hlist_head *);
678 	fib_hash_free(old_info_hash, bytes);
679 	fib_hash_free(old_laddrhash, bytes);
680 }
681 
682 struct fib_info *fib_create_info(struct fib_config *cfg)
683 {
684 	int err;
685 	struct fib_info *fi = NULL;
686 	struct fib_info *ofi;
687 	int nhs = 1;
688 
689 	/* Fast check to catch the most weird cases */
690 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
691 		goto err_inval;
692 
693 #ifdef CONFIG_IP_ROUTE_MULTIPATH
694 	if (cfg->fc_mp) {
695 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
696 		if (nhs == 0)
697 			goto err_inval;
698 	}
699 #endif
700 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
701 	if (cfg->fc_mp_alg) {
702 		if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
703 		    cfg->fc_mp_alg > IP_MP_ALG_MAX)
704 			goto err_inval;
705 	}
706 #endif
707 
708 	err = -ENOBUFS;
709 	if (fib_info_cnt >= fib_hash_size) {
710 		unsigned int new_size = fib_hash_size << 1;
711 		struct hlist_head *new_info_hash;
712 		struct hlist_head *new_laddrhash;
713 		unsigned int bytes;
714 
715 		if (!new_size)
716 			new_size = 1;
717 		bytes = new_size * sizeof(struct hlist_head *);
718 		new_info_hash = fib_hash_alloc(bytes);
719 		new_laddrhash = fib_hash_alloc(bytes);
720 		if (!new_info_hash || !new_laddrhash) {
721 			fib_hash_free(new_info_hash, bytes);
722 			fib_hash_free(new_laddrhash, bytes);
723 		} else {
724 			memset(new_info_hash, 0, bytes);
725 			memset(new_laddrhash, 0, bytes);
726 
727 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
728 		}
729 
730 		if (!fib_hash_size)
731 			goto failure;
732 	}
733 
734 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
735 	if (fi == NULL)
736 		goto failure;
737 	fib_info_cnt++;
738 
739 	fi->fib_protocol = cfg->fc_protocol;
740 	fi->fib_flags = cfg->fc_flags;
741 	fi->fib_priority = cfg->fc_priority;
742 	fi->fib_prefsrc = cfg->fc_prefsrc;
743 
744 	fi->fib_nhs = nhs;
745 	change_nexthops(fi) {
746 		nh->nh_parent = fi;
747 	} endfor_nexthops(fi)
748 
749 	if (cfg->fc_mx) {
750 		struct nlattr *nla;
751 		int remaining;
752 
753 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
754 			int type = nla->nla_type;
755 
756 			if (type) {
757 				if (type > RTAX_MAX)
758 					goto err_inval;
759 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
760 			}
761 		}
762 	}
763 
764 	if (cfg->fc_mp) {
765 #ifdef CONFIG_IP_ROUTE_MULTIPATH
766 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
767 		if (err != 0)
768 			goto failure;
769 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
770 			goto err_inval;
771 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
772 			goto err_inval;
773 #ifdef CONFIG_NET_CLS_ROUTE
774 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
775 			goto err_inval;
776 #endif
777 #else
778 		goto err_inval;
779 #endif
780 	} else {
781 		struct fib_nh *nh = fi->fib_nh;
782 
783 		nh->nh_oif = cfg->fc_oif;
784 		nh->nh_gw = cfg->fc_gw;
785 		nh->nh_flags = cfg->fc_flags;
786 #ifdef CONFIG_NET_CLS_ROUTE
787 		nh->nh_tclassid = cfg->fc_flow;
788 #endif
789 #ifdef CONFIG_IP_ROUTE_MULTIPATH
790 		nh->nh_weight = 1;
791 #endif
792 	}
793 
794 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
795 	fi->fib_mp_alg = cfg->fc_mp_alg;
796 #endif
797 
798 	if (fib_props[cfg->fc_type].error) {
799 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
800 			goto err_inval;
801 		goto link_it;
802 	}
803 
804 	if (cfg->fc_scope > RT_SCOPE_HOST)
805 		goto err_inval;
806 
807 	if (cfg->fc_scope == RT_SCOPE_HOST) {
808 		struct fib_nh *nh = fi->fib_nh;
809 
810 		/* Local address is added. */
811 		if (nhs != 1 || nh->nh_gw)
812 			goto err_inval;
813 		nh->nh_scope = RT_SCOPE_NOWHERE;
814 		nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
815 		err = -ENODEV;
816 		if (nh->nh_dev == NULL)
817 			goto failure;
818 	} else {
819 		change_nexthops(fi) {
820 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
821 				goto failure;
822 		} endfor_nexthops(fi)
823 	}
824 
825 	if (fi->fib_prefsrc) {
826 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
827 		    fi->fib_prefsrc != cfg->fc_dst)
828 			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
829 				goto err_inval;
830 	}
831 
832 link_it:
833 	if ((ofi = fib_find_info(fi)) != NULL) {
834 		fi->fib_dead = 1;
835 		free_fib_info(fi);
836 		ofi->fib_treeref++;
837 		return ofi;
838 	}
839 
840 	fi->fib_treeref++;
841 	atomic_inc(&fi->fib_clntref);
842 	spin_lock_bh(&fib_info_lock);
843 	hlist_add_head(&fi->fib_hash,
844 		       &fib_info_hash[fib_info_hashfn(fi)]);
845 	if (fi->fib_prefsrc) {
846 		struct hlist_head *head;
847 
848 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
849 		hlist_add_head(&fi->fib_lhash, head);
850 	}
851 	change_nexthops(fi) {
852 		struct hlist_head *head;
853 		unsigned int hash;
854 
855 		if (!nh->nh_dev)
856 			continue;
857 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
858 		head = &fib_info_devhash[hash];
859 		hlist_add_head(&nh->nh_hash, head);
860 	} endfor_nexthops(fi)
861 	spin_unlock_bh(&fib_info_lock);
862 	return fi;
863 
864 err_inval:
865 	err = -EINVAL;
866 
867 failure:
868 	if (fi) {
869 		fi->fib_dead = 1;
870 		free_fib_info(fi);
871 	}
872 
873 	return ERR_PTR(err);
874 }
875 
876 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
877 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
878 		       struct fib_result *res, __be32 zone, __be32 mask,
879 			int prefixlen)
880 {
881 	struct fib_alias *fa;
882 	int nh_sel = 0;
883 
884 	list_for_each_entry_rcu(fa, head, fa_list) {
885 		int err;
886 
887 		if (fa->fa_tos &&
888 		    fa->fa_tos != flp->fl4_tos)
889 			continue;
890 
891 		if (fa->fa_scope < flp->fl4_scope)
892 			continue;
893 
894 		fa->fa_state |= FA_S_ACCESSED;
895 
896 		err = fib_props[fa->fa_type].error;
897 		if (err == 0) {
898 			struct fib_info *fi = fa->fa_info;
899 
900 			if (fi->fib_flags & RTNH_F_DEAD)
901 				continue;
902 
903 			switch (fa->fa_type) {
904 			case RTN_UNICAST:
905 			case RTN_LOCAL:
906 			case RTN_BROADCAST:
907 			case RTN_ANYCAST:
908 			case RTN_MULTICAST:
909 				for_nexthops(fi) {
910 					if (nh->nh_flags&RTNH_F_DEAD)
911 						continue;
912 					if (!flp->oif || flp->oif == nh->nh_oif)
913 						break;
914 				}
915 #ifdef CONFIG_IP_ROUTE_MULTIPATH
916 				if (nhsel < fi->fib_nhs) {
917 					nh_sel = nhsel;
918 					goto out_fill_res;
919 				}
920 #else
921 				if (nhsel < 1) {
922 					goto out_fill_res;
923 				}
924 #endif
925 				endfor_nexthops(fi);
926 				continue;
927 
928 			default:
929 				printk(KERN_DEBUG "impossible 102\n");
930 				return -EINVAL;
931 			}
932 		}
933 		return err;
934 	}
935 	return 1;
936 
937 out_fill_res:
938 	res->prefixlen = prefixlen;
939 	res->nh_sel = nh_sel;
940 	res->type = fa->fa_type;
941 	res->scope = fa->fa_scope;
942 	res->fi = fa->fa_info;
943 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 	res->netmask = mask;
945 	res->network = zone & inet_make_mask(prefixlen);
946 #endif
947 	atomic_inc(&res->fi->fib_clntref);
948 	return 0;
949 }
950 
951 /* Find appropriate source address to this destination */
952 
953 __be32 __fib_res_prefsrc(struct fib_result *res)
954 {
955 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
956 }
957 
958 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
959 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
960 		  struct fib_info *fi, unsigned int flags)
961 {
962 	struct nlmsghdr *nlh;
963 	struct rtmsg *rtm;
964 
965 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
966 	if (nlh == NULL)
967 		return -EMSGSIZE;
968 
969 	rtm = nlmsg_data(nlh);
970 	rtm->rtm_family = AF_INET;
971 	rtm->rtm_dst_len = dst_len;
972 	rtm->rtm_src_len = 0;
973 	rtm->rtm_tos = tos;
974 	rtm->rtm_table = tb_id;
975 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
976 	rtm->rtm_type = type;
977 	rtm->rtm_flags = fi->fib_flags;
978 	rtm->rtm_scope = scope;
979 	rtm->rtm_protocol = fi->fib_protocol;
980 
981 	if (rtm->rtm_dst_len)
982 		NLA_PUT_BE32(skb, RTA_DST, dst);
983 
984 	if (fi->fib_priority)
985 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
986 
987 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
988 		goto nla_put_failure;
989 
990 	if (fi->fib_prefsrc)
991 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
992 
993 	if (fi->fib_nhs == 1) {
994 		if (fi->fib_nh->nh_gw)
995 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
996 
997 		if (fi->fib_nh->nh_oif)
998 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
999 #ifdef CONFIG_NET_CLS_ROUTE
1000 		if (fi->fib_nh[0].nh_tclassid)
1001 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1002 #endif
1003 	}
1004 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1005 	if (fi->fib_nhs > 1) {
1006 		struct rtnexthop *rtnh;
1007 		struct nlattr *mp;
1008 
1009 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1010 		if (mp == NULL)
1011 			goto nla_put_failure;
1012 
1013 		for_nexthops(fi) {
1014 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1015 			if (rtnh == NULL)
1016 				goto nla_put_failure;
1017 
1018 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1019 			rtnh->rtnh_hops = nh->nh_weight - 1;
1020 			rtnh->rtnh_ifindex = nh->nh_oif;
1021 
1022 			if (nh->nh_gw)
1023 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1024 #ifdef CONFIG_NET_CLS_ROUTE
1025 			if (nh->nh_tclassid)
1026 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1027 #endif
1028 			/* length of rtnetlink header + attributes */
1029 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1030 		} endfor_nexthops(fi);
1031 
1032 		nla_nest_end(skb, mp);
1033 	}
1034 #endif
1035 	return nlmsg_end(skb, nlh);
1036 
1037 nla_put_failure:
1038 	nlmsg_cancel(skb, nlh);
1039 	return -EMSGSIZE;
1040 }
1041 
1042 /*
1043    Update FIB if:
1044    - local address disappeared -> we must delete all the entries
1045      referring to it.
1046    - device went down -> we must shutdown all nexthops going via it.
1047  */
1048 
1049 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1050 {
1051 	int ret = 0;
1052 	int scope = RT_SCOPE_NOWHERE;
1053 
1054 	if (force)
1055 		scope = -1;
1056 
1057 	if (local && fib_info_laddrhash) {
1058 		unsigned int hash = fib_laddr_hashfn(local);
1059 		struct hlist_head *head = &fib_info_laddrhash[hash];
1060 		struct hlist_node *node;
1061 		struct fib_info *fi;
1062 
1063 		hlist_for_each_entry(fi, node, head, fib_lhash) {
1064 			if (fi->fib_prefsrc == local) {
1065 				fi->fib_flags |= RTNH_F_DEAD;
1066 				ret++;
1067 			}
1068 		}
1069 	}
1070 
1071 	if (dev) {
1072 		struct fib_info *prev_fi = NULL;
1073 		unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1074 		struct hlist_head *head = &fib_info_devhash[hash];
1075 		struct hlist_node *node;
1076 		struct fib_nh *nh;
1077 
1078 		hlist_for_each_entry(nh, node, head, nh_hash) {
1079 			struct fib_info *fi = nh->nh_parent;
1080 			int dead;
1081 
1082 			BUG_ON(!fi->fib_nhs);
1083 			if (nh->nh_dev != dev || fi == prev_fi)
1084 				continue;
1085 			prev_fi = fi;
1086 			dead = 0;
1087 			change_nexthops(fi) {
1088 				if (nh->nh_flags&RTNH_F_DEAD)
1089 					dead++;
1090 				else if (nh->nh_dev == dev &&
1091 					 nh->nh_scope != scope) {
1092 					nh->nh_flags |= RTNH_F_DEAD;
1093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1094 					spin_lock_bh(&fib_multipath_lock);
1095 					fi->fib_power -= nh->nh_power;
1096 					nh->nh_power = 0;
1097 					spin_unlock_bh(&fib_multipath_lock);
1098 #endif
1099 					dead++;
1100 				}
1101 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1102 				if (force > 1 && nh->nh_dev == dev) {
1103 					dead = fi->fib_nhs;
1104 					break;
1105 				}
1106 #endif
1107 			} endfor_nexthops(fi)
1108 			if (dead == fi->fib_nhs) {
1109 				fi->fib_flags |= RTNH_F_DEAD;
1110 				ret++;
1111 			}
1112 		}
1113 	}
1114 
1115 	return ret;
1116 }
1117 
1118 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1119 
1120 /*
1121    Dead device goes up. We wake up dead nexthops.
1122    It takes sense only on multipath routes.
1123  */
1124 
1125 int fib_sync_up(struct net_device *dev)
1126 {
1127 	struct fib_info *prev_fi;
1128 	unsigned int hash;
1129 	struct hlist_head *head;
1130 	struct hlist_node *node;
1131 	struct fib_nh *nh;
1132 	int ret;
1133 
1134 	if (!(dev->flags&IFF_UP))
1135 		return 0;
1136 
1137 	prev_fi = NULL;
1138 	hash = fib_devindex_hashfn(dev->ifindex);
1139 	head = &fib_info_devhash[hash];
1140 	ret = 0;
1141 
1142 	hlist_for_each_entry(nh, node, head, nh_hash) {
1143 		struct fib_info *fi = nh->nh_parent;
1144 		int alive;
1145 
1146 		BUG_ON(!fi->fib_nhs);
1147 		if (nh->nh_dev != dev || fi == prev_fi)
1148 			continue;
1149 
1150 		prev_fi = fi;
1151 		alive = 0;
1152 		change_nexthops(fi) {
1153 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1154 				alive++;
1155 				continue;
1156 			}
1157 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1158 				continue;
1159 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1160 				continue;
1161 			alive++;
1162 			spin_lock_bh(&fib_multipath_lock);
1163 			nh->nh_power = 0;
1164 			nh->nh_flags &= ~RTNH_F_DEAD;
1165 			spin_unlock_bh(&fib_multipath_lock);
1166 		} endfor_nexthops(fi)
1167 
1168 		if (alive > 0) {
1169 			fi->fib_flags &= ~RTNH_F_DEAD;
1170 			ret++;
1171 		}
1172 	}
1173 
1174 	return ret;
1175 }
1176 
1177 /*
1178    The algorithm is suboptimal, but it provides really
1179    fair weighted route distribution.
1180  */
1181 
1182 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1183 {
1184 	struct fib_info *fi = res->fi;
1185 	int w;
1186 
1187 	spin_lock_bh(&fib_multipath_lock);
1188 	if (fi->fib_power <= 0) {
1189 		int power = 0;
1190 		change_nexthops(fi) {
1191 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1192 				power += nh->nh_weight;
1193 				nh->nh_power = nh->nh_weight;
1194 			}
1195 		} endfor_nexthops(fi);
1196 		fi->fib_power = power;
1197 		if (power <= 0) {
1198 			spin_unlock_bh(&fib_multipath_lock);
1199 			/* Race condition: route has just become dead. */
1200 			res->nh_sel = 0;
1201 			return;
1202 		}
1203 	}
1204 
1205 
1206 	/* w should be random number [0..fi->fib_power-1],
1207 	   it is pretty bad approximation.
1208 	 */
1209 
1210 	w = jiffies % fi->fib_power;
1211 
1212 	change_nexthops(fi) {
1213 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1214 			if ((w -= nh->nh_power) <= 0) {
1215 				nh->nh_power--;
1216 				fi->fib_power--;
1217 				res->nh_sel = nhsel;
1218 				spin_unlock_bh(&fib_multipath_lock);
1219 				return;
1220 			}
1221 		}
1222 	} endfor_nexthops(fi);
1223 
1224 	/* Race condition: route has just become dead. */
1225 	res->nh_sel = 0;
1226 	spin_unlock_bh(&fib_multipath_lock);
1227 }
1228 #endif
1229