xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision 96de0e252cedffad61b3cb5e05662c591898e69a)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47 
48 #include "fib_lookup.h"
49 
50 #define FSprintk(a...)
51 
52 static DEFINE_SPINLOCK(fib_info_lock);
53 static struct hlist_head *fib_info_hash;
54 static struct hlist_head *fib_info_laddrhash;
55 static unsigned int fib_hash_size;
56 static unsigned int fib_info_cnt;
57 
58 #define DEVINDEX_HASHBITS 8
59 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
60 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
61 
62 #ifdef CONFIG_IP_ROUTE_MULTIPATH
63 
64 static DEFINE_SPINLOCK(fib_multipath_lock);
65 
66 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
67 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
68 
69 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
70 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
71 
72 #else /* CONFIG_IP_ROUTE_MULTIPATH */
73 
74 /* Hope, that gcc will optimize it to get rid of dummy loop */
75 
76 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
77 for (nhsel=0; nhsel < 1; nhsel++)
78 
79 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
80 for (nhsel=0; nhsel < 1; nhsel++)
81 
82 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
83 
84 #define endfor_nexthops(fi) }
85 
86 
87 static const struct
88 {
89 	int	error;
90 	u8	scope;
91 } fib_props[RTN_MAX + 1] = {
92 	{
93 		.error	= 0,
94 		.scope	= RT_SCOPE_NOWHERE,
95 	},	/* RTN_UNSPEC */
96 	{
97 		.error	= 0,
98 		.scope	= RT_SCOPE_UNIVERSE,
99 	},	/* RTN_UNICAST */
100 	{
101 		.error	= 0,
102 		.scope	= RT_SCOPE_HOST,
103 	},	/* RTN_LOCAL */
104 	{
105 		.error	= 0,
106 		.scope	= RT_SCOPE_LINK,
107 	},	/* RTN_BROADCAST */
108 	{
109 		.error	= 0,
110 		.scope	= RT_SCOPE_LINK,
111 	},	/* RTN_ANYCAST */
112 	{
113 		.error	= 0,
114 		.scope	= RT_SCOPE_UNIVERSE,
115 	},	/* RTN_MULTICAST */
116 	{
117 		.error	= -EINVAL,
118 		.scope	= RT_SCOPE_UNIVERSE,
119 	},	/* RTN_BLACKHOLE */
120 	{
121 		.error	= -EHOSTUNREACH,
122 		.scope	= RT_SCOPE_UNIVERSE,
123 	},	/* RTN_UNREACHABLE */
124 	{
125 		.error	= -EACCES,
126 		.scope	= RT_SCOPE_UNIVERSE,
127 	},	/* RTN_PROHIBIT */
128 	{
129 		.error	= -EAGAIN,
130 		.scope	= RT_SCOPE_UNIVERSE,
131 	},	/* RTN_THROW */
132 	{
133 		.error	= -EINVAL,
134 		.scope	= RT_SCOPE_NOWHERE,
135 	},	/* RTN_NAT */
136 	{
137 		.error	= -EINVAL,
138 		.scope	= RT_SCOPE_NOWHERE,
139 	},	/* RTN_XRESOLVE */
140 };
141 
142 
143 /* Release a nexthop info record */
144 
145 void free_fib_info(struct fib_info *fi)
146 {
147 	if (fi->fib_dead == 0) {
148 		printk("Freeing alive fib_info %p\n", fi);
149 		return;
150 	}
151 	change_nexthops(fi) {
152 		if (nh->nh_dev)
153 			dev_put(nh->nh_dev);
154 		nh->nh_dev = NULL;
155 	} endfor_nexthops(fi);
156 	fib_info_cnt--;
157 	kfree(fi);
158 }
159 
160 void fib_release_info(struct fib_info *fi)
161 {
162 	spin_lock_bh(&fib_info_lock);
163 	if (fi && --fi->fib_treeref == 0) {
164 		hlist_del(&fi->fib_hash);
165 		if (fi->fib_prefsrc)
166 			hlist_del(&fi->fib_lhash);
167 		change_nexthops(fi) {
168 			if (!nh->nh_dev)
169 				continue;
170 			hlist_del(&nh->nh_hash);
171 		} endfor_nexthops(fi)
172 		fi->fib_dead = 1;
173 		fib_info_put(fi);
174 	}
175 	spin_unlock_bh(&fib_info_lock);
176 }
177 
178 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
179 {
180 	const struct fib_nh *onh = ofi->fib_nh;
181 
182 	for_nexthops(fi) {
183 		if (nh->nh_oif != onh->nh_oif ||
184 		    nh->nh_gw  != onh->nh_gw ||
185 		    nh->nh_scope != onh->nh_scope ||
186 #ifdef CONFIG_IP_ROUTE_MULTIPATH
187 		    nh->nh_weight != onh->nh_weight ||
188 #endif
189 #ifdef CONFIG_NET_CLS_ROUTE
190 		    nh->nh_tclassid != onh->nh_tclassid ||
191 #endif
192 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
193 			return -1;
194 		onh++;
195 	} endfor_nexthops(fi);
196 	return 0;
197 }
198 
199 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
200 {
201 	unsigned int mask = (fib_hash_size - 1);
202 	unsigned int val = fi->fib_nhs;
203 
204 	val ^= fi->fib_protocol;
205 	val ^= (__force u32)fi->fib_prefsrc;
206 	val ^= fi->fib_priority;
207 
208 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
209 }
210 
211 static struct fib_info *fib_find_info(const struct fib_info *nfi)
212 {
213 	struct hlist_head *head;
214 	struct hlist_node *node;
215 	struct fib_info *fi;
216 	unsigned int hash;
217 
218 	hash = fib_info_hashfn(nfi);
219 	head = &fib_info_hash[hash];
220 
221 	hlist_for_each_entry(fi, node, head, fib_hash) {
222 		if (fi->fib_nhs != nfi->fib_nhs)
223 			continue;
224 		if (nfi->fib_protocol == fi->fib_protocol &&
225 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
226 		    nfi->fib_priority == fi->fib_priority &&
227 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
228 			   sizeof(fi->fib_metrics)) == 0 &&
229 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
230 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
231 			return fi;
232 	}
233 
234 	return NULL;
235 }
236 
237 static inline unsigned int fib_devindex_hashfn(unsigned int val)
238 {
239 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
240 
241 	return (val ^
242 		(val >> DEVINDEX_HASHBITS) ^
243 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
244 }
245 
246 /* Check, that the gateway is already configured.
247    Used only by redirect accept routine.
248  */
249 
250 int ip_fib_check_default(__be32 gw, struct net_device *dev)
251 {
252 	struct hlist_head *head;
253 	struct hlist_node *node;
254 	struct fib_nh *nh;
255 	unsigned int hash;
256 
257 	spin_lock(&fib_info_lock);
258 
259 	hash = fib_devindex_hashfn(dev->ifindex);
260 	head = &fib_info_devhash[hash];
261 	hlist_for_each_entry(nh, node, head, nh_hash) {
262 		if (nh->nh_dev == dev &&
263 		    nh->nh_gw == gw &&
264 		    !(nh->nh_flags&RTNH_F_DEAD)) {
265 			spin_unlock(&fib_info_lock);
266 			return 0;
267 		}
268 	}
269 
270 	spin_unlock(&fib_info_lock);
271 
272 	return -1;
273 }
274 
275 static inline size_t fib_nlmsg_size(struct fib_info *fi)
276 {
277 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
278 			 + nla_total_size(4) /* RTA_TABLE */
279 			 + nla_total_size(4) /* RTA_DST */
280 			 + nla_total_size(4) /* RTA_PRIORITY */
281 			 + nla_total_size(4); /* RTA_PREFSRC */
282 
283 	/* space for nested metrics */
284 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
285 
286 	if (fi->fib_nhs) {
287 		/* Also handles the special case fib_nhs == 1 */
288 
289 		/* each nexthop is packed in an attribute */
290 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
291 
292 		/* may contain flow and gateway attribute */
293 		nhsize += 2 * nla_total_size(4);
294 
295 		/* all nexthops are packed in a nested attribute */
296 		payload += nla_total_size(fi->fib_nhs * nhsize);
297 	}
298 
299 	return payload;
300 }
301 
302 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
303 	       int dst_len, u32 tb_id, struct nl_info *info,
304 	       unsigned int nlm_flags)
305 {
306 	struct sk_buff *skb;
307 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
308 	int err = -ENOBUFS;
309 
310 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
311 	if (skb == NULL)
312 		goto errout;
313 
314 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
315 			    fa->fa_type, fa->fa_scope, key, dst_len,
316 			    fa->fa_tos, fa->fa_info, nlm_flags);
317 	if (err < 0) {
318 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
319 		WARN_ON(err == -EMSGSIZE);
320 		kfree_skb(skb);
321 		goto errout;
322 	}
323 	err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
324 			  info->nlh, GFP_KERNEL);
325 errout:
326 	if (err < 0)
327 		rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
328 }
329 
330 /* Return the first fib alias matching TOS with
331  * priority less than or equal to PRIO.
332  */
333 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
334 {
335 	if (fah) {
336 		struct fib_alias *fa;
337 		list_for_each_entry(fa, fah, fa_list) {
338 			if (fa->fa_tos > tos)
339 				continue;
340 			if (fa->fa_info->fib_priority >= prio ||
341 			    fa->fa_tos < tos)
342 				return fa;
343 		}
344 	}
345 	return NULL;
346 }
347 
348 int fib_detect_death(struct fib_info *fi, int order,
349 		     struct fib_info **last_resort, int *last_idx, int *dflt)
350 {
351 	struct neighbour *n;
352 	int state = NUD_NONE;
353 
354 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
355 	if (n) {
356 		state = n->nud_state;
357 		neigh_release(n);
358 	}
359 	if (state==NUD_REACHABLE)
360 		return 0;
361 	if ((state&NUD_VALID) && order != *dflt)
362 		return 0;
363 	if ((state&NUD_VALID) ||
364 	    (*last_idx<0 && order > *dflt)) {
365 		*last_resort = fi;
366 		*last_idx = order;
367 	}
368 	return 1;
369 }
370 
371 #ifdef CONFIG_IP_ROUTE_MULTIPATH
372 
373 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
374 {
375 	int nhs = 0;
376 
377 	while (rtnh_ok(rtnh, remaining)) {
378 		nhs++;
379 		rtnh = rtnh_next(rtnh, &remaining);
380 	}
381 
382 	/* leftover implies invalid nexthop configuration, discard it */
383 	return remaining > 0 ? 0 : nhs;
384 }
385 
386 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
387 		       int remaining, struct fib_config *cfg)
388 {
389 	change_nexthops(fi) {
390 		int attrlen;
391 
392 		if (!rtnh_ok(rtnh, remaining))
393 			return -EINVAL;
394 
395 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
396 		nh->nh_oif = rtnh->rtnh_ifindex;
397 		nh->nh_weight = rtnh->rtnh_hops + 1;
398 
399 		attrlen = rtnh_attrlen(rtnh);
400 		if (attrlen > 0) {
401 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
402 
403 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
404 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
405 #ifdef CONFIG_NET_CLS_ROUTE
406 			nla = nla_find(attrs, attrlen, RTA_FLOW);
407 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
408 #endif
409 		}
410 
411 		rtnh = rtnh_next(rtnh, &remaining);
412 	} endfor_nexthops(fi);
413 
414 	return 0;
415 }
416 
417 #endif
418 
419 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
420 {
421 #ifdef CONFIG_IP_ROUTE_MULTIPATH
422 	struct rtnexthop *rtnh;
423 	int remaining;
424 #endif
425 
426 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
427 		return 1;
428 
429 	if (cfg->fc_oif || cfg->fc_gw) {
430 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
431 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
432 			return 0;
433 		return 1;
434 	}
435 
436 #ifdef CONFIG_IP_ROUTE_MULTIPATH
437 	if (cfg->fc_mp == NULL)
438 		return 0;
439 
440 	rtnh = cfg->fc_mp;
441 	remaining = cfg->fc_mp_len;
442 
443 	for_nexthops(fi) {
444 		int attrlen;
445 
446 		if (!rtnh_ok(rtnh, remaining))
447 			return -EINVAL;
448 
449 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
450 			return 1;
451 
452 		attrlen = rtnh_attrlen(rtnh);
453 		if (attrlen < 0) {
454 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
455 
456 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
457 			if (nla && nla_get_be32(nla) != nh->nh_gw)
458 				return 1;
459 #ifdef CONFIG_NET_CLS_ROUTE
460 			nla = nla_find(attrs, attrlen, RTA_FLOW);
461 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
462 				return 1;
463 #endif
464 		}
465 
466 		rtnh = rtnh_next(rtnh, &remaining);
467 	} endfor_nexthops(fi);
468 #endif
469 	return 0;
470 }
471 
472 
473 /*
474    Picture
475    -------
476 
477    Semantics of nexthop is very messy by historical reasons.
478    We have to take into account, that:
479    a) gateway can be actually local interface address,
480       so that gatewayed route is direct.
481    b) gateway must be on-link address, possibly
482       described not by an ifaddr, but also by a direct route.
483    c) If both gateway and interface are specified, they should not
484       contradict.
485    d) If we use tunnel routes, gateway could be not on-link.
486 
487    Attempt to reconcile all of these (alas, self-contradictory) conditions
488    results in pretty ugly and hairy code with obscure logic.
489 
490    I chose to generalized it instead, so that the size
491    of code does not increase practically, but it becomes
492    much more general.
493    Every prefix is assigned a "scope" value: "host" is local address,
494    "link" is direct route,
495    [ ... "site" ... "interior" ... ]
496    and "universe" is true gateway route with global meaning.
497 
498    Every prefix refers to a set of "nexthop"s (gw, oif),
499    where gw must have narrower scope. This recursion stops
500    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
501    which means that gw is forced to be on link.
502 
503    Code is still hairy, but now it is apparently logically
504    consistent and very flexible. F.e. as by-product it allows
505    to co-exists in peace independent exterior and interior
506    routing processes.
507 
508    Normally it looks as following.
509 
510    {universe prefix}  -> (gw, oif) [scope link]
511 			  |
512 			  |-> {link prefix} -> (gw, oif) [scope local]
513 						|
514 						|-> {local prefix} (terminal node)
515  */
516 
517 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
518 			struct fib_nh *nh)
519 {
520 	int err;
521 
522 	if (nh->nh_gw) {
523 		struct fib_result res;
524 
525 #ifdef CONFIG_IP_ROUTE_PERVASIVE
526 		if (nh->nh_flags&RTNH_F_PERVASIVE)
527 			return 0;
528 #endif
529 		if (nh->nh_flags&RTNH_F_ONLINK) {
530 			struct net_device *dev;
531 
532 			if (cfg->fc_scope >= RT_SCOPE_LINK)
533 				return -EINVAL;
534 			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
535 				return -EINVAL;
536 			if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
537 				return -ENODEV;
538 			if (!(dev->flags&IFF_UP))
539 				return -ENETDOWN;
540 			nh->nh_dev = dev;
541 			dev_hold(dev);
542 			nh->nh_scope = RT_SCOPE_LINK;
543 			return 0;
544 		}
545 		{
546 			struct flowi fl = {
547 				.nl_u = {
548 					.ip4_u = {
549 						.daddr = nh->nh_gw,
550 						.scope = cfg->fc_scope + 1,
551 					},
552 				},
553 				.oif = nh->nh_oif,
554 			};
555 
556 			/* It is not necessary, but requires a bit of thinking */
557 			if (fl.fl4_scope < RT_SCOPE_LINK)
558 				fl.fl4_scope = RT_SCOPE_LINK;
559 			if ((err = fib_lookup(&fl, &res)) != 0)
560 				return err;
561 		}
562 		err = -EINVAL;
563 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
564 			goto out;
565 		nh->nh_scope = res.scope;
566 		nh->nh_oif = FIB_RES_OIF(res);
567 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
568 			goto out;
569 		dev_hold(nh->nh_dev);
570 		err = -ENETDOWN;
571 		if (!(nh->nh_dev->flags & IFF_UP))
572 			goto out;
573 		err = 0;
574 out:
575 		fib_res_put(&res);
576 		return err;
577 	} else {
578 		struct in_device *in_dev;
579 
580 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
581 			return -EINVAL;
582 
583 		in_dev = inetdev_by_index(nh->nh_oif);
584 		if (in_dev == NULL)
585 			return -ENODEV;
586 		if (!(in_dev->dev->flags&IFF_UP)) {
587 			in_dev_put(in_dev);
588 			return -ENETDOWN;
589 		}
590 		nh->nh_dev = in_dev->dev;
591 		dev_hold(nh->nh_dev);
592 		nh->nh_scope = RT_SCOPE_HOST;
593 		in_dev_put(in_dev);
594 	}
595 	return 0;
596 }
597 
598 static inline unsigned int fib_laddr_hashfn(__be32 val)
599 {
600 	unsigned int mask = (fib_hash_size - 1);
601 
602 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
603 }
604 
605 static struct hlist_head *fib_hash_alloc(int bytes)
606 {
607 	if (bytes <= PAGE_SIZE)
608 		return kmalloc(bytes, GFP_KERNEL);
609 	else
610 		return (struct hlist_head *)
611 			__get_free_pages(GFP_KERNEL, get_order(bytes));
612 }
613 
614 static void fib_hash_free(struct hlist_head *hash, int bytes)
615 {
616 	if (!hash)
617 		return;
618 
619 	if (bytes <= PAGE_SIZE)
620 		kfree(hash);
621 	else
622 		free_pages((unsigned long) hash, get_order(bytes));
623 }
624 
625 static void fib_hash_move(struct hlist_head *new_info_hash,
626 			  struct hlist_head *new_laddrhash,
627 			  unsigned int new_size)
628 {
629 	struct hlist_head *old_info_hash, *old_laddrhash;
630 	unsigned int old_size = fib_hash_size;
631 	unsigned int i, bytes;
632 
633 	spin_lock_bh(&fib_info_lock);
634 	old_info_hash = fib_info_hash;
635 	old_laddrhash = fib_info_laddrhash;
636 	fib_hash_size = new_size;
637 
638 	for (i = 0; i < old_size; i++) {
639 		struct hlist_head *head = &fib_info_hash[i];
640 		struct hlist_node *node, *n;
641 		struct fib_info *fi;
642 
643 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
644 			struct hlist_head *dest;
645 			unsigned int new_hash;
646 
647 			hlist_del(&fi->fib_hash);
648 
649 			new_hash = fib_info_hashfn(fi);
650 			dest = &new_info_hash[new_hash];
651 			hlist_add_head(&fi->fib_hash, dest);
652 		}
653 	}
654 	fib_info_hash = new_info_hash;
655 
656 	for (i = 0; i < old_size; i++) {
657 		struct hlist_head *lhead = &fib_info_laddrhash[i];
658 		struct hlist_node *node, *n;
659 		struct fib_info *fi;
660 
661 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
662 			struct hlist_head *ldest;
663 			unsigned int new_hash;
664 
665 			hlist_del(&fi->fib_lhash);
666 
667 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
668 			ldest = &new_laddrhash[new_hash];
669 			hlist_add_head(&fi->fib_lhash, ldest);
670 		}
671 	}
672 	fib_info_laddrhash = new_laddrhash;
673 
674 	spin_unlock_bh(&fib_info_lock);
675 
676 	bytes = old_size * sizeof(struct hlist_head *);
677 	fib_hash_free(old_info_hash, bytes);
678 	fib_hash_free(old_laddrhash, bytes);
679 }
680 
681 struct fib_info *fib_create_info(struct fib_config *cfg)
682 {
683 	int err;
684 	struct fib_info *fi = NULL;
685 	struct fib_info *ofi;
686 	int nhs = 1;
687 
688 	/* Fast check to catch the most weird cases */
689 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
690 		goto err_inval;
691 
692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
693 	if (cfg->fc_mp) {
694 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
695 		if (nhs == 0)
696 			goto err_inval;
697 	}
698 #endif
699 
700 	err = -ENOBUFS;
701 	if (fib_info_cnt >= fib_hash_size) {
702 		unsigned int new_size = fib_hash_size << 1;
703 		struct hlist_head *new_info_hash;
704 		struct hlist_head *new_laddrhash;
705 		unsigned int bytes;
706 
707 		if (!new_size)
708 			new_size = 1;
709 		bytes = new_size * sizeof(struct hlist_head *);
710 		new_info_hash = fib_hash_alloc(bytes);
711 		new_laddrhash = fib_hash_alloc(bytes);
712 		if (!new_info_hash || !new_laddrhash) {
713 			fib_hash_free(new_info_hash, bytes);
714 			fib_hash_free(new_laddrhash, bytes);
715 		} else {
716 			memset(new_info_hash, 0, bytes);
717 			memset(new_laddrhash, 0, bytes);
718 
719 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
720 		}
721 
722 		if (!fib_hash_size)
723 			goto failure;
724 	}
725 
726 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727 	if (fi == NULL)
728 		goto failure;
729 	fib_info_cnt++;
730 
731 	fi->fib_protocol = cfg->fc_protocol;
732 	fi->fib_flags = cfg->fc_flags;
733 	fi->fib_priority = cfg->fc_priority;
734 	fi->fib_prefsrc = cfg->fc_prefsrc;
735 
736 	fi->fib_nhs = nhs;
737 	change_nexthops(fi) {
738 		nh->nh_parent = fi;
739 	} endfor_nexthops(fi)
740 
741 	if (cfg->fc_mx) {
742 		struct nlattr *nla;
743 		int remaining;
744 
745 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
746 			int type = nla_type(nla);
747 
748 			if (type) {
749 				if (type > RTAX_MAX)
750 					goto err_inval;
751 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
752 			}
753 		}
754 	}
755 
756 	if (cfg->fc_mp) {
757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
758 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
759 		if (err != 0)
760 			goto failure;
761 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
762 			goto err_inval;
763 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
764 			goto err_inval;
765 #ifdef CONFIG_NET_CLS_ROUTE
766 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
767 			goto err_inval;
768 #endif
769 #else
770 		goto err_inval;
771 #endif
772 	} else {
773 		struct fib_nh *nh = fi->fib_nh;
774 
775 		nh->nh_oif = cfg->fc_oif;
776 		nh->nh_gw = cfg->fc_gw;
777 		nh->nh_flags = cfg->fc_flags;
778 #ifdef CONFIG_NET_CLS_ROUTE
779 		nh->nh_tclassid = cfg->fc_flow;
780 #endif
781 #ifdef CONFIG_IP_ROUTE_MULTIPATH
782 		nh->nh_weight = 1;
783 #endif
784 	}
785 
786 	if (fib_props[cfg->fc_type].error) {
787 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
788 			goto err_inval;
789 		goto link_it;
790 	}
791 
792 	if (cfg->fc_scope > RT_SCOPE_HOST)
793 		goto err_inval;
794 
795 	if (cfg->fc_scope == RT_SCOPE_HOST) {
796 		struct fib_nh *nh = fi->fib_nh;
797 
798 		/* Local address is added. */
799 		if (nhs != 1 || nh->nh_gw)
800 			goto err_inval;
801 		nh->nh_scope = RT_SCOPE_NOWHERE;
802 		nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
803 		err = -ENODEV;
804 		if (nh->nh_dev == NULL)
805 			goto failure;
806 	} else {
807 		change_nexthops(fi) {
808 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
809 				goto failure;
810 		} endfor_nexthops(fi)
811 	}
812 
813 	if (fi->fib_prefsrc) {
814 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
815 		    fi->fib_prefsrc != cfg->fc_dst)
816 			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
817 				goto err_inval;
818 	}
819 
820 link_it:
821 	if ((ofi = fib_find_info(fi)) != NULL) {
822 		fi->fib_dead = 1;
823 		free_fib_info(fi);
824 		ofi->fib_treeref++;
825 		return ofi;
826 	}
827 
828 	fi->fib_treeref++;
829 	atomic_inc(&fi->fib_clntref);
830 	spin_lock_bh(&fib_info_lock);
831 	hlist_add_head(&fi->fib_hash,
832 		       &fib_info_hash[fib_info_hashfn(fi)]);
833 	if (fi->fib_prefsrc) {
834 		struct hlist_head *head;
835 
836 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
837 		hlist_add_head(&fi->fib_lhash, head);
838 	}
839 	change_nexthops(fi) {
840 		struct hlist_head *head;
841 		unsigned int hash;
842 
843 		if (!nh->nh_dev)
844 			continue;
845 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
846 		head = &fib_info_devhash[hash];
847 		hlist_add_head(&nh->nh_hash, head);
848 	} endfor_nexthops(fi)
849 	spin_unlock_bh(&fib_info_lock);
850 	return fi;
851 
852 err_inval:
853 	err = -EINVAL;
854 
855 failure:
856 	if (fi) {
857 		fi->fib_dead = 1;
858 		free_fib_info(fi);
859 	}
860 
861 	return ERR_PTR(err);
862 }
863 
864 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
865 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
866 		       struct fib_result *res, __be32 zone, __be32 mask,
867 			int prefixlen)
868 {
869 	struct fib_alias *fa;
870 	int nh_sel = 0;
871 
872 	list_for_each_entry_rcu(fa, head, fa_list) {
873 		int err;
874 
875 		if (fa->fa_tos &&
876 		    fa->fa_tos != flp->fl4_tos)
877 			continue;
878 
879 		if (fa->fa_scope < flp->fl4_scope)
880 			continue;
881 
882 		fa->fa_state |= FA_S_ACCESSED;
883 
884 		err = fib_props[fa->fa_type].error;
885 		if (err == 0) {
886 			struct fib_info *fi = fa->fa_info;
887 
888 			if (fi->fib_flags & RTNH_F_DEAD)
889 				continue;
890 
891 			switch (fa->fa_type) {
892 			case RTN_UNICAST:
893 			case RTN_LOCAL:
894 			case RTN_BROADCAST:
895 			case RTN_ANYCAST:
896 			case RTN_MULTICAST:
897 				for_nexthops(fi) {
898 					if (nh->nh_flags&RTNH_F_DEAD)
899 						continue;
900 					if (!flp->oif || flp->oif == nh->nh_oif)
901 						break;
902 				}
903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
904 				if (nhsel < fi->fib_nhs) {
905 					nh_sel = nhsel;
906 					goto out_fill_res;
907 				}
908 #else
909 				if (nhsel < 1) {
910 					goto out_fill_res;
911 				}
912 #endif
913 				endfor_nexthops(fi);
914 				continue;
915 
916 			default:
917 				printk(KERN_DEBUG "impossible 102\n");
918 				return -EINVAL;
919 			}
920 		}
921 		return err;
922 	}
923 	return 1;
924 
925 out_fill_res:
926 	res->prefixlen = prefixlen;
927 	res->nh_sel = nh_sel;
928 	res->type = fa->fa_type;
929 	res->scope = fa->fa_scope;
930 	res->fi = fa->fa_info;
931 	atomic_inc(&res->fi->fib_clntref);
932 	return 0;
933 }
934 
935 /* Find appropriate source address to this destination */
936 
937 __be32 __fib_res_prefsrc(struct fib_result *res)
938 {
939 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
940 }
941 
942 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
943 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
944 		  struct fib_info *fi, unsigned int flags)
945 {
946 	struct nlmsghdr *nlh;
947 	struct rtmsg *rtm;
948 
949 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
950 	if (nlh == NULL)
951 		return -EMSGSIZE;
952 
953 	rtm = nlmsg_data(nlh);
954 	rtm->rtm_family = AF_INET;
955 	rtm->rtm_dst_len = dst_len;
956 	rtm->rtm_src_len = 0;
957 	rtm->rtm_tos = tos;
958 	rtm->rtm_table = tb_id;
959 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
960 	rtm->rtm_type = type;
961 	rtm->rtm_flags = fi->fib_flags;
962 	rtm->rtm_scope = scope;
963 	rtm->rtm_protocol = fi->fib_protocol;
964 
965 	if (rtm->rtm_dst_len)
966 		NLA_PUT_BE32(skb, RTA_DST, dst);
967 
968 	if (fi->fib_priority)
969 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
970 
971 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
972 		goto nla_put_failure;
973 
974 	if (fi->fib_prefsrc)
975 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
976 
977 	if (fi->fib_nhs == 1) {
978 		if (fi->fib_nh->nh_gw)
979 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
980 
981 		if (fi->fib_nh->nh_oif)
982 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
983 #ifdef CONFIG_NET_CLS_ROUTE
984 		if (fi->fib_nh[0].nh_tclassid)
985 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
986 #endif
987 	}
988 #ifdef CONFIG_IP_ROUTE_MULTIPATH
989 	if (fi->fib_nhs > 1) {
990 		struct rtnexthop *rtnh;
991 		struct nlattr *mp;
992 
993 		mp = nla_nest_start(skb, RTA_MULTIPATH);
994 		if (mp == NULL)
995 			goto nla_put_failure;
996 
997 		for_nexthops(fi) {
998 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
999 			if (rtnh == NULL)
1000 				goto nla_put_failure;
1001 
1002 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1003 			rtnh->rtnh_hops = nh->nh_weight - 1;
1004 			rtnh->rtnh_ifindex = nh->nh_oif;
1005 
1006 			if (nh->nh_gw)
1007 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1008 #ifdef CONFIG_NET_CLS_ROUTE
1009 			if (nh->nh_tclassid)
1010 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1011 #endif
1012 			/* length of rtnetlink header + attributes */
1013 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1014 		} endfor_nexthops(fi);
1015 
1016 		nla_nest_end(skb, mp);
1017 	}
1018 #endif
1019 	return nlmsg_end(skb, nlh);
1020 
1021 nla_put_failure:
1022 	nlmsg_cancel(skb, nlh);
1023 	return -EMSGSIZE;
1024 }
1025 
1026 /*
1027    Update FIB if:
1028    - local address disappeared -> we must delete all the entries
1029      referring to it.
1030    - device went down -> we must shutdown all nexthops going via it.
1031  */
1032 
1033 int fib_sync_down(__be32 local, struct net_device *dev, int force)
1034 {
1035 	int ret = 0;
1036 	int scope = RT_SCOPE_NOWHERE;
1037 
1038 	if (force)
1039 		scope = -1;
1040 
1041 	if (local && fib_info_laddrhash) {
1042 		unsigned int hash = fib_laddr_hashfn(local);
1043 		struct hlist_head *head = &fib_info_laddrhash[hash];
1044 		struct hlist_node *node;
1045 		struct fib_info *fi;
1046 
1047 		hlist_for_each_entry(fi, node, head, fib_lhash) {
1048 			if (fi->fib_prefsrc == local) {
1049 				fi->fib_flags |= RTNH_F_DEAD;
1050 				ret++;
1051 			}
1052 		}
1053 	}
1054 
1055 	if (dev) {
1056 		struct fib_info *prev_fi = NULL;
1057 		unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1058 		struct hlist_head *head = &fib_info_devhash[hash];
1059 		struct hlist_node *node;
1060 		struct fib_nh *nh;
1061 
1062 		hlist_for_each_entry(nh, node, head, nh_hash) {
1063 			struct fib_info *fi = nh->nh_parent;
1064 			int dead;
1065 
1066 			BUG_ON(!fi->fib_nhs);
1067 			if (nh->nh_dev != dev || fi == prev_fi)
1068 				continue;
1069 			prev_fi = fi;
1070 			dead = 0;
1071 			change_nexthops(fi) {
1072 				if (nh->nh_flags&RTNH_F_DEAD)
1073 					dead++;
1074 				else if (nh->nh_dev == dev &&
1075 					 nh->nh_scope != scope) {
1076 					nh->nh_flags |= RTNH_F_DEAD;
1077 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1078 					spin_lock_bh(&fib_multipath_lock);
1079 					fi->fib_power -= nh->nh_power;
1080 					nh->nh_power = 0;
1081 					spin_unlock_bh(&fib_multipath_lock);
1082 #endif
1083 					dead++;
1084 				}
1085 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1086 				if (force > 1 && nh->nh_dev == dev) {
1087 					dead = fi->fib_nhs;
1088 					break;
1089 				}
1090 #endif
1091 			} endfor_nexthops(fi)
1092 			if (dead == fi->fib_nhs) {
1093 				fi->fib_flags |= RTNH_F_DEAD;
1094 				ret++;
1095 			}
1096 		}
1097 	}
1098 
1099 	return ret;
1100 }
1101 
1102 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1103 
1104 /*
1105    Dead device goes up. We wake up dead nexthops.
1106    It takes sense only on multipath routes.
1107  */
1108 
1109 int fib_sync_up(struct net_device *dev)
1110 {
1111 	struct fib_info *prev_fi;
1112 	unsigned int hash;
1113 	struct hlist_head *head;
1114 	struct hlist_node *node;
1115 	struct fib_nh *nh;
1116 	int ret;
1117 
1118 	if (!(dev->flags&IFF_UP))
1119 		return 0;
1120 
1121 	prev_fi = NULL;
1122 	hash = fib_devindex_hashfn(dev->ifindex);
1123 	head = &fib_info_devhash[hash];
1124 	ret = 0;
1125 
1126 	hlist_for_each_entry(nh, node, head, nh_hash) {
1127 		struct fib_info *fi = nh->nh_parent;
1128 		int alive;
1129 
1130 		BUG_ON(!fi->fib_nhs);
1131 		if (nh->nh_dev != dev || fi == prev_fi)
1132 			continue;
1133 
1134 		prev_fi = fi;
1135 		alive = 0;
1136 		change_nexthops(fi) {
1137 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1138 				alive++;
1139 				continue;
1140 			}
1141 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1142 				continue;
1143 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1144 				continue;
1145 			alive++;
1146 			spin_lock_bh(&fib_multipath_lock);
1147 			nh->nh_power = 0;
1148 			nh->nh_flags &= ~RTNH_F_DEAD;
1149 			spin_unlock_bh(&fib_multipath_lock);
1150 		} endfor_nexthops(fi)
1151 
1152 		if (alive > 0) {
1153 			fi->fib_flags &= ~RTNH_F_DEAD;
1154 			ret++;
1155 		}
1156 	}
1157 
1158 	return ret;
1159 }
1160 
1161 /*
1162    The algorithm is suboptimal, but it provides really
1163    fair weighted route distribution.
1164  */
1165 
1166 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1167 {
1168 	struct fib_info *fi = res->fi;
1169 	int w;
1170 
1171 	spin_lock_bh(&fib_multipath_lock);
1172 	if (fi->fib_power <= 0) {
1173 		int power = 0;
1174 		change_nexthops(fi) {
1175 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1176 				power += nh->nh_weight;
1177 				nh->nh_power = nh->nh_weight;
1178 			}
1179 		} endfor_nexthops(fi);
1180 		fi->fib_power = power;
1181 		if (power <= 0) {
1182 			spin_unlock_bh(&fib_multipath_lock);
1183 			/* Race condition: route has just become dead. */
1184 			res->nh_sel = 0;
1185 			return;
1186 		}
1187 	}
1188 
1189 
1190 	/* w should be random number [0..fi->fib_power-1],
1191 	   it is pretty bad approximation.
1192 	 */
1193 
1194 	w = jiffies % fi->fib_power;
1195 
1196 	change_nexthops(fi) {
1197 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1198 			if ((w -= nh->nh_power) <= 0) {
1199 				nh->nh_power--;
1200 				fi->fib_power--;
1201 				res->nh_sel = nhsel;
1202 				spin_unlock_bh(&fib_multipath_lock);
1203 				return;
1204 			}
1205 		}
1206 	} endfor_nexthops(fi);
1207 
1208 	/* Race condition: route has just become dead. */
1209 	res->nh_sel = 0;
1210 	spin_unlock_bh(&fib_multipath_lock);
1211 }
1212 #endif
1213