xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision a1e58bbd)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47 
48 #include "fib_lookup.h"
49 
50 static DEFINE_SPINLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55 
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59 
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61 
62 static DEFINE_SPINLOCK(fib_multipath_lock);
63 
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66 
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71 
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73 
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76 
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79 
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81 
82 #define endfor_nexthops(fi) }
83 
84 
85 static const struct
86 {
87 	int	error;
88 	u8	scope;
89 } fib_props[RTN_MAX + 1] = {
90 	{
91 		.error	= 0,
92 		.scope	= RT_SCOPE_NOWHERE,
93 	},	/* RTN_UNSPEC */
94 	{
95 		.error	= 0,
96 		.scope	= RT_SCOPE_UNIVERSE,
97 	},	/* RTN_UNICAST */
98 	{
99 		.error	= 0,
100 		.scope	= RT_SCOPE_HOST,
101 	},	/* RTN_LOCAL */
102 	{
103 		.error	= 0,
104 		.scope	= RT_SCOPE_LINK,
105 	},	/* RTN_BROADCAST */
106 	{
107 		.error	= 0,
108 		.scope	= RT_SCOPE_LINK,
109 	},	/* RTN_ANYCAST */
110 	{
111 		.error	= 0,
112 		.scope	= RT_SCOPE_UNIVERSE,
113 	},	/* RTN_MULTICAST */
114 	{
115 		.error	= -EINVAL,
116 		.scope	= RT_SCOPE_UNIVERSE,
117 	},	/* RTN_BLACKHOLE */
118 	{
119 		.error	= -EHOSTUNREACH,
120 		.scope	= RT_SCOPE_UNIVERSE,
121 	},	/* RTN_UNREACHABLE */
122 	{
123 		.error	= -EACCES,
124 		.scope	= RT_SCOPE_UNIVERSE,
125 	},	/* RTN_PROHIBIT */
126 	{
127 		.error	= -EAGAIN,
128 		.scope	= RT_SCOPE_UNIVERSE,
129 	},	/* RTN_THROW */
130 	{
131 		.error	= -EINVAL,
132 		.scope	= RT_SCOPE_NOWHERE,
133 	},	/* RTN_NAT */
134 	{
135 		.error	= -EINVAL,
136 		.scope	= RT_SCOPE_NOWHERE,
137 	},	/* RTN_XRESOLVE */
138 };
139 
140 
141 /* Release a nexthop info record */
142 
143 void free_fib_info(struct fib_info *fi)
144 {
145 	if (fi->fib_dead == 0) {
146 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
147 		return;
148 	}
149 	change_nexthops(fi) {
150 		if (nh->nh_dev)
151 			dev_put(nh->nh_dev);
152 		nh->nh_dev = NULL;
153 	} endfor_nexthops(fi);
154 	fib_info_cnt--;
155 	kfree(fi);
156 }
157 
158 void fib_release_info(struct fib_info *fi)
159 {
160 	spin_lock_bh(&fib_info_lock);
161 	if (fi && --fi->fib_treeref == 0) {
162 		hlist_del(&fi->fib_hash);
163 		if (fi->fib_prefsrc)
164 			hlist_del(&fi->fib_lhash);
165 		change_nexthops(fi) {
166 			if (!nh->nh_dev)
167 				continue;
168 			hlist_del(&nh->nh_hash);
169 		} endfor_nexthops(fi)
170 		fi->fib_dead = 1;
171 		fib_info_put(fi);
172 	}
173 	spin_unlock_bh(&fib_info_lock);
174 }
175 
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177 {
178 	const struct fib_nh *onh = ofi->fib_nh;
179 
180 	for_nexthops(fi) {
181 		if (nh->nh_oif != onh->nh_oif ||
182 		    nh->nh_gw  != onh->nh_gw ||
183 		    nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185 		    nh->nh_weight != onh->nh_weight ||
186 #endif
187 #ifdef CONFIG_NET_CLS_ROUTE
188 		    nh->nh_tclassid != onh->nh_tclassid ||
189 #endif
190 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191 			return -1;
192 		onh++;
193 	} endfor_nexthops(fi);
194 	return 0;
195 }
196 
197 static inline unsigned int fib_devindex_hashfn(unsigned int val)
198 {
199 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
200 
201 	return (val ^
202 		(val >> DEVINDEX_HASHBITS) ^
203 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
204 }
205 
206 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207 {
208 	unsigned int mask = (fib_hash_size - 1);
209 	unsigned int val = fi->fib_nhs;
210 
211 	val ^= fi->fib_protocol;
212 	val ^= (__force u32)fi->fib_prefsrc;
213 	val ^= fi->fib_priority;
214 	for_nexthops(fi) {
215 		val ^= fib_devindex_hashfn(nh->nh_oif);
216 	} endfor_nexthops(fi)
217 
218 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
219 }
220 
221 static struct fib_info *fib_find_info(const struct fib_info *nfi)
222 {
223 	struct hlist_head *head;
224 	struct hlist_node *node;
225 	struct fib_info *fi;
226 	unsigned int hash;
227 
228 	hash = fib_info_hashfn(nfi);
229 	head = &fib_info_hash[hash];
230 
231 	hlist_for_each_entry(fi, node, head, fib_hash) {
232 		if (fi->fib_net != nfi->fib_net)
233 			continue;
234 		if (fi->fib_nhs != nfi->fib_nhs)
235 			continue;
236 		if (nfi->fib_protocol == fi->fib_protocol &&
237 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
238 		    nfi->fib_priority == fi->fib_priority &&
239 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
240 			   sizeof(fi->fib_metrics)) == 0 &&
241 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
242 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243 			return fi;
244 	}
245 
246 	return NULL;
247 }
248 
249 /* Check, that the gateway is already configured.
250    Used only by redirect accept routine.
251  */
252 
253 int ip_fib_check_default(__be32 gw, struct net_device *dev)
254 {
255 	struct hlist_head *head;
256 	struct hlist_node *node;
257 	struct fib_nh *nh;
258 	unsigned int hash;
259 
260 	spin_lock(&fib_info_lock);
261 
262 	hash = fib_devindex_hashfn(dev->ifindex);
263 	head = &fib_info_devhash[hash];
264 	hlist_for_each_entry(nh, node, head, nh_hash) {
265 		if (nh->nh_dev == dev &&
266 		    nh->nh_gw == gw &&
267 		    !(nh->nh_flags&RTNH_F_DEAD)) {
268 			spin_unlock(&fib_info_lock);
269 			return 0;
270 		}
271 	}
272 
273 	spin_unlock(&fib_info_lock);
274 
275 	return -1;
276 }
277 
278 static inline size_t fib_nlmsg_size(struct fib_info *fi)
279 {
280 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
281 			 + nla_total_size(4) /* RTA_TABLE */
282 			 + nla_total_size(4) /* RTA_DST */
283 			 + nla_total_size(4) /* RTA_PRIORITY */
284 			 + nla_total_size(4); /* RTA_PREFSRC */
285 
286 	/* space for nested metrics */
287 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
288 
289 	if (fi->fib_nhs) {
290 		/* Also handles the special case fib_nhs == 1 */
291 
292 		/* each nexthop is packed in an attribute */
293 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
294 
295 		/* may contain flow and gateway attribute */
296 		nhsize += 2 * nla_total_size(4);
297 
298 		/* all nexthops are packed in a nested attribute */
299 		payload += nla_total_size(fi->fib_nhs * nhsize);
300 	}
301 
302 	return payload;
303 }
304 
305 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
306 	       int dst_len, u32 tb_id, struct nl_info *info,
307 	       unsigned int nlm_flags)
308 {
309 	struct sk_buff *skb;
310 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
311 	int err = -ENOBUFS;
312 
313 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
314 	if (skb == NULL)
315 		goto errout;
316 
317 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
318 			    fa->fa_type, fa->fa_scope, key, dst_len,
319 			    fa->fa_tos, fa->fa_info, nlm_flags);
320 	if (err < 0) {
321 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
322 		WARN_ON(err == -EMSGSIZE);
323 		kfree_skb(skb);
324 		goto errout;
325 	}
326 	err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
327 			  info->nlh, GFP_KERNEL);
328 errout:
329 	if (err < 0)
330 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332 
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338 	if (fah) {
339 		struct fib_alias *fa;
340 		list_for_each_entry(fa, fah, fa_list) {
341 			if (fa->fa_tos > tos)
342 				continue;
343 			if (fa->fa_info->fib_priority >= prio ||
344 			    fa->fa_tos < tos)
345 				return fa;
346 		}
347 	}
348 	return NULL;
349 }
350 
351 int fib_detect_death(struct fib_info *fi, int order,
352 		     struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354 	struct neighbour *n;
355 	int state = NUD_NONE;
356 
357 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358 	if (n) {
359 		state = n->nud_state;
360 		neigh_release(n);
361 	}
362 	if (state==NUD_REACHABLE)
363 		return 0;
364 	if ((state&NUD_VALID) && order != dflt)
365 		return 0;
366 	if ((state&NUD_VALID) ||
367 	    (*last_idx<0 && order > dflt)) {
368 		*last_resort = fi;
369 		*last_idx = order;
370 	}
371 	return 1;
372 }
373 
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375 
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378 	int nhs = 0;
379 
380 	while (rtnh_ok(rtnh, remaining)) {
381 		nhs++;
382 		rtnh = rtnh_next(rtnh, &remaining);
383 	}
384 
385 	/* leftover implies invalid nexthop configuration, discard it */
386 	return remaining > 0 ? 0 : nhs;
387 }
388 
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390 		       int remaining, struct fib_config *cfg)
391 {
392 	change_nexthops(fi) {
393 		int attrlen;
394 
395 		if (!rtnh_ok(rtnh, remaining))
396 			return -EINVAL;
397 
398 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
399 		nh->nh_oif = rtnh->rtnh_ifindex;
400 		nh->nh_weight = rtnh->rtnh_hops + 1;
401 
402 		attrlen = rtnh_attrlen(rtnh);
403 		if (attrlen > 0) {
404 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 
406 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408 #ifdef CONFIG_NET_CLS_ROUTE
409 			nla = nla_find(attrs, attrlen, RTA_FLOW);
410 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411 #endif
412 		}
413 
414 		rtnh = rtnh_next(rtnh, &remaining);
415 	} endfor_nexthops(fi);
416 
417 	return 0;
418 }
419 
420 #endif
421 
422 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
423 {
424 #ifdef CONFIG_IP_ROUTE_MULTIPATH
425 	struct rtnexthop *rtnh;
426 	int remaining;
427 #endif
428 
429 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
430 		return 1;
431 
432 	if (cfg->fc_oif || cfg->fc_gw) {
433 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
434 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
435 			return 0;
436 		return 1;
437 	}
438 
439 #ifdef CONFIG_IP_ROUTE_MULTIPATH
440 	if (cfg->fc_mp == NULL)
441 		return 0;
442 
443 	rtnh = cfg->fc_mp;
444 	remaining = cfg->fc_mp_len;
445 
446 	for_nexthops(fi) {
447 		int attrlen;
448 
449 		if (!rtnh_ok(rtnh, remaining))
450 			return -EINVAL;
451 
452 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
453 			return 1;
454 
455 		attrlen = rtnh_attrlen(rtnh);
456 		if (attrlen < 0) {
457 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
458 
459 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
460 			if (nla && nla_get_be32(nla) != nh->nh_gw)
461 				return 1;
462 #ifdef CONFIG_NET_CLS_ROUTE
463 			nla = nla_find(attrs, attrlen, RTA_FLOW);
464 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
465 				return 1;
466 #endif
467 		}
468 
469 		rtnh = rtnh_next(rtnh, &remaining);
470 	} endfor_nexthops(fi);
471 #endif
472 	return 0;
473 }
474 
475 
476 /*
477    Picture
478    -------
479 
480    Semantics of nexthop is very messy by historical reasons.
481    We have to take into account, that:
482    a) gateway can be actually local interface address,
483       so that gatewayed route is direct.
484    b) gateway must be on-link address, possibly
485       described not by an ifaddr, but also by a direct route.
486    c) If both gateway and interface are specified, they should not
487       contradict.
488    d) If we use tunnel routes, gateway could be not on-link.
489 
490    Attempt to reconcile all of these (alas, self-contradictory) conditions
491    results in pretty ugly and hairy code with obscure logic.
492 
493    I chose to generalized it instead, so that the size
494    of code does not increase practically, but it becomes
495    much more general.
496    Every prefix is assigned a "scope" value: "host" is local address,
497    "link" is direct route,
498    [ ... "site" ... "interior" ... ]
499    and "universe" is true gateway route with global meaning.
500 
501    Every prefix refers to a set of "nexthop"s (gw, oif),
502    where gw must have narrower scope. This recursion stops
503    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504    which means that gw is forced to be on link.
505 
506    Code is still hairy, but now it is apparently logically
507    consistent and very flexible. F.e. as by-product it allows
508    to co-exists in peace independent exterior and interior
509    routing processes.
510 
511    Normally it looks as following.
512 
513    {universe prefix}  -> (gw, oif) [scope link]
514 			  |
515 			  |-> {link prefix} -> (gw, oif) [scope local]
516 						|
517 						|-> {local prefix} (terminal node)
518  */
519 
520 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521 			struct fib_nh *nh)
522 {
523 	int err;
524 	struct net *net;
525 
526 	net = cfg->fc_nlinfo.nl_net;
527 	if (nh->nh_gw) {
528 		struct fib_result res;
529 
530 #ifdef CONFIG_IP_ROUTE_PERVASIVE
531 		if (nh->nh_flags&RTNH_F_PERVASIVE)
532 			return 0;
533 #endif
534 		if (nh->nh_flags&RTNH_F_ONLINK) {
535 			struct net_device *dev;
536 
537 			if (cfg->fc_scope >= RT_SCOPE_LINK)
538 				return -EINVAL;
539 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
540 				return -EINVAL;
541 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
542 				return -ENODEV;
543 			if (!(dev->flags&IFF_UP))
544 				return -ENETDOWN;
545 			nh->nh_dev = dev;
546 			dev_hold(dev);
547 			nh->nh_scope = RT_SCOPE_LINK;
548 			return 0;
549 		}
550 		{
551 			struct flowi fl = {
552 				.nl_u = {
553 					.ip4_u = {
554 						.daddr = nh->nh_gw,
555 						.scope = cfg->fc_scope + 1,
556 					},
557 				},
558 				.oif = nh->nh_oif,
559 			};
560 
561 			/* It is not necessary, but requires a bit of thinking */
562 			if (fl.fl4_scope < RT_SCOPE_LINK)
563 				fl.fl4_scope = RT_SCOPE_LINK;
564 			if ((err = fib_lookup(net, &fl, &res)) != 0)
565 				return err;
566 		}
567 		err = -EINVAL;
568 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569 			goto out;
570 		nh->nh_scope = res.scope;
571 		nh->nh_oif = FIB_RES_OIF(res);
572 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
573 			goto out;
574 		dev_hold(nh->nh_dev);
575 		err = -ENETDOWN;
576 		if (!(nh->nh_dev->flags & IFF_UP))
577 			goto out;
578 		err = 0;
579 out:
580 		fib_res_put(&res);
581 		return err;
582 	} else {
583 		struct in_device *in_dev;
584 
585 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
586 			return -EINVAL;
587 
588 		in_dev = inetdev_by_index(net, nh->nh_oif);
589 		if (in_dev == NULL)
590 			return -ENODEV;
591 		if (!(in_dev->dev->flags&IFF_UP)) {
592 			in_dev_put(in_dev);
593 			return -ENETDOWN;
594 		}
595 		nh->nh_dev = in_dev->dev;
596 		dev_hold(nh->nh_dev);
597 		nh->nh_scope = RT_SCOPE_HOST;
598 		in_dev_put(in_dev);
599 	}
600 	return 0;
601 }
602 
603 static inline unsigned int fib_laddr_hashfn(__be32 val)
604 {
605 	unsigned int mask = (fib_hash_size - 1);
606 
607 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
608 }
609 
610 static struct hlist_head *fib_hash_alloc(int bytes)
611 {
612 	if (bytes <= PAGE_SIZE)
613 		return kzalloc(bytes, GFP_KERNEL);
614 	else
615 		return (struct hlist_head *)
616 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
617 }
618 
619 static void fib_hash_free(struct hlist_head *hash, int bytes)
620 {
621 	if (!hash)
622 		return;
623 
624 	if (bytes <= PAGE_SIZE)
625 		kfree(hash);
626 	else
627 		free_pages((unsigned long) hash, get_order(bytes));
628 }
629 
630 static void fib_hash_move(struct hlist_head *new_info_hash,
631 			  struct hlist_head *new_laddrhash,
632 			  unsigned int new_size)
633 {
634 	struct hlist_head *old_info_hash, *old_laddrhash;
635 	unsigned int old_size = fib_hash_size;
636 	unsigned int i, bytes;
637 
638 	spin_lock_bh(&fib_info_lock);
639 	old_info_hash = fib_info_hash;
640 	old_laddrhash = fib_info_laddrhash;
641 	fib_hash_size = new_size;
642 
643 	for (i = 0; i < old_size; i++) {
644 		struct hlist_head *head = &fib_info_hash[i];
645 		struct hlist_node *node, *n;
646 		struct fib_info *fi;
647 
648 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
649 			struct hlist_head *dest;
650 			unsigned int new_hash;
651 
652 			hlist_del(&fi->fib_hash);
653 
654 			new_hash = fib_info_hashfn(fi);
655 			dest = &new_info_hash[new_hash];
656 			hlist_add_head(&fi->fib_hash, dest);
657 		}
658 	}
659 	fib_info_hash = new_info_hash;
660 
661 	for (i = 0; i < old_size; i++) {
662 		struct hlist_head *lhead = &fib_info_laddrhash[i];
663 		struct hlist_node *node, *n;
664 		struct fib_info *fi;
665 
666 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
667 			struct hlist_head *ldest;
668 			unsigned int new_hash;
669 
670 			hlist_del(&fi->fib_lhash);
671 
672 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
673 			ldest = &new_laddrhash[new_hash];
674 			hlist_add_head(&fi->fib_lhash, ldest);
675 		}
676 	}
677 	fib_info_laddrhash = new_laddrhash;
678 
679 	spin_unlock_bh(&fib_info_lock);
680 
681 	bytes = old_size * sizeof(struct hlist_head *);
682 	fib_hash_free(old_info_hash, bytes);
683 	fib_hash_free(old_laddrhash, bytes);
684 }
685 
686 struct fib_info *fib_create_info(struct fib_config *cfg)
687 {
688 	int err;
689 	struct fib_info *fi = NULL;
690 	struct fib_info *ofi;
691 	int nhs = 1;
692 	struct net *net = cfg->fc_nlinfo.nl_net;
693 
694 	/* Fast check to catch the most weird cases */
695 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
696 		goto err_inval;
697 
698 #ifdef CONFIG_IP_ROUTE_MULTIPATH
699 	if (cfg->fc_mp) {
700 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
701 		if (nhs == 0)
702 			goto err_inval;
703 	}
704 #endif
705 
706 	err = -ENOBUFS;
707 	if (fib_info_cnt >= fib_hash_size) {
708 		unsigned int new_size = fib_hash_size << 1;
709 		struct hlist_head *new_info_hash;
710 		struct hlist_head *new_laddrhash;
711 		unsigned int bytes;
712 
713 		if (!new_size)
714 			new_size = 1;
715 		bytes = new_size * sizeof(struct hlist_head *);
716 		new_info_hash = fib_hash_alloc(bytes);
717 		new_laddrhash = fib_hash_alloc(bytes);
718 		if (!new_info_hash || !new_laddrhash) {
719 			fib_hash_free(new_info_hash, bytes);
720 			fib_hash_free(new_laddrhash, bytes);
721 		} else
722 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
723 
724 		if (!fib_hash_size)
725 			goto failure;
726 	}
727 
728 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
729 	if (fi == NULL)
730 		goto failure;
731 	fib_info_cnt++;
732 
733 	fi->fib_net = net;
734 	fi->fib_protocol = cfg->fc_protocol;
735 	fi->fib_flags = cfg->fc_flags;
736 	fi->fib_priority = cfg->fc_priority;
737 	fi->fib_prefsrc = cfg->fc_prefsrc;
738 
739 	fi->fib_nhs = nhs;
740 	change_nexthops(fi) {
741 		nh->nh_parent = fi;
742 	} endfor_nexthops(fi)
743 
744 	if (cfg->fc_mx) {
745 		struct nlattr *nla;
746 		int remaining;
747 
748 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
749 			int type = nla_type(nla);
750 
751 			if (type) {
752 				if (type > RTAX_MAX)
753 					goto err_inval;
754 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
755 			}
756 		}
757 	}
758 
759 	if (cfg->fc_mp) {
760 #ifdef CONFIG_IP_ROUTE_MULTIPATH
761 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
762 		if (err != 0)
763 			goto failure;
764 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
765 			goto err_inval;
766 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
767 			goto err_inval;
768 #ifdef CONFIG_NET_CLS_ROUTE
769 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
770 			goto err_inval;
771 #endif
772 #else
773 		goto err_inval;
774 #endif
775 	} else {
776 		struct fib_nh *nh = fi->fib_nh;
777 
778 		nh->nh_oif = cfg->fc_oif;
779 		nh->nh_gw = cfg->fc_gw;
780 		nh->nh_flags = cfg->fc_flags;
781 #ifdef CONFIG_NET_CLS_ROUTE
782 		nh->nh_tclassid = cfg->fc_flow;
783 #endif
784 #ifdef CONFIG_IP_ROUTE_MULTIPATH
785 		nh->nh_weight = 1;
786 #endif
787 	}
788 
789 	if (fib_props[cfg->fc_type].error) {
790 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
791 			goto err_inval;
792 		goto link_it;
793 	}
794 
795 	if (cfg->fc_scope > RT_SCOPE_HOST)
796 		goto err_inval;
797 
798 	if (cfg->fc_scope == RT_SCOPE_HOST) {
799 		struct fib_nh *nh = fi->fib_nh;
800 
801 		/* Local address is added. */
802 		if (nhs != 1 || nh->nh_gw)
803 			goto err_inval;
804 		nh->nh_scope = RT_SCOPE_NOWHERE;
805 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
806 		err = -ENODEV;
807 		if (nh->nh_dev == NULL)
808 			goto failure;
809 	} else {
810 		change_nexthops(fi) {
811 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
812 				goto failure;
813 		} endfor_nexthops(fi)
814 	}
815 
816 	if (fi->fib_prefsrc) {
817 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
818 		    fi->fib_prefsrc != cfg->fc_dst)
819 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
820 				goto err_inval;
821 	}
822 
823 link_it:
824 	if ((ofi = fib_find_info(fi)) != NULL) {
825 		fi->fib_dead = 1;
826 		free_fib_info(fi);
827 		ofi->fib_treeref++;
828 		return ofi;
829 	}
830 
831 	fi->fib_treeref++;
832 	atomic_inc(&fi->fib_clntref);
833 	spin_lock_bh(&fib_info_lock);
834 	hlist_add_head(&fi->fib_hash,
835 		       &fib_info_hash[fib_info_hashfn(fi)]);
836 	if (fi->fib_prefsrc) {
837 		struct hlist_head *head;
838 
839 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
840 		hlist_add_head(&fi->fib_lhash, head);
841 	}
842 	change_nexthops(fi) {
843 		struct hlist_head *head;
844 		unsigned int hash;
845 
846 		if (!nh->nh_dev)
847 			continue;
848 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
849 		head = &fib_info_devhash[hash];
850 		hlist_add_head(&nh->nh_hash, head);
851 	} endfor_nexthops(fi)
852 	spin_unlock_bh(&fib_info_lock);
853 	return fi;
854 
855 err_inval:
856 	err = -EINVAL;
857 
858 failure:
859 	if (fi) {
860 		fi->fib_dead = 1;
861 		free_fib_info(fi);
862 	}
863 
864 	return ERR_PTR(err);
865 }
866 
867 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
868 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
869 		       struct fib_result *res, __be32 zone, __be32 mask,
870 			int prefixlen)
871 {
872 	struct fib_alias *fa;
873 	int nh_sel = 0;
874 
875 	list_for_each_entry_rcu(fa, head, fa_list) {
876 		int err;
877 
878 		if (fa->fa_tos &&
879 		    fa->fa_tos != flp->fl4_tos)
880 			continue;
881 
882 		if (fa->fa_scope < flp->fl4_scope)
883 			continue;
884 
885 		fa->fa_state |= FA_S_ACCESSED;
886 
887 		err = fib_props[fa->fa_type].error;
888 		if (err == 0) {
889 			struct fib_info *fi = fa->fa_info;
890 
891 			if (fi->fib_flags & RTNH_F_DEAD)
892 				continue;
893 
894 			switch (fa->fa_type) {
895 			case RTN_UNICAST:
896 			case RTN_LOCAL:
897 			case RTN_BROADCAST:
898 			case RTN_ANYCAST:
899 			case RTN_MULTICAST:
900 				for_nexthops(fi) {
901 					if (nh->nh_flags&RTNH_F_DEAD)
902 						continue;
903 					if (!flp->oif || flp->oif == nh->nh_oif)
904 						break;
905 				}
906 #ifdef CONFIG_IP_ROUTE_MULTIPATH
907 				if (nhsel < fi->fib_nhs) {
908 					nh_sel = nhsel;
909 					goto out_fill_res;
910 				}
911 #else
912 				if (nhsel < 1) {
913 					goto out_fill_res;
914 				}
915 #endif
916 				endfor_nexthops(fi);
917 				continue;
918 
919 			default:
920 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
921 					fa->fa_type);
922 				return -EINVAL;
923 			}
924 		}
925 		return err;
926 	}
927 	return 1;
928 
929 out_fill_res:
930 	res->prefixlen = prefixlen;
931 	res->nh_sel = nh_sel;
932 	res->type = fa->fa_type;
933 	res->scope = fa->fa_scope;
934 	res->fi = fa->fa_info;
935 	atomic_inc(&res->fi->fib_clntref);
936 	return 0;
937 }
938 
939 /* Find appropriate source address to this destination */
940 
941 __be32 __fib_res_prefsrc(struct fib_result *res)
942 {
943 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
944 }
945 
946 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
947 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
948 		  struct fib_info *fi, unsigned int flags)
949 {
950 	struct nlmsghdr *nlh;
951 	struct rtmsg *rtm;
952 
953 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
954 	if (nlh == NULL)
955 		return -EMSGSIZE;
956 
957 	rtm = nlmsg_data(nlh);
958 	rtm->rtm_family = AF_INET;
959 	rtm->rtm_dst_len = dst_len;
960 	rtm->rtm_src_len = 0;
961 	rtm->rtm_tos = tos;
962 	rtm->rtm_table = tb_id;
963 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
964 	rtm->rtm_type = type;
965 	rtm->rtm_flags = fi->fib_flags;
966 	rtm->rtm_scope = scope;
967 	rtm->rtm_protocol = fi->fib_protocol;
968 
969 	if (rtm->rtm_dst_len)
970 		NLA_PUT_BE32(skb, RTA_DST, dst);
971 
972 	if (fi->fib_priority)
973 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
974 
975 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
976 		goto nla_put_failure;
977 
978 	if (fi->fib_prefsrc)
979 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
980 
981 	if (fi->fib_nhs == 1) {
982 		if (fi->fib_nh->nh_gw)
983 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
984 
985 		if (fi->fib_nh->nh_oif)
986 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
987 #ifdef CONFIG_NET_CLS_ROUTE
988 		if (fi->fib_nh[0].nh_tclassid)
989 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
990 #endif
991 	}
992 #ifdef CONFIG_IP_ROUTE_MULTIPATH
993 	if (fi->fib_nhs > 1) {
994 		struct rtnexthop *rtnh;
995 		struct nlattr *mp;
996 
997 		mp = nla_nest_start(skb, RTA_MULTIPATH);
998 		if (mp == NULL)
999 			goto nla_put_failure;
1000 
1001 		for_nexthops(fi) {
1002 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1003 			if (rtnh == NULL)
1004 				goto nla_put_failure;
1005 
1006 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1007 			rtnh->rtnh_hops = nh->nh_weight - 1;
1008 			rtnh->rtnh_ifindex = nh->nh_oif;
1009 
1010 			if (nh->nh_gw)
1011 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012 #ifdef CONFIG_NET_CLS_ROUTE
1013 			if (nh->nh_tclassid)
1014 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015 #endif
1016 			/* length of rtnetlink header + attributes */
1017 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1018 		} endfor_nexthops(fi);
1019 
1020 		nla_nest_end(skb, mp);
1021 	}
1022 #endif
1023 	return nlmsg_end(skb, nlh);
1024 
1025 nla_put_failure:
1026 	nlmsg_cancel(skb, nlh);
1027 	return -EMSGSIZE;
1028 }
1029 
1030 /*
1031    Update FIB if:
1032    - local address disappeared -> we must delete all the entries
1033      referring to it.
1034    - device went down -> we must shutdown all nexthops going via it.
1035  */
1036 int fib_sync_down_addr(struct net *net, __be32 local)
1037 {
1038 	int ret = 0;
1039 	unsigned int hash = fib_laddr_hashfn(local);
1040 	struct hlist_head *head = &fib_info_laddrhash[hash];
1041 	struct hlist_node *node;
1042 	struct fib_info *fi;
1043 
1044 	if (fib_info_laddrhash == NULL || local == 0)
1045 		return 0;
1046 
1047 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1048 		if (fi->fib_net != net)
1049 			continue;
1050 		if (fi->fib_prefsrc == local) {
1051 			fi->fib_flags |= RTNH_F_DEAD;
1052 			ret++;
1053 		}
1054 	}
1055 	return ret;
1056 }
1057 
1058 int fib_sync_down_dev(struct net_device *dev, int force)
1059 {
1060 	int ret = 0;
1061 	int scope = RT_SCOPE_NOWHERE;
1062 	struct fib_info *prev_fi = NULL;
1063 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1064 	struct hlist_head *head = &fib_info_devhash[hash];
1065 	struct hlist_node *node;
1066 	struct fib_nh *nh;
1067 
1068 	if (force)
1069 		scope = -1;
1070 
1071 	hlist_for_each_entry(nh, node, head, nh_hash) {
1072 		struct fib_info *fi = nh->nh_parent;
1073 		int dead;
1074 
1075 		BUG_ON(!fi->fib_nhs);
1076 		if (nh->nh_dev != dev || fi == prev_fi)
1077 			continue;
1078 		prev_fi = fi;
1079 		dead = 0;
1080 		change_nexthops(fi) {
1081 			if (nh->nh_flags&RTNH_F_DEAD)
1082 				dead++;
1083 			else if (nh->nh_dev == dev &&
1084 					nh->nh_scope != scope) {
1085 				nh->nh_flags |= RTNH_F_DEAD;
1086 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1087 				spin_lock_bh(&fib_multipath_lock);
1088 				fi->fib_power -= nh->nh_power;
1089 				nh->nh_power = 0;
1090 				spin_unlock_bh(&fib_multipath_lock);
1091 #endif
1092 				dead++;
1093 			}
1094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1095 			if (force > 1 && nh->nh_dev == dev) {
1096 				dead = fi->fib_nhs;
1097 				break;
1098 			}
1099 #endif
1100 		} endfor_nexthops(fi)
1101 		if (dead == fi->fib_nhs) {
1102 			fi->fib_flags |= RTNH_F_DEAD;
1103 			ret++;
1104 		}
1105 	}
1106 
1107 	return ret;
1108 }
1109 
1110 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1111 
1112 /*
1113    Dead device goes up. We wake up dead nexthops.
1114    It takes sense only on multipath routes.
1115  */
1116 
1117 int fib_sync_up(struct net_device *dev)
1118 {
1119 	struct fib_info *prev_fi;
1120 	unsigned int hash;
1121 	struct hlist_head *head;
1122 	struct hlist_node *node;
1123 	struct fib_nh *nh;
1124 	int ret;
1125 
1126 	if (!(dev->flags&IFF_UP))
1127 		return 0;
1128 
1129 	prev_fi = NULL;
1130 	hash = fib_devindex_hashfn(dev->ifindex);
1131 	head = &fib_info_devhash[hash];
1132 	ret = 0;
1133 
1134 	hlist_for_each_entry(nh, node, head, nh_hash) {
1135 		struct fib_info *fi = nh->nh_parent;
1136 		int alive;
1137 
1138 		BUG_ON(!fi->fib_nhs);
1139 		if (nh->nh_dev != dev || fi == prev_fi)
1140 			continue;
1141 
1142 		prev_fi = fi;
1143 		alive = 0;
1144 		change_nexthops(fi) {
1145 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1146 				alive++;
1147 				continue;
1148 			}
1149 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1150 				continue;
1151 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1152 				continue;
1153 			alive++;
1154 			spin_lock_bh(&fib_multipath_lock);
1155 			nh->nh_power = 0;
1156 			nh->nh_flags &= ~RTNH_F_DEAD;
1157 			spin_unlock_bh(&fib_multipath_lock);
1158 		} endfor_nexthops(fi)
1159 
1160 		if (alive > 0) {
1161 			fi->fib_flags &= ~RTNH_F_DEAD;
1162 			ret++;
1163 		}
1164 	}
1165 
1166 	return ret;
1167 }
1168 
1169 /*
1170    The algorithm is suboptimal, but it provides really
1171    fair weighted route distribution.
1172  */
1173 
1174 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1175 {
1176 	struct fib_info *fi = res->fi;
1177 	int w;
1178 
1179 	spin_lock_bh(&fib_multipath_lock);
1180 	if (fi->fib_power <= 0) {
1181 		int power = 0;
1182 		change_nexthops(fi) {
1183 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1184 				power += nh->nh_weight;
1185 				nh->nh_power = nh->nh_weight;
1186 			}
1187 		} endfor_nexthops(fi);
1188 		fi->fib_power = power;
1189 		if (power <= 0) {
1190 			spin_unlock_bh(&fib_multipath_lock);
1191 			/* Race condition: route has just become dead. */
1192 			res->nh_sel = 0;
1193 			return;
1194 		}
1195 	}
1196 
1197 
1198 	/* w should be random number [0..fi->fib_power-1],
1199 	   it is pretty bad approximation.
1200 	 */
1201 
1202 	w = jiffies % fi->fib_power;
1203 
1204 	change_nexthops(fi) {
1205 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1206 			if ((w -= nh->nh_power) <= 0) {
1207 				nh->nh_power--;
1208 				fi->fib_power--;
1209 				res->nh_sel = nhsel;
1210 				spin_unlock_bh(&fib_multipath_lock);
1211 				return;
1212 			}
1213 		}
1214 	} endfor_nexthops(fi);
1215 
1216 	/* Race condition: route has just become dead. */
1217 	res->nh_sel = 0;
1218 	spin_unlock_bh(&fib_multipath_lock);
1219 }
1220 #endif
1221