xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision f42b3800)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *		This program is free software; you can redistribute it and/or
13  *		modify it under the terms of the GNU General Public License
14  *		as published by the Free Software Foundation; either version
15  *		2 of the License, or (at your option) any later version.
16  */
17 
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37 
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/netlink.h>
46 #include <net/nexthop.h>
47 
48 #include "fib_lookup.h"
49 
50 static DEFINE_SPINLOCK(fib_info_lock);
51 static struct hlist_head *fib_info_hash;
52 static struct hlist_head *fib_info_laddrhash;
53 static unsigned int fib_hash_size;
54 static unsigned int fib_info_cnt;
55 
56 #define DEVINDEX_HASHBITS 8
57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59 
60 #ifdef CONFIG_IP_ROUTE_MULTIPATH
61 
62 static DEFINE_SPINLOCK(fib_multipath_lock);
63 
64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66 
67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69 
70 #else /* CONFIG_IP_ROUTE_MULTIPATH */
71 
72 /* Hope, that gcc will optimize it to get rid of dummy loop */
73 
74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75 for (nhsel=0; nhsel < 1; nhsel++)
76 
77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78 for (nhsel=0; nhsel < 1; nhsel++)
79 
80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
81 
82 #define endfor_nexthops(fi) }
83 
84 
85 static const struct
86 {
87 	int	error;
88 	u8	scope;
89 } fib_props[RTN_MAX + 1] = {
90 	{
91 		.error	= 0,
92 		.scope	= RT_SCOPE_NOWHERE,
93 	},	/* RTN_UNSPEC */
94 	{
95 		.error	= 0,
96 		.scope	= RT_SCOPE_UNIVERSE,
97 	},	/* RTN_UNICAST */
98 	{
99 		.error	= 0,
100 		.scope	= RT_SCOPE_HOST,
101 	},	/* RTN_LOCAL */
102 	{
103 		.error	= 0,
104 		.scope	= RT_SCOPE_LINK,
105 	},	/* RTN_BROADCAST */
106 	{
107 		.error	= 0,
108 		.scope	= RT_SCOPE_LINK,
109 	},	/* RTN_ANYCAST */
110 	{
111 		.error	= 0,
112 		.scope	= RT_SCOPE_UNIVERSE,
113 	},	/* RTN_MULTICAST */
114 	{
115 		.error	= -EINVAL,
116 		.scope	= RT_SCOPE_UNIVERSE,
117 	},	/* RTN_BLACKHOLE */
118 	{
119 		.error	= -EHOSTUNREACH,
120 		.scope	= RT_SCOPE_UNIVERSE,
121 	},	/* RTN_UNREACHABLE */
122 	{
123 		.error	= -EACCES,
124 		.scope	= RT_SCOPE_UNIVERSE,
125 	},	/* RTN_PROHIBIT */
126 	{
127 		.error	= -EAGAIN,
128 		.scope	= RT_SCOPE_UNIVERSE,
129 	},	/* RTN_THROW */
130 	{
131 		.error	= -EINVAL,
132 		.scope	= RT_SCOPE_NOWHERE,
133 	},	/* RTN_NAT */
134 	{
135 		.error	= -EINVAL,
136 		.scope	= RT_SCOPE_NOWHERE,
137 	},	/* RTN_XRESOLVE */
138 };
139 
140 
141 /* Release a nexthop info record */
142 
143 void free_fib_info(struct fib_info *fi)
144 {
145 	if (fi->fib_dead == 0) {
146 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
147 		return;
148 	}
149 	change_nexthops(fi) {
150 		if (nh->nh_dev)
151 			dev_put(nh->nh_dev);
152 		nh->nh_dev = NULL;
153 	} endfor_nexthops(fi);
154 	fib_info_cnt--;
155 	release_net(fi->fib_net);
156 	kfree(fi);
157 }
158 
159 void fib_release_info(struct fib_info *fi)
160 {
161 	spin_lock_bh(&fib_info_lock);
162 	if (fi && --fi->fib_treeref == 0) {
163 		hlist_del(&fi->fib_hash);
164 		if (fi->fib_prefsrc)
165 			hlist_del(&fi->fib_lhash);
166 		change_nexthops(fi) {
167 			if (!nh->nh_dev)
168 				continue;
169 			hlist_del(&nh->nh_hash);
170 		} endfor_nexthops(fi)
171 		fi->fib_dead = 1;
172 		fib_info_put(fi);
173 	}
174 	spin_unlock_bh(&fib_info_lock);
175 }
176 
177 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178 {
179 	const struct fib_nh *onh = ofi->fib_nh;
180 
181 	for_nexthops(fi) {
182 		if (nh->nh_oif != onh->nh_oif ||
183 		    nh->nh_gw  != onh->nh_gw ||
184 		    nh->nh_scope != onh->nh_scope ||
185 #ifdef CONFIG_IP_ROUTE_MULTIPATH
186 		    nh->nh_weight != onh->nh_weight ||
187 #endif
188 #ifdef CONFIG_NET_CLS_ROUTE
189 		    nh->nh_tclassid != onh->nh_tclassid ||
190 #endif
191 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
192 			return -1;
193 		onh++;
194 	} endfor_nexthops(fi);
195 	return 0;
196 }
197 
198 static inline unsigned int fib_devindex_hashfn(unsigned int val)
199 {
200 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
201 
202 	return (val ^
203 		(val >> DEVINDEX_HASHBITS) ^
204 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
205 }
206 
207 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
208 {
209 	unsigned int mask = (fib_hash_size - 1);
210 	unsigned int val = fi->fib_nhs;
211 
212 	val ^= fi->fib_protocol;
213 	val ^= (__force u32)fi->fib_prefsrc;
214 	val ^= fi->fib_priority;
215 	for_nexthops(fi) {
216 		val ^= fib_devindex_hashfn(nh->nh_oif);
217 	} endfor_nexthops(fi)
218 
219 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
220 }
221 
222 static struct fib_info *fib_find_info(const struct fib_info *nfi)
223 {
224 	struct hlist_head *head;
225 	struct hlist_node *node;
226 	struct fib_info *fi;
227 	unsigned int hash;
228 
229 	hash = fib_info_hashfn(nfi);
230 	head = &fib_info_hash[hash];
231 
232 	hlist_for_each_entry(fi, node, head, fib_hash) {
233 		if (fi->fib_net != nfi->fib_net)
234 			continue;
235 		if (fi->fib_nhs != nfi->fib_nhs)
236 			continue;
237 		if (nfi->fib_protocol == fi->fib_protocol &&
238 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
239 		    nfi->fib_priority == fi->fib_priority &&
240 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
241 			   sizeof(fi->fib_metrics)) == 0 &&
242 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
243 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
244 			return fi;
245 	}
246 
247 	return NULL;
248 }
249 
250 /* Check, that the gateway is already configured.
251    Used only by redirect accept routine.
252  */
253 
254 int ip_fib_check_default(__be32 gw, struct net_device *dev)
255 {
256 	struct hlist_head *head;
257 	struct hlist_node *node;
258 	struct fib_nh *nh;
259 	unsigned int hash;
260 
261 	spin_lock(&fib_info_lock);
262 
263 	hash = fib_devindex_hashfn(dev->ifindex);
264 	head = &fib_info_devhash[hash];
265 	hlist_for_each_entry(nh, node, head, nh_hash) {
266 		if (nh->nh_dev == dev &&
267 		    nh->nh_gw == gw &&
268 		    !(nh->nh_flags&RTNH_F_DEAD)) {
269 			spin_unlock(&fib_info_lock);
270 			return 0;
271 		}
272 	}
273 
274 	spin_unlock(&fib_info_lock);
275 
276 	return -1;
277 }
278 
279 static inline size_t fib_nlmsg_size(struct fib_info *fi)
280 {
281 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
282 			 + nla_total_size(4) /* RTA_TABLE */
283 			 + nla_total_size(4) /* RTA_DST */
284 			 + nla_total_size(4) /* RTA_PRIORITY */
285 			 + nla_total_size(4); /* RTA_PREFSRC */
286 
287 	/* space for nested metrics */
288 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
289 
290 	if (fi->fib_nhs) {
291 		/* Also handles the special case fib_nhs == 1 */
292 
293 		/* each nexthop is packed in an attribute */
294 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
295 
296 		/* may contain flow and gateway attribute */
297 		nhsize += 2 * nla_total_size(4);
298 
299 		/* all nexthops are packed in a nested attribute */
300 		payload += nla_total_size(fi->fib_nhs * nhsize);
301 	}
302 
303 	return payload;
304 }
305 
306 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
307 	       int dst_len, u32 tb_id, struct nl_info *info,
308 	       unsigned int nlm_flags)
309 {
310 	struct sk_buff *skb;
311 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
312 	int err = -ENOBUFS;
313 
314 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
315 	if (skb == NULL)
316 		goto errout;
317 
318 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
319 			    fa->fa_type, fa->fa_scope, key, dst_len,
320 			    fa->fa_tos, fa->fa_info, nlm_flags);
321 	if (err < 0) {
322 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
323 		WARN_ON(err == -EMSGSIZE);
324 		kfree_skb(skb);
325 		goto errout;
326 	}
327 	err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
328 			  info->nlh, GFP_KERNEL);
329 errout:
330 	if (err < 0)
331 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
332 }
333 
334 /* Return the first fib alias matching TOS with
335  * priority less than or equal to PRIO.
336  */
337 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
338 {
339 	if (fah) {
340 		struct fib_alias *fa;
341 		list_for_each_entry(fa, fah, fa_list) {
342 			if (fa->fa_tos > tos)
343 				continue;
344 			if (fa->fa_info->fib_priority >= prio ||
345 			    fa->fa_tos < tos)
346 				return fa;
347 		}
348 	}
349 	return NULL;
350 }
351 
352 int fib_detect_death(struct fib_info *fi, int order,
353 		     struct fib_info **last_resort, int *last_idx, int dflt)
354 {
355 	struct neighbour *n;
356 	int state = NUD_NONE;
357 
358 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
359 	if (n) {
360 		state = n->nud_state;
361 		neigh_release(n);
362 	}
363 	if (state==NUD_REACHABLE)
364 		return 0;
365 	if ((state&NUD_VALID) && order != dflt)
366 		return 0;
367 	if ((state&NUD_VALID) ||
368 	    (*last_idx<0 && order > dflt)) {
369 		*last_resort = fi;
370 		*last_idx = order;
371 	}
372 	return 1;
373 }
374 
375 #ifdef CONFIG_IP_ROUTE_MULTIPATH
376 
377 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
378 {
379 	int nhs = 0;
380 
381 	while (rtnh_ok(rtnh, remaining)) {
382 		nhs++;
383 		rtnh = rtnh_next(rtnh, &remaining);
384 	}
385 
386 	/* leftover implies invalid nexthop configuration, discard it */
387 	return remaining > 0 ? 0 : nhs;
388 }
389 
390 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
391 		       int remaining, struct fib_config *cfg)
392 {
393 	change_nexthops(fi) {
394 		int attrlen;
395 
396 		if (!rtnh_ok(rtnh, remaining))
397 			return -EINVAL;
398 
399 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 		nh->nh_oif = rtnh->rtnh_ifindex;
401 		nh->nh_weight = rtnh->rtnh_hops + 1;
402 
403 		attrlen = rtnh_attrlen(rtnh);
404 		if (attrlen > 0) {
405 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
406 
407 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
408 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
409 #ifdef CONFIG_NET_CLS_ROUTE
410 			nla = nla_find(attrs, attrlen, RTA_FLOW);
411 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
412 #endif
413 		}
414 
415 		rtnh = rtnh_next(rtnh, &remaining);
416 	} endfor_nexthops(fi);
417 
418 	return 0;
419 }
420 
421 #endif
422 
423 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
424 {
425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
426 	struct rtnexthop *rtnh;
427 	int remaining;
428 #endif
429 
430 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
431 		return 1;
432 
433 	if (cfg->fc_oif || cfg->fc_gw) {
434 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
435 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
436 			return 0;
437 		return 1;
438 	}
439 
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441 	if (cfg->fc_mp == NULL)
442 		return 0;
443 
444 	rtnh = cfg->fc_mp;
445 	remaining = cfg->fc_mp_len;
446 
447 	for_nexthops(fi) {
448 		int attrlen;
449 
450 		if (!rtnh_ok(rtnh, remaining))
451 			return -EINVAL;
452 
453 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
454 			return 1;
455 
456 		attrlen = rtnh_attrlen(rtnh);
457 		if (attrlen < 0) {
458 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
459 
460 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
461 			if (nla && nla_get_be32(nla) != nh->nh_gw)
462 				return 1;
463 #ifdef CONFIG_NET_CLS_ROUTE
464 			nla = nla_find(attrs, attrlen, RTA_FLOW);
465 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
466 				return 1;
467 #endif
468 		}
469 
470 		rtnh = rtnh_next(rtnh, &remaining);
471 	} endfor_nexthops(fi);
472 #endif
473 	return 0;
474 }
475 
476 
477 /*
478    Picture
479    -------
480 
481    Semantics of nexthop is very messy by historical reasons.
482    We have to take into account, that:
483    a) gateway can be actually local interface address,
484       so that gatewayed route is direct.
485    b) gateway must be on-link address, possibly
486       described not by an ifaddr, but also by a direct route.
487    c) If both gateway and interface are specified, they should not
488       contradict.
489    d) If we use tunnel routes, gateway could be not on-link.
490 
491    Attempt to reconcile all of these (alas, self-contradictory) conditions
492    results in pretty ugly and hairy code with obscure logic.
493 
494    I chose to generalized it instead, so that the size
495    of code does not increase practically, but it becomes
496    much more general.
497    Every prefix is assigned a "scope" value: "host" is local address,
498    "link" is direct route,
499    [ ... "site" ... "interior" ... ]
500    and "universe" is true gateway route with global meaning.
501 
502    Every prefix refers to a set of "nexthop"s (gw, oif),
503    where gw must have narrower scope. This recursion stops
504    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505    which means that gw is forced to be on link.
506 
507    Code is still hairy, but now it is apparently logically
508    consistent and very flexible. F.e. as by-product it allows
509    to co-exists in peace independent exterior and interior
510    routing processes.
511 
512    Normally it looks as following.
513 
514    {universe prefix}  -> (gw, oif) [scope link]
515 			  |
516 			  |-> {link prefix} -> (gw, oif) [scope local]
517 						|
518 						|-> {local prefix} (terminal node)
519  */
520 
521 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
522 			struct fib_nh *nh)
523 {
524 	int err;
525 	struct net *net;
526 
527 	net = cfg->fc_nlinfo.nl_net;
528 	if (nh->nh_gw) {
529 		struct fib_result res;
530 
531 #ifdef CONFIG_IP_ROUTE_PERVASIVE
532 		if (nh->nh_flags&RTNH_F_PERVASIVE)
533 			return 0;
534 #endif
535 		if (nh->nh_flags&RTNH_F_ONLINK) {
536 			struct net_device *dev;
537 
538 			if (cfg->fc_scope >= RT_SCOPE_LINK)
539 				return -EINVAL;
540 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
541 				return -EINVAL;
542 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
543 				return -ENODEV;
544 			if (!(dev->flags&IFF_UP))
545 				return -ENETDOWN;
546 			nh->nh_dev = dev;
547 			dev_hold(dev);
548 			nh->nh_scope = RT_SCOPE_LINK;
549 			return 0;
550 		}
551 		{
552 			struct flowi fl = {
553 				.nl_u = {
554 					.ip4_u = {
555 						.daddr = nh->nh_gw,
556 						.scope = cfg->fc_scope + 1,
557 					},
558 				},
559 				.oif = nh->nh_oif,
560 			};
561 
562 			/* It is not necessary, but requires a bit of thinking */
563 			if (fl.fl4_scope < RT_SCOPE_LINK)
564 				fl.fl4_scope = RT_SCOPE_LINK;
565 			if ((err = fib_lookup(net, &fl, &res)) != 0)
566 				return err;
567 		}
568 		err = -EINVAL;
569 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
570 			goto out;
571 		nh->nh_scope = res.scope;
572 		nh->nh_oif = FIB_RES_OIF(res);
573 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
574 			goto out;
575 		dev_hold(nh->nh_dev);
576 		err = -ENETDOWN;
577 		if (!(nh->nh_dev->flags & IFF_UP))
578 			goto out;
579 		err = 0;
580 out:
581 		fib_res_put(&res);
582 		return err;
583 	} else {
584 		struct in_device *in_dev;
585 
586 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
587 			return -EINVAL;
588 
589 		in_dev = inetdev_by_index(net, nh->nh_oif);
590 		if (in_dev == NULL)
591 			return -ENODEV;
592 		if (!(in_dev->dev->flags&IFF_UP)) {
593 			in_dev_put(in_dev);
594 			return -ENETDOWN;
595 		}
596 		nh->nh_dev = in_dev->dev;
597 		dev_hold(nh->nh_dev);
598 		nh->nh_scope = RT_SCOPE_HOST;
599 		in_dev_put(in_dev);
600 	}
601 	return 0;
602 }
603 
604 static inline unsigned int fib_laddr_hashfn(__be32 val)
605 {
606 	unsigned int mask = (fib_hash_size - 1);
607 
608 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
609 }
610 
611 static struct hlist_head *fib_hash_alloc(int bytes)
612 {
613 	if (bytes <= PAGE_SIZE)
614 		return kzalloc(bytes, GFP_KERNEL);
615 	else
616 		return (struct hlist_head *)
617 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
618 }
619 
620 static void fib_hash_free(struct hlist_head *hash, int bytes)
621 {
622 	if (!hash)
623 		return;
624 
625 	if (bytes <= PAGE_SIZE)
626 		kfree(hash);
627 	else
628 		free_pages((unsigned long) hash, get_order(bytes));
629 }
630 
631 static void fib_hash_move(struct hlist_head *new_info_hash,
632 			  struct hlist_head *new_laddrhash,
633 			  unsigned int new_size)
634 {
635 	struct hlist_head *old_info_hash, *old_laddrhash;
636 	unsigned int old_size = fib_hash_size;
637 	unsigned int i, bytes;
638 
639 	spin_lock_bh(&fib_info_lock);
640 	old_info_hash = fib_info_hash;
641 	old_laddrhash = fib_info_laddrhash;
642 	fib_hash_size = new_size;
643 
644 	for (i = 0; i < old_size; i++) {
645 		struct hlist_head *head = &fib_info_hash[i];
646 		struct hlist_node *node, *n;
647 		struct fib_info *fi;
648 
649 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
650 			struct hlist_head *dest;
651 			unsigned int new_hash;
652 
653 			hlist_del(&fi->fib_hash);
654 
655 			new_hash = fib_info_hashfn(fi);
656 			dest = &new_info_hash[new_hash];
657 			hlist_add_head(&fi->fib_hash, dest);
658 		}
659 	}
660 	fib_info_hash = new_info_hash;
661 
662 	for (i = 0; i < old_size; i++) {
663 		struct hlist_head *lhead = &fib_info_laddrhash[i];
664 		struct hlist_node *node, *n;
665 		struct fib_info *fi;
666 
667 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
668 			struct hlist_head *ldest;
669 			unsigned int new_hash;
670 
671 			hlist_del(&fi->fib_lhash);
672 
673 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
674 			ldest = &new_laddrhash[new_hash];
675 			hlist_add_head(&fi->fib_lhash, ldest);
676 		}
677 	}
678 	fib_info_laddrhash = new_laddrhash;
679 
680 	spin_unlock_bh(&fib_info_lock);
681 
682 	bytes = old_size * sizeof(struct hlist_head *);
683 	fib_hash_free(old_info_hash, bytes);
684 	fib_hash_free(old_laddrhash, bytes);
685 }
686 
687 struct fib_info *fib_create_info(struct fib_config *cfg)
688 {
689 	int err;
690 	struct fib_info *fi = NULL;
691 	struct fib_info *ofi;
692 	int nhs = 1;
693 	struct net *net = cfg->fc_nlinfo.nl_net;
694 
695 	/* Fast check to catch the most weird cases */
696 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
697 		goto err_inval;
698 
699 #ifdef CONFIG_IP_ROUTE_MULTIPATH
700 	if (cfg->fc_mp) {
701 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
702 		if (nhs == 0)
703 			goto err_inval;
704 	}
705 #endif
706 
707 	err = -ENOBUFS;
708 	if (fib_info_cnt >= fib_hash_size) {
709 		unsigned int new_size = fib_hash_size << 1;
710 		struct hlist_head *new_info_hash;
711 		struct hlist_head *new_laddrhash;
712 		unsigned int bytes;
713 
714 		if (!new_size)
715 			new_size = 1;
716 		bytes = new_size * sizeof(struct hlist_head *);
717 		new_info_hash = fib_hash_alloc(bytes);
718 		new_laddrhash = fib_hash_alloc(bytes);
719 		if (!new_info_hash || !new_laddrhash) {
720 			fib_hash_free(new_info_hash, bytes);
721 			fib_hash_free(new_laddrhash, bytes);
722 		} else
723 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
724 
725 		if (!fib_hash_size)
726 			goto failure;
727 	}
728 
729 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
730 	if (fi == NULL)
731 		goto failure;
732 	fib_info_cnt++;
733 
734 	fi->fib_net = hold_net(net);
735 	fi->fib_protocol = cfg->fc_protocol;
736 	fi->fib_flags = cfg->fc_flags;
737 	fi->fib_priority = cfg->fc_priority;
738 	fi->fib_prefsrc = cfg->fc_prefsrc;
739 
740 	fi->fib_nhs = nhs;
741 	change_nexthops(fi) {
742 		nh->nh_parent = fi;
743 	} endfor_nexthops(fi)
744 
745 	if (cfg->fc_mx) {
746 		struct nlattr *nla;
747 		int remaining;
748 
749 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
750 			int type = nla_type(nla);
751 
752 			if (type) {
753 				if (type > RTAX_MAX)
754 					goto err_inval;
755 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
756 			}
757 		}
758 	}
759 
760 	if (cfg->fc_mp) {
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
763 		if (err != 0)
764 			goto failure;
765 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
766 			goto err_inval;
767 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
768 			goto err_inval;
769 #ifdef CONFIG_NET_CLS_ROUTE
770 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
771 			goto err_inval;
772 #endif
773 #else
774 		goto err_inval;
775 #endif
776 	} else {
777 		struct fib_nh *nh = fi->fib_nh;
778 
779 		nh->nh_oif = cfg->fc_oif;
780 		nh->nh_gw = cfg->fc_gw;
781 		nh->nh_flags = cfg->fc_flags;
782 #ifdef CONFIG_NET_CLS_ROUTE
783 		nh->nh_tclassid = cfg->fc_flow;
784 #endif
785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
786 		nh->nh_weight = 1;
787 #endif
788 	}
789 
790 	if (fib_props[cfg->fc_type].error) {
791 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
792 			goto err_inval;
793 		goto link_it;
794 	}
795 
796 	if (cfg->fc_scope > RT_SCOPE_HOST)
797 		goto err_inval;
798 
799 	if (cfg->fc_scope == RT_SCOPE_HOST) {
800 		struct fib_nh *nh = fi->fib_nh;
801 
802 		/* Local address is added. */
803 		if (nhs != 1 || nh->nh_gw)
804 			goto err_inval;
805 		nh->nh_scope = RT_SCOPE_NOWHERE;
806 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
807 		err = -ENODEV;
808 		if (nh->nh_dev == NULL)
809 			goto failure;
810 	} else {
811 		change_nexthops(fi) {
812 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
813 				goto failure;
814 		} endfor_nexthops(fi)
815 	}
816 
817 	if (fi->fib_prefsrc) {
818 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
819 		    fi->fib_prefsrc != cfg->fc_dst)
820 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
821 				goto err_inval;
822 	}
823 
824 link_it:
825 	if ((ofi = fib_find_info(fi)) != NULL) {
826 		fi->fib_dead = 1;
827 		free_fib_info(fi);
828 		ofi->fib_treeref++;
829 		return ofi;
830 	}
831 
832 	fi->fib_treeref++;
833 	atomic_inc(&fi->fib_clntref);
834 	spin_lock_bh(&fib_info_lock);
835 	hlist_add_head(&fi->fib_hash,
836 		       &fib_info_hash[fib_info_hashfn(fi)]);
837 	if (fi->fib_prefsrc) {
838 		struct hlist_head *head;
839 
840 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
841 		hlist_add_head(&fi->fib_lhash, head);
842 	}
843 	change_nexthops(fi) {
844 		struct hlist_head *head;
845 		unsigned int hash;
846 
847 		if (!nh->nh_dev)
848 			continue;
849 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
850 		head = &fib_info_devhash[hash];
851 		hlist_add_head(&nh->nh_hash, head);
852 	} endfor_nexthops(fi)
853 	spin_unlock_bh(&fib_info_lock);
854 	return fi;
855 
856 err_inval:
857 	err = -EINVAL;
858 
859 failure:
860 	if (fi) {
861 		fi->fib_dead = 1;
862 		free_fib_info(fi);
863 	}
864 
865 	return ERR_PTR(err);
866 }
867 
868 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
869 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
870 		       struct fib_result *res, __be32 zone, __be32 mask,
871 			int prefixlen)
872 {
873 	struct fib_alias *fa;
874 	int nh_sel = 0;
875 
876 	list_for_each_entry_rcu(fa, head, fa_list) {
877 		int err;
878 
879 		if (fa->fa_tos &&
880 		    fa->fa_tos != flp->fl4_tos)
881 			continue;
882 
883 		if (fa->fa_scope < flp->fl4_scope)
884 			continue;
885 
886 		fa->fa_state |= FA_S_ACCESSED;
887 
888 		err = fib_props[fa->fa_type].error;
889 		if (err == 0) {
890 			struct fib_info *fi = fa->fa_info;
891 
892 			if (fi->fib_flags & RTNH_F_DEAD)
893 				continue;
894 
895 			switch (fa->fa_type) {
896 			case RTN_UNICAST:
897 			case RTN_LOCAL:
898 			case RTN_BROADCAST:
899 			case RTN_ANYCAST:
900 			case RTN_MULTICAST:
901 				for_nexthops(fi) {
902 					if (nh->nh_flags&RTNH_F_DEAD)
903 						continue;
904 					if (!flp->oif || flp->oif == nh->nh_oif)
905 						break;
906 				}
907 #ifdef CONFIG_IP_ROUTE_MULTIPATH
908 				if (nhsel < fi->fib_nhs) {
909 					nh_sel = nhsel;
910 					goto out_fill_res;
911 				}
912 #else
913 				if (nhsel < 1) {
914 					goto out_fill_res;
915 				}
916 #endif
917 				endfor_nexthops(fi);
918 				continue;
919 
920 			default:
921 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
922 					fa->fa_type);
923 				return -EINVAL;
924 			}
925 		}
926 		return err;
927 	}
928 	return 1;
929 
930 out_fill_res:
931 	res->prefixlen = prefixlen;
932 	res->nh_sel = nh_sel;
933 	res->type = fa->fa_type;
934 	res->scope = fa->fa_scope;
935 	res->fi = fa->fa_info;
936 	atomic_inc(&res->fi->fib_clntref);
937 	return 0;
938 }
939 
940 /* Find appropriate source address to this destination */
941 
942 __be32 __fib_res_prefsrc(struct fib_result *res)
943 {
944 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
945 }
946 
947 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
948 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
949 		  struct fib_info *fi, unsigned int flags)
950 {
951 	struct nlmsghdr *nlh;
952 	struct rtmsg *rtm;
953 
954 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
955 	if (nlh == NULL)
956 		return -EMSGSIZE;
957 
958 	rtm = nlmsg_data(nlh);
959 	rtm->rtm_family = AF_INET;
960 	rtm->rtm_dst_len = dst_len;
961 	rtm->rtm_src_len = 0;
962 	rtm->rtm_tos = tos;
963 	rtm->rtm_table = tb_id;
964 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
965 	rtm->rtm_type = type;
966 	rtm->rtm_flags = fi->fib_flags;
967 	rtm->rtm_scope = scope;
968 	rtm->rtm_protocol = fi->fib_protocol;
969 
970 	if (rtm->rtm_dst_len)
971 		NLA_PUT_BE32(skb, RTA_DST, dst);
972 
973 	if (fi->fib_priority)
974 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
975 
976 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
977 		goto nla_put_failure;
978 
979 	if (fi->fib_prefsrc)
980 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
981 
982 	if (fi->fib_nhs == 1) {
983 		if (fi->fib_nh->nh_gw)
984 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
985 
986 		if (fi->fib_nh->nh_oif)
987 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
988 #ifdef CONFIG_NET_CLS_ROUTE
989 		if (fi->fib_nh[0].nh_tclassid)
990 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
991 #endif
992 	}
993 #ifdef CONFIG_IP_ROUTE_MULTIPATH
994 	if (fi->fib_nhs > 1) {
995 		struct rtnexthop *rtnh;
996 		struct nlattr *mp;
997 
998 		mp = nla_nest_start(skb, RTA_MULTIPATH);
999 		if (mp == NULL)
1000 			goto nla_put_failure;
1001 
1002 		for_nexthops(fi) {
1003 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1004 			if (rtnh == NULL)
1005 				goto nla_put_failure;
1006 
1007 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1008 			rtnh->rtnh_hops = nh->nh_weight - 1;
1009 			rtnh->rtnh_ifindex = nh->nh_oif;
1010 
1011 			if (nh->nh_gw)
1012 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1013 #ifdef CONFIG_NET_CLS_ROUTE
1014 			if (nh->nh_tclassid)
1015 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1016 #endif
1017 			/* length of rtnetlink header + attributes */
1018 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1019 		} endfor_nexthops(fi);
1020 
1021 		nla_nest_end(skb, mp);
1022 	}
1023 #endif
1024 	return nlmsg_end(skb, nlh);
1025 
1026 nla_put_failure:
1027 	nlmsg_cancel(skb, nlh);
1028 	return -EMSGSIZE;
1029 }
1030 
1031 /*
1032    Update FIB if:
1033    - local address disappeared -> we must delete all the entries
1034      referring to it.
1035    - device went down -> we must shutdown all nexthops going via it.
1036  */
1037 int fib_sync_down_addr(struct net *net, __be32 local)
1038 {
1039 	int ret = 0;
1040 	unsigned int hash = fib_laddr_hashfn(local);
1041 	struct hlist_head *head = &fib_info_laddrhash[hash];
1042 	struct hlist_node *node;
1043 	struct fib_info *fi;
1044 
1045 	if (fib_info_laddrhash == NULL || local == 0)
1046 		return 0;
1047 
1048 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1049 		if (fi->fib_net != net)
1050 			continue;
1051 		if (fi->fib_prefsrc == local) {
1052 			fi->fib_flags |= RTNH_F_DEAD;
1053 			ret++;
1054 		}
1055 	}
1056 	return ret;
1057 }
1058 
1059 int fib_sync_down_dev(struct net_device *dev, int force)
1060 {
1061 	int ret = 0;
1062 	int scope = RT_SCOPE_NOWHERE;
1063 	struct fib_info *prev_fi = NULL;
1064 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1065 	struct hlist_head *head = &fib_info_devhash[hash];
1066 	struct hlist_node *node;
1067 	struct fib_nh *nh;
1068 
1069 	if (force)
1070 		scope = -1;
1071 
1072 	hlist_for_each_entry(nh, node, head, nh_hash) {
1073 		struct fib_info *fi = nh->nh_parent;
1074 		int dead;
1075 
1076 		BUG_ON(!fi->fib_nhs);
1077 		if (nh->nh_dev != dev || fi == prev_fi)
1078 			continue;
1079 		prev_fi = fi;
1080 		dead = 0;
1081 		change_nexthops(fi) {
1082 			if (nh->nh_flags&RTNH_F_DEAD)
1083 				dead++;
1084 			else if (nh->nh_dev == dev &&
1085 					nh->nh_scope != scope) {
1086 				nh->nh_flags |= RTNH_F_DEAD;
1087 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1088 				spin_lock_bh(&fib_multipath_lock);
1089 				fi->fib_power -= nh->nh_power;
1090 				nh->nh_power = 0;
1091 				spin_unlock_bh(&fib_multipath_lock);
1092 #endif
1093 				dead++;
1094 			}
1095 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1096 			if (force > 1 && nh->nh_dev == dev) {
1097 				dead = fi->fib_nhs;
1098 				break;
1099 			}
1100 #endif
1101 		} endfor_nexthops(fi)
1102 		if (dead == fi->fib_nhs) {
1103 			fi->fib_flags |= RTNH_F_DEAD;
1104 			ret++;
1105 		}
1106 	}
1107 
1108 	return ret;
1109 }
1110 
1111 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1112 
1113 /*
1114    Dead device goes up. We wake up dead nexthops.
1115    It takes sense only on multipath routes.
1116  */
1117 
1118 int fib_sync_up(struct net_device *dev)
1119 {
1120 	struct fib_info *prev_fi;
1121 	unsigned int hash;
1122 	struct hlist_head *head;
1123 	struct hlist_node *node;
1124 	struct fib_nh *nh;
1125 	int ret;
1126 
1127 	if (!(dev->flags&IFF_UP))
1128 		return 0;
1129 
1130 	prev_fi = NULL;
1131 	hash = fib_devindex_hashfn(dev->ifindex);
1132 	head = &fib_info_devhash[hash];
1133 	ret = 0;
1134 
1135 	hlist_for_each_entry(nh, node, head, nh_hash) {
1136 		struct fib_info *fi = nh->nh_parent;
1137 		int alive;
1138 
1139 		BUG_ON(!fi->fib_nhs);
1140 		if (nh->nh_dev != dev || fi == prev_fi)
1141 			continue;
1142 
1143 		prev_fi = fi;
1144 		alive = 0;
1145 		change_nexthops(fi) {
1146 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1147 				alive++;
1148 				continue;
1149 			}
1150 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1151 				continue;
1152 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1153 				continue;
1154 			alive++;
1155 			spin_lock_bh(&fib_multipath_lock);
1156 			nh->nh_power = 0;
1157 			nh->nh_flags &= ~RTNH_F_DEAD;
1158 			spin_unlock_bh(&fib_multipath_lock);
1159 		} endfor_nexthops(fi)
1160 
1161 		if (alive > 0) {
1162 			fi->fib_flags &= ~RTNH_F_DEAD;
1163 			ret++;
1164 		}
1165 	}
1166 
1167 	return ret;
1168 }
1169 
1170 /*
1171    The algorithm is suboptimal, but it provides really
1172    fair weighted route distribution.
1173  */
1174 
1175 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1176 {
1177 	struct fib_info *fi = res->fi;
1178 	int w;
1179 
1180 	spin_lock_bh(&fib_multipath_lock);
1181 	if (fi->fib_power <= 0) {
1182 		int power = 0;
1183 		change_nexthops(fi) {
1184 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1185 				power += nh->nh_weight;
1186 				nh->nh_power = nh->nh_weight;
1187 			}
1188 		} endfor_nexthops(fi);
1189 		fi->fib_power = power;
1190 		if (power <= 0) {
1191 			spin_unlock_bh(&fib_multipath_lock);
1192 			/* Race condition: route has just become dead. */
1193 			res->nh_sel = 0;
1194 			return;
1195 		}
1196 	}
1197 
1198 
1199 	/* w should be random number [0..fi->fib_power-1],
1200 	   it is pretty bad approximation.
1201 	 */
1202 
1203 	w = jiffies % fi->fib_power;
1204 
1205 	change_nexthops(fi) {
1206 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1207 			if ((w -= nh->nh_power) <= 0) {
1208 				nh->nh_power--;
1209 				fi->fib_power--;
1210 				res->nh_sel = nhsel;
1211 				spin_unlock_bh(&fib_multipath_lock);
1212 				return;
1213 			}
1214 		}
1215 	} endfor_nexthops(fi);
1216 
1217 	/* Race condition: route has just become dead. */
1218 	res->nh_sel = 0;
1219 	spin_unlock_bh(&fib_multipath_lock);
1220 }
1221 #endif
1222