xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision 9ac8d3fb)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 
46 #include "fib_lookup.h"
47 
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53 
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57 
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59 
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61 
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 
65 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69 
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71 
72 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74 
75 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77 
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79 
80 #define endfor_nexthops(fi) }
81 
82 
83 static const struct
84 {
85 	int	error;
86 	u8	scope;
87 } fib_props[RTN_MAX + 1] = {
88 	{
89 		.error	= 0,
90 		.scope	= RT_SCOPE_NOWHERE,
91 	},	/* RTN_UNSPEC */
92 	{
93 		.error	= 0,
94 		.scope	= RT_SCOPE_UNIVERSE,
95 	},	/* RTN_UNICAST */
96 	{
97 		.error	= 0,
98 		.scope	= RT_SCOPE_HOST,
99 	},	/* RTN_LOCAL */
100 	{
101 		.error	= 0,
102 		.scope	= RT_SCOPE_LINK,
103 	},	/* RTN_BROADCAST */
104 	{
105 		.error	= 0,
106 		.scope	= RT_SCOPE_LINK,
107 	},	/* RTN_ANYCAST */
108 	{
109 		.error	= 0,
110 		.scope	= RT_SCOPE_UNIVERSE,
111 	},	/* RTN_MULTICAST */
112 	{
113 		.error	= -EINVAL,
114 		.scope	= RT_SCOPE_UNIVERSE,
115 	},	/* RTN_BLACKHOLE */
116 	{
117 		.error	= -EHOSTUNREACH,
118 		.scope	= RT_SCOPE_UNIVERSE,
119 	},	/* RTN_UNREACHABLE */
120 	{
121 		.error	= -EACCES,
122 		.scope	= RT_SCOPE_UNIVERSE,
123 	},	/* RTN_PROHIBIT */
124 	{
125 		.error	= -EAGAIN,
126 		.scope	= RT_SCOPE_UNIVERSE,
127 	},	/* RTN_THROW */
128 	{
129 		.error	= -EINVAL,
130 		.scope	= RT_SCOPE_NOWHERE,
131 	},	/* RTN_NAT */
132 	{
133 		.error	= -EINVAL,
134 		.scope	= RT_SCOPE_NOWHERE,
135 	},	/* RTN_XRESOLVE */
136 };
137 
138 
139 /* Release a nexthop info record */
140 
141 void free_fib_info(struct fib_info *fi)
142 {
143 	if (fi->fib_dead == 0) {
144 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145 		return;
146 	}
147 	change_nexthops(fi) {
148 		if (nh->nh_dev)
149 			dev_put(nh->nh_dev);
150 		nh->nh_dev = NULL;
151 	} endfor_nexthops(fi);
152 	fib_info_cnt--;
153 	release_net(fi->fib_net);
154 	kfree(fi);
155 }
156 
157 void fib_release_info(struct fib_info *fi)
158 {
159 	spin_lock_bh(&fib_info_lock);
160 	if (fi && --fi->fib_treeref == 0) {
161 		hlist_del(&fi->fib_hash);
162 		if (fi->fib_prefsrc)
163 			hlist_del(&fi->fib_lhash);
164 		change_nexthops(fi) {
165 			if (!nh->nh_dev)
166 				continue;
167 			hlist_del(&nh->nh_hash);
168 		} endfor_nexthops(fi)
169 		fi->fib_dead = 1;
170 		fib_info_put(fi);
171 	}
172 	spin_unlock_bh(&fib_info_lock);
173 }
174 
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177 	const struct fib_nh *onh = ofi->fib_nh;
178 
179 	for_nexthops(fi) {
180 		if (nh->nh_oif != onh->nh_oif ||
181 		    nh->nh_gw  != onh->nh_gw ||
182 		    nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184 		    nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187 		    nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 			return -1;
191 		onh++;
192 	} endfor_nexthops(fi);
193 	return 0;
194 }
195 
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
199 
200 	return (val ^
201 		(val >> DEVINDEX_HASHBITS) ^
202 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204 
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207 	unsigned int mask = (fib_hash_size - 1);
208 	unsigned int val = fi->fib_nhs;
209 
210 	val ^= fi->fib_protocol;
211 	val ^= (__force u32)fi->fib_prefsrc;
212 	val ^= fi->fib_priority;
213 	for_nexthops(fi) {
214 		val ^= fib_devindex_hashfn(nh->nh_oif);
215 	} endfor_nexthops(fi)
216 
217 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219 
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222 	struct hlist_head *head;
223 	struct hlist_node *node;
224 	struct fib_info *fi;
225 	unsigned int hash;
226 
227 	hash = fib_info_hashfn(nfi);
228 	head = &fib_info_hash[hash];
229 
230 	hlist_for_each_entry(fi, node, head, fib_hash) {
231 		if (fi->fib_net != nfi->fib_net)
232 			continue;
233 		if (fi->fib_nhs != nfi->fib_nhs)
234 			continue;
235 		if (nfi->fib_protocol == fi->fib_protocol &&
236 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
237 		    nfi->fib_priority == fi->fib_priority &&
238 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
239 			   sizeof(fi->fib_metrics)) == 0 &&
240 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 			return fi;
243 	}
244 
245 	return NULL;
246 }
247 
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251 
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254 	struct hlist_head *head;
255 	struct hlist_node *node;
256 	struct fib_nh *nh;
257 	unsigned int hash;
258 
259 	spin_lock(&fib_info_lock);
260 
261 	hash = fib_devindex_hashfn(dev->ifindex);
262 	head = &fib_info_devhash[hash];
263 	hlist_for_each_entry(nh, node, head, nh_hash) {
264 		if (nh->nh_dev == dev &&
265 		    nh->nh_gw == gw &&
266 		    !(nh->nh_flags&RTNH_F_DEAD)) {
267 			spin_unlock(&fib_info_lock);
268 			return 0;
269 		}
270 	}
271 
272 	spin_unlock(&fib_info_lock);
273 
274 	return -1;
275 }
276 
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 			 + nla_total_size(4) /* RTA_TABLE */
281 			 + nla_total_size(4) /* RTA_DST */
282 			 + nla_total_size(4) /* RTA_PRIORITY */
283 			 + nla_total_size(4); /* RTA_PREFSRC */
284 
285 	/* space for nested metrics */
286 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287 
288 	if (fi->fib_nhs) {
289 		/* Also handles the special case fib_nhs == 1 */
290 
291 		/* each nexthop is packed in an attribute */
292 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293 
294 		/* may contain flow and gateway attribute */
295 		nhsize += 2 * nla_total_size(4);
296 
297 		/* all nexthops are packed in a nested attribute */
298 		payload += nla_total_size(fi->fib_nhs * nhsize);
299 	}
300 
301 	return payload;
302 }
303 
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305 	       int dst_len, u32 tb_id, struct nl_info *info,
306 	       unsigned int nlm_flags)
307 {
308 	struct sk_buff *skb;
309 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310 	int err = -ENOBUFS;
311 
312 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313 	if (skb == NULL)
314 		goto errout;
315 
316 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317 			    fa->fa_type, fa->fa_scope, key, dst_len,
318 			    fa->fa_tos, fa->fa_info, nlm_flags);
319 	if (err < 0) {
320 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 		WARN_ON(err == -EMSGSIZE);
322 		kfree_skb(skb);
323 		goto errout;
324 	}
325 	err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 			  info->nlh, GFP_KERNEL);
327 errout:
328 	if (err < 0)
329 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
330 }
331 
332 /* Return the first fib alias matching TOS with
333  * priority less than or equal to PRIO.
334  */
335 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
336 {
337 	if (fah) {
338 		struct fib_alias *fa;
339 		list_for_each_entry(fa, fah, fa_list) {
340 			if (fa->fa_tos > tos)
341 				continue;
342 			if (fa->fa_info->fib_priority >= prio ||
343 			    fa->fa_tos < tos)
344 				return fa;
345 		}
346 	}
347 	return NULL;
348 }
349 
350 int fib_detect_death(struct fib_info *fi, int order,
351 		     struct fib_info **last_resort, int *last_idx, int dflt)
352 {
353 	struct neighbour *n;
354 	int state = NUD_NONE;
355 
356 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
357 	if (n) {
358 		state = n->nud_state;
359 		neigh_release(n);
360 	}
361 	if (state==NUD_REACHABLE)
362 		return 0;
363 	if ((state&NUD_VALID) && order != dflt)
364 		return 0;
365 	if ((state&NUD_VALID) ||
366 	    (*last_idx<0 && order > dflt)) {
367 		*last_resort = fi;
368 		*last_idx = order;
369 	}
370 	return 1;
371 }
372 
373 #ifdef CONFIG_IP_ROUTE_MULTIPATH
374 
375 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
376 {
377 	int nhs = 0;
378 
379 	while (rtnh_ok(rtnh, remaining)) {
380 		nhs++;
381 		rtnh = rtnh_next(rtnh, &remaining);
382 	}
383 
384 	/* leftover implies invalid nexthop configuration, discard it */
385 	return remaining > 0 ? 0 : nhs;
386 }
387 
388 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
389 		       int remaining, struct fib_config *cfg)
390 {
391 	change_nexthops(fi) {
392 		int attrlen;
393 
394 		if (!rtnh_ok(rtnh, remaining))
395 			return -EINVAL;
396 
397 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
398 		nh->nh_oif = rtnh->rtnh_ifindex;
399 		nh->nh_weight = rtnh->rtnh_hops + 1;
400 
401 		attrlen = rtnh_attrlen(rtnh);
402 		if (attrlen > 0) {
403 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
404 
405 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
406 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
407 #ifdef CONFIG_NET_CLS_ROUTE
408 			nla = nla_find(attrs, attrlen, RTA_FLOW);
409 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
410 #endif
411 		}
412 
413 		rtnh = rtnh_next(rtnh, &remaining);
414 	} endfor_nexthops(fi);
415 
416 	return 0;
417 }
418 
419 #endif
420 
421 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
422 {
423 #ifdef CONFIG_IP_ROUTE_MULTIPATH
424 	struct rtnexthop *rtnh;
425 	int remaining;
426 #endif
427 
428 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
429 		return 1;
430 
431 	if (cfg->fc_oif || cfg->fc_gw) {
432 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
433 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
434 			return 0;
435 		return 1;
436 	}
437 
438 #ifdef CONFIG_IP_ROUTE_MULTIPATH
439 	if (cfg->fc_mp == NULL)
440 		return 0;
441 
442 	rtnh = cfg->fc_mp;
443 	remaining = cfg->fc_mp_len;
444 
445 	for_nexthops(fi) {
446 		int attrlen;
447 
448 		if (!rtnh_ok(rtnh, remaining))
449 			return -EINVAL;
450 
451 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
452 			return 1;
453 
454 		attrlen = rtnh_attrlen(rtnh);
455 		if (attrlen < 0) {
456 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
457 
458 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
459 			if (nla && nla_get_be32(nla) != nh->nh_gw)
460 				return 1;
461 #ifdef CONFIG_NET_CLS_ROUTE
462 			nla = nla_find(attrs, attrlen, RTA_FLOW);
463 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
464 				return 1;
465 #endif
466 		}
467 
468 		rtnh = rtnh_next(rtnh, &remaining);
469 	} endfor_nexthops(fi);
470 #endif
471 	return 0;
472 }
473 
474 
475 /*
476    Picture
477    -------
478 
479    Semantics of nexthop is very messy by historical reasons.
480    We have to take into account, that:
481    a) gateway can be actually local interface address,
482       so that gatewayed route is direct.
483    b) gateway must be on-link address, possibly
484       described not by an ifaddr, but also by a direct route.
485    c) If both gateway and interface are specified, they should not
486       contradict.
487    d) If we use tunnel routes, gateway could be not on-link.
488 
489    Attempt to reconcile all of these (alas, self-contradictory) conditions
490    results in pretty ugly and hairy code with obscure logic.
491 
492    I chose to generalized it instead, so that the size
493    of code does not increase practically, but it becomes
494    much more general.
495    Every prefix is assigned a "scope" value: "host" is local address,
496    "link" is direct route,
497    [ ... "site" ... "interior" ... ]
498    and "universe" is true gateway route with global meaning.
499 
500    Every prefix refers to a set of "nexthop"s (gw, oif),
501    where gw must have narrower scope. This recursion stops
502    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
503    which means that gw is forced to be on link.
504 
505    Code is still hairy, but now it is apparently logically
506    consistent and very flexible. F.e. as by-product it allows
507    to co-exists in peace independent exterior and interior
508    routing processes.
509 
510    Normally it looks as following.
511 
512    {universe prefix}  -> (gw, oif) [scope link]
513 			  |
514 			  |-> {link prefix} -> (gw, oif) [scope local]
515 						|
516 						|-> {local prefix} (terminal node)
517  */
518 
519 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
520 			struct fib_nh *nh)
521 {
522 	int err;
523 	struct net *net;
524 
525 	net = cfg->fc_nlinfo.nl_net;
526 	if (nh->nh_gw) {
527 		struct fib_result res;
528 
529 #ifdef CONFIG_IP_ROUTE_PERVASIVE
530 		if (nh->nh_flags&RTNH_F_PERVASIVE)
531 			return 0;
532 #endif
533 		if (nh->nh_flags&RTNH_F_ONLINK) {
534 			struct net_device *dev;
535 
536 			if (cfg->fc_scope >= RT_SCOPE_LINK)
537 				return -EINVAL;
538 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
539 				return -EINVAL;
540 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
541 				return -ENODEV;
542 			if (!(dev->flags&IFF_UP))
543 				return -ENETDOWN;
544 			nh->nh_dev = dev;
545 			dev_hold(dev);
546 			nh->nh_scope = RT_SCOPE_LINK;
547 			return 0;
548 		}
549 		{
550 			struct flowi fl = {
551 				.nl_u = {
552 					.ip4_u = {
553 						.daddr = nh->nh_gw,
554 						.scope = cfg->fc_scope + 1,
555 					},
556 				},
557 				.oif = nh->nh_oif,
558 			};
559 
560 			/* It is not necessary, but requires a bit of thinking */
561 			if (fl.fl4_scope < RT_SCOPE_LINK)
562 				fl.fl4_scope = RT_SCOPE_LINK;
563 			if ((err = fib_lookup(net, &fl, &res)) != 0)
564 				return err;
565 		}
566 		err = -EINVAL;
567 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
568 			goto out;
569 		nh->nh_scope = res.scope;
570 		nh->nh_oif = FIB_RES_OIF(res);
571 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
572 			goto out;
573 		dev_hold(nh->nh_dev);
574 		err = -ENETDOWN;
575 		if (!(nh->nh_dev->flags & IFF_UP))
576 			goto out;
577 		err = 0;
578 out:
579 		fib_res_put(&res);
580 		return err;
581 	} else {
582 		struct in_device *in_dev;
583 
584 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
585 			return -EINVAL;
586 
587 		in_dev = inetdev_by_index(net, nh->nh_oif);
588 		if (in_dev == NULL)
589 			return -ENODEV;
590 		if (!(in_dev->dev->flags&IFF_UP)) {
591 			in_dev_put(in_dev);
592 			return -ENETDOWN;
593 		}
594 		nh->nh_dev = in_dev->dev;
595 		dev_hold(nh->nh_dev);
596 		nh->nh_scope = RT_SCOPE_HOST;
597 		in_dev_put(in_dev);
598 	}
599 	return 0;
600 }
601 
602 static inline unsigned int fib_laddr_hashfn(__be32 val)
603 {
604 	unsigned int mask = (fib_hash_size - 1);
605 
606 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
607 }
608 
609 static struct hlist_head *fib_hash_alloc(int bytes)
610 {
611 	if (bytes <= PAGE_SIZE)
612 		return kzalloc(bytes, GFP_KERNEL);
613 	else
614 		return (struct hlist_head *)
615 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
616 }
617 
618 static void fib_hash_free(struct hlist_head *hash, int bytes)
619 {
620 	if (!hash)
621 		return;
622 
623 	if (bytes <= PAGE_SIZE)
624 		kfree(hash);
625 	else
626 		free_pages((unsigned long) hash, get_order(bytes));
627 }
628 
629 static void fib_hash_move(struct hlist_head *new_info_hash,
630 			  struct hlist_head *new_laddrhash,
631 			  unsigned int new_size)
632 {
633 	struct hlist_head *old_info_hash, *old_laddrhash;
634 	unsigned int old_size = fib_hash_size;
635 	unsigned int i, bytes;
636 
637 	spin_lock_bh(&fib_info_lock);
638 	old_info_hash = fib_info_hash;
639 	old_laddrhash = fib_info_laddrhash;
640 	fib_hash_size = new_size;
641 
642 	for (i = 0; i < old_size; i++) {
643 		struct hlist_head *head = &fib_info_hash[i];
644 		struct hlist_node *node, *n;
645 		struct fib_info *fi;
646 
647 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
648 			struct hlist_head *dest;
649 			unsigned int new_hash;
650 
651 			hlist_del(&fi->fib_hash);
652 
653 			new_hash = fib_info_hashfn(fi);
654 			dest = &new_info_hash[new_hash];
655 			hlist_add_head(&fi->fib_hash, dest);
656 		}
657 	}
658 	fib_info_hash = new_info_hash;
659 
660 	for (i = 0; i < old_size; i++) {
661 		struct hlist_head *lhead = &fib_info_laddrhash[i];
662 		struct hlist_node *node, *n;
663 		struct fib_info *fi;
664 
665 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
666 			struct hlist_head *ldest;
667 			unsigned int new_hash;
668 
669 			hlist_del(&fi->fib_lhash);
670 
671 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
672 			ldest = &new_laddrhash[new_hash];
673 			hlist_add_head(&fi->fib_lhash, ldest);
674 		}
675 	}
676 	fib_info_laddrhash = new_laddrhash;
677 
678 	spin_unlock_bh(&fib_info_lock);
679 
680 	bytes = old_size * sizeof(struct hlist_head *);
681 	fib_hash_free(old_info_hash, bytes);
682 	fib_hash_free(old_laddrhash, bytes);
683 }
684 
685 struct fib_info *fib_create_info(struct fib_config *cfg)
686 {
687 	int err;
688 	struct fib_info *fi = NULL;
689 	struct fib_info *ofi;
690 	int nhs = 1;
691 	struct net *net = cfg->fc_nlinfo.nl_net;
692 
693 	/* Fast check to catch the most weird cases */
694 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
695 		goto err_inval;
696 
697 #ifdef CONFIG_IP_ROUTE_MULTIPATH
698 	if (cfg->fc_mp) {
699 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
700 		if (nhs == 0)
701 			goto err_inval;
702 	}
703 #endif
704 
705 	err = -ENOBUFS;
706 	if (fib_info_cnt >= fib_hash_size) {
707 		unsigned int new_size = fib_hash_size << 1;
708 		struct hlist_head *new_info_hash;
709 		struct hlist_head *new_laddrhash;
710 		unsigned int bytes;
711 
712 		if (!new_size)
713 			new_size = 1;
714 		bytes = new_size * sizeof(struct hlist_head *);
715 		new_info_hash = fib_hash_alloc(bytes);
716 		new_laddrhash = fib_hash_alloc(bytes);
717 		if (!new_info_hash || !new_laddrhash) {
718 			fib_hash_free(new_info_hash, bytes);
719 			fib_hash_free(new_laddrhash, bytes);
720 		} else
721 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
722 
723 		if (!fib_hash_size)
724 			goto failure;
725 	}
726 
727 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
728 	if (fi == NULL)
729 		goto failure;
730 	fib_info_cnt++;
731 
732 	fi->fib_net = hold_net(net);
733 	fi->fib_protocol = cfg->fc_protocol;
734 	fi->fib_flags = cfg->fc_flags;
735 	fi->fib_priority = cfg->fc_priority;
736 	fi->fib_prefsrc = cfg->fc_prefsrc;
737 
738 	fi->fib_nhs = nhs;
739 	change_nexthops(fi) {
740 		nh->nh_parent = fi;
741 	} endfor_nexthops(fi)
742 
743 	if (cfg->fc_mx) {
744 		struct nlattr *nla;
745 		int remaining;
746 
747 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
748 			int type = nla_type(nla);
749 
750 			if (type) {
751 				if (type > RTAX_MAX)
752 					goto err_inval;
753 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
754 			}
755 		}
756 	}
757 
758 	if (cfg->fc_mp) {
759 #ifdef CONFIG_IP_ROUTE_MULTIPATH
760 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
761 		if (err != 0)
762 			goto failure;
763 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
764 			goto err_inval;
765 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
766 			goto err_inval;
767 #ifdef CONFIG_NET_CLS_ROUTE
768 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
769 			goto err_inval;
770 #endif
771 #else
772 		goto err_inval;
773 #endif
774 	} else {
775 		struct fib_nh *nh = fi->fib_nh;
776 
777 		nh->nh_oif = cfg->fc_oif;
778 		nh->nh_gw = cfg->fc_gw;
779 		nh->nh_flags = cfg->fc_flags;
780 #ifdef CONFIG_NET_CLS_ROUTE
781 		nh->nh_tclassid = cfg->fc_flow;
782 #endif
783 #ifdef CONFIG_IP_ROUTE_MULTIPATH
784 		nh->nh_weight = 1;
785 #endif
786 	}
787 
788 	if (fib_props[cfg->fc_type].error) {
789 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
790 			goto err_inval;
791 		goto link_it;
792 	}
793 
794 	if (cfg->fc_scope > RT_SCOPE_HOST)
795 		goto err_inval;
796 
797 	if (cfg->fc_scope == RT_SCOPE_HOST) {
798 		struct fib_nh *nh = fi->fib_nh;
799 
800 		/* Local address is added. */
801 		if (nhs != 1 || nh->nh_gw)
802 			goto err_inval;
803 		nh->nh_scope = RT_SCOPE_NOWHERE;
804 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
805 		err = -ENODEV;
806 		if (nh->nh_dev == NULL)
807 			goto failure;
808 	} else {
809 		change_nexthops(fi) {
810 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
811 				goto failure;
812 		} endfor_nexthops(fi)
813 	}
814 
815 	if (fi->fib_prefsrc) {
816 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
817 		    fi->fib_prefsrc != cfg->fc_dst)
818 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
819 				goto err_inval;
820 	}
821 
822 link_it:
823 	if ((ofi = fib_find_info(fi)) != NULL) {
824 		fi->fib_dead = 1;
825 		free_fib_info(fi);
826 		ofi->fib_treeref++;
827 		return ofi;
828 	}
829 
830 	fi->fib_treeref++;
831 	atomic_inc(&fi->fib_clntref);
832 	spin_lock_bh(&fib_info_lock);
833 	hlist_add_head(&fi->fib_hash,
834 		       &fib_info_hash[fib_info_hashfn(fi)]);
835 	if (fi->fib_prefsrc) {
836 		struct hlist_head *head;
837 
838 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
839 		hlist_add_head(&fi->fib_lhash, head);
840 	}
841 	change_nexthops(fi) {
842 		struct hlist_head *head;
843 		unsigned int hash;
844 
845 		if (!nh->nh_dev)
846 			continue;
847 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
848 		head = &fib_info_devhash[hash];
849 		hlist_add_head(&nh->nh_hash, head);
850 	} endfor_nexthops(fi)
851 	spin_unlock_bh(&fib_info_lock);
852 	return fi;
853 
854 err_inval:
855 	err = -EINVAL;
856 
857 failure:
858 	if (fi) {
859 		fi->fib_dead = 1;
860 		free_fib_info(fi);
861 	}
862 
863 	return ERR_PTR(err);
864 }
865 
866 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
867 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
868 		       struct fib_result *res, __be32 zone, __be32 mask,
869 			int prefixlen)
870 {
871 	struct fib_alias *fa;
872 	int nh_sel = 0;
873 
874 	list_for_each_entry_rcu(fa, head, fa_list) {
875 		int err;
876 
877 		if (fa->fa_tos &&
878 		    fa->fa_tos != flp->fl4_tos)
879 			continue;
880 
881 		if (fa->fa_scope < flp->fl4_scope)
882 			continue;
883 
884 		fa->fa_state |= FA_S_ACCESSED;
885 
886 		err = fib_props[fa->fa_type].error;
887 		if (err == 0) {
888 			struct fib_info *fi = fa->fa_info;
889 
890 			if (fi->fib_flags & RTNH_F_DEAD)
891 				continue;
892 
893 			switch (fa->fa_type) {
894 			case RTN_UNICAST:
895 			case RTN_LOCAL:
896 			case RTN_BROADCAST:
897 			case RTN_ANYCAST:
898 			case RTN_MULTICAST:
899 				for_nexthops(fi) {
900 					if (nh->nh_flags&RTNH_F_DEAD)
901 						continue;
902 					if (!flp->oif || flp->oif == nh->nh_oif)
903 						break;
904 				}
905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
906 				if (nhsel < fi->fib_nhs) {
907 					nh_sel = nhsel;
908 					goto out_fill_res;
909 				}
910 #else
911 				if (nhsel < 1) {
912 					goto out_fill_res;
913 				}
914 #endif
915 				endfor_nexthops(fi);
916 				continue;
917 
918 			default:
919 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
920 					fa->fa_type);
921 				return -EINVAL;
922 			}
923 		}
924 		return err;
925 	}
926 	return 1;
927 
928 out_fill_res:
929 	res->prefixlen = prefixlen;
930 	res->nh_sel = nh_sel;
931 	res->type = fa->fa_type;
932 	res->scope = fa->fa_scope;
933 	res->fi = fa->fa_info;
934 	atomic_inc(&res->fi->fib_clntref);
935 	return 0;
936 }
937 
938 /* Find appropriate source address to this destination */
939 
940 __be32 __fib_res_prefsrc(struct fib_result *res)
941 {
942 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
943 }
944 
945 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
946 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
947 		  struct fib_info *fi, unsigned int flags)
948 {
949 	struct nlmsghdr *nlh;
950 	struct rtmsg *rtm;
951 
952 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
953 	if (nlh == NULL)
954 		return -EMSGSIZE;
955 
956 	rtm = nlmsg_data(nlh);
957 	rtm->rtm_family = AF_INET;
958 	rtm->rtm_dst_len = dst_len;
959 	rtm->rtm_src_len = 0;
960 	rtm->rtm_tos = tos;
961 	if (tb_id < 256)
962 		rtm->rtm_table = tb_id;
963 	else
964 		rtm->rtm_table = RT_TABLE_COMPAT;
965 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
966 	rtm->rtm_type = type;
967 	rtm->rtm_flags = fi->fib_flags;
968 	rtm->rtm_scope = scope;
969 	rtm->rtm_protocol = fi->fib_protocol;
970 
971 	if (rtm->rtm_dst_len)
972 		NLA_PUT_BE32(skb, RTA_DST, dst);
973 
974 	if (fi->fib_priority)
975 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
976 
977 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
978 		goto nla_put_failure;
979 
980 	if (fi->fib_prefsrc)
981 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
982 
983 	if (fi->fib_nhs == 1) {
984 		if (fi->fib_nh->nh_gw)
985 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
986 
987 		if (fi->fib_nh->nh_oif)
988 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
989 #ifdef CONFIG_NET_CLS_ROUTE
990 		if (fi->fib_nh[0].nh_tclassid)
991 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
992 #endif
993 	}
994 #ifdef CONFIG_IP_ROUTE_MULTIPATH
995 	if (fi->fib_nhs > 1) {
996 		struct rtnexthop *rtnh;
997 		struct nlattr *mp;
998 
999 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1000 		if (mp == NULL)
1001 			goto nla_put_failure;
1002 
1003 		for_nexthops(fi) {
1004 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1005 			if (rtnh == NULL)
1006 				goto nla_put_failure;
1007 
1008 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1009 			rtnh->rtnh_hops = nh->nh_weight - 1;
1010 			rtnh->rtnh_ifindex = nh->nh_oif;
1011 
1012 			if (nh->nh_gw)
1013 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1014 #ifdef CONFIG_NET_CLS_ROUTE
1015 			if (nh->nh_tclassid)
1016 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1017 #endif
1018 			/* length of rtnetlink header + attributes */
1019 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1020 		} endfor_nexthops(fi);
1021 
1022 		nla_nest_end(skb, mp);
1023 	}
1024 #endif
1025 	return nlmsg_end(skb, nlh);
1026 
1027 nla_put_failure:
1028 	nlmsg_cancel(skb, nlh);
1029 	return -EMSGSIZE;
1030 }
1031 
1032 /*
1033    Update FIB if:
1034    - local address disappeared -> we must delete all the entries
1035      referring to it.
1036    - device went down -> we must shutdown all nexthops going via it.
1037  */
1038 int fib_sync_down_addr(struct net *net, __be32 local)
1039 {
1040 	int ret = 0;
1041 	unsigned int hash = fib_laddr_hashfn(local);
1042 	struct hlist_head *head = &fib_info_laddrhash[hash];
1043 	struct hlist_node *node;
1044 	struct fib_info *fi;
1045 
1046 	if (fib_info_laddrhash == NULL || local == 0)
1047 		return 0;
1048 
1049 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1050 		if (fi->fib_net != net)
1051 			continue;
1052 		if (fi->fib_prefsrc == local) {
1053 			fi->fib_flags |= RTNH_F_DEAD;
1054 			ret++;
1055 		}
1056 	}
1057 	return ret;
1058 }
1059 
1060 int fib_sync_down_dev(struct net_device *dev, int force)
1061 {
1062 	int ret = 0;
1063 	int scope = RT_SCOPE_NOWHERE;
1064 	struct fib_info *prev_fi = NULL;
1065 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1066 	struct hlist_head *head = &fib_info_devhash[hash];
1067 	struct hlist_node *node;
1068 	struct fib_nh *nh;
1069 
1070 	if (force)
1071 		scope = -1;
1072 
1073 	hlist_for_each_entry(nh, node, head, nh_hash) {
1074 		struct fib_info *fi = nh->nh_parent;
1075 		int dead;
1076 
1077 		BUG_ON(!fi->fib_nhs);
1078 		if (nh->nh_dev != dev || fi == prev_fi)
1079 			continue;
1080 		prev_fi = fi;
1081 		dead = 0;
1082 		change_nexthops(fi) {
1083 			if (nh->nh_flags&RTNH_F_DEAD)
1084 				dead++;
1085 			else if (nh->nh_dev == dev &&
1086 					nh->nh_scope != scope) {
1087 				nh->nh_flags |= RTNH_F_DEAD;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 				spin_lock_bh(&fib_multipath_lock);
1090 				fi->fib_power -= nh->nh_power;
1091 				nh->nh_power = 0;
1092 				spin_unlock_bh(&fib_multipath_lock);
1093 #endif
1094 				dead++;
1095 			}
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 			if (force > 1 && nh->nh_dev == dev) {
1098 				dead = fi->fib_nhs;
1099 				break;
1100 			}
1101 #endif
1102 		} endfor_nexthops(fi)
1103 		if (dead == fi->fib_nhs) {
1104 			fi->fib_flags |= RTNH_F_DEAD;
1105 			ret++;
1106 		}
1107 	}
1108 
1109 	return ret;
1110 }
1111 
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113 
1114 /*
1115    Dead device goes up. We wake up dead nexthops.
1116    It takes sense only on multipath routes.
1117  */
1118 
1119 int fib_sync_up(struct net_device *dev)
1120 {
1121 	struct fib_info *prev_fi;
1122 	unsigned int hash;
1123 	struct hlist_head *head;
1124 	struct hlist_node *node;
1125 	struct fib_nh *nh;
1126 	int ret;
1127 
1128 	if (!(dev->flags&IFF_UP))
1129 		return 0;
1130 
1131 	prev_fi = NULL;
1132 	hash = fib_devindex_hashfn(dev->ifindex);
1133 	head = &fib_info_devhash[hash];
1134 	ret = 0;
1135 
1136 	hlist_for_each_entry(nh, node, head, nh_hash) {
1137 		struct fib_info *fi = nh->nh_parent;
1138 		int alive;
1139 
1140 		BUG_ON(!fi->fib_nhs);
1141 		if (nh->nh_dev != dev || fi == prev_fi)
1142 			continue;
1143 
1144 		prev_fi = fi;
1145 		alive = 0;
1146 		change_nexthops(fi) {
1147 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1148 				alive++;
1149 				continue;
1150 			}
1151 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1152 				continue;
1153 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1154 				continue;
1155 			alive++;
1156 			spin_lock_bh(&fib_multipath_lock);
1157 			nh->nh_power = 0;
1158 			nh->nh_flags &= ~RTNH_F_DEAD;
1159 			spin_unlock_bh(&fib_multipath_lock);
1160 		} endfor_nexthops(fi)
1161 
1162 		if (alive > 0) {
1163 			fi->fib_flags &= ~RTNH_F_DEAD;
1164 			ret++;
1165 		}
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /*
1172    The algorithm is suboptimal, but it provides really
1173    fair weighted route distribution.
1174  */
1175 
1176 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177 {
1178 	struct fib_info *fi = res->fi;
1179 	int w;
1180 
1181 	spin_lock_bh(&fib_multipath_lock);
1182 	if (fi->fib_power <= 0) {
1183 		int power = 0;
1184 		change_nexthops(fi) {
1185 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1186 				power += nh->nh_weight;
1187 				nh->nh_power = nh->nh_weight;
1188 			}
1189 		} endfor_nexthops(fi);
1190 		fi->fib_power = power;
1191 		if (power <= 0) {
1192 			spin_unlock_bh(&fib_multipath_lock);
1193 			/* Race condition: route has just become dead. */
1194 			res->nh_sel = 0;
1195 			return;
1196 		}
1197 	}
1198 
1199 
1200 	/* w should be random number [0..fi->fib_power-1],
1201 	   it is pretty bad approximation.
1202 	 */
1203 
1204 	w = jiffies % fi->fib_power;
1205 
1206 	change_nexthops(fi) {
1207 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1208 			if ((w -= nh->nh_power) <= 0) {
1209 				nh->nh_power--;
1210 				fi->fib_power--;
1211 				res->nh_sel = nhsel;
1212 				spin_unlock_bh(&fib_multipath_lock);
1213 				return;
1214 			}
1215 		}
1216 	} endfor_nexthops(fi);
1217 
1218 	/* Race condition: route has just become dead. */
1219 	res->nh_sel = 0;
1220 	spin_unlock_bh(&fib_multipath_lock);
1221 }
1222 #endif
1223