xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision b6dcefde)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 
46 #include "fib_lookup.h"
47 
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53 
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57 
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59 
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61 
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 
65 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66 for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69 
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71 
72 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74 
75 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77 
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79 
80 #define endfor_nexthops(fi) }
81 
82 
83 static const struct
84 {
85 	int	error;
86 	u8	scope;
87 } fib_props[RTN_MAX + 1] = {
88 	{
89 		.error	= 0,
90 		.scope	= RT_SCOPE_NOWHERE,
91 	},	/* RTN_UNSPEC */
92 	{
93 		.error	= 0,
94 		.scope	= RT_SCOPE_UNIVERSE,
95 	},	/* RTN_UNICAST */
96 	{
97 		.error	= 0,
98 		.scope	= RT_SCOPE_HOST,
99 	},	/* RTN_LOCAL */
100 	{
101 		.error	= 0,
102 		.scope	= RT_SCOPE_LINK,
103 	},	/* RTN_BROADCAST */
104 	{
105 		.error	= 0,
106 		.scope	= RT_SCOPE_LINK,
107 	},	/* RTN_ANYCAST */
108 	{
109 		.error	= 0,
110 		.scope	= RT_SCOPE_UNIVERSE,
111 	},	/* RTN_MULTICAST */
112 	{
113 		.error	= -EINVAL,
114 		.scope	= RT_SCOPE_UNIVERSE,
115 	},	/* RTN_BLACKHOLE */
116 	{
117 		.error	= -EHOSTUNREACH,
118 		.scope	= RT_SCOPE_UNIVERSE,
119 	},	/* RTN_UNREACHABLE */
120 	{
121 		.error	= -EACCES,
122 		.scope	= RT_SCOPE_UNIVERSE,
123 	},	/* RTN_PROHIBIT */
124 	{
125 		.error	= -EAGAIN,
126 		.scope	= RT_SCOPE_UNIVERSE,
127 	},	/* RTN_THROW */
128 	{
129 		.error	= -EINVAL,
130 		.scope	= RT_SCOPE_NOWHERE,
131 	},	/* RTN_NAT */
132 	{
133 		.error	= -EINVAL,
134 		.scope	= RT_SCOPE_NOWHERE,
135 	},	/* RTN_XRESOLVE */
136 };
137 
138 
139 /* Release a nexthop info record */
140 
141 void free_fib_info(struct fib_info *fi)
142 {
143 	if (fi->fib_dead == 0) {
144 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145 		return;
146 	}
147 	change_nexthops(fi) {
148 		if (nh->nh_dev)
149 			dev_put(nh->nh_dev);
150 		nh->nh_dev = NULL;
151 	} endfor_nexthops(fi);
152 	fib_info_cnt--;
153 	release_net(fi->fib_net);
154 	kfree(fi);
155 }
156 
157 void fib_release_info(struct fib_info *fi)
158 {
159 	spin_lock_bh(&fib_info_lock);
160 	if (fi && --fi->fib_treeref == 0) {
161 		hlist_del(&fi->fib_hash);
162 		if (fi->fib_prefsrc)
163 			hlist_del(&fi->fib_lhash);
164 		change_nexthops(fi) {
165 			if (!nh->nh_dev)
166 				continue;
167 			hlist_del(&nh->nh_hash);
168 		} endfor_nexthops(fi)
169 		fi->fib_dead = 1;
170 		fib_info_put(fi);
171 	}
172 	spin_unlock_bh(&fib_info_lock);
173 }
174 
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177 	const struct fib_nh *onh = ofi->fib_nh;
178 
179 	for_nexthops(fi) {
180 		if (nh->nh_oif != onh->nh_oif ||
181 		    nh->nh_gw  != onh->nh_gw ||
182 		    nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184 		    nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187 		    nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 			return -1;
191 		onh++;
192 	} endfor_nexthops(fi);
193 	return 0;
194 }
195 
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
199 
200 	return (val ^
201 		(val >> DEVINDEX_HASHBITS) ^
202 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204 
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207 	unsigned int mask = (fib_hash_size - 1);
208 	unsigned int val = fi->fib_nhs;
209 
210 	val ^= fi->fib_protocol;
211 	val ^= (__force u32)fi->fib_prefsrc;
212 	val ^= fi->fib_priority;
213 	for_nexthops(fi) {
214 		val ^= fib_devindex_hashfn(nh->nh_oif);
215 	} endfor_nexthops(fi)
216 
217 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219 
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222 	struct hlist_head *head;
223 	struct hlist_node *node;
224 	struct fib_info *fi;
225 	unsigned int hash;
226 
227 	hash = fib_info_hashfn(nfi);
228 	head = &fib_info_hash[hash];
229 
230 	hlist_for_each_entry(fi, node, head, fib_hash) {
231 		if (!net_eq(fi->fib_net, nfi->fib_net))
232 			continue;
233 		if (fi->fib_nhs != nfi->fib_nhs)
234 			continue;
235 		if (nfi->fib_protocol == fi->fib_protocol &&
236 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
237 		    nfi->fib_priority == fi->fib_priority &&
238 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
239 			   sizeof(fi->fib_metrics)) == 0 &&
240 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 			return fi;
243 	}
244 
245 	return NULL;
246 }
247 
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251 
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254 	struct hlist_head *head;
255 	struct hlist_node *node;
256 	struct fib_nh *nh;
257 	unsigned int hash;
258 
259 	spin_lock(&fib_info_lock);
260 
261 	hash = fib_devindex_hashfn(dev->ifindex);
262 	head = &fib_info_devhash[hash];
263 	hlist_for_each_entry(nh, node, head, nh_hash) {
264 		if (nh->nh_dev == dev &&
265 		    nh->nh_gw == gw &&
266 		    !(nh->nh_flags&RTNH_F_DEAD)) {
267 			spin_unlock(&fib_info_lock);
268 			return 0;
269 		}
270 	}
271 
272 	spin_unlock(&fib_info_lock);
273 
274 	return -1;
275 }
276 
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 			 + nla_total_size(4) /* RTA_TABLE */
281 			 + nla_total_size(4) /* RTA_DST */
282 			 + nla_total_size(4) /* RTA_PRIORITY */
283 			 + nla_total_size(4); /* RTA_PREFSRC */
284 
285 	/* space for nested metrics */
286 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287 
288 	if (fi->fib_nhs) {
289 		/* Also handles the special case fib_nhs == 1 */
290 
291 		/* each nexthop is packed in an attribute */
292 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293 
294 		/* may contain flow and gateway attribute */
295 		nhsize += 2 * nla_total_size(4);
296 
297 		/* all nexthops are packed in a nested attribute */
298 		payload += nla_total_size(fi->fib_nhs * nhsize);
299 	}
300 
301 	return payload;
302 }
303 
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305 	       int dst_len, u32 tb_id, struct nl_info *info,
306 	       unsigned int nlm_flags)
307 {
308 	struct sk_buff *skb;
309 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310 	int err = -ENOBUFS;
311 
312 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313 	if (skb == NULL)
314 		goto errout;
315 
316 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317 			    fa->fa_type, fa->fa_scope, key, dst_len,
318 			    fa->fa_tos, fa->fa_info, nlm_flags);
319 	if (err < 0) {
320 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 		WARN_ON(err == -EMSGSIZE);
322 		kfree_skb(skb);
323 		goto errout;
324 	}
325 	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 		    info->nlh, GFP_KERNEL);
327 	return;
328 errout:
329 	if (err < 0)
330 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332 
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338 	if (fah) {
339 		struct fib_alias *fa;
340 		list_for_each_entry(fa, fah, fa_list) {
341 			if (fa->fa_tos > tos)
342 				continue;
343 			if (fa->fa_info->fib_priority >= prio ||
344 			    fa->fa_tos < tos)
345 				return fa;
346 		}
347 	}
348 	return NULL;
349 }
350 
351 int fib_detect_death(struct fib_info *fi, int order,
352 		     struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354 	struct neighbour *n;
355 	int state = NUD_NONE;
356 
357 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358 	if (n) {
359 		state = n->nud_state;
360 		neigh_release(n);
361 	}
362 	if (state == NUD_REACHABLE)
363 		return 0;
364 	if ((state&NUD_VALID) && order != dflt)
365 		return 0;
366 	if ((state&NUD_VALID) ||
367 	    (*last_idx<0 && order > dflt)) {
368 		*last_resort = fi;
369 		*last_idx = order;
370 	}
371 	return 1;
372 }
373 
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375 
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378 	int nhs = 0;
379 
380 	while (rtnh_ok(rtnh, remaining)) {
381 		nhs++;
382 		rtnh = rtnh_next(rtnh, &remaining);
383 	}
384 
385 	/* leftover implies invalid nexthop configuration, discard it */
386 	return remaining > 0 ? 0 : nhs;
387 }
388 
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390 		       int remaining, struct fib_config *cfg)
391 {
392 	change_nexthops(fi) {
393 		int attrlen;
394 
395 		if (!rtnh_ok(rtnh, remaining))
396 			return -EINVAL;
397 
398 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
399 		nh->nh_oif = rtnh->rtnh_ifindex;
400 		nh->nh_weight = rtnh->rtnh_hops + 1;
401 
402 		attrlen = rtnh_attrlen(rtnh);
403 		if (attrlen > 0) {
404 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 
406 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408 #ifdef CONFIG_NET_CLS_ROUTE
409 			nla = nla_find(attrs, attrlen, RTA_FLOW);
410 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411 #endif
412 		}
413 
414 		rtnh = rtnh_next(rtnh, &remaining);
415 	} endfor_nexthops(fi);
416 
417 	return 0;
418 }
419 
420 #endif
421 
422 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
423 {
424 #ifdef CONFIG_IP_ROUTE_MULTIPATH
425 	struct rtnexthop *rtnh;
426 	int remaining;
427 #endif
428 
429 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
430 		return 1;
431 
432 	if (cfg->fc_oif || cfg->fc_gw) {
433 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
434 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
435 			return 0;
436 		return 1;
437 	}
438 
439 #ifdef CONFIG_IP_ROUTE_MULTIPATH
440 	if (cfg->fc_mp == NULL)
441 		return 0;
442 
443 	rtnh = cfg->fc_mp;
444 	remaining = cfg->fc_mp_len;
445 
446 	for_nexthops(fi) {
447 		int attrlen;
448 
449 		if (!rtnh_ok(rtnh, remaining))
450 			return -EINVAL;
451 
452 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
453 			return 1;
454 
455 		attrlen = rtnh_attrlen(rtnh);
456 		if (attrlen < 0) {
457 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
458 
459 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
460 			if (nla && nla_get_be32(nla) != nh->nh_gw)
461 				return 1;
462 #ifdef CONFIG_NET_CLS_ROUTE
463 			nla = nla_find(attrs, attrlen, RTA_FLOW);
464 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
465 				return 1;
466 #endif
467 		}
468 
469 		rtnh = rtnh_next(rtnh, &remaining);
470 	} endfor_nexthops(fi);
471 #endif
472 	return 0;
473 }
474 
475 
476 /*
477    Picture
478    -------
479 
480    Semantics of nexthop is very messy by historical reasons.
481    We have to take into account, that:
482    a) gateway can be actually local interface address,
483       so that gatewayed route is direct.
484    b) gateway must be on-link address, possibly
485       described not by an ifaddr, but also by a direct route.
486    c) If both gateway and interface are specified, they should not
487       contradict.
488    d) If we use tunnel routes, gateway could be not on-link.
489 
490    Attempt to reconcile all of these (alas, self-contradictory) conditions
491    results in pretty ugly and hairy code with obscure logic.
492 
493    I chose to generalized it instead, so that the size
494    of code does not increase practically, but it becomes
495    much more general.
496    Every prefix is assigned a "scope" value: "host" is local address,
497    "link" is direct route,
498    [ ... "site" ... "interior" ... ]
499    and "universe" is true gateway route with global meaning.
500 
501    Every prefix refers to a set of "nexthop"s (gw, oif),
502    where gw must have narrower scope. This recursion stops
503    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504    which means that gw is forced to be on link.
505 
506    Code is still hairy, but now it is apparently logically
507    consistent and very flexible. F.e. as by-product it allows
508    to co-exists in peace independent exterior and interior
509    routing processes.
510 
511    Normally it looks as following.
512 
513    {universe prefix}  -> (gw, oif) [scope link]
514 			  |
515 			  |-> {link prefix} -> (gw, oif) [scope local]
516 						|
517 						|-> {local prefix} (terminal node)
518  */
519 
520 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521 			struct fib_nh *nh)
522 {
523 	int err;
524 	struct net *net;
525 
526 	net = cfg->fc_nlinfo.nl_net;
527 	if (nh->nh_gw) {
528 		struct fib_result res;
529 
530 #ifdef CONFIG_IP_ROUTE_PERVASIVE
531 		if (nh->nh_flags&RTNH_F_PERVASIVE)
532 			return 0;
533 #endif
534 		if (nh->nh_flags&RTNH_F_ONLINK) {
535 			struct net_device *dev;
536 
537 			if (cfg->fc_scope >= RT_SCOPE_LINK)
538 				return -EINVAL;
539 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
540 				return -EINVAL;
541 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
542 				return -ENODEV;
543 			if (!(dev->flags&IFF_UP))
544 				return -ENETDOWN;
545 			nh->nh_dev = dev;
546 			dev_hold(dev);
547 			nh->nh_scope = RT_SCOPE_LINK;
548 			return 0;
549 		}
550 		{
551 			struct flowi fl = {
552 				.nl_u = {
553 					.ip4_u = {
554 						.daddr = nh->nh_gw,
555 						.scope = cfg->fc_scope + 1,
556 					},
557 				},
558 				.oif = nh->nh_oif,
559 			};
560 
561 			/* It is not necessary, but requires a bit of thinking */
562 			if (fl.fl4_scope < RT_SCOPE_LINK)
563 				fl.fl4_scope = RT_SCOPE_LINK;
564 			if ((err = fib_lookup(net, &fl, &res)) != 0)
565 				return err;
566 		}
567 		err = -EINVAL;
568 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569 			goto out;
570 		nh->nh_scope = res.scope;
571 		nh->nh_oif = FIB_RES_OIF(res);
572 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
573 			goto out;
574 		dev_hold(nh->nh_dev);
575 		err = -ENETDOWN;
576 		if (!(nh->nh_dev->flags & IFF_UP))
577 			goto out;
578 		err = 0;
579 out:
580 		fib_res_put(&res);
581 		return err;
582 	} else {
583 		struct in_device *in_dev;
584 
585 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
586 			return -EINVAL;
587 
588 		in_dev = inetdev_by_index(net, nh->nh_oif);
589 		if (in_dev == NULL)
590 			return -ENODEV;
591 		if (!(in_dev->dev->flags&IFF_UP)) {
592 			in_dev_put(in_dev);
593 			return -ENETDOWN;
594 		}
595 		nh->nh_dev = in_dev->dev;
596 		dev_hold(nh->nh_dev);
597 		nh->nh_scope = RT_SCOPE_HOST;
598 		in_dev_put(in_dev);
599 	}
600 	return 0;
601 }
602 
603 static inline unsigned int fib_laddr_hashfn(__be32 val)
604 {
605 	unsigned int mask = (fib_hash_size - 1);
606 
607 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
608 }
609 
610 static struct hlist_head *fib_hash_alloc(int bytes)
611 {
612 	if (bytes <= PAGE_SIZE)
613 		return kzalloc(bytes, GFP_KERNEL);
614 	else
615 		return (struct hlist_head *)
616 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
617 }
618 
619 static void fib_hash_free(struct hlist_head *hash, int bytes)
620 {
621 	if (!hash)
622 		return;
623 
624 	if (bytes <= PAGE_SIZE)
625 		kfree(hash);
626 	else
627 		free_pages((unsigned long) hash, get_order(bytes));
628 }
629 
630 static void fib_hash_move(struct hlist_head *new_info_hash,
631 			  struct hlist_head *new_laddrhash,
632 			  unsigned int new_size)
633 {
634 	struct hlist_head *old_info_hash, *old_laddrhash;
635 	unsigned int old_size = fib_hash_size;
636 	unsigned int i, bytes;
637 
638 	spin_lock_bh(&fib_info_lock);
639 	old_info_hash = fib_info_hash;
640 	old_laddrhash = fib_info_laddrhash;
641 	fib_hash_size = new_size;
642 
643 	for (i = 0; i < old_size; i++) {
644 		struct hlist_head *head = &fib_info_hash[i];
645 		struct hlist_node *node, *n;
646 		struct fib_info *fi;
647 
648 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
649 			struct hlist_head *dest;
650 			unsigned int new_hash;
651 
652 			hlist_del(&fi->fib_hash);
653 
654 			new_hash = fib_info_hashfn(fi);
655 			dest = &new_info_hash[new_hash];
656 			hlist_add_head(&fi->fib_hash, dest);
657 		}
658 	}
659 	fib_info_hash = new_info_hash;
660 
661 	for (i = 0; i < old_size; i++) {
662 		struct hlist_head *lhead = &fib_info_laddrhash[i];
663 		struct hlist_node *node, *n;
664 		struct fib_info *fi;
665 
666 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
667 			struct hlist_head *ldest;
668 			unsigned int new_hash;
669 
670 			hlist_del(&fi->fib_lhash);
671 
672 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
673 			ldest = &new_laddrhash[new_hash];
674 			hlist_add_head(&fi->fib_lhash, ldest);
675 		}
676 	}
677 	fib_info_laddrhash = new_laddrhash;
678 
679 	spin_unlock_bh(&fib_info_lock);
680 
681 	bytes = old_size * sizeof(struct hlist_head *);
682 	fib_hash_free(old_info_hash, bytes);
683 	fib_hash_free(old_laddrhash, bytes);
684 }
685 
686 struct fib_info *fib_create_info(struct fib_config *cfg)
687 {
688 	int err;
689 	struct fib_info *fi = NULL;
690 	struct fib_info *ofi;
691 	int nhs = 1;
692 	struct net *net = cfg->fc_nlinfo.nl_net;
693 
694 	/* Fast check to catch the most weird cases */
695 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
696 		goto err_inval;
697 
698 #ifdef CONFIG_IP_ROUTE_MULTIPATH
699 	if (cfg->fc_mp) {
700 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
701 		if (nhs == 0)
702 			goto err_inval;
703 	}
704 #endif
705 
706 	err = -ENOBUFS;
707 	if (fib_info_cnt >= fib_hash_size) {
708 		unsigned int new_size = fib_hash_size << 1;
709 		struct hlist_head *new_info_hash;
710 		struct hlist_head *new_laddrhash;
711 		unsigned int bytes;
712 
713 		if (!new_size)
714 			new_size = 1;
715 		bytes = new_size * sizeof(struct hlist_head *);
716 		new_info_hash = fib_hash_alloc(bytes);
717 		new_laddrhash = fib_hash_alloc(bytes);
718 		if (!new_info_hash || !new_laddrhash) {
719 			fib_hash_free(new_info_hash, bytes);
720 			fib_hash_free(new_laddrhash, bytes);
721 		} else
722 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
723 
724 		if (!fib_hash_size)
725 			goto failure;
726 	}
727 
728 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
729 	if (fi == NULL)
730 		goto failure;
731 	fib_info_cnt++;
732 
733 	fi->fib_net = hold_net(net);
734 	fi->fib_protocol = cfg->fc_protocol;
735 	fi->fib_flags = cfg->fc_flags;
736 	fi->fib_priority = cfg->fc_priority;
737 	fi->fib_prefsrc = cfg->fc_prefsrc;
738 
739 	fi->fib_nhs = nhs;
740 	change_nexthops(fi) {
741 		nh->nh_parent = fi;
742 	} endfor_nexthops(fi)
743 
744 	if (cfg->fc_mx) {
745 		struct nlattr *nla;
746 		int remaining;
747 
748 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
749 			int type = nla_type(nla);
750 
751 			if (type) {
752 				if (type > RTAX_MAX)
753 					goto err_inval;
754 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
755 			}
756 		}
757 	}
758 
759 	if (cfg->fc_mp) {
760 #ifdef CONFIG_IP_ROUTE_MULTIPATH
761 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
762 		if (err != 0)
763 			goto failure;
764 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
765 			goto err_inval;
766 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
767 			goto err_inval;
768 #ifdef CONFIG_NET_CLS_ROUTE
769 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
770 			goto err_inval;
771 #endif
772 #else
773 		goto err_inval;
774 #endif
775 	} else {
776 		struct fib_nh *nh = fi->fib_nh;
777 
778 		nh->nh_oif = cfg->fc_oif;
779 		nh->nh_gw = cfg->fc_gw;
780 		nh->nh_flags = cfg->fc_flags;
781 #ifdef CONFIG_NET_CLS_ROUTE
782 		nh->nh_tclassid = cfg->fc_flow;
783 #endif
784 #ifdef CONFIG_IP_ROUTE_MULTIPATH
785 		nh->nh_weight = 1;
786 #endif
787 	}
788 
789 	if (fib_props[cfg->fc_type].error) {
790 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
791 			goto err_inval;
792 		goto link_it;
793 	}
794 
795 	if (cfg->fc_scope > RT_SCOPE_HOST)
796 		goto err_inval;
797 
798 	if (cfg->fc_scope == RT_SCOPE_HOST) {
799 		struct fib_nh *nh = fi->fib_nh;
800 
801 		/* Local address is added. */
802 		if (nhs != 1 || nh->nh_gw)
803 			goto err_inval;
804 		nh->nh_scope = RT_SCOPE_NOWHERE;
805 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
806 		err = -ENODEV;
807 		if (nh->nh_dev == NULL)
808 			goto failure;
809 	} else {
810 		change_nexthops(fi) {
811 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
812 				goto failure;
813 		} endfor_nexthops(fi)
814 	}
815 
816 	if (fi->fib_prefsrc) {
817 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
818 		    fi->fib_prefsrc != cfg->fc_dst)
819 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
820 				goto err_inval;
821 	}
822 
823 link_it:
824 	if ((ofi = fib_find_info(fi)) != NULL) {
825 		fi->fib_dead = 1;
826 		free_fib_info(fi);
827 		ofi->fib_treeref++;
828 		return ofi;
829 	}
830 
831 	fi->fib_treeref++;
832 	atomic_inc(&fi->fib_clntref);
833 	spin_lock_bh(&fib_info_lock);
834 	hlist_add_head(&fi->fib_hash,
835 		       &fib_info_hash[fib_info_hashfn(fi)]);
836 	if (fi->fib_prefsrc) {
837 		struct hlist_head *head;
838 
839 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
840 		hlist_add_head(&fi->fib_lhash, head);
841 	}
842 	change_nexthops(fi) {
843 		struct hlist_head *head;
844 		unsigned int hash;
845 
846 		if (!nh->nh_dev)
847 			continue;
848 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
849 		head = &fib_info_devhash[hash];
850 		hlist_add_head(&nh->nh_hash, head);
851 	} endfor_nexthops(fi)
852 	spin_unlock_bh(&fib_info_lock);
853 	return fi;
854 
855 err_inval:
856 	err = -EINVAL;
857 
858 failure:
859 	if (fi) {
860 		fi->fib_dead = 1;
861 		free_fib_info(fi);
862 	}
863 
864 	return ERR_PTR(err);
865 }
866 
867 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
868 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
869 		       struct fib_result *res, int prefixlen)
870 {
871 	struct fib_alias *fa;
872 	int nh_sel = 0;
873 
874 	list_for_each_entry_rcu(fa, head, fa_list) {
875 		int err;
876 
877 		if (fa->fa_tos &&
878 		    fa->fa_tos != flp->fl4_tos)
879 			continue;
880 
881 		if (fa->fa_scope < flp->fl4_scope)
882 			continue;
883 
884 		fa->fa_state |= FA_S_ACCESSED;
885 
886 		err = fib_props[fa->fa_type].error;
887 		if (err == 0) {
888 			struct fib_info *fi = fa->fa_info;
889 
890 			if (fi->fib_flags & RTNH_F_DEAD)
891 				continue;
892 
893 			switch (fa->fa_type) {
894 			case RTN_UNICAST:
895 			case RTN_LOCAL:
896 			case RTN_BROADCAST:
897 			case RTN_ANYCAST:
898 			case RTN_MULTICAST:
899 				for_nexthops(fi) {
900 					if (nh->nh_flags&RTNH_F_DEAD)
901 						continue;
902 					if (!flp->oif || flp->oif == nh->nh_oif)
903 						break;
904 				}
905 #ifdef CONFIG_IP_ROUTE_MULTIPATH
906 				if (nhsel < fi->fib_nhs) {
907 					nh_sel = nhsel;
908 					goto out_fill_res;
909 				}
910 #else
911 				if (nhsel < 1) {
912 					goto out_fill_res;
913 				}
914 #endif
915 				endfor_nexthops(fi);
916 				continue;
917 
918 			default:
919 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
920 					fa->fa_type);
921 				return -EINVAL;
922 			}
923 		}
924 		return err;
925 	}
926 	return 1;
927 
928 out_fill_res:
929 	res->prefixlen = prefixlen;
930 	res->nh_sel = nh_sel;
931 	res->type = fa->fa_type;
932 	res->scope = fa->fa_scope;
933 	res->fi = fa->fa_info;
934 	atomic_inc(&res->fi->fib_clntref);
935 	return 0;
936 }
937 
938 /* Find appropriate source address to this destination */
939 
940 __be32 __fib_res_prefsrc(struct fib_result *res)
941 {
942 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
943 }
944 
945 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
946 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
947 		  struct fib_info *fi, unsigned int flags)
948 {
949 	struct nlmsghdr *nlh;
950 	struct rtmsg *rtm;
951 
952 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
953 	if (nlh == NULL)
954 		return -EMSGSIZE;
955 
956 	rtm = nlmsg_data(nlh);
957 	rtm->rtm_family = AF_INET;
958 	rtm->rtm_dst_len = dst_len;
959 	rtm->rtm_src_len = 0;
960 	rtm->rtm_tos = tos;
961 	if (tb_id < 256)
962 		rtm->rtm_table = tb_id;
963 	else
964 		rtm->rtm_table = RT_TABLE_COMPAT;
965 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
966 	rtm->rtm_type = type;
967 	rtm->rtm_flags = fi->fib_flags;
968 	rtm->rtm_scope = scope;
969 	rtm->rtm_protocol = fi->fib_protocol;
970 
971 	if (rtm->rtm_dst_len)
972 		NLA_PUT_BE32(skb, RTA_DST, dst);
973 
974 	if (fi->fib_priority)
975 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
976 
977 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
978 		goto nla_put_failure;
979 
980 	if (fi->fib_prefsrc)
981 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
982 
983 	if (fi->fib_nhs == 1) {
984 		if (fi->fib_nh->nh_gw)
985 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
986 
987 		if (fi->fib_nh->nh_oif)
988 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
989 #ifdef CONFIG_NET_CLS_ROUTE
990 		if (fi->fib_nh[0].nh_tclassid)
991 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
992 #endif
993 	}
994 #ifdef CONFIG_IP_ROUTE_MULTIPATH
995 	if (fi->fib_nhs > 1) {
996 		struct rtnexthop *rtnh;
997 		struct nlattr *mp;
998 
999 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1000 		if (mp == NULL)
1001 			goto nla_put_failure;
1002 
1003 		for_nexthops(fi) {
1004 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1005 			if (rtnh == NULL)
1006 				goto nla_put_failure;
1007 
1008 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1009 			rtnh->rtnh_hops = nh->nh_weight - 1;
1010 			rtnh->rtnh_ifindex = nh->nh_oif;
1011 
1012 			if (nh->nh_gw)
1013 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1014 #ifdef CONFIG_NET_CLS_ROUTE
1015 			if (nh->nh_tclassid)
1016 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1017 #endif
1018 			/* length of rtnetlink header + attributes */
1019 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1020 		} endfor_nexthops(fi);
1021 
1022 		nla_nest_end(skb, mp);
1023 	}
1024 #endif
1025 	return nlmsg_end(skb, nlh);
1026 
1027 nla_put_failure:
1028 	nlmsg_cancel(skb, nlh);
1029 	return -EMSGSIZE;
1030 }
1031 
1032 /*
1033    Update FIB if:
1034    - local address disappeared -> we must delete all the entries
1035      referring to it.
1036    - device went down -> we must shutdown all nexthops going via it.
1037  */
1038 int fib_sync_down_addr(struct net *net, __be32 local)
1039 {
1040 	int ret = 0;
1041 	unsigned int hash = fib_laddr_hashfn(local);
1042 	struct hlist_head *head = &fib_info_laddrhash[hash];
1043 	struct hlist_node *node;
1044 	struct fib_info *fi;
1045 
1046 	if (fib_info_laddrhash == NULL || local == 0)
1047 		return 0;
1048 
1049 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1050 		if (!net_eq(fi->fib_net, net))
1051 			continue;
1052 		if (fi->fib_prefsrc == local) {
1053 			fi->fib_flags |= RTNH_F_DEAD;
1054 			ret++;
1055 		}
1056 	}
1057 	return ret;
1058 }
1059 
1060 int fib_sync_down_dev(struct net_device *dev, int force)
1061 {
1062 	int ret = 0;
1063 	int scope = RT_SCOPE_NOWHERE;
1064 	struct fib_info *prev_fi = NULL;
1065 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1066 	struct hlist_head *head = &fib_info_devhash[hash];
1067 	struct hlist_node *node;
1068 	struct fib_nh *nh;
1069 
1070 	if (force)
1071 		scope = -1;
1072 
1073 	hlist_for_each_entry(nh, node, head, nh_hash) {
1074 		struct fib_info *fi = nh->nh_parent;
1075 		int dead;
1076 
1077 		BUG_ON(!fi->fib_nhs);
1078 		if (nh->nh_dev != dev || fi == prev_fi)
1079 			continue;
1080 		prev_fi = fi;
1081 		dead = 0;
1082 		change_nexthops(fi) {
1083 			if (nh->nh_flags&RTNH_F_DEAD)
1084 				dead++;
1085 			else if (nh->nh_dev == dev &&
1086 					nh->nh_scope != scope) {
1087 				nh->nh_flags |= RTNH_F_DEAD;
1088 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1089 				spin_lock_bh(&fib_multipath_lock);
1090 				fi->fib_power -= nh->nh_power;
1091 				nh->nh_power = 0;
1092 				spin_unlock_bh(&fib_multipath_lock);
1093 #endif
1094 				dead++;
1095 			}
1096 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1097 			if (force > 1 && nh->nh_dev == dev) {
1098 				dead = fi->fib_nhs;
1099 				break;
1100 			}
1101 #endif
1102 		} endfor_nexthops(fi)
1103 		if (dead == fi->fib_nhs) {
1104 			fi->fib_flags |= RTNH_F_DEAD;
1105 			ret++;
1106 		}
1107 	}
1108 
1109 	return ret;
1110 }
1111 
1112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1113 
1114 /*
1115    Dead device goes up. We wake up dead nexthops.
1116    It takes sense only on multipath routes.
1117  */
1118 
1119 int fib_sync_up(struct net_device *dev)
1120 {
1121 	struct fib_info *prev_fi;
1122 	unsigned int hash;
1123 	struct hlist_head *head;
1124 	struct hlist_node *node;
1125 	struct fib_nh *nh;
1126 	int ret;
1127 
1128 	if (!(dev->flags&IFF_UP))
1129 		return 0;
1130 
1131 	prev_fi = NULL;
1132 	hash = fib_devindex_hashfn(dev->ifindex);
1133 	head = &fib_info_devhash[hash];
1134 	ret = 0;
1135 
1136 	hlist_for_each_entry(nh, node, head, nh_hash) {
1137 		struct fib_info *fi = nh->nh_parent;
1138 		int alive;
1139 
1140 		BUG_ON(!fi->fib_nhs);
1141 		if (nh->nh_dev != dev || fi == prev_fi)
1142 			continue;
1143 
1144 		prev_fi = fi;
1145 		alive = 0;
1146 		change_nexthops(fi) {
1147 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1148 				alive++;
1149 				continue;
1150 			}
1151 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1152 				continue;
1153 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1154 				continue;
1155 			alive++;
1156 			spin_lock_bh(&fib_multipath_lock);
1157 			nh->nh_power = 0;
1158 			nh->nh_flags &= ~RTNH_F_DEAD;
1159 			spin_unlock_bh(&fib_multipath_lock);
1160 		} endfor_nexthops(fi)
1161 
1162 		if (alive > 0) {
1163 			fi->fib_flags &= ~RTNH_F_DEAD;
1164 			ret++;
1165 		}
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /*
1172    The algorithm is suboptimal, but it provides really
1173    fair weighted route distribution.
1174  */
1175 
1176 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177 {
1178 	struct fib_info *fi = res->fi;
1179 	int w;
1180 
1181 	spin_lock_bh(&fib_multipath_lock);
1182 	if (fi->fib_power <= 0) {
1183 		int power = 0;
1184 		change_nexthops(fi) {
1185 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1186 				power += nh->nh_weight;
1187 				nh->nh_power = nh->nh_weight;
1188 			}
1189 		} endfor_nexthops(fi);
1190 		fi->fib_power = power;
1191 		if (power <= 0) {
1192 			spin_unlock_bh(&fib_multipath_lock);
1193 			/* Race condition: route has just become dead. */
1194 			res->nh_sel = 0;
1195 			return;
1196 		}
1197 	}
1198 
1199 
1200 	/* w should be random number [0..fi->fib_power-1],
1201 	   it is pretty bad approximation.
1202 	 */
1203 
1204 	w = jiffies % fi->fib_power;
1205 
1206 	change_nexthops(fi) {
1207 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1208 			if ((w -= nh->nh_power) <= 0) {
1209 				nh->nh_power--;
1210 				fi->fib_power--;
1211 				res->nh_sel = nhsel;
1212 				spin_unlock_bh(&fib_multipath_lock);
1213 				return;
1214 			}
1215 		}
1216 	} endfor_nexthops(fi);
1217 
1218 	/* Race condition: route has just become dead. */
1219 	res->nh_sel = 0;
1220 	spin_unlock_bh(&fib_multipath_lock);
1221 }
1222 #endif
1223