xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision b04b4f78)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 
46 #include "fib_lookup.h"
47 
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53 
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57 
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59 
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61 
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64 
65 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
66 for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
67 
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69 
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71 
72 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74 
75 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77 
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79 
80 #define endfor_nexthops(fi) }
81 
82 
83 static const struct
84 {
85 	int	error;
86 	u8	scope;
87 } fib_props[RTN_MAX + 1] = {
88 	{
89 		.error	= 0,
90 		.scope	= RT_SCOPE_NOWHERE,
91 	},	/* RTN_UNSPEC */
92 	{
93 		.error	= 0,
94 		.scope	= RT_SCOPE_UNIVERSE,
95 	},	/* RTN_UNICAST */
96 	{
97 		.error	= 0,
98 		.scope	= RT_SCOPE_HOST,
99 	},	/* RTN_LOCAL */
100 	{
101 		.error	= 0,
102 		.scope	= RT_SCOPE_LINK,
103 	},	/* RTN_BROADCAST */
104 	{
105 		.error	= 0,
106 		.scope	= RT_SCOPE_LINK,
107 	},	/* RTN_ANYCAST */
108 	{
109 		.error	= 0,
110 		.scope	= RT_SCOPE_UNIVERSE,
111 	},	/* RTN_MULTICAST */
112 	{
113 		.error	= -EINVAL,
114 		.scope	= RT_SCOPE_UNIVERSE,
115 	},	/* RTN_BLACKHOLE */
116 	{
117 		.error	= -EHOSTUNREACH,
118 		.scope	= RT_SCOPE_UNIVERSE,
119 	},	/* RTN_UNREACHABLE */
120 	{
121 		.error	= -EACCES,
122 		.scope	= RT_SCOPE_UNIVERSE,
123 	},	/* RTN_PROHIBIT */
124 	{
125 		.error	= -EAGAIN,
126 		.scope	= RT_SCOPE_UNIVERSE,
127 	},	/* RTN_THROW */
128 	{
129 		.error	= -EINVAL,
130 		.scope	= RT_SCOPE_NOWHERE,
131 	},	/* RTN_NAT */
132 	{
133 		.error	= -EINVAL,
134 		.scope	= RT_SCOPE_NOWHERE,
135 	},	/* RTN_XRESOLVE */
136 };
137 
138 
139 /* Release a nexthop info record */
140 
141 void free_fib_info(struct fib_info *fi)
142 {
143 	if (fi->fib_dead == 0) {
144 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145 		return;
146 	}
147 	change_nexthops(fi) {
148 		if (nh->nh_dev)
149 			dev_put(nh->nh_dev);
150 		nh->nh_dev = NULL;
151 	} endfor_nexthops(fi);
152 	fib_info_cnt--;
153 	release_net(fi->fib_net);
154 	kfree(fi);
155 }
156 
157 void fib_release_info(struct fib_info *fi)
158 {
159 	spin_lock_bh(&fib_info_lock);
160 	if (fi && --fi->fib_treeref == 0) {
161 		hlist_del(&fi->fib_hash);
162 		if (fi->fib_prefsrc)
163 			hlist_del(&fi->fib_lhash);
164 		change_nexthops(fi) {
165 			if (!nh->nh_dev)
166 				continue;
167 			hlist_del(&nh->nh_hash);
168 		} endfor_nexthops(fi)
169 		fi->fib_dead = 1;
170 		fib_info_put(fi);
171 	}
172 	spin_unlock_bh(&fib_info_lock);
173 }
174 
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177 	const struct fib_nh *onh = ofi->fib_nh;
178 
179 	for_nexthops(fi) {
180 		if (nh->nh_oif != onh->nh_oif ||
181 		    nh->nh_gw  != onh->nh_gw ||
182 		    nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184 		    nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187 		    nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190 			return -1;
191 		onh++;
192 	} endfor_nexthops(fi);
193 	return 0;
194 }
195 
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
199 
200 	return (val ^
201 		(val >> DEVINDEX_HASHBITS) ^
202 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204 
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207 	unsigned int mask = (fib_hash_size - 1);
208 	unsigned int val = fi->fib_nhs;
209 
210 	val ^= fi->fib_protocol;
211 	val ^= (__force u32)fi->fib_prefsrc;
212 	val ^= fi->fib_priority;
213 	for_nexthops(fi) {
214 		val ^= fib_devindex_hashfn(nh->nh_oif);
215 	} endfor_nexthops(fi)
216 
217 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219 
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222 	struct hlist_head *head;
223 	struct hlist_node *node;
224 	struct fib_info *fi;
225 	unsigned int hash;
226 
227 	hash = fib_info_hashfn(nfi);
228 	head = &fib_info_hash[hash];
229 
230 	hlist_for_each_entry(fi, node, head, fib_hash) {
231 		if (fi->fib_net != nfi->fib_net)
232 			continue;
233 		if (fi->fib_nhs != nfi->fib_nhs)
234 			continue;
235 		if (nfi->fib_protocol == fi->fib_protocol &&
236 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
237 		    nfi->fib_priority == fi->fib_priority &&
238 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
239 			   sizeof(fi->fib_metrics)) == 0 &&
240 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242 			return fi;
243 	}
244 
245 	return NULL;
246 }
247 
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251 
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254 	struct hlist_head *head;
255 	struct hlist_node *node;
256 	struct fib_nh *nh;
257 	unsigned int hash;
258 
259 	spin_lock(&fib_info_lock);
260 
261 	hash = fib_devindex_hashfn(dev->ifindex);
262 	head = &fib_info_devhash[hash];
263 	hlist_for_each_entry(nh, node, head, nh_hash) {
264 		if (nh->nh_dev == dev &&
265 		    nh->nh_gw == gw &&
266 		    !(nh->nh_flags&RTNH_F_DEAD)) {
267 			spin_unlock(&fib_info_lock);
268 			return 0;
269 		}
270 	}
271 
272 	spin_unlock(&fib_info_lock);
273 
274 	return -1;
275 }
276 
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280 			 + nla_total_size(4) /* RTA_TABLE */
281 			 + nla_total_size(4) /* RTA_DST */
282 			 + nla_total_size(4) /* RTA_PRIORITY */
283 			 + nla_total_size(4); /* RTA_PREFSRC */
284 
285 	/* space for nested metrics */
286 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287 
288 	if (fi->fib_nhs) {
289 		/* Also handles the special case fib_nhs == 1 */
290 
291 		/* each nexthop is packed in an attribute */
292 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293 
294 		/* may contain flow and gateway attribute */
295 		nhsize += 2 * nla_total_size(4);
296 
297 		/* all nexthops are packed in a nested attribute */
298 		payload += nla_total_size(fi->fib_nhs * nhsize);
299 	}
300 
301 	return payload;
302 }
303 
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305 	       int dst_len, u32 tb_id, struct nl_info *info,
306 	       unsigned int nlm_flags)
307 {
308 	struct sk_buff *skb;
309 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310 	int err = -ENOBUFS;
311 
312 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313 	if (skb == NULL)
314 		goto errout;
315 
316 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317 			    fa->fa_type, fa->fa_scope, key, dst_len,
318 			    fa->fa_tos, fa->fa_info, nlm_flags);
319 	if (err < 0) {
320 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321 		WARN_ON(err == -EMSGSIZE);
322 		kfree_skb(skb);
323 		goto errout;
324 	}
325 	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 		    info->nlh, GFP_KERNEL);
327 	return;
328 errout:
329 	if (err < 0)
330 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332 
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338 	if (fah) {
339 		struct fib_alias *fa;
340 		list_for_each_entry(fa, fah, fa_list) {
341 			if (fa->fa_tos > tos)
342 				continue;
343 			if (fa->fa_info->fib_priority >= prio ||
344 			    fa->fa_tos < tos)
345 				return fa;
346 		}
347 	}
348 	return NULL;
349 }
350 
351 int fib_detect_death(struct fib_info *fi, int order,
352 		     struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354 	struct neighbour *n;
355 	int state = NUD_NONE;
356 
357 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358 	if (n) {
359 		state = n->nud_state;
360 		neigh_release(n);
361 	}
362 	if (state == NUD_REACHABLE)
363 		return 0;
364 	if ((state&NUD_VALID) && order != dflt)
365 		return 0;
366 	if ((state&NUD_VALID) ||
367 	    (*last_idx<0 && order > dflt)) {
368 		*last_resort = fi;
369 		*last_idx = order;
370 	}
371 	return 1;
372 }
373 
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375 
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378 	int nhs = 0;
379 
380 	while (rtnh_ok(rtnh, remaining)) {
381 		nhs++;
382 		rtnh = rtnh_next(rtnh, &remaining);
383 	}
384 
385 	/* leftover implies invalid nexthop configuration, discard it */
386 	return remaining > 0 ? 0 : nhs;
387 }
388 
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390 		       int remaining, struct fib_config *cfg)
391 {
392 	change_nexthops(fi) {
393 		int attrlen;
394 
395 		if (!rtnh_ok(rtnh, remaining))
396 			return -EINVAL;
397 
398 		nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
399 		nh->nh_oif = rtnh->rtnh_ifindex;
400 		nh->nh_weight = rtnh->rtnh_hops + 1;
401 
402 		attrlen = rtnh_attrlen(rtnh);
403 		if (attrlen > 0) {
404 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
405 
406 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
407 			nh->nh_gw = nla ? nla_get_be32(nla) : 0;
408 #ifdef CONFIG_NET_CLS_ROUTE
409 			nla = nla_find(attrs, attrlen, RTA_FLOW);
410 			nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
411 #endif
412 		}
413 
414 		rtnh = rtnh_next(rtnh, &remaining);
415 	} endfor_nexthops(fi);
416 
417 	return 0;
418 }
419 
420 #endif
421 
422 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
423 {
424 #ifdef CONFIG_IP_ROUTE_MULTIPATH
425 	struct rtnexthop *rtnh;
426 	int remaining;
427 #endif
428 
429 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
430 		return 1;
431 
432 	if (cfg->fc_oif || cfg->fc_gw) {
433 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
434 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
435 			return 0;
436 		return 1;
437 	}
438 
439 #ifdef CONFIG_IP_ROUTE_MULTIPATH
440 	if (cfg->fc_mp == NULL)
441 		return 0;
442 
443 	rtnh = cfg->fc_mp;
444 	remaining = cfg->fc_mp_len;
445 
446 	for_nexthops(fi) {
447 		int attrlen;
448 
449 		if (!rtnh_ok(rtnh, remaining))
450 			return -EINVAL;
451 
452 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
453 			return 1;
454 
455 		attrlen = rtnh_attrlen(rtnh);
456 		if (attrlen < 0) {
457 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
458 
459 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
460 			if (nla && nla_get_be32(nla) != nh->nh_gw)
461 				return 1;
462 #ifdef CONFIG_NET_CLS_ROUTE
463 			nla = nla_find(attrs, attrlen, RTA_FLOW);
464 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
465 				return 1;
466 #endif
467 		}
468 
469 		rtnh = rtnh_next(rtnh, &remaining);
470 	} endfor_nexthops(fi);
471 #endif
472 	return 0;
473 }
474 
475 
476 /*
477    Picture
478    -------
479 
480    Semantics of nexthop is very messy by historical reasons.
481    We have to take into account, that:
482    a) gateway can be actually local interface address,
483       so that gatewayed route is direct.
484    b) gateway must be on-link address, possibly
485       described not by an ifaddr, but also by a direct route.
486    c) If both gateway and interface are specified, they should not
487       contradict.
488    d) If we use tunnel routes, gateway could be not on-link.
489 
490    Attempt to reconcile all of these (alas, self-contradictory) conditions
491    results in pretty ugly and hairy code with obscure logic.
492 
493    I chose to generalized it instead, so that the size
494    of code does not increase practically, but it becomes
495    much more general.
496    Every prefix is assigned a "scope" value: "host" is local address,
497    "link" is direct route,
498    [ ... "site" ... "interior" ... ]
499    and "universe" is true gateway route with global meaning.
500 
501    Every prefix refers to a set of "nexthop"s (gw, oif),
502    where gw must have narrower scope. This recursion stops
503    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
504    which means that gw is forced to be on link.
505 
506    Code is still hairy, but now it is apparently logically
507    consistent and very flexible. F.e. as by-product it allows
508    to co-exists in peace independent exterior and interior
509    routing processes.
510 
511    Normally it looks as following.
512 
513    {universe prefix}  -> (gw, oif) [scope link]
514 			  |
515 			  |-> {link prefix} -> (gw, oif) [scope local]
516 						|
517 						|-> {local prefix} (terminal node)
518  */
519 
520 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
521 			struct fib_nh *nh)
522 {
523 	int err;
524 	struct net *net;
525 
526 	net = cfg->fc_nlinfo.nl_net;
527 	if (nh->nh_gw) {
528 		struct fib_result res;
529 
530 #ifdef CONFIG_IP_ROUTE_PERVASIVE
531 		if (nh->nh_flags&RTNH_F_PERVASIVE)
532 			return 0;
533 #endif
534 		if (nh->nh_flags&RTNH_F_ONLINK) {
535 			struct net_device *dev;
536 
537 			if (cfg->fc_scope >= RT_SCOPE_LINK)
538 				return -EINVAL;
539 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
540 				return -EINVAL;
541 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
542 				return -ENODEV;
543 			if (!(dev->flags&IFF_UP))
544 				return -ENETDOWN;
545 			nh->nh_dev = dev;
546 			dev_hold(dev);
547 			nh->nh_scope = RT_SCOPE_LINK;
548 			return 0;
549 		}
550 		{
551 			struct flowi fl = {
552 				.nl_u = {
553 					.ip4_u = {
554 						.daddr = nh->nh_gw,
555 						.scope = cfg->fc_scope + 1,
556 					},
557 				},
558 				.oif = nh->nh_oif,
559 			};
560 
561 			/* It is not necessary, but requires a bit of thinking */
562 			if (fl.fl4_scope < RT_SCOPE_LINK)
563 				fl.fl4_scope = RT_SCOPE_LINK;
564 			if ((err = fib_lookup(net, &fl, &res)) != 0)
565 				return err;
566 		}
567 		err = -EINVAL;
568 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
569 			goto out;
570 		nh->nh_scope = res.scope;
571 		nh->nh_oif = FIB_RES_OIF(res);
572 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
573 			goto out;
574 		dev_hold(nh->nh_dev);
575 		err = -ENETDOWN;
576 		if (!(nh->nh_dev->flags & IFF_UP))
577 			goto out;
578 		err = 0;
579 out:
580 		fib_res_put(&res);
581 		return err;
582 	} else {
583 		struct in_device *in_dev;
584 
585 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
586 			return -EINVAL;
587 
588 		in_dev = inetdev_by_index(net, nh->nh_oif);
589 		if (in_dev == NULL)
590 			return -ENODEV;
591 		if (!(in_dev->dev->flags&IFF_UP)) {
592 			in_dev_put(in_dev);
593 			return -ENETDOWN;
594 		}
595 		nh->nh_dev = in_dev->dev;
596 		dev_hold(nh->nh_dev);
597 		nh->nh_scope = RT_SCOPE_HOST;
598 		in_dev_put(in_dev);
599 	}
600 	return 0;
601 }
602 
603 static inline unsigned int fib_laddr_hashfn(__be32 val)
604 {
605 	unsigned int mask = (fib_hash_size - 1);
606 
607 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
608 }
609 
610 static struct hlist_head *fib_hash_alloc(int bytes)
611 {
612 	if (bytes <= PAGE_SIZE)
613 		return kzalloc(bytes, GFP_KERNEL);
614 	else
615 		return (struct hlist_head *)
616 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
617 }
618 
619 static void fib_hash_free(struct hlist_head *hash, int bytes)
620 {
621 	if (!hash)
622 		return;
623 
624 	if (bytes <= PAGE_SIZE)
625 		kfree(hash);
626 	else
627 		free_pages((unsigned long) hash, get_order(bytes));
628 }
629 
630 static void fib_hash_move(struct hlist_head *new_info_hash,
631 			  struct hlist_head *new_laddrhash,
632 			  unsigned int new_size)
633 {
634 	struct hlist_head *old_info_hash, *old_laddrhash;
635 	unsigned int old_size = fib_hash_size;
636 	unsigned int i, bytes;
637 
638 	spin_lock_bh(&fib_info_lock);
639 	old_info_hash = fib_info_hash;
640 	old_laddrhash = fib_info_laddrhash;
641 	fib_hash_size = new_size;
642 
643 	for (i = 0; i < old_size; i++) {
644 		struct hlist_head *head = &fib_info_hash[i];
645 		struct hlist_node *node, *n;
646 		struct fib_info *fi;
647 
648 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
649 			struct hlist_head *dest;
650 			unsigned int new_hash;
651 
652 			hlist_del(&fi->fib_hash);
653 
654 			new_hash = fib_info_hashfn(fi);
655 			dest = &new_info_hash[new_hash];
656 			hlist_add_head(&fi->fib_hash, dest);
657 		}
658 	}
659 	fib_info_hash = new_info_hash;
660 
661 	for (i = 0; i < old_size; i++) {
662 		struct hlist_head *lhead = &fib_info_laddrhash[i];
663 		struct hlist_node *node, *n;
664 		struct fib_info *fi;
665 
666 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
667 			struct hlist_head *ldest;
668 			unsigned int new_hash;
669 
670 			hlist_del(&fi->fib_lhash);
671 
672 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
673 			ldest = &new_laddrhash[new_hash];
674 			hlist_add_head(&fi->fib_lhash, ldest);
675 		}
676 	}
677 	fib_info_laddrhash = new_laddrhash;
678 
679 	spin_unlock_bh(&fib_info_lock);
680 
681 	bytes = old_size * sizeof(struct hlist_head *);
682 	fib_hash_free(old_info_hash, bytes);
683 	fib_hash_free(old_laddrhash, bytes);
684 }
685 
686 struct fib_info *fib_create_info(struct fib_config *cfg)
687 {
688 	int err;
689 	struct fib_info *fi = NULL;
690 	struct fib_info *ofi;
691 	int nhs = 1;
692 	struct net *net = cfg->fc_nlinfo.nl_net;
693 
694 	/* Fast check to catch the most weird cases */
695 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
696 		goto err_inval;
697 
698 #ifdef CONFIG_IP_ROUTE_MULTIPATH
699 	if (cfg->fc_mp) {
700 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
701 		if (nhs == 0)
702 			goto err_inval;
703 	}
704 #endif
705 
706 	err = -ENOBUFS;
707 	if (fib_info_cnt >= fib_hash_size) {
708 		unsigned int new_size = fib_hash_size << 1;
709 		struct hlist_head *new_info_hash;
710 		struct hlist_head *new_laddrhash;
711 		unsigned int bytes;
712 
713 		if (!new_size)
714 			new_size = 1;
715 		bytes = new_size * sizeof(struct hlist_head *);
716 		new_info_hash = fib_hash_alloc(bytes);
717 		new_laddrhash = fib_hash_alloc(bytes);
718 		if (!new_info_hash || !new_laddrhash) {
719 			fib_hash_free(new_info_hash, bytes);
720 			fib_hash_free(new_laddrhash, bytes);
721 		} else
722 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
723 
724 		if (!fib_hash_size)
725 			goto failure;
726 	}
727 
728 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
729 	if (fi == NULL)
730 		goto failure;
731 	fib_info_cnt++;
732 
733 	fi->fib_net = hold_net(net);
734 	fi->fib_protocol = cfg->fc_protocol;
735 	fi->fib_flags = cfg->fc_flags;
736 	fi->fib_priority = cfg->fc_priority;
737 	fi->fib_prefsrc = cfg->fc_prefsrc;
738 
739 	fi->fib_nhs = nhs;
740 	change_nexthops(fi) {
741 		nh->nh_parent = fi;
742 	} endfor_nexthops(fi)
743 
744 	if (cfg->fc_mx) {
745 		struct nlattr *nla;
746 		int remaining;
747 
748 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
749 			int type = nla_type(nla);
750 
751 			if (type) {
752 				if (type > RTAX_MAX)
753 					goto err_inval;
754 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
755 			}
756 		}
757 	}
758 
759 	if (cfg->fc_mp) {
760 #ifdef CONFIG_IP_ROUTE_MULTIPATH
761 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
762 		if (err != 0)
763 			goto failure;
764 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
765 			goto err_inval;
766 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
767 			goto err_inval;
768 #ifdef CONFIG_NET_CLS_ROUTE
769 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
770 			goto err_inval;
771 #endif
772 #else
773 		goto err_inval;
774 #endif
775 	} else {
776 		struct fib_nh *nh = fi->fib_nh;
777 
778 		nh->nh_oif = cfg->fc_oif;
779 		nh->nh_gw = cfg->fc_gw;
780 		nh->nh_flags = cfg->fc_flags;
781 #ifdef CONFIG_NET_CLS_ROUTE
782 		nh->nh_tclassid = cfg->fc_flow;
783 #endif
784 #ifdef CONFIG_IP_ROUTE_MULTIPATH
785 		nh->nh_weight = 1;
786 #endif
787 	}
788 
789 	if (fib_props[cfg->fc_type].error) {
790 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
791 			goto err_inval;
792 		goto link_it;
793 	}
794 
795 	if (cfg->fc_scope > RT_SCOPE_HOST)
796 		goto err_inval;
797 
798 	if (cfg->fc_scope == RT_SCOPE_HOST) {
799 		struct fib_nh *nh = fi->fib_nh;
800 
801 		/* Local address is added. */
802 		if (nhs != 1 || nh->nh_gw)
803 			goto err_inval;
804 		nh->nh_scope = RT_SCOPE_NOWHERE;
805 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
806 		err = -ENODEV;
807 		if (nh->nh_dev == NULL)
808 			goto failure;
809 	} else {
810 		change_nexthops(fi) {
811 			if ((err = fib_check_nh(cfg, fi, nh)) != 0)
812 				goto failure;
813 		} endfor_nexthops(fi)
814 	}
815 
816 	if (fi->fib_prefsrc) {
817 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
818 		    fi->fib_prefsrc != cfg->fc_dst)
819 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
820 				goto err_inval;
821 	}
822 
823 link_it:
824 	if ((ofi = fib_find_info(fi)) != NULL) {
825 		fi->fib_dead = 1;
826 		free_fib_info(fi);
827 		ofi->fib_treeref++;
828 		return ofi;
829 	}
830 
831 	fi->fib_treeref++;
832 	atomic_inc(&fi->fib_clntref);
833 	spin_lock_bh(&fib_info_lock);
834 	hlist_add_head(&fi->fib_hash,
835 		       &fib_info_hash[fib_info_hashfn(fi)]);
836 	if (fi->fib_prefsrc) {
837 		struct hlist_head *head;
838 
839 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
840 		hlist_add_head(&fi->fib_lhash, head);
841 	}
842 	change_nexthops(fi) {
843 		struct hlist_head *head;
844 		unsigned int hash;
845 
846 		if (!nh->nh_dev)
847 			continue;
848 		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
849 		head = &fib_info_devhash[hash];
850 		hlist_add_head(&nh->nh_hash, head);
851 	} endfor_nexthops(fi)
852 	spin_unlock_bh(&fib_info_lock);
853 	return fi;
854 
855 err_inval:
856 	err = -EINVAL;
857 
858 failure:
859 	if (fi) {
860 		fi->fib_dead = 1;
861 		free_fib_info(fi);
862 	}
863 
864 	return ERR_PTR(err);
865 }
866 
867 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
868 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
869 		       struct fib_result *res, __be32 zone, __be32 mask,
870 			int prefixlen)
871 {
872 	struct fib_alias *fa;
873 	int nh_sel = 0;
874 
875 	list_for_each_entry_rcu(fa, head, fa_list) {
876 		int err;
877 
878 		if (fa->fa_tos &&
879 		    fa->fa_tos != flp->fl4_tos)
880 			continue;
881 
882 		if (fa->fa_scope < flp->fl4_scope)
883 			continue;
884 
885 		fa->fa_state |= FA_S_ACCESSED;
886 
887 		err = fib_props[fa->fa_type].error;
888 		if (err == 0) {
889 			struct fib_info *fi = fa->fa_info;
890 
891 			if (fi->fib_flags & RTNH_F_DEAD)
892 				continue;
893 
894 			switch (fa->fa_type) {
895 			case RTN_UNICAST:
896 			case RTN_LOCAL:
897 			case RTN_BROADCAST:
898 			case RTN_ANYCAST:
899 			case RTN_MULTICAST:
900 				for_nexthops(fi) {
901 					if (nh->nh_flags&RTNH_F_DEAD)
902 						continue;
903 					if (!flp->oif || flp->oif == nh->nh_oif)
904 						break;
905 				}
906 #ifdef CONFIG_IP_ROUTE_MULTIPATH
907 				if (nhsel < fi->fib_nhs) {
908 					nh_sel = nhsel;
909 					goto out_fill_res;
910 				}
911 #else
912 				if (nhsel < 1) {
913 					goto out_fill_res;
914 				}
915 #endif
916 				endfor_nexthops(fi);
917 				continue;
918 
919 			default:
920 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
921 					fa->fa_type);
922 				return -EINVAL;
923 			}
924 		}
925 		return err;
926 	}
927 	return 1;
928 
929 out_fill_res:
930 	res->prefixlen = prefixlen;
931 	res->nh_sel = nh_sel;
932 	res->type = fa->fa_type;
933 	res->scope = fa->fa_scope;
934 	res->fi = fa->fa_info;
935 	atomic_inc(&res->fi->fib_clntref);
936 	return 0;
937 }
938 
939 /* Find appropriate source address to this destination */
940 
941 __be32 __fib_res_prefsrc(struct fib_result *res)
942 {
943 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
944 }
945 
946 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
947 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
948 		  struct fib_info *fi, unsigned int flags)
949 {
950 	struct nlmsghdr *nlh;
951 	struct rtmsg *rtm;
952 
953 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
954 	if (nlh == NULL)
955 		return -EMSGSIZE;
956 
957 	rtm = nlmsg_data(nlh);
958 	rtm->rtm_family = AF_INET;
959 	rtm->rtm_dst_len = dst_len;
960 	rtm->rtm_src_len = 0;
961 	rtm->rtm_tos = tos;
962 	if (tb_id < 256)
963 		rtm->rtm_table = tb_id;
964 	else
965 		rtm->rtm_table = RT_TABLE_COMPAT;
966 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
967 	rtm->rtm_type = type;
968 	rtm->rtm_flags = fi->fib_flags;
969 	rtm->rtm_scope = scope;
970 	rtm->rtm_protocol = fi->fib_protocol;
971 
972 	if (rtm->rtm_dst_len)
973 		NLA_PUT_BE32(skb, RTA_DST, dst);
974 
975 	if (fi->fib_priority)
976 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
977 
978 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
979 		goto nla_put_failure;
980 
981 	if (fi->fib_prefsrc)
982 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
983 
984 	if (fi->fib_nhs == 1) {
985 		if (fi->fib_nh->nh_gw)
986 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
987 
988 		if (fi->fib_nh->nh_oif)
989 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
990 #ifdef CONFIG_NET_CLS_ROUTE
991 		if (fi->fib_nh[0].nh_tclassid)
992 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
993 #endif
994 	}
995 #ifdef CONFIG_IP_ROUTE_MULTIPATH
996 	if (fi->fib_nhs > 1) {
997 		struct rtnexthop *rtnh;
998 		struct nlattr *mp;
999 
1000 		mp = nla_nest_start(skb, RTA_MULTIPATH);
1001 		if (mp == NULL)
1002 			goto nla_put_failure;
1003 
1004 		for_nexthops(fi) {
1005 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1006 			if (rtnh == NULL)
1007 				goto nla_put_failure;
1008 
1009 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1010 			rtnh->rtnh_hops = nh->nh_weight - 1;
1011 			rtnh->rtnh_ifindex = nh->nh_oif;
1012 
1013 			if (nh->nh_gw)
1014 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1015 #ifdef CONFIG_NET_CLS_ROUTE
1016 			if (nh->nh_tclassid)
1017 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1018 #endif
1019 			/* length of rtnetlink header + attributes */
1020 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1021 		} endfor_nexthops(fi);
1022 
1023 		nla_nest_end(skb, mp);
1024 	}
1025 #endif
1026 	return nlmsg_end(skb, nlh);
1027 
1028 nla_put_failure:
1029 	nlmsg_cancel(skb, nlh);
1030 	return -EMSGSIZE;
1031 }
1032 
1033 /*
1034    Update FIB if:
1035    - local address disappeared -> we must delete all the entries
1036      referring to it.
1037    - device went down -> we must shutdown all nexthops going via it.
1038  */
1039 int fib_sync_down_addr(struct net *net, __be32 local)
1040 {
1041 	int ret = 0;
1042 	unsigned int hash = fib_laddr_hashfn(local);
1043 	struct hlist_head *head = &fib_info_laddrhash[hash];
1044 	struct hlist_node *node;
1045 	struct fib_info *fi;
1046 
1047 	if (fib_info_laddrhash == NULL || local == 0)
1048 		return 0;
1049 
1050 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1051 		if (fi->fib_net != net)
1052 			continue;
1053 		if (fi->fib_prefsrc == local) {
1054 			fi->fib_flags |= RTNH_F_DEAD;
1055 			ret++;
1056 		}
1057 	}
1058 	return ret;
1059 }
1060 
1061 int fib_sync_down_dev(struct net_device *dev, int force)
1062 {
1063 	int ret = 0;
1064 	int scope = RT_SCOPE_NOWHERE;
1065 	struct fib_info *prev_fi = NULL;
1066 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1067 	struct hlist_head *head = &fib_info_devhash[hash];
1068 	struct hlist_node *node;
1069 	struct fib_nh *nh;
1070 
1071 	if (force)
1072 		scope = -1;
1073 
1074 	hlist_for_each_entry(nh, node, head, nh_hash) {
1075 		struct fib_info *fi = nh->nh_parent;
1076 		int dead;
1077 
1078 		BUG_ON(!fi->fib_nhs);
1079 		if (nh->nh_dev != dev || fi == prev_fi)
1080 			continue;
1081 		prev_fi = fi;
1082 		dead = 0;
1083 		change_nexthops(fi) {
1084 			if (nh->nh_flags&RTNH_F_DEAD)
1085 				dead++;
1086 			else if (nh->nh_dev == dev &&
1087 					nh->nh_scope != scope) {
1088 				nh->nh_flags |= RTNH_F_DEAD;
1089 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1090 				spin_lock_bh(&fib_multipath_lock);
1091 				fi->fib_power -= nh->nh_power;
1092 				nh->nh_power = 0;
1093 				spin_unlock_bh(&fib_multipath_lock);
1094 #endif
1095 				dead++;
1096 			}
1097 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1098 			if (force > 1 && nh->nh_dev == dev) {
1099 				dead = fi->fib_nhs;
1100 				break;
1101 			}
1102 #endif
1103 		} endfor_nexthops(fi)
1104 		if (dead == fi->fib_nhs) {
1105 			fi->fib_flags |= RTNH_F_DEAD;
1106 			ret++;
1107 		}
1108 	}
1109 
1110 	return ret;
1111 }
1112 
1113 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1114 
1115 /*
1116    Dead device goes up. We wake up dead nexthops.
1117    It takes sense only on multipath routes.
1118  */
1119 
1120 int fib_sync_up(struct net_device *dev)
1121 {
1122 	struct fib_info *prev_fi;
1123 	unsigned int hash;
1124 	struct hlist_head *head;
1125 	struct hlist_node *node;
1126 	struct fib_nh *nh;
1127 	int ret;
1128 
1129 	if (!(dev->flags&IFF_UP))
1130 		return 0;
1131 
1132 	prev_fi = NULL;
1133 	hash = fib_devindex_hashfn(dev->ifindex);
1134 	head = &fib_info_devhash[hash];
1135 	ret = 0;
1136 
1137 	hlist_for_each_entry(nh, node, head, nh_hash) {
1138 		struct fib_info *fi = nh->nh_parent;
1139 		int alive;
1140 
1141 		BUG_ON(!fi->fib_nhs);
1142 		if (nh->nh_dev != dev || fi == prev_fi)
1143 			continue;
1144 
1145 		prev_fi = fi;
1146 		alive = 0;
1147 		change_nexthops(fi) {
1148 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1149 				alive++;
1150 				continue;
1151 			}
1152 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1153 				continue;
1154 			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1155 				continue;
1156 			alive++;
1157 			spin_lock_bh(&fib_multipath_lock);
1158 			nh->nh_power = 0;
1159 			nh->nh_flags &= ~RTNH_F_DEAD;
1160 			spin_unlock_bh(&fib_multipath_lock);
1161 		} endfor_nexthops(fi)
1162 
1163 		if (alive > 0) {
1164 			fi->fib_flags &= ~RTNH_F_DEAD;
1165 			ret++;
1166 		}
1167 	}
1168 
1169 	return ret;
1170 }
1171 
1172 /*
1173    The algorithm is suboptimal, but it provides really
1174    fair weighted route distribution.
1175  */
1176 
1177 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1178 {
1179 	struct fib_info *fi = res->fi;
1180 	int w;
1181 
1182 	spin_lock_bh(&fib_multipath_lock);
1183 	if (fi->fib_power <= 0) {
1184 		int power = 0;
1185 		change_nexthops(fi) {
1186 			if (!(nh->nh_flags&RTNH_F_DEAD)) {
1187 				power += nh->nh_weight;
1188 				nh->nh_power = nh->nh_weight;
1189 			}
1190 		} endfor_nexthops(fi);
1191 		fi->fib_power = power;
1192 		if (power <= 0) {
1193 			spin_unlock_bh(&fib_multipath_lock);
1194 			/* Race condition: route has just become dead. */
1195 			res->nh_sel = 0;
1196 			return;
1197 		}
1198 	}
1199 
1200 
1201 	/* w should be random number [0..fi->fib_power-1],
1202 	   it is pretty bad approximation.
1203 	 */
1204 
1205 	w = jiffies % fi->fib_power;
1206 
1207 	change_nexthops(fi) {
1208 		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1209 			if ((w -= nh->nh_power) <= 0) {
1210 				nh->nh_power--;
1211 				fi->fib_power--;
1212 				res->nh_sel = nhsel;
1213 				spin_unlock_bh(&fib_multipath_lock);
1214 				return;
1215 			}
1216 		}
1217 	} endfor_nexthops(fi);
1218 
1219 	/* Race condition: route has just become dead. */
1220 	res->nh_sel = 0;
1221 	spin_unlock_bh(&fib_multipath_lock);
1222 }
1223 #endif
1224