xref: /openbmc/linux/net/ipv4/fib_semantics.c (revision 1fa6ac37)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 #include <linux/slab.h>
36 
37 #include <net/arp.h>
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/ip_fib.h>
44 #include <net/netlink.h>
45 #include <net/nexthop.h>
46 
47 #include "fib_lookup.h"
48 
49 static DEFINE_SPINLOCK(fib_info_lock);
50 static struct hlist_head *fib_info_hash;
51 static struct hlist_head *fib_info_laddrhash;
52 static unsigned int fib_hash_size;
53 static unsigned int fib_info_cnt;
54 
55 #define DEVINDEX_HASHBITS 8
56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58 
59 #ifdef CONFIG_IP_ROUTE_MULTIPATH
60 
61 static DEFINE_SPINLOCK(fib_multipath_lock);
62 
63 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
64 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
65 
66 #define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
67 for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
68 
69 #else /* CONFIG_IP_ROUTE_MULTIPATH */
70 
71 /* Hope, that gcc will optimize it to get rid of dummy loop */
72 
73 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
74 for (nhsel=0; nhsel < 1; nhsel++)
75 
76 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
77 for (nhsel=0; nhsel < 1; nhsel++)
78 
79 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
80 
81 #define endfor_nexthops(fi) }
82 
83 
84 static const struct
85 {
86 	int	error;
87 	u8	scope;
88 } fib_props[RTN_MAX + 1] = {
89 	{
90 		.error	= 0,
91 		.scope	= RT_SCOPE_NOWHERE,
92 	},	/* RTN_UNSPEC */
93 	{
94 		.error	= 0,
95 		.scope	= RT_SCOPE_UNIVERSE,
96 	},	/* RTN_UNICAST */
97 	{
98 		.error	= 0,
99 		.scope	= RT_SCOPE_HOST,
100 	},	/* RTN_LOCAL */
101 	{
102 		.error	= 0,
103 		.scope	= RT_SCOPE_LINK,
104 	},	/* RTN_BROADCAST */
105 	{
106 		.error	= 0,
107 		.scope	= RT_SCOPE_LINK,
108 	},	/* RTN_ANYCAST */
109 	{
110 		.error	= 0,
111 		.scope	= RT_SCOPE_UNIVERSE,
112 	},	/* RTN_MULTICAST */
113 	{
114 		.error	= -EINVAL,
115 		.scope	= RT_SCOPE_UNIVERSE,
116 	},	/* RTN_BLACKHOLE */
117 	{
118 		.error	= -EHOSTUNREACH,
119 		.scope	= RT_SCOPE_UNIVERSE,
120 	},	/* RTN_UNREACHABLE */
121 	{
122 		.error	= -EACCES,
123 		.scope	= RT_SCOPE_UNIVERSE,
124 	},	/* RTN_PROHIBIT */
125 	{
126 		.error	= -EAGAIN,
127 		.scope	= RT_SCOPE_UNIVERSE,
128 	},	/* RTN_THROW */
129 	{
130 		.error	= -EINVAL,
131 		.scope	= RT_SCOPE_NOWHERE,
132 	},	/* RTN_NAT */
133 	{
134 		.error	= -EINVAL,
135 		.scope	= RT_SCOPE_NOWHERE,
136 	},	/* RTN_XRESOLVE */
137 };
138 
139 
140 /* Release a nexthop info record */
141 
142 void free_fib_info(struct fib_info *fi)
143 {
144 	if (fi->fib_dead == 0) {
145 		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
146 		return;
147 	}
148 	change_nexthops(fi) {
149 		if (nexthop_nh->nh_dev)
150 			dev_put(nexthop_nh->nh_dev);
151 		nexthop_nh->nh_dev = NULL;
152 	} endfor_nexthops(fi);
153 	fib_info_cnt--;
154 	release_net(fi->fib_net);
155 	kfree(fi);
156 }
157 
158 void fib_release_info(struct fib_info *fi)
159 {
160 	spin_lock_bh(&fib_info_lock);
161 	if (fi && --fi->fib_treeref == 0) {
162 		hlist_del(&fi->fib_hash);
163 		if (fi->fib_prefsrc)
164 			hlist_del(&fi->fib_lhash);
165 		change_nexthops(fi) {
166 			if (!nexthop_nh->nh_dev)
167 				continue;
168 			hlist_del(&nexthop_nh->nh_hash);
169 		} endfor_nexthops(fi)
170 		fi->fib_dead = 1;
171 		fib_info_put(fi);
172 	}
173 	spin_unlock_bh(&fib_info_lock);
174 }
175 
176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
177 {
178 	const struct fib_nh *onh = ofi->fib_nh;
179 
180 	for_nexthops(fi) {
181 		if (nh->nh_oif != onh->nh_oif ||
182 		    nh->nh_gw  != onh->nh_gw ||
183 		    nh->nh_scope != onh->nh_scope ||
184 #ifdef CONFIG_IP_ROUTE_MULTIPATH
185 		    nh->nh_weight != onh->nh_weight ||
186 #endif
187 #ifdef CONFIG_NET_CLS_ROUTE
188 		    nh->nh_tclassid != onh->nh_tclassid ||
189 #endif
190 		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
191 			return -1;
192 		onh++;
193 	} endfor_nexthops(fi);
194 	return 0;
195 }
196 
197 static inline unsigned int fib_devindex_hashfn(unsigned int val)
198 {
199 	unsigned int mask = DEVINDEX_HASHSIZE - 1;
200 
201 	return (val ^
202 		(val >> DEVINDEX_HASHBITS) ^
203 		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
204 }
205 
206 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
207 {
208 	unsigned int mask = (fib_hash_size - 1);
209 	unsigned int val = fi->fib_nhs;
210 
211 	val ^= fi->fib_protocol;
212 	val ^= (__force u32)fi->fib_prefsrc;
213 	val ^= fi->fib_priority;
214 	for_nexthops(fi) {
215 		val ^= fib_devindex_hashfn(nh->nh_oif);
216 	} endfor_nexthops(fi)
217 
218 	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
219 }
220 
221 static struct fib_info *fib_find_info(const struct fib_info *nfi)
222 {
223 	struct hlist_head *head;
224 	struct hlist_node *node;
225 	struct fib_info *fi;
226 	unsigned int hash;
227 
228 	hash = fib_info_hashfn(nfi);
229 	head = &fib_info_hash[hash];
230 
231 	hlist_for_each_entry(fi, node, head, fib_hash) {
232 		if (!net_eq(fi->fib_net, nfi->fib_net))
233 			continue;
234 		if (fi->fib_nhs != nfi->fib_nhs)
235 			continue;
236 		if (nfi->fib_protocol == fi->fib_protocol &&
237 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
238 		    nfi->fib_priority == fi->fib_priority &&
239 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
240 			   sizeof(fi->fib_metrics)) == 0 &&
241 		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
242 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
243 			return fi;
244 	}
245 
246 	return NULL;
247 }
248 
249 /* Check, that the gateway is already configured.
250    Used only by redirect accept routine.
251  */
252 
253 int ip_fib_check_default(__be32 gw, struct net_device *dev)
254 {
255 	struct hlist_head *head;
256 	struct hlist_node *node;
257 	struct fib_nh *nh;
258 	unsigned int hash;
259 
260 	spin_lock(&fib_info_lock);
261 
262 	hash = fib_devindex_hashfn(dev->ifindex);
263 	head = &fib_info_devhash[hash];
264 	hlist_for_each_entry(nh, node, head, nh_hash) {
265 		if (nh->nh_dev == dev &&
266 		    nh->nh_gw == gw &&
267 		    !(nh->nh_flags&RTNH_F_DEAD)) {
268 			spin_unlock(&fib_info_lock);
269 			return 0;
270 		}
271 	}
272 
273 	spin_unlock(&fib_info_lock);
274 
275 	return -1;
276 }
277 
278 static inline size_t fib_nlmsg_size(struct fib_info *fi)
279 {
280 	size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
281 			 + nla_total_size(4) /* RTA_TABLE */
282 			 + nla_total_size(4) /* RTA_DST */
283 			 + nla_total_size(4) /* RTA_PRIORITY */
284 			 + nla_total_size(4); /* RTA_PREFSRC */
285 
286 	/* space for nested metrics */
287 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
288 
289 	if (fi->fib_nhs) {
290 		/* Also handles the special case fib_nhs == 1 */
291 
292 		/* each nexthop is packed in an attribute */
293 		size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
294 
295 		/* may contain flow and gateway attribute */
296 		nhsize += 2 * nla_total_size(4);
297 
298 		/* all nexthops are packed in a nested attribute */
299 		payload += nla_total_size(fi->fib_nhs * nhsize);
300 	}
301 
302 	return payload;
303 }
304 
305 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
306 	       int dst_len, u32 tb_id, struct nl_info *info,
307 	       unsigned int nlm_flags)
308 {
309 	struct sk_buff *skb;
310 	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
311 	int err = -ENOBUFS;
312 
313 	skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
314 	if (skb == NULL)
315 		goto errout;
316 
317 	err = fib_dump_info(skb, info->pid, seq, event, tb_id,
318 			    fa->fa_type, fa->fa_scope, key, dst_len,
319 			    fa->fa_tos, fa->fa_info, nlm_flags);
320 	if (err < 0) {
321 		/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
322 		WARN_ON(err == -EMSGSIZE);
323 		kfree_skb(skb);
324 		goto errout;
325 	}
326 	rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
327 		    info->nlh, GFP_KERNEL);
328 	return;
329 errout:
330 	if (err < 0)
331 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
332 }
333 
334 /* Return the first fib alias matching TOS with
335  * priority less than or equal to PRIO.
336  */
337 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
338 {
339 	if (fah) {
340 		struct fib_alias *fa;
341 		list_for_each_entry(fa, fah, fa_list) {
342 			if (fa->fa_tos > tos)
343 				continue;
344 			if (fa->fa_info->fib_priority >= prio ||
345 			    fa->fa_tos < tos)
346 				return fa;
347 		}
348 	}
349 	return NULL;
350 }
351 
352 int fib_detect_death(struct fib_info *fi, int order,
353 		     struct fib_info **last_resort, int *last_idx, int dflt)
354 {
355 	struct neighbour *n;
356 	int state = NUD_NONE;
357 
358 	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
359 	if (n) {
360 		state = n->nud_state;
361 		neigh_release(n);
362 	}
363 	if (state == NUD_REACHABLE)
364 		return 0;
365 	if ((state&NUD_VALID) && order != dflt)
366 		return 0;
367 	if ((state&NUD_VALID) ||
368 	    (*last_idx<0 && order > dflt)) {
369 		*last_resort = fi;
370 		*last_idx = order;
371 	}
372 	return 1;
373 }
374 
375 #ifdef CONFIG_IP_ROUTE_MULTIPATH
376 
377 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
378 {
379 	int nhs = 0;
380 
381 	while (rtnh_ok(rtnh, remaining)) {
382 		nhs++;
383 		rtnh = rtnh_next(rtnh, &remaining);
384 	}
385 
386 	/* leftover implies invalid nexthop configuration, discard it */
387 	return remaining > 0 ? 0 : nhs;
388 }
389 
390 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
391 		       int remaining, struct fib_config *cfg)
392 {
393 	change_nexthops(fi) {
394 		int attrlen;
395 
396 		if (!rtnh_ok(rtnh, remaining))
397 			return -EINVAL;
398 
399 		nexthop_nh->nh_flags =
400 			(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
401 		nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
402 		nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
403 
404 		attrlen = rtnh_attrlen(rtnh);
405 		if (attrlen > 0) {
406 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
407 
408 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
409 			nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
410 #ifdef CONFIG_NET_CLS_ROUTE
411 			nla = nla_find(attrs, attrlen, RTA_FLOW);
412 			nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
413 #endif
414 		}
415 
416 		rtnh = rtnh_next(rtnh, &remaining);
417 	} endfor_nexthops(fi);
418 
419 	return 0;
420 }
421 
422 #endif
423 
424 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
425 {
426 #ifdef CONFIG_IP_ROUTE_MULTIPATH
427 	struct rtnexthop *rtnh;
428 	int remaining;
429 #endif
430 
431 	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
432 		return 1;
433 
434 	if (cfg->fc_oif || cfg->fc_gw) {
435 		if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
436 		    (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
437 			return 0;
438 		return 1;
439 	}
440 
441 #ifdef CONFIG_IP_ROUTE_MULTIPATH
442 	if (cfg->fc_mp == NULL)
443 		return 0;
444 
445 	rtnh = cfg->fc_mp;
446 	remaining = cfg->fc_mp_len;
447 
448 	for_nexthops(fi) {
449 		int attrlen;
450 
451 		if (!rtnh_ok(rtnh, remaining))
452 			return -EINVAL;
453 
454 		if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
455 			return 1;
456 
457 		attrlen = rtnh_attrlen(rtnh);
458 		if (attrlen < 0) {
459 			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
460 
461 			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
462 			if (nla && nla_get_be32(nla) != nh->nh_gw)
463 				return 1;
464 #ifdef CONFIG_NET_CLS_ROUTE
465 			nla = nla_find(attrs, attrlen, RTA_FLOW);
466 			if (nla && nla_get_u32(nla) != nh->nh_tclassid)
467 				return 1;
468 #endif
469 		}
470 
471 		rtnh = rtnh_next(rtnh, &remaining);
472 	} endfor_nexthops(fi);
473 #endif
474 	return 0;
475 }
476 
477 
478 /*
479    Picture
480    -------
481 
482    Semantics of nexthop is very messy by historical reasons.
483    We have to take into account, that:
484    a) gateway can be actually local interface address,
485       so that gatewayed route is direct.
486    b) gateway must be on-link address, possibly
487       described not by an ifaddr, but also by a direct route.
488    c) If both gateway and interface are specified, they should not
489       contradict.
490    d) If we use tunnel routes, gateway could be not on-link.
491 
492    Attempt to reconcile all of these (alas, self-contradictory) conditions
493    results in pretty ugly and hairy code with obscure logic.
494 
495    I chose to generalized it instead, so that the size
496    of code does not increase practically, but it becomes
497    much more general.
498    Every prefix is assigned a "scope" value: "host" is local address,
499    "link" is direct route,
500    [ ... "site" ... "interior" ... ]
501    and "universe" is true gateway route with global meaning.
502 
503    Every prefix refers to a set of "nexthop"s (gw, oif),
504    where gw must have narrower scope. This recursion stops
505    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
506    which means that gw is forced to be on link.
507 
508    Code is still hairy, but now it is apparently logically
509    consistent and very flexible. F.e. as by-product it allows
510    to co-exists in peace independent exterior and interior
511    routing processes.
512 
513    Normally it looks as following.
514 
515    {universe prefix}  -> (gw, oif) [scope link]
516 			  |
517 			  |-> {link prefix} -> (gw, oif) [scope local]
518 						|
519 						|-> {local prefix} (terminal node)
520  */
521 
522 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
523 			struct fib_nh *nh)
524 {
525 	int err;
526 	struct net *net;
527 
528 	net = cfg->fc_nlinfo.nl_net;
529 	if (nh->nh_gw) {
530 		struct fib_result res;
531 
532 		if (nh->nh_flags&RTNH_F_ONLINK) {
533 			struct net_device *dev;
534 
535 			if (cfg->fc_scope >= RT_SCOPE_LINK)
536 				return -EINVAL;
537 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
538 				return -EINVAL;
539 			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
540 				return -ENODEV;
541 			if (!(dev->flags&IFF_UP))
542 				return -ENETDOWN;
543 			nh->nh_dev = dev;
544 			dev_hold(dev);
545 			nh->nh_scope = RT_SCOPE_LINK;
546 			return 0;
547 		}
548 		{
549 			struct flowi fl = {
550 				.nl_u = {
551 					.ip4_u = {
552 						.daddr = nh->nh_gw,
553 						.scope = cfg->fc_scope + 1,
554 					},
555 				},
556 				.oif = nh->nh_oif,
557 			};
558 
559 			/* It is not necessary, but requires a bit of thinking */
560 			if (fl.fl4_scope < RT_SCOPE_LINK)
561 				fl.fl4_scope = RT_SCOPE_LINK;
562 			if ((err = fib_lookup(net, &fl, &res)) != 0)
563 				return err;
564 		}
565 		err = -EINVAL;
566 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
567 			goto out;
568 		nh->nh_scope = res.scope;
569 		nh->nh_oif = FIB_RES_OIF(res);
570 		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
571 			goto out;
572 		dev_hold(nh->nh_dev);
573 		err = -ENETDOWN;
574 		if (!(nh->nh_dev->flags & IFF_UP))
575 			goto out;
576 		err = 0;
577 out:
578 		fib_res_put(&res);
579 		return err;
580 	} else {
581 		struct in_device *in_dev;
582 
583 		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
584 			return -EINVAL;
585 
586 		in_dev = inetdev_by_index(net, nh->nh_oif);
587 		if (in_dev == NULL)
588 			return -ENODEV;
589 		if (!(in_dev->dev->flags&IFF_UP)) {
590 			in_dev_put(in_dev);
591 			return -ENETDOWN;
592 		}
593 		nh->nh_dev = in_dev->dev;
594 		dev_hold(nh->nh_dev);
595 		nh->nh_scope = RT_SCOPE_HOST;
596 		in_dev_put(in_dev);
597 	}
598 	return 0;
599 }
600 
601 static inline unsigned int fib_laddr_hashfn(__be32 val)
602 {
603 	unsigned int mask = (fib_hash_size - 1);
604 
605 	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
606 }
607 
608 static struct hlist_head *fib_hash_alloc(int bytes)
609 {
610 	if (bytes <= PAGE_SIZE)
611 		return kzalloc(bytes, GFP_KERNEL);
612 	else
613 		return (struct hlist_head *)
614 			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
615 }
616 
617 static void fib_hash_free(struct hlist_head *hash, int bytes)
618 {
619 	if (!hash)
620 		return;
621 
622 	if (bytes <= PAGE_SIZE)
623 		kfree(hash);
624 	else
625 		free_pages((unsigned long) hash, get_order(bytes));
626 }
627 
628 static void fib_hash_move(struct hlist_head *new_info_hash,
629 			  struct hlist_head *new_laddrhash,
630 			  unsigned int new_size)
631 {
632 	struct hlist_head *old_info_hash, *old_laddrhash;
633 	unsigned int old_size = fib_hash_size;
634 	unsigned int i, bytes;
635 
636 	spin_lock_bh(&fib_info_lock);
637 	old_info_hash = fib_info_hash;
638 	old_laddrhash = fib_info_laddrhash;
639 	fib_hash_size = new_size;
640 
641 	for (i = 0; i < old_size; i++) {
642 		struct hlist_head *head = &fib_info_hash[i];
643 		struct hlist_node *node, *n;
644 		struct fib_info *fi;
645 
646 		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
647 			struct hlist_head *dest;
648 			unsigned int new_hash;
649 
650 			hlist_del(&fi->fib_hash);
651 
652 			new_hash = fib_info_hashfn(fi);
653 			dest = &new_info_hash[new_hash];
654 			hlist_add_head(&fi->fib_hash, dest);
655 		}
656 	}
657 	fib_info_hash = new_info_hash;
658 
659 	for (i = 0; i < old_size; i++) {
660 		struct hlist_head *lhead = &fib_info_laddrhash[i];
661 		struct hlist_node *node, *n;
662 		struct fib_info *fi;
663 
664 		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
665 			struct hlist_head *ldest;
666 			unsigned int new_hash;
667 
668 			hlist_del(&fi->fib_lhash);
669 
670 			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
671 			ldest = &new_laddrhash[new_hash];
672 			hlist_add_head(&fi->fib_lhash, ldest);
673 		}
674 	}
675 	fib_info_laddrhash = new_laddrhash;
676 
677 	spin_unlock_bh(&fib_info_lock);
678 
679 	bytes = old_size * sizeof(struct hlist_head *);
680 	fib_hash_free(old_info_hash, bytes);
681 	fib_hash_free(old_laddrhash, bytes);
682 }
683 
684 struct fib_info *fib_create_info(struct fib_config *cfg)
685 {
686 	int err;
687 	struct fib_info *fi = NULL;
688 	struct fib_info *ofi;
689 	int nhs = 1;
690 	struct net *net = cfg->fc_nlinfo.nl_net;
691 
692 	/* Fast check to catch the most weird cases */
693 	if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
694 		goto err_inval;
695 
696 #ifdef CONFIG_IP_ROUTE_MULTIPATH
697 	if (cfg->fc_mp) {
698 		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
699 		if (nhs == 0)
700 			goto err_inval;
701 	}
702 #endif
703 
704 	err = -ENOBUFS;
705 	if (fib_info_cnt >= fib_hash_size) {
706 		unsigned int new_size = fib_hash_size << 1;
707 		struct hlist_head *new_info_hash;
708 		struct hlist_head *new_laddrhash;
709 		unsigned int bytes;
710 
711 		if (!new_size)
712 			new_size = 1;
713 		bytes = new_size * sizeof(struct hlist_head *);
714 		new_info_hash = fib_hash_alloc(bytes);
715 		new_laddrhash = fib_hash_alloc(bytes);
716 		if (!new_info_hash || !new_laddrhash) {
717 			fib_hash_free(new_info_hash, bytes);
718 			fib_hash_free(new_laddrhash, bytes);
719 		} else
720 			fib_hash_move(new_info_hash, new_laddrhash, new_size);
721 
722 		if (!fib_hash_size)
723 			goto failure;
724 	}
725 
726 	fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
727 	if (fi == NULL)
728 		goto failure;
729 	fib_info_cnt++;
730 
731 	fi->fib_net = hold_net(net);
732 	fi->fib_protocol = cfg->fc_protocol;
733 	fi->fib_flags = cfg->fc_flags;
734 	fi->fib_priority = cfg->fc_priority;
735 	fi->fib_prefsrc = cfg->fc_prefsrc;
736 
737 	fi->fib_nhs = nhs;
738 	change_nexthops(fi) {
739 		nexthop_nh->nh_parent = fi;
740 	} endfor_nexthops(fi)
741 
742 	if (cfg->fc_mx) {
743 		struct nlattr *nla;
744 		int remaining;
745 
746 		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
747 			int type = nla_type(nla);
748 
749 			if (type) {
750 				if (type > RTAX_MAX)
751 					goto err_inval;
752 				fi->fib_metrics[type - 1] = nla_get_u32(nla);
753 			}
754 		}
755 	}
756 
757 	if (cfg->fc_mp) {
758 #ifdef CONFIG_IP_ROUTE_MULTIPATH
759 		err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
760 		if (err != 0)
761 			goto failure;
762 		if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
763 			goto err_inval;
764 		if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
765 			goto err_inval;
766 #ifdef CONFIG_NET_CLS_ROUTE
767 		if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
768 			goto err_inval;
769 #endif
770 #else
771 		goto err_inval;
772 #endif
773 	} else {
774 		struct fib_nh *nh = fi->fib_nh;
775 
776 		nh->nh_oif = cfg->fc_oif;
777 		nh->nh_gw = cfg->fc_gw;
778 		nh->nh_flags = cfg->fc_flags;
779 #ifdef CONFIG_NET_CLS_ROUTE
780 		nh->nh_tclassid = cfg->fc_flow;
781 #endif
782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
783 		nh->nh_weight = 1;
784 #endif
785 	}
786 
787 	if (fib_props[cfg->fc_type].error) {
788 		if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
789 			goto err_inval;
790 		goto link_it;
791 	}
792 
793 	if (cfg->fc_scope > RT_SCOPE_HOST)
794 		goto err_inval;
795 
796 	if (cfg->fc_scope == RT_SCOPE_HOST) {
797 		struct fib_nh *nh = fi->fib_nh;
798 
799 		/* Local address is added. */
800 		if (nhs != 1 || nh->nh_gw)
801 			goto err_inval;
802 		nh->nh_scope = RT_SCOPE_NOWHERE;
803 		nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
804 		err = -ENODEV;
805 		if (nh->nh_dev == NULL)
806 			goto failure;
807 	} else {
808 		change_nexthops(fi) {
809 			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
810 				goto failure;
811 		} endfor_nexthops(fi)
812 	}
813 
814 	if (fi->fib_prefsrc) {
815 		if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
816 		    fi->fib_prefsrc != cfg->fc_dst)
817 			if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
818 				goto err_inval;
819 	}
820 
821 link_it:
822 	if ((ofi = fib_find_info(fi)) != NULL) {
823 		fi->fib_dead = 1;
824 		free_fib_info(fi);
825 		ofi->fib_treeref++;
826 		return ofi;
827 	}
828 
829 	fi->fib_treeref++;
830 	atomic_inc(&fi->fib_clntref);
831 	spin_lock_bh(&fib_info_lock);
832 	hlist_add_head(&fi->fib_hash,
833 		       &fib_info_hash[fib_info_hashfn(fi)]);
834 	if (fi->fib_prefsrc) {
835 		struct hlist_head *head;
836 
837 		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
838 		hlist_add_head(&fi->fib_lhash, head);
839 	}
840 	change_nexthops(fi) {
841 		struct hlist_head *head;
842 		unsigned int hash;
843 
844 		if (!nexthop_nh->nh_dev)
845 			continue;
846 		hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
847 		head = &fib_info_devhash[hash];
848 		hlist_add_head(&nexthop_nh->nh_hash, head);
849 	} endfor_nexthops(fi)
850 	spin_unlock_bh(&fib_info_lock);
851 	return fi;
852 
853 err_inval:
854 	err = -EINVAL;
855 
856 failure:
857 	if (fi) {
858 		fi->fib_dead = 1;
859 		free_fib_info(fi);
860 	}
861 
862 	return ERR_PTR(err);
863 }
864 
865 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
866 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
867 		       struct fib_result *res, int prefixlen)
868 {
869 	struct fib_alias *fa;
870 	int nh_sel = 0;
871 
872 	list_for_each_entry_rcu(fa, head, fa_list) {
873 		int err;
874 
875 		if (fa->fa_tos &&
876 		    fa->fa_tos != flp->fl4_tos)
877 			continue;
878 
879 		if (fa->fa_scope < flp->fl4_scope)
880 			continue;
881 
882 		fa->fa_state |= FA_S_ACCESSED;
883 
884 		err = fib_props[fa->fa_type].error;
885 		if (err == 0) {
886 			struct fib_info *fi = fa->fa_info;
887 
888 			if (fi->fib_flags & RTNH_F_DEAD)
889 				continue;
890 
891 			switch (fa->fa_type) {
892 			case RTN_UNICAST:
893 			case RTN_LOCAL:
894 			case RTN_BROADCAST:
895 			case RTN_ANYCAST:
896 			case RTN_MULTICAST:
897 				for_nexthops(fi) {
898 					if (nh->nh_flags&RTNH_F_DEAD)
899 						continue;
900 					if (!flp->oif || flp->oif == nh->nh_oif)
901 						break;
902 				}
903 #ifdef CONFIG_IP_ROUTE_MULTIPATH
904 				if (nhsel < fi->fib_nhs) {
905 					nh_sel = nhsel;
906 					goto out_fill_res;
907 				}
908 #else
909 				if (nhsel < 1) {
910 					goto out_fill_res;
911 				}
912 #endif
913 				endfor_nexthops(fi);
914 				continue;
915 
916 			default:
917 				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
918 					fa->fa_type);
919 				return -EINVAL;
920 			}
921 		}
922 		return err;
923 	}
924 	return 1;
925 
926 out_fill_res:
927 	res->prefixlen = prefixlen;
928 	res->nh_sel = nh_sel;
929 	res->type = fa->fa_type;
930 	res->scope = fa->fa_scope;
931 	res->fi = fa->fa_info;
932 	atomic_inc(&res->fi->fib_clntref);
933 	return 0;
934 }
935 
936 /* Find appropriate source address to this destination */
937 
938 __be32 __fib_res_prefsrc(struct fib_result *res)
939 {
940 	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
941 }
942 
943 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
944 		  u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
945 		  struct fib_info *fi, unsigned int flags)
946 {
947 	struct nlmsghdr *nlh;
948 	struct rtmsg *rtm;
949 
950 	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
951 	if (nlh == NULL)
952 		return -EMSGSIZE;
953 
954 	rtm = nlmsg_data(nlh);
955 	rtm->rtm_family = AF_INET;
956 	rtm->rtm_dst_len = dst_len;
957 	rtm->rtm_src_len = 0;
958 	rtm->rtm_tos = tos;
959 	if (tb_id < 256)
960 		rtm->rtm_table = tb_id;
961 	else
962 		rtm->rtm_table = RT_TABLE_COMPAT;
963 	NLA_PUT_U32(skb, RTA_TABLE, tb_id);
964 	rtm->rtm_type = type;
965 	rtm->rtm_flags = fi->fib_flags;
966 	rtm->rtm_scope = scope;
967 	rtm->rtm_protocol = fi->fib_protocol;
968 
969 	if (rtm->rtm_dst_len)
970 		NLA_PUT_BE32(skb, RTA_DST, dst);
971 
972 	if (fi->fib_priority)
973 		NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
974 
975 	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
976 		goto nla_put_failure;
977 
978 	if (fi->fib_prefsrc)
979 		NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
980 
981 	if (fi->fib_nhs == 1) {
982 		if (fi->fib_nh->nh_gw)
983 			NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
984 
985 		if (fi->fib_nh->nh_oif)
986 			NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
987 #ifdef CONFIG_NET_CLS_ROUTE
988 		if (fi->fib_nh[0].nh_tclassid)
989 			NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
990 #endif
991 	}
992 #ifdef CONFIG_IP_ROUTE_MULTIPATH
993 	if (fi->fib_nhs > 1) {
994 		struct rtnexthop *rtnh;
995 		struct nlattr *mp;
996 
997 		mp = nla_nest_start(skb, RTA_MULTIPATH);
998 		if (mp == NULL)
999 			goto nla_put_failure;
1000 
1001 		for_nexthops(fi) {
1002 			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1003 			if (rtnh == NULL)
1004 				goto nla_put_failure;
1005 
1006 			rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1007 			rtnh->rtnh_hops = nh->nh_weight - 1;
1008 			rtnh->rtnh_ifindex = nh->nh_oif;
1009 
1010 			if (nh->nh_gw)
1011 				NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1012 #ifdef CONFIG_NET_CLS_ROUTE
1013 			if (nh->nh_tclassid)
1014 				NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1015 #endif
1016 			/* length of rtnetlink header + attributes */
1017 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1018 		} endfor_nexthops(fi);
1019 
1020 		nla_nest_end(skb, mp);
1021 	}
1022 #endif
1023 	return nlmsg_end(skb, nlh);
1024 
1025 nla_put_failure:
1026 	nlmsg_cancel(skb, nlh);
1027 	return -EMSGSIZE;
1028 }
1029 
1030 /*
1031    Update FIB if:
1032    - local address disappeared -> we must delete all the entries
1033      referring to it.
1034    - device went down -> we must shutdown all nexthops going via it.
1035  */
1036 int fib_sync_down_addr(struct net *net, __be32 local)
1037 {
1038 	int ret = 0;
1039 	unsigned int hash = fib_laddr_hashfn(local);
1040 	struct hlist_head *head = &fib_info_laddrhash[hash];
1041 	struct hlist_node *node;
1042 	struct fib_info *fi;
1043 
1044 	if (fib_info_laddrhash == NULL || local == 0)
1045 		return 0;
1046 
1047 	hlist_for_each_entry(fi, node, head, fib_lhash) {
1048 		if (!net_eq(fi->fib_net, net))
1049 			continue;
1050 		if (fi->fib_prefsrc == local) {
1051 			fi->fib_flags |= RTNH_F_DEAD;
1052 			ret++;
1053 		}
1054 	}
1055 	return ret;
1056 }
1057 
1058 int fib_sync_down_dev(struct net_device *dev, int force)
1059 {
1060 	int ret = 0;
1061 	int scope = RT_SCOPE_NOWHERE;
1062 	struct fib_info *prev_fi = NULL;
1063 	unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1064 	struct hlist_head *head = &fib_info_devhash[hash];
1065 	struct hlist_node *node;
1066 	struct fib_nh *nh;
1067 
1068 	if (force)
1069 		scope = -1;
1070 
1071 	hlist_for_each_entry(nh, node, head, nh_hash) {
1072 		struct fib_info *fi = nh->nh_parent;
1073 		int dead;
1074 
1075 		BUG_ON(!fi->fib_nhs);
1076 		if (nh->nh_dev != dev || fi == prev_fi)
1077 			continue;
1078 		prev_fi = fi;
1079 		dead = 0;
1080 		change_nexthops(fi) {
1081 			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1082 				dead++;
1083 			else if (nexthop_nh->nh_dev == dev &&
1084 				 nexthop_nh->nh_scope != scope) {
1085 				nexthop_nh->nh_flags |= RTNH_F_DEAD;
1086 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1087 				spin_lock_bh(&fib_multipath_lock);
1088 				fi->fib_power -= nexthop_nh->nh_power;
1089 				nexthop_nh->nh_power = 0;
1090 				spin_unlock_bh(&fib_multipath_lock);
1091 #endif
1092 				dead++;
1093 			}
1094 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1095 			if (force > 1 && nexthop_nh->nh_dev == dev) {
1096 				dead = fi->fib_nhs;
1097 				break;
1098 			}
1099 #endif
1100 		} endfor_nexthops(fi)
1101 		if (dead == fi->fib_nhs) {
1102 			fi->fib_flags |= RTNH_F_DEAD;
1103 			ret++;
1104 		}
1105 	}
1106 
1107 	return ret;
1108 }
1109 
1110 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1111 
1112 /*
1113    Dead device goes up. We wake up dead nexthops.
1114    It takes sense only on multipath routes.
1115  */
1116 
1117 int fib_sync_up(struct net_device *dev)
1118 {
1119 	struct fib_info *prev_fi;
1120 	unsigned int hash;
1121 	struct hlist_head *head;
1122 	struct hlist_node *node;
1123 	struct fib_nh *nh;
1124 	int ret;
1125 
1126 	if (!(dev->flags&IFF_UP))
1127 		return 0;
1128 
1129 	prev_fi = NULL;
1130 	hash = fib_devindex_hashfn(dev->ifindex);
1131 	head = &fib_info_devhash[hash];
1132 	ret = 0;
1133 
1134 	hlist_for_each_entry(nh, node, head, nh_hash) {
1135 		struct fib_info *fi = nh->nh_parent;
1136 		int alive;
1137 
1138 		BUG_ON(!fi->fib_nhs);
1139 		if (nh->nh_dev != dev || fi == prev_fi)
1140 			continue;
1141 
1142 		prev_fi = fi;
1143 		alive = 0;
1144 		change_nexthops(fi) {
1145 			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1146 				alive++;
1147 				continue;
1148 			}
1149 			if (nexthop_nh->nh_dev == NULL ||
1150 			    !(nexthop_nh->nh_dev->flags&IFF_UP))
1151 				continue;
1152 			if (nexthop_nh->nh_dev != dev ||
1153 			    !__in_dev_get_rtnl(dev))
1154 				continue;
1155 			alive++;
1156 			spin_lock_bh(&fib_multipath_lock);
1157 			nexthop_nh->nh_power = 0;
1158 			nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1159 			spin_unlock_bh(&fib_multipath_lock);
1160 		} endfor_nexthops(fi)
1161 
1162 		if (alive > 0) {
1163 			fi->fib_flags &= ~RTNH_F_DEAD;
1164 			ret++;
1165 		}
1166 	}
1167 
1168 	return ret;
1169 }
1170 
1171 /*
1172    The algorithm is suboptimal, but it provides really
1173    fair weighted route distribution.
1174  */
1175 
1176 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1177 {
1178 	struct fib_info *fi = res->fi;
1179 	int w;
1180 
1181 	spin_lock_bh(&fib_multipath_lock);
1182 	if (fi->fib_power <= 0) {
1183 		int power = 0;
1184 		change_nexthops(fi) {
1185 			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1186 				power += nexthop_nh->nh_weight;
1187 				nexthop_nh->nh_power = nexthop_nh->nh_weight;
1188 			}
1189 		} endfor_nexthops(fi);
1190 		fi->fib_power = power;
1191 		if (power <= 0) {
1192 			spin_unlock_bh(&fib_multipath_lock);
1193 			/* Race condition: route has just become dead. */
1194 			res->nh_sel = 0;
1195 			return;
1196 		}
1197 	}
1198 
1199 
1200 	/* w should be random number [0..fi->fib_power-1],
1201 	   it is pretty bad approximation.
1202 	 */
1203 
1204 	w = jiffies % fi->fib_power;
1205 
1206 	change_nexthops(fi) {
1207 		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1208 		    nexthop_nh->nh_power) {
1209 			if ((w -= nexthop_nh->nh_power) <= 0) {
1210 				nexthop_nh->nh_power--;
1211 				fi->fib_power--;
1212 				res->nh_sel = nhsel;
1213 				spin_unlock_bh(&fib_multipath_lock);
1214 				return;
1215 			}
1216 		}
1217 	} endfor_nexthops(fi);
1218 
1219 	/* Race condition: route has just become dead. */
1220 	res->nh_sel = 0;
1221 	spin_unlock_bh(&fib_multipath_lock);
1222 }
1223 #endif
1224