xref: /openbmc/linux/net/ipv4/fib_frontend.c (revision 179dd8c0348af75b02c7d72eaaf1cb179f1721ef)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <linux/bitops.h>
19 #include <linux/capability.h>
20 #include <linux/types.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_addr.h>
32 #include <linux/if_arp.h>
33 #include <linux/skbuff.h>
34 #include <linux/cache.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 #include <linux/slab.h>
38 
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/arp.h>
45 #include <net/ip_fib.h>
46 #include <net/rtnetlink.h>
47 #include <net/xfrm.h>
48 
49 #ifndef CONFIG_IP_MULTIPLE_TABLES
50 
51 static int __net_init fib4_rules_init(struct net *net)
52 {
53 	struct fib_table *local_table, *main_table;
54 
55 	main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
56 	if (!main_table)
57 		return -ENOMEM;
58 
59 	local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
60 	if (!local_table)
61 		goto fail;
62 
63 	hlist_add_head_rcu(&local_table->tb_hlist,
64 				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
65 	hlist_add_head_rcu(&main_table->tb_hlist,
66 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
67 	return 0;
68 
69 fail:
70 	fib_free_table(main_table);
71 	return -ENOMEM;
72 }
73 #else
74 
75 struct fib_table *fib_new_table(struct net *net, u32 id)
76 {
77 	struct fib_table *tb, *alias = NULL;
78 	unsigned int h;
79 
80 	if (id == 0)
81 		id = RT_TABLE_MAIN;
82 	tb = fib_get_table(net, id);
83 	if (tb)
84 		return tb;
85 
86 	if (id == RT_TABLE_LOCAL)
87 		alias = fib_new_table(net, RT_TABLE_MAIN);
88 
89 	tb = fib_trie_table(id, alias);
90 	if (!tb)
91 		return NULL;
92 
93 	switch (id) {
94 	case RT_TABLE_LOCAL:
95 		rcu_assign_pointer(net->ipv4.fib_local, tb);
96 		break;
97 	case RT_TABLE_MAIN:
98 		rcu_assign_pointer(net->ipv4.fib_main, tb);
99 		break;
100 	case RT_TABLE_DEFAULT:
101 		rcu_assign_pointer(net->ipv4.fib_default, tb);
102 		break;
103 	default:
104 		break;
105 	}
106 
107 	h = id & (FIB_TABLE_HASHSZ - 1);
108 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
109 	return tb;
110 }
111 
112 /* caller must hold either rtnl or rcu read lock */
113 struct fib_table *fib_get_table(struct net *net, u32 id)
114 {
115 	struct fib_table *tb;
116 	struct hlist_head *head;
117 	unsigned int h;
118 
119 	if (id == 0)
120 		id = RT_TABLE_MAIN;
121 	h = id & (FIB_TABLE_HASHSZ - 1);
122 
123 	head = &net->ipv4.fib_table_hash[h];
124 	hlist_for_each_entry_rcu(tb, head, tb_hlist) {
125 		if (tb->tb_id == id)
126 			return tb;
127 	}
128 	return NULL;
129 }
130 #endif /* CONFIG_IP_MULTIPLE_TABLES */
131 
132 static void fib_replace_table(struct net *net, struct fib_table *old,
133 			      struct fib_table *new)
134 {
135 #ifdef CONFIG_IP_MULTIPLE_TABLES
136 	switch (new->tb_id) {
137 	case RT_TABLE_LOCAL:
138 		rcu_assign_pointer(net->ipv4.fib_local, new);
139 		break;
140 	case RT_TABLE_MAIN:
141 		rcu_assign_pointer(net->ipv4.fib_main, new);
142 		break;
143 	case RT_TABLE_DEFAULT:
144 		rcu_assign_pointer(net->ipv4.fib_default, new);
145 		break;
146 	default:
147 		break;
148 	}
149 
150 #endif
151 	/* replace the old table in the hlist */
152 	hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
153 }
154 
155 int fib_unmerge(struct net *net)
156 {
157 	struct fib_table *old, *new;
158 
159 	/* attempt to fetch local table if it has been allocated */
160 	old = fib_get_table(net, RT_TABLE_LOCAL);
161 	if (!old)
162 		return 0;
163 
164 	new = fib_trie_unmerge(old);
165 	if (!new)
166 		return -ENOMEM;
167 
168 	/* replace merged table with clean table */
169 	if (new != old) {
170 		fib_replace_table(net, old, new);
171 		fib_free_table(old);
172 	}
173 
174 	return 0;
175 }
176 
177 static void fib_flush(struct net *net)
178 {
179 	int flushed = 0;
180 	unsigned int h;
181 
182 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
183 		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
184 		struct hlist_node *tmp;
185 		struct fib_table *tb;
186 
187 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
188 			flushed += fib_table_flush(tb);
189 	}
190 
191 	if (flushed)
192 		rt_cache_flush(net);
193 }
194 
195 void fib_flush_external(struct net *net)
196 {
197 	struct fib_table *tb;
198 	struct hlist_head *head;
199 	unsigned int h;
200 
201 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
202 		head = &net->ipv4.fib_table_hash[h];
203 		hlist_for_each_entry(tb, head, tb_hlist)
204 			fib_table_flush_external(tb);
205 	}
206 }
207 
208 /*
209  * Find address type as if only "dev" was present in the system. If
210  * on_dev is NULL then all interfaces are taken into consideration.
211  */
212 static inline unsigned int __inet_dev_addr_type(struct net *net,
213 						const struct net_device *dev,
214 						__be32 addr)
215 {
216 	struct flowi4		fl4 = { .daddr = addr };
217 	struct fib_result	res;
218 	unsigned int ret = RTN_BROADCAST;
219 	struct fib_table *local_table;
220 
221 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
222 		return RTN_BROADCAST;
223 	if (ipv4_is_multicast(addr))
224 		return RTN_MULTICAST;
225 
226 	rcu_read_lock();
227 
228 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
229 	if (local_table) {
230 		ret = RTN_UNICAST;
231 		if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
232 			if (!dev || dev == res.fi->fib_dev)
233 				ret = res.type;
234 		}
235 	}
236 
237 	rcu_read_unlock();
238 	return ret;
239 }
240 
241 unsigned int inet_addr_type(struct net *net, __be32 addr)
242 {
243 	return __inet_dev_addr_type(net, NULL, addr);
244 }
245 EXPORT_SYMBOL(inet_addr_type);
246 
247 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
248 				__be32 addr)
249 {
250 	return __inet_dev_addr_type(net, dev, addr);
251 }
252 EXPORT_SYMBOL(inet_dev_addr_type);
253 
254 __be32 fib_compute_spec_dst(struct sk_buff *skb)
255 {
256 	struct net_device *dev = skb->dev;
257 	struct in_device *in_dev;
258 	struct fib_result res;
259 	struct rtable *rt;
260 	struct flowi4 fl4;
261 	struct net *net;
262 	int scope;
263 
264 	rt = skb_rtable(skb);
265 	if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
266 	    RTCF_LOCAL)
267 		return ip_hdr(skb)->daddr;
268 
269 	in_dev = __in_dev_get_rcu(dev);
270 	BUG_ON(!in_dev);
271 
272 	net = dev_net(dev);
273 
274 	scope = RT_SCOPE_UNIVERSE;
275 	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
276 		fl4.flowi4_oif = 0;
277 		fl4.flowi4_iif = LOOPBACK_IFINDEX;
278 		fl4.daddr = ip_hdr(skb)->saddr;
279 		fl4.saddr = 0;
280 		fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
281 		fl4.flowi4_scope = scope;
282 		fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
283 		if (!fib_lookup(net, &fl4, &res, 0))
284 			return FIB_RES_PREFSRC(net, res);
285 	} else {
286 		scope = RT_SCOPE_LINK;
287 	}
288 
289 	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
290 }
291 
292 /* Given (packet source, input interface) and optional (dst, oif, tos):
293  * - (main) check, that source is valid i.e. not broadcast or our local
294  *   address.
295  * - figure out what "logical" interface this packet arrived
296  *   and calculate "specific destination" address.
297  * - check, that packet arrived from expected physical interface.
298  * called with rcu_read_lock()
299  */
300 static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
301 				 u8 tos, int oif, struct net_device *dev,
302 				 int rpf, struct in_device *idev, u32 *itag)
303 {
304 	int ret, no_addr;
305 	struct fib_result res;
306 	struct flowi4 fl4;
307 	struct net *net;
308 	bool dev_match;
309 
310 	fl4.flowi4_oif = 0;
311 	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
312 	fl4.daddr = src;
313 	fl4.saddr = dst;
314 	fl4.flowi4_tos = tos;
315 	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
316 
317 	no_addr = idev->ifa_list == NULL;
318 
319 	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
320 
321 	net = dev_net(dev);
322 	if (fib_lookup(net, &fl4, &res, 0))
323 		goto last_resort;
324 	if (res.type != RTN_UNICAST &&
325 	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
326 		goto e_inval;
327 	if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
328 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
329 		goto last_resort;
330 	fib_combine_itag(itag, &res);
331 	dev_match = false;
332 
333 #ifdef CONFIG_IP_ROUTE_MULTIPATH
334 	for (ret = 0; ret < res.fi->fib_nhs; ret++) {
335 		struct fib_nh *nh = &res.fi->fib_nh[ret];
336 
337 		if (nh->nh_dev == dev) {
338 			dev_match = true;
339 			break;
340 		}
341 	}
342 #else
343 	if (FIB_RES_DEV(res) == dev)
344 		dev_match = true;
345 #endif
346 	if (dev_match) {
347 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
348 		return ret;
349 	}
350 	if (no_addr)
351 		goto last_resort;
352 	if (rpf == 1)
353 		goto e_rpf;
354 	fl4.flowi4_oif = dev->ifindex;
355 
356 	ret = 0;
357 	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
358 		if (res.type == RTN_UNICAST)
359 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
360 	}
361 	return ret;
362 
363 last_resort:
364 	if (rpf)
365 		goto e_rpf;
366 	*itag = 0;
367 	return 0;
368 
369 e_inval:
370 	return -EINVAL;
371 e_rpf:
372 	return -EXDEV;
373 }
374 
375 /* Ignore rp_filter for packets protected by IPsec. */
376 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
377 			u8 tos, int oif, struct net_device *dev,
378 			struct in_device *idev, u32 *itag)
379 {
380 	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
381 
382 	if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
383 	    IN_DEV_ACCEPT_LOCAL(idev) &&
384 	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
385 		*itag = 0;
386 		return 0;
387 	}
388 	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
389 }
390 
391 static inline __be32 sk_extract_addr(struct sockaddr *addr)
392 {
393 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
394 }
395 
396 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
397 {
398 	struct nlattr *nla;
399 
400 	nla = (struct nlattr *) ((char *) mx + len);
401 	nla->nla_type = type;
402 	nla->nla_len = nla_attr_size(4);
403 	*(u32 *) nla_data(nla) = value;
404 
405 	return len + nla_total_size(4);
406 }
407 
408 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
409 				 struct fib_config *cfg)
410 {
411 	__be32 addr;
412 	int plen;
413 
414 	memset(cfg, 0, sizeof(*cfg));
415 	cfg->fc_nlinfo.nl_net = net;
416 
417 	if (rt->rt_dst.sa_family != AF_INET)
418 		return -EAFNOSUPPORT;
419 
420 	/*
421 	 * Check mask for validity:
422 	 * a) it must be contiguous.
423 	 * b) destination must have all host bits clear.
424 	 * c) if application forgot to set correct family (AF_INET),
425 	 *    reject request unless it is absolutely clear i.e.
426 	 *    both family and mask are zero.
427 	 */
428 	plen = 32;
429 	addr = sk_extract_addr(&rt->rt_dst);
430 	if (!(rt->rt_flags & RTF_HOST)) {
431 		__be32 mask = sk_extract_addr(&rt->rt_genmask);
432 
433 		if (rt->rt_genmask.sa_family != AF_INET) {
434 			if (mask || rt->rt_genmask.sa_family)
435 				return -EAFNOSUPPORT;
436 		}
437 
438 		if (bad_mask(mask, addr))
439 			return -EINVAL;
440 
441 		plen = inet_mask_len(mask);
442 	}
443 
444 	cfg->fc_dst_len = plen;
445 	cfg->fc_dst = addr;
446 
447 	if (cmd != SIOCDELRT) {
448 		cfg->fc_nlflags = NLM_F_CREATE;
449 		cfg->fc_protocol = RTPROT_BOOT;
450 	}
451 
452 	if (rt->rt_metric)
453 		cfg->fc_priority = rt->rt_metric - 1;
454 
455 	if (rt->rt_flags & RTF_REJECT) {
456 		cfg->fc_scope = RT_SCOPE_HOST;
457 		cfg->fc_type = RTN_UNREACHABLE;
458 		return 0;
459 	}
460 
461 	cfg->fc_scope = RT_SCOPE_NOWHERE;
462 	cfg->fc_type = RTN_UNICAST;
463 
464 	if (rt->rt_dev) {
465 		char *colon;
466 		struct net_device *dev;
467 		char devname[IFNAMSIZ];
468 
469 		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
470 			return -EFAULT;
471 
472 		devname[IFNAMSIZ-1] = 0;
473 		colon = strchr(devname, ':');
474 		if (colon)
475 			*colon = 0;
476 		dev = __dev_get_by_name(net, devname);
477 		if (!dev)
478 			return -ENODEV;
479 		cfg->fc_oif = dev->ifindex;
480 		if (colon) {
481 			struct in_ifaddr *ifa;
482 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
483 			if (!in_dev)
484 				return -ENODEV;
485 			*colon = ':';
486 			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
487 				if (strcmp(ifa->ifa_label, devname) == 0)
488 					break;
489 			if (!ifa)
490 				return -ENODEV;
491 			cfg->fc_prefsrc = ifa->ifa_local;
492 		}
493 	}
494 
495 	addr = sk_extract_addr(&rt->rt_gateway);
496 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
497 		cfg->fc_gw = addr;
498 		if (rt->rt_flags & RTF_GATEWAY &&
499 		    inet_addr_type(net, addr) == RTN_UNICAST)
500 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
501 	}
502 
503 	if (cmd == SIOCDELRT)
504 		return 0;
505 
506 	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
507 		return -EINVAL;
508 
509 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
510 		cfg->fc_scope = RT_SCOPE_LINK;
511 
512 	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
513 		struct nlattr *mx;
514 		int len = 0;
515 
516 		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
517 		if (!mx)
518 			return -ENOMEM;
519 
520 		if (rt->rt_flags & RTF_MTU)
521 			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
522 
523 		if (rt->rt_flags & RTF_WINDOW)
524 			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
525 
526 		if (rt->rt_flags & RTF_IRTT)
527 			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
528 
529 		cfg->fc_mx = mx;
530 		cfg->fc_mx_len = len;
531 	}
532 
533 	return 0;
534 }
535 
536 /*
537  * Handle IP routing ioctl calls.
538  * These are used to manipulate the routing tables
539  */
540 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
541 {
542 	struct fib_config cfg;
543 	struct rtentry rt;
544 	int err;
545 
546 	switch (cmd) {
547 	case SIOCADDRT:		/* Add a route */
548 	case SIOCDELRT:		/* Delete a route */
549 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
550 			return -EPERM;
551 
552 		if (copy_from_user(&rt, arg, sizeof(rt)))
553 			return -EFAULT;
554 
555 		rtnl_lock();
556 		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
557 		if (err == 0) {
558 			struct fib_table *tb;
559 
560 			if (cmd == SIOCDELRT) {
561 				tb = fib_get_table(net, cfg.fc_table);
562 				if (tb)
563 					err = fib_table_delete(tb, &cfg);
564 				else
565 					err = -ESRCH;
566 			} else {
567 				tb = fib_new_table(net, cfg.fc_table);
568 				if (tb)
569 					err = fib_table_insert(tb, &cfg);
570 				else
571 					err = -ENOBUFS;
572 			}
573 
574 			/* allocated by rtentry_to_fib_config() */
575 			kfree(cfg.fc_mx);
576 		}
577 		rtnl_unlock();
578 		return err;
579 	}
580 	return -EINVAL;
581 }
582 
583 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
584 	[RTA_DST]		= { .type = NLA_U32 },
585 	[RTA_SRC]		= { .type = NLA_U32 },
586 	[RTA_IIF]		= { .type = NLA_U32 },
587 	[RTA_OIF]		= { .type = NLA_U32 },
588 	[RTA_GATEWAY]		= { .type = NLA_U32 },
589 	[RTA_PRIORITY]		= { .type = NLA_U32 },
590 	[RTA_PREFSRC]		= { .type = NLA_U32 },
591 	[RTA_METRICS]		= { .type = NLA_NESTED },
592 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
593 	[RTA_FLOW]		= { .type = NLA_U32 },
594 };
595 
596 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
597 			     struct nlmsghdr *nlh, struct fib_config *cfg)
598 {
599 	struct nlattr *attr;
600 	int err, remaining;
601 	struct rtmsg *rtm;
602 
603 	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
604 	if (err < 0)
605 		goto errout;
606 
607 	memset(cfg, 0, sizeof(*cfg));
608 
609 	rtm = nlmsg_data(nlh);
610 	cfg->fc_dst_len = rtm->rtm_dst_len;
611 	cfg->fc_tos = rtm->rtm_tos;
612 	cfg->fc_table = rtm->rtm_table;
613 	cfg->fc_protocol = rtm->rtm_protocol;
614 	cfg->fc_scope = rtm->rtm_scope;
615 	cfg->fc_type = rtm->rtm_type;
616 	cfg->fc_flags = rtm->rtm_flags;
617 	cfg->fc_nlflags = nlh->nlmsg_flags;
618 
619 	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
620 	cfg->fc_nlinfo.nlh = nlh;
621 	cfg->fc_nlinfo.nl_net = net;
622 
623 	if (cfg->fc_type > RTN_MAX) {
624 		err = -EINVAL;
625 		goto errout;
626 	}
627 
628 	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
629 		switch (nla_type(attr)) {
630 		case RTA_DST:
631 			cfg->fc_dst = nla_get_be32(attr);
632 			break;
633 		case RTA_OIF:
634 			cfg->fc_oif = nla_get_u32(attr);
635 			break;
636 		case RTA_GATEWAY:
637 			cfg->fc_gw = nla_get_be32(attr);
638 			break;
639 		case RTA_PRIORITY:
640 			cfg->fc_priority = nla_get_u32(attr);
641 			break;
642 		case RTA_PREFSRC:
643 			cfg->fc_prefsrc = nla_get_be32(attr);
644 			break;
645 		case RTA_METRICS:
646 			cfg->fc_mx = nla_data(attr);
647 			cfg->fc_mx_len = nla_len(attr);
648 			break;
649 		case RTA_MULTIPATH:
650 			cfg->fc_mp = nla_data(attr);
651 			cfg->fc_mp_len = nla_len(attr);
652 			break;
653 		case RTA_FLOW:
654 			cfg->fc_flow = nla_get_u32(attr);
655 			break;
656 		case RTA_TABLE:
657 			cfg->fc_table = nla_get_u32(attr);
658 			break;
659 		}
660 	}
661 
662 	return 0;
663 errout:
664 	return err;
665 }
666 
667 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
668 {
669 	struct net *net = sock_net(skb->sk);
670 	struct fib_config cfg;
671 	struct fib_table *tb;
672 	int err;
673 
674 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
675 	if (err < 0)
676 		goto errout;
677 
678 	tb = fib_get_table(net, cfg.fc_table);
679 	if (!tb) {
680 		err = -ESRCH;
681 		goto errout;
682 	}
683 
684 	err = fib_table_delete(tb, &cfg);
685 errout:
686 	return err;
687 }
688 
689 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
690 {
691 	struct net *net = sock_net(skb->sk);
692 	struct fib_config cfg;
693 	struct fib_table *tb;
694 	int err;
695 
696 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
697 	if (err < 0)
698 		goto errout;
699 
700 	tb = fib_new_table(net, cfg.fc_table);
701 	if (!tb) {
702 		err = -ENOBUFS;
703 		goto errout;
704 	}
705 
706 	err = fib_table_insert(tb, &cfg);
707 errout:
708 	return err;
709 }
710 
711 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
712 {
713 	struct net *net = sock_net(skb->sk);
714 	unsigned int h, s_h;
715 	unsigned int e = 0, s_e;
716 	struct fib_table *tb;
717 	struct hlist_head *head;
718 	int dumped = 0;
719 
720 	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
721 	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
722 		return skb->len;
723 
724 	s_h = cb->args[0];
725 	s_e = cb->args[1];
726 
727 	rcu_read_lock();
728 
729 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
730 		e = 0;
731 		head = &net->ipv4.fib_table_hash[h];
732 		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
733 			if (e < s_e)
734 				goto next;
735 			if (dumped)
736 				memset(&cb->args[2], 0, sizeof(cb->args) -
737 						 2 * sizeof(cb->args[0]));
738 			if (fib_table_dump(tb, skb, cb) < 0)
739 				goto out;
740 			dumped = 1;
741 next:
742 			e++;
743 		}
744 	}
745 out:
746 	rcu_read_unlock();
747 
748 	cb->args[1] = e;
749 	cb->args[0] = h;
750 
751 	return skb->len;
752 }
753 
754 /* Prepare and feed intra-kernel routing request.
755  * Really, it should be netlink message, but :-( netlink
756  * can be not configured, so that we feed it directly
757  * to fib engine. It is legal, because all events occur
758  * only when netlink is already locked.
759  */
760 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
761 {
762 	struct net *net = dev_net(ifa->ifa_dev->dev);
763 	struct fib_table *tb;
764 	struct fib_config cfg = {
765 		.fc_protocol = RTPROT_KERNEL,
766 		.fc_type = type,
767 		.fc_dst = dst,
768 		.fc_dst_len = dst_len,
769 		.fc_prefsrc = ifa->ifa_local,
770 		.fc_oif = ifa->ifa_dev->dev->ifindex,
771 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
772 		.fc_nlinfo = {
773 			.nl_net = net,
774 		},
775 	};
776 
777 	if (type == RTN_UNICAST)
778 		tb = fib_new_table(net, RT_TABLE_MAIN);
779 	else
780 		tb = fib_new_table(net, RT_TABLE_LOCAL);
781 
782 	if (!tb)
783 		return;
784 
785 	cfg.fc_table = tb->tb_id;
786 
787 	if (type != RTN_LOCAL)
788 		cfg.fc_scope = RT_SCOPE_LINK;
789 	else
790 		cfg.fc_scope = RT_SCOPE_HOST;
791 
792 	if (cmd == RTM_NEWROUTE)
793 		fib_table_insert(tb, &cfg);
794 	else
795 		fib_table_delete(tb, &cfg);
796 }
797 
798 void fib_add_ifaddr(struct in_ifaddr *ifa)
799 {
800 	struct in_device *in_dev = ifa->ifa_dev;
801 	struct net_device *dev = in_dev->dev;
802 	struct in_ifaddr *prim = ifa;
803 	__be32 mask = ifa->ifa_mask;
804 	__be32 addr = ifa->ifa_local;
805 	__be32 prefix = ifa->ifa_address & mask;
806 
807 	if (ifa->ifa_flags & IFA_F_SECONDARY) {
808 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
809 		if (!prim) {
810 			pr_warn("%s: bug: prim == NULL\n", __func__);
811 			return;
812 		}
813 	}
814 
815 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
816 
817 	if (!(dev->flags & IFF_UP))
818 		return;
819 
820 	/* Add broadcast address, if it is explicitly assigned. */
821 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
822 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
823 
824 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
825 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
826 		fib_magic(RTM_NEWROUTE,
827 			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
828 			  prefix, ifa->ifa_prefixlen, prim);
829 
830 		/* Add network specific broadcasts, when it takes a sense */
831 		if (ifa->ifa_prefixlen < 31) {
832 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
833 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
834 				  32, prim);
835 		}
836 	}
837 }
838 
839 /* Delete primary or secondary address.
840  * Optionally, on secondary address promotion consider the addresses
841  * from subnet iprim as deleted, even if they are in device list.
842  * In this case the secondary ifa can be in device list.
843  */
844 void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
845 {
846 	struct in_device *in_dev = ifa->ifa_dev;
847 	struct net_device *dev = in_dev->dev;
848 	struct in_ifaddr *ifa1;
849 	struct in_ifaddr *prim = ifa, *prim1 = NULL;
850 	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
851 	__be32 any = ifa->ifa_address & ifa->ifa_mask;
852 #define LOCAL_OK	1
853 #define BRD_OK		2
854 #define BRD0_OK		4
855 #define BRD1_OK		8
856 	unsigned int ok = 0;
857 	int subnet = 0;		/* Primary network */
858 	int gone = 1;		/* Address is missing */
859 	int same_prefsrc = 0;	/* Another primary with same IP */
860 
861 	if (ifa->ifa_flags & IFA_F_SECONDARY) {
862 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
863 		if (!prim) {
864 			pr_warn("%s: bug: prim == NULL\n", __func__);
865 			return;
866 		}
867 		if (iprim && iprim != prim) {
868 			pr_warn("%s: bug: iprim != prim\n", __func__);
869 			return;
870 		}
871 	} else if (!ipv4_is_zeronet(any) &&
872 		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
873 		fib_magic(RTM_DELROUTE,
874 			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
875 			  any, ifa->ifa_prefixlen, prim);
876 		subnet = 1;
877 	}
878 
879 	/* Deletion is more complicated than add.
880 	 * We should take care of not to delete too much :-)
881 	 *
882 	 * Scan address list to be sure that addresses are really gone.
883 	 */
884 
885 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
886 		if (ifa1 == ifa) {
887 			/* promotion, keep the IP */
888 			gone = 0;
889 			continue;
890 		}
891 		/* Ignore IFAs from our subnet */
892 		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
893 		    inet_ifa_match(ifa1->ifa_address, iprim))
894 			continue;
895 
896 		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
897 		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
898 			/* Another address from our subnet? */
899 			if (ifa1->ifa_mask == prim->ifa_mask &&
900 			    inet_ifa_match(ifa1->ifa_address, prim))
901 				prim1 = prim;
902 			else {
903 				/* We reached the secondaries, so
904 				 * same_prefsrc should be determined.
905 				 */
906 				if (!same_prefsrc)
907 					continue;
908 				/* Search new prim1 if ifa1 is not
909 				 * using the current prim1
910 				 */
911 				if (!prim1 ||
912 				    ifa1->ifa_mask != prim1->ifa_mask ||
913 				    !inet_ifa_match(ifa1->ifa_address, prim1))
914 					prim1 = inet_ifa_byprefix(in_dev,
915 							ifa1->ifa_address,
916 							ifa1->ifa_mask);
917 				if (!prim1)
918 					continue;
919 				if (prim1->ifa_local != prim->ifa_local)
920 					continue;
921 			}
922 		} else {
923 			if (prim->ifa_local != ifa1->ifa_local)
924 				continue;
925 			prim1 = ifa1;
926 			if (prim != prim1)
927 				same_prefsrc = 1;
928 		}
929 		if (ifa->ifa_local == ifa1->ifa_local)
930 			ok |= LOCAL_OK;
931 		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
932 			ok |= BRD_OK;
933 		if (brd == ifa1->ifa_broadcast)
934 			ok |= BRD1_OK;
935 		if (any == ifa1->ifa_broadcast)
936 			ok |= BRD0_OK;
937 		/* primary has network specific broadcasts */
938 		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
939 			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
940 			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
941 
942 			if (!ipv4_is_zeronet(any1)) {
943 				if (ifa->ifa_broadcast == brd1 ||
944 				    ifa->ifa_broadcast == any1)
945 					ok |= BRD_OK;
946 				if (brd == brd1 || brd == any1)
947 					ok |= BRD1_OK;
948 				if (any == brd1 || any == any1)
949 					ok |= BRD0_OK;
950 			}
951 		}
952 	}
953 
954 	if (!(ok & BRD_OK))
955 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
956 	if (subnet && ifa->ifa_prefixlen < 31) {
957 		if (!(ok & BRD1_OK))
958 			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
959 		if (!(ok & BRD0_OK))
960 			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
961 	}
962 	if (!(ok & LOCAL_OK)) {
963 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
964 
965 		/* Check, that this local address finally disappeared. */
966 		if (gone &&
967 		    inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
968 			/* And the last, but not the least thing.
969 			 * We must flush stray FIB entries.
970 			 *
971 			 * First of all, we scan fib_info list searching
972 			 * for stray nexthop entries, then ignite fib_flush.
973 			 */
974 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
975 				fib_flush(dev_net(dev));
976 		}
977 	}
978 #undef LOCAL_OK
979 #undef BRD_OK
980 #undef BRD0_OK
981 #undef BRD1_OK
982 }
983 
984 static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
985 {
986 
987 	struct fib_result       res;
988 	struct flowi4           fl4 = {
989 		.flowi4_mark = frn->fl_mark,
990 		.daddr = frn->fl_addr,
991 		.flowi4_tos = frn->fl_tos,
992 		.flowi4_scope = frn->fl_scope,
993 	};
994 	struct fib_table *tb;
995 
996 	rcu_read_lock();
997 
998 	tb = fib_get_table(net, frn->tb_id_in);
999 
1000 	frn->err = -ENOENT;
1001 	if (tb) {
1002 		local_bh_disable();
1003 
1004 		frn->tb_id = tb->tb_id;
1005 		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1006 
1007 		if (!frn->err) {
1008 			frn->prefixlen = res.prefixlen;
1009 			frn->nh_sel = res.nh_sel;
1010 			frn->type = res.type;
1011 			frn->scope = res.scope;
1012 		}
1013 		local_bh_enable();
1014 	}
1015 
1016 	rcu_read_unlock();
1017 }
1018 
1019 static void nl_fib_input(struct sk_buff *skb)
1020 {
1021 	struct net *net;
1022 	struct fib_result_nl *frn;
1023 	struct nlmsghdr *nlh;
1024 	u32 portid;
1025 
1026 	net = sock_net(skb->sk);
1027 	nlh = nlmsg_hdr(skb);
1028 	if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len ||
1029 	    nlmsg_len(nlh) < sizeof(*frn))
1030 		return;
1031 
1032 	skb = netlink_skb_clone(skb, GFP_KERNEL);
1033 	if (!skb)
1034 		return;
1035 	nlh = nlmsg_hdr(skb);
1036 
1037 	frn = (struct fib_result_nl *) nlmsg_data(nlh);
1038 	nl_fib_lookup(net, frn);
1039 
1040 	portid = NETLINK_CB(skb).portid;      /* netlink portid */
1041 	NETLINK_CB(skb).portid = 0;        /* from kernel */
1042 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
1043 	netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
1044 }
1045 
1046 static int __net_init nl_fib_lookup_init(struct net *net)
1047 {
1048 	struct sock *sk;
1049 	struct netlink_kernel_cfg cfg = {
1050 		.input	= nl_fib_input,
1051 	};
1052 
1053 	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1054 	if (!sk)
1055 		return -EAFNOSUPPORT;
1056 	net->ipv4.fibnl = sk;
1057 	return 0;
1058 }
1059 
1060 static void nl_fib_lookup_exit(struct net *net)
1061 {
1062 	netlink_kernel_release(net->ipv4.fibnl);
1063 	net->ipv4.fibnl = NULL;
1064 }
1065 
1066 static void fib_disable_ip(struct net_device *dev, unsigned long event)
1067 {
1068 	if (fib_sync_down_dev(dev, event))
1069 		fib_flush(dev_net(dev));
1070 	rt_cache_flush(dev_net(dev));
1071 	arp_ifdown(dev);
1072 }
1073 
1074 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1075 {
1076 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
1077 	struct net_device *dev = ifa->ifa_dev->dev;
1078 	struct net *net = dev_net(dev);
1079 
1080 	switch (event) {
1081 	case NETDEV_UP:
1082 		fib_add_ifaddr(ifa);
1083 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1084 		fib_sync_up(dev, RTNH_F_DEAD);
1085 #endif
1086 		atomic_inc(&net->ipv4.dev_addr_genid);
1087 		rt_cache_flush(dev_net(dev));
1088 		break;
1089 	case NETDEV_DOWN:
1090 		fib_del_ifaddr(ifa, NULL);
1091 		atomic_inc(&net->ipv4.dev_addr_genid);
1092 		if (!ifa->ifa_dev->ifa_list) {
1093 			/* Last address was deleted from this interface.
1094 			 * Disable IP.
1095 			 */
1096 			fib_disable_ip(dev, event);
1097 		} else {
1098 			rt_cache_flush(dev_net(dev));
1099 		}
1100 		break;
1101 	}
1102 	return NOTIFY_DONE;
1103 }
1104 
1105 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1106 {
1107 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1108 	struct in_device *in_dev;
1109 	struct net *net = dev_net(dev);
1110 	unsigned int flags;
1111 
1112 	if (event == NETDEV_UNREGISTER) {
1113 		fib_disable_ip(dev, event);
1114 		rt_flush_dev(dev);
1115 		return NOTIFY_DONE;
1116 	}
1117 
1118 	in_dev = __in_dev_get_rtnl(dev);
1119 	if (!in_dev)
1120 		return NOTIFY_DONE;
1121 
1122 	switch (event) {
1123 	case NETDEV_UP:
1124 		for_ifa(in_dev) {
1125 			fib_add_ifaddr(ifa);
1126 		} endfor_ifa(in_dev);
1127 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1128 		fib_sync_up(dev, RTNH_F_DEAD);
1129 #endif
1130 		atomic_inc(&net->ipv4.dev_addr_genid);
1131 		rt_cache_flush(net);
1132 		break;
1133 	case NETDEV_DOWN:
1134 		fib_disable_ip(dev, event);
1135 		break;
1136 	case NETDEV_CHANGE:
1137 		flags = dev_get_flags(dev);
1138 		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1139 			fib_sync_up(dev, RTNH_F_LINKDOWN);
1140 		else
1141 			fib_sync_down_dev(dev, event);
1142 		/* fall through */
1143 	case NETDEV_CHANGEMTU:
1144 		rt_cache_flush(net);
1145 		break;
1146 	}
1147 	return NOTIFY_DONE;
1148 }
1149 
1150 static struct notifier_block fib_inetaddr_notifier = {
1151 	.notifier_call = fib_inetaddr_event,
1152 };
1153 
1154 static struct notifier_block fib_netdev_notifier = {
1155 	.notifier_call = fib_netdev_event,
1156 };
1157 
1158 static int __net_init ip_fib_net_init(struct net *net)
1159 {
1160 	int err;
1161 	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1162 
1163 	/* Avoid false sharing : Use at least a full cache line */
1164 	size = max_t(size_t, size, L1_CACHE_BYTES);
1165 
1166 	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1167 	if (!net->ipv4.fib_table_hash)
1168 		return -ENOMEM;
1169 
1170 	err = fib4_rules_init(net);
1171 	if (err < 0)
1172 		goto fail;
1173 	return 0;
1174 
1175 fail:
1176 	kfree(net->ipv4.fib_table_hash);
1177 	return err;
1178 }
1179 
1180 static void ip_fib_net_exit(struct net *net)
1181 {
1182 	unsigned int i;
1183 
1184 	rtnl_lock();
1185 #ifdef CONFIG_IP_MULTIPLE_TABLES
1186 	RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
1187 	RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1188 	RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1189 #endif
1190 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1191 		struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1192 		struct hlist_node *tmp;
1193 		struct fib_table *tb;
1194 
1195 		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1196 			hlist_del(&tb->tb_hlist);
1197 			fib_table_flush(tb);
1198 			fib_free_table(tb);
1199 		}
1200 	}
1201 
1202 #ifdef CONFIG_IP_MULTIPLE_TABLES
1203 	fib4_rules_exit(net);
1204 #endif
1205 	rtnl_unlock();
1206 	kfree(net->ipv4.fib_table_hash);
1207 }
1208 
1209 static int __net_init fib_net_init(struct net *net)
1210 {
1211 	int error;
1212 
1213 #ifdef CONFIG_IP_ROUTE_CLASSID
1214 	net->ipv4.fib_num_tclassid_users = 0;
1215 #endif
1216 	error = ip_fib_net_init(net);
1217 	if (error < 0)
1218 		goto out;
1219 	error = nl_fib_lookup_init(net);
1220 	if (error < 0)
1221 		goto out_nlfl;
1222 	error = fib_proc_init(net);
1223 	if (error < 0)
1224 		goto out_proc;
1225 out:
1226 	return error;
1227 
1228 out_proc:
1229 	nl_fib_lookup_exit(net);
1230 out_nlfl:
1231 	ip_fib_net_exit(net);
1232 	goto out;
1233 }
1234 
1235 static void __net_exit fib_net_exit(struct net *net)
1236 {
1237 	fib_proc_exit(net);
1238 	nl_fib_lookup_exit(net);
1239 	ip_fib_net_exit(net);
1240 }
1241 
1242 static struct pernet_operations fib_net_ops = {
1243 	.init = fib_net_init,
1244 	.exit = fib_net_exit,
1245 };
1246 
1247 void __init ip_fib_init(void)
1248 {
1249 	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
1250 	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
1251 	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
1252 
1253 	register_pernet_subsys(&fib_net_ops);
1254 	register_netdevice_notifier(&fib_netdev_notifier);
1255 	register_inetaddr_notifier(&fib_inetaddr_notifier);
1256 
1257 	fib_trie_init();
1258 }
1259