xref: /openbmc/linux/net/ipv4/fib_frontend.c (revision e8e0929d)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <asm/system.h>
19 #include <linux/bitops.h>
20 #include <linux/capability.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/inetdevice.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_addr.h>
33 #include <linux/if_arp.h>
34 #include <linux/skbuff.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/arp.h>
44 #include <net/ip_fib.h>
45 #include <net/rtnetlink.h>
46 
47 #ifndef CONFIG_IP_MULTIPLE_TABLES
48 
49 static int __net_init fib4_rules_init(struct net *net)
50 {
51 	struct fib_table *local_table, *main_table;
52 
53 	local_table = fib_hash_table(RT_TABLE_LOCAL);
54 	if (local_table == NULL)
55 		return -ENOMEM;
56 
57 	main_table  = fib_hash_table(RT_TABLE_MAIN);
58 	if (main_table == NULL)
59 		goto fail;
60 
61 	hlist_add_head_rcu(&local_table->tb_hlist,
62 				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
63 	hlist_add_head_rcu(&main_table->tb_hlist,
64 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
65 	return 0;
66 
67 fail:
68 	kfree(local_table);
69 	return -ENOMEM;
70 }
71 #else
72 
73 struct fib_table *fib_new_table(struct net *net, u32 id)
74 {
75 	struct fib_table *tb;
76 	unsigned int h;
77 
78 	if (id == 0)
79 		id = RT_TABLE_MAIN;
80 	tb = fib_get_table(net, id);
81 	if (tb)
82 		return tb;
83 
84 	tb = fib_hash_table(id);
85 	if (!tb)
86 		return NULL;
87 	h = id & (FIB_TABLE_HASHSZ - 1);
88 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
89 	return tb;
90 }
91 
92 struct fib_table *fib_get_table(struct net *net, u32 id)
93 {
94 	struct fib_table *tb;
95 	struct hlist_node *node;
96 	struct hlist_head *head;
97 	unsigned int h;
98 
99 	if (id == 0)
100 		id = RT_TABLE_MAIN;
101 	h = id & (FIB_TABLE_HASHSZ - 1);
102 
103 	rcu_read_lock();
104 	head = &net->ipv4.fib_table_hash[h];
105 	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
106 		if (tb->tb_id == id) {
107 			rcu_read_unlock();
108 			return tb;
109 		}
110 	}
111 	rcu_read_unlock();
112 	return NULL;
113 }
114 #endif /* CONFIG_IP_MULTIPLE_TABLES */
115 
116 void fib_select_default(struct net *net,
117 			const struct flowi *flp, struct fib_result *res)
118 {
119 	struct fib_table *tb;
120 	int table = RT_TABLE_MAIN;
121 #ifdef CONFIG_IP_MULTIPLE_TABLES
122 	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
123 		return;
124 	table = res->r->table;
125 #endif
126 	tb = fib_get_table(net, table);
127 	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 		tb->tb_select_default(tb, flp, res);
129 }
130 
131 static void fib_flush(struct net *net)
132 {
133 	int flushed = 0;
134 	struct fib_table *tb;
135 	struct hlist_node *node;
136 	struct hlist_head *head;
137 	unsigned int h;
138 
139 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 		head = &net->ipv4.fib_table_hash[h];
141 		hlist_for_each_entry(tb, node, head, tb_hlist)
142 			flushed += tb->tb_flush(tb);
143 	}
144 
145 	if (flushed)
146 		rt_cache_flush(net, -1);
147 }
148 
149 /*
150  *	Find the first device with a given source address.
151  */
152 
153 struct net_device * ip_dev_find(struct net *net, __be32 addr)
154 {
155 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
156 	struct fib_result res;
157 	struct net_device *dev = NULL;
158 	struct fib_table *local_table;
159 
160 #ifdef CONFIG_IP_MULTIPLE_TABLES
161 	res.r = NULL;
162 #endif
163 
164 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 	if (!local_table || local_table->tb_lookup(local_table, &fl, &res))
166 		return NULL;
167 	if (res.type != RTN_LOCAL)
168 		goto out;
169 	dev = FIB_RES_DEV(res);
170 
171 	if (dev)
172 		dev_hold(dev);
173 out:
174 	fib_res_put(&res);
175 	return dev;
176 }
177 
178 /*
179  * Find address type as if only "dev" was present in the system. If
180  * on_dev is NULL then all interfaces are taken into consideration.
181  */
182 static inline unsigned __inet_dev_addr_type(struct net *net,
183 					    const struct net_device *dev,
184 					    __be32 addr)
185 {
186 	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
187 	struct fib_result	res;
188 	unsigned ret = RTN_BROADCAST;
189 	struct fib_table *local_table;
190 
191 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
192 		return RTN_BROADCAST;
193 	if (ipv4_is_multicast(addr))
194 		return RTN_MULTICAST;
195 
196 #ifdef CONFIG_IP_MULTIPLE_TABLES
197 	res.r = NULL;
198 #endif
199 
200 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 	if (local_table) {
202 		ret = RTN_UNICAST;
203 		if (!local_table->tb_lookup(local_table, &fl, &res)) {
204 			if (!dev || dev == res.fi->fib_dev)
205 				ret = res.type;
206 			fib_res_put(&res);
207 		}
208 	}
209 	return ret;
210 }
211 
212 unsigned int inet_addr_type(struct net *net, __be32 addr)
213 {
214 	return __inet_dev_addr_type(net, NULL, addr);
215 }
216 
217 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
218 				__be32 addr)
219 {
220        return __inet_dev_addr_type(net, dev, addr);
221 }
222 
223 /* Given (packet source, input interface) and optional (dst, oif, tos):
224    - (main) check, that source is valid i.e. not broadcast or our local
225      address.
226    - figure out what "logical" interface this packet arrived
227      and calculate "specific destination" address.
228    - check, that packet arrived from expected physical interface.
229  */
230 
231 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
232 			struct net_device *dev, __be32 *spec_dst, u32 *itag)
233 {
234 	struct in_device *in_dev;
235 	struct flowi fl = { .nl_u = { .ip4_u =
236 				      { .daddr = src,
237 					.saddr = dst,
238 					.tos = tos } },
239 			    .iif = oif };
240 	struct fib_result res;
241 	int no_addr, rpf;
242 	int ret;
243 	struct net *net;
244 
245 	no_addr = rpf = 0;
246 	rcu_read_lock();
247 	in_dev = __in_dev_get_rcu(dev);
248 	if (in_dev) {
249 		no_addr = in_dev->ifa_list == NULL;
250 		rpf = IN_DEV_RPFILTER(in_dev);
251 	}
252 	rcu_read_unlock();
253 
254 	if (in_dev == NULL)
255 		goto e_inval;
256 
257 	net = dev_net(dev);
258 	if (fib_lookup(net, &fl, &res))
259 		goto last_resort;
260 	if (res.type != RTN_UNICAST)
261 		goto e_inval_res;
262 	*spec_dst = FIB_RES_PREFSRC(res);
263 	fib_combine_itag(itag, &res);
264 #ifdef CONFIG_IP_ROUTE_MULTIPATH
265 	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
266 #else
267 	if (FIB_RES_DEV(res) == dev)
268 #endif
269 	{
270 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
271 		fib_res_put(&res);
272 		return ret;
273 	}
274 	fib_res_put(&res);
275 	if (no_addr)
276 		goto last_resort;
277 	if (rpf == 1)
278 		goto e_inval;
279 	fl.oif = dev->ifindex;
280 
281 	ret = 0;
282 	if (fib_lookup(net, &fl, &res) == 0) {
283 		if (res.type == RTN_UNICAST) {
284 			*spec_dst = FIB_RES_PREFSRC(res);
285 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
286 		}
287 		fib_res_put(&res);
288 	}
289 	return ret;
290 
291 last_resort:
292 	if (rpf)
293 		goto e_inval;
294 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
295 	*itag = 0;
296 	return 0;
297 
298 e_inval_res:
299 	fib_res_put(&res);
300 e_inval:
301 	return -EINVAL;
302 }
303 
304 static inline __be32 sk_extract_addr(struct sockaddr *addr)
305 {
306 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
307 }
308 
309 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
310 {
311 	struct nlattr *nla;
312 
313 	nla = (struct nlattr *) ((char *) mx + len);
314 	nla->nla_type = type;
315 	nla->nla_len = nla_attr_size(4);
316 	*(u32 *) nla_data(nla) = value;
317 
318 	return len + nla_total_size(4);
319 }
320 
321 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
322 				 struct fib_config *cfg)
323 {
324 	__be32 addr;
325 	int plen;
326 
327 	memset(cfg, 0, sizeof(*cfg));
328 	cfg->fc_nlinfo.nl_net = net;
329 
330 	if (rt->rt_dst.sa_family != AF_INET)
331 		return -EAFNOSUPPORT;
332 
333 	/*
334 	 * Check mask for validity:
335 	 * a) it must be contiguous.
336 	 * b) destination must have all host bits clear.
337 	 * c) if application forgot to set correct family (AF_INET),
338 	 *    reject request unless it is absolutely clear i.e.
339 	 *    both family and mask are zero.
340 	 */
341 	plen = 32;
342 	addr = sk_extract_addr(&rt->rt_dst);
343 	if (!(rt->rt_flags & RTF_HOST)) {
344 		__be32 mask = sk_extract_addr(&rt->rt_genmask);
345 
346 		if (rt->rt_genmask.sa_family != AF_INET) {
347 			if (mask || rt->rt_genmask.sa_family)
348 				return -EAFNOSUPPORT;
349 		}
350 
351 		if (bad_mask(mask, addr))
352 			return -EINVAL;
353 
354 		plen = inet_mask_len(mask);
355 	}
356 
357 	cfg->fc_dst_len = plen;
358 	cfg->fc_dst = addr;
359 
360 	if (cmd != SIOCDELRT) {
361 		cfg->fc_nlflags = NLM_F_CREATE;
362 		cfg->fc_protocol = RTPROT_BOOT;
363 	}
364 
365 	if (rt->rt_metric)
366 		cfg->fc_priority = rt->rt_metric - 1;
367 
368 	if (rt->rt_flags & RTF_REJECT) {
369 		cfg->fc_scope = RT_SCOPE_HOST;
370 		cfg->fc_type = RTN_UNREACHABLE;
371 		return 0;
372 	}
373 
374 	cfg->fc_scope = RT_SCOPE_NOWHERE;
375 	cfg->fc_type = RTN_UNICAST;
376 
377 	if (rt->rt_dev) {
378 		char *colon;
379 		struct net_device *dev;
380 		char devname[IFNAMSIZ];
381 
382 		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
383 			return -EFAULT;
384 
385 		devname[IFNAMSIZ-1] = 0;
386 		colon = strchr(devname, ':');
387 		if (colon)
388 			*colon = 0;
389 		dev = __dev_get_by_name(net, devname);
390 		if (!dev)
391 			return -ENODEV;
392 		cfg->fc_oif = dev->ifindex;
393 		if (colon) {
394 			struct in_ifaddr *ifa;
395 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
396 			if (!in_dev)
397 				return -ENODEV;
398 			*colon = ':';
399 			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
400 				if (strcmp(ifa->ifa_label, devname) == 0)
401 					break;
402 			if (ifa == NULL)
403 				return -ENODEV;
404 			cfg->fc_prefsrc = ifa->ifa_local;
405 		}
406 	}
407 
408 	addr = sk_extract_addr(&rt->rt_gateway);
409 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
410 		cfg->fc_gw = addr;
411 		if (rt->rt_flags & RTF_GATEWAY &&
412 		    inet_addr_type(net, addr) == RTN_UNICAST)
413 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
414 	}
415 
416 	if (cmd == SIOCDELRT)
417 		return 0;
418 
419 	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
420 		return -EINVAL;
421 
422 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
423 		cfg->fc_scope = RT_SCOPE_LINK;
424 
425 	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
426 		struct nlattr *mx;
427 		int len = 0;
428 
429 		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
430 		if (mx == NULL)
431 			return -ENOMEM;
432 
433 		if (rt->rt_flags & RTF_MTU)
434 			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
435 
436 		if (rt->rt_flags & RTF_WINDOW)
437 			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
438 
439 		if (rt->rt_flags & RTF_IRTT)
440 			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
441 
442 		cfg->fc_mx = mx;
443 		cfg->fc_mx_len = len;
444 	}
445 
446 	return 0;
447 }
448 
449 /*
450  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
451  */
452 
453 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
454 {
455 	struct fib_config cfg;
456 	struct rtentry rt;
457 	int err;
458 
459 	switch (cmd) {
460 	case SIOCADDRT:		/* Add a route */
461 	case SIOCDELRT:		/* Delete a route */
462 		if (!capable(CAP_NET_ADMIN))
463 			return -EPERM;
464 
465 		if (copy_from_user(&rt, arg, sizeof(rt)))
466 			return -EFAULT;
467 
468 		rtnl_lock();
469 		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
470 		if (err == 0) {
471 			struct fib_table *tb;
472 
473 			if (cmd == SIOCDELRT) {
474 				tb = fib_get_table(net, cfg.fc_table);
475 				if (tb)
476 					err = tb->tb_delete(tb, &cfg);
477 				else
478 					err = -ESRCH;
479 			} else {
480 				tb = fib_new_table(net, cfg.fc_table);
481 				if (tb)
482 					err = tb->tb_insert(tb, &cfg);
483 				else
484 					err = -ENOBUFS;
485 			}
486 
487 			/* allocated by rtentry_to_fib_config() */
488 			kfree(cfg.fc_mx);
489 		}
490 		rtnl_unlock();
491 		return err;
492 	}
493 	return -EINVAL;
494 }
495 
496 const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
497 	[RTA_DST]		= { .type = NLA_U32 },
498 	[RTA_SRC]		= { .type = NLA_U32 },
499 	[RTA_IIF]		= { .type = NLA_U32 },
500 	[RTA_OIF]		= { .type = NLA_U32 },
501 	[RTA_GATEWAY]		= { .type = NLA_U32 },
502 	[RTA_PRIORITY]		= { .type = NLA_U32 },
503 	[RTA_PREFSRC]		= { .type = NLA_U32 },
504 	[RTA_METRICS]		= { .type = NLA_NESTED },
505 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
506 	[RTA_FLOW]		= { .type = NLA_U32 },
507 };
508 
509 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
510 			    struct nlmsghdr *nlh, struct fib_config *cfg)
511 {
512 	struct nlattr *attr;
513 	int err, remaining;
514 	struct rtmsg *rtm;
515 
516 	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
517 	if (err < 0)
518 		goto errout;
519 
520 	memset(cfg, 0, sizeof(*cfg));
521 
522 	rtm = nlmsg_data(nlh);
523 	cfg->fc_dst_len = rtm->rtm_dst_len;
524 	cfg->fc_tos = rtm->rtm_tos;
525 	cfg->fc_table = rtm->rtm_table;
526 	cfg->fc_protocol = rtm->rtm_protocol;
527 	cfg->fc_scope = rtm->rtm_scope;
528 	cfg->fc_type = rtm->rtm_type;
529 	cfg->fc_flags = rtm->rtm_flags;
530 	cfg->fc_nlflags = nlh->nlmsg_flags;
531 
532 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
533 	cfg->fc_nlinfo.nlh = nlh;
534 	cfg->fc_nlinfo.nl_net = net;
535 
536 	if (cfg->fc_type > RTN_MAX) {
537 		err = -EINVAL;
538 		goto errout;
539 	}
540 
541 	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
542 		switch (nla_type(attr)) {
543 		case RTA_DST:
544 			cfg->fc_dst = nla_get_be32(attr);
545 			break;
546 		case RTA_OIF:
547 			cfg->fc_oif = nla_get_u32(attr);
548 			break;
549 		case RTA_GATEWAY:
550 			cfg->fc_gw = nla_get_be32(attr);
551 			break;
552 		case RTA_PRIORITY:
553 			cfg->fc_priority = nla_get_u32(attr);
554 			break;
555 		case RTA_PREFSRC:
556 			cfg->fc_prefsrc = nla_get_be32(attr);
557 			break;
558 		case RTA_METRICS:
559 			cfg->fc_mx = nla_data(attr);
560 			cfg->fc_mx_len = nla_len(attr);
561 			break;
562 		case RTA_MULTIPATH:
563 			cfg->fc_mp = nla_data(attr);
564 			cfg->fc_mp_len = nla_len(attr);
565 			break;
566 		case RTA_FLOW:
567 			cfg->fc_flow = nla_get_u32(attr);
568 			break;
569 		case RTA_TABLE:
570 			cfg->fc_table = nla_get_u32(attr);
571 			break;
572 		}
573 	}
574 
575 	return 0;
576 errout:
577 	return err;
578 }
579 
580 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
581 {
582 	struct net *net = sock_net(skb->sk);
583 	struct fib_config cfg;
584 	struct fib_table *tb;
585 	int err;
586 
587 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
588 	if (err < 0)
589 		goto errout;
590 
591 	tb = fib_get_table(net, cfg.fc_table);
592 	if (tb == NULL) {
593 		err = -ESRCH;
594 		goto errout;
595 	}
596 
597 	err = tb->tb_delete(tb, &cfg);
598 errout:
599 	return err;
600 }
601 
602 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
603 {
604 	struct net *net = sock_net(skb->sk);
605 	struct fib_config cfg;
606 	struct fib_table *tb;
607 	int err;
608 
609 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
610 	if (err < 0)
611 		goto errout;
612 
613 	tb = fib_new_table(net, cfg.fc_table);
614 	if (tb == NULL) {
615 		err = -ENOBUFS;
616 		goto errout;
617 	}
618 
619 	err = tb->tb_insert(tb, &cfg);
620 errout:
621 	return err;
622 }
623 
624 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
625 {
626 	struct net *net = sock_net(skb->sk);
627 	unsigned int h, s_h;
628 	unsigned int e = 0, s_e;
629 	struct fib_table *tb;
630 	struct hlist_node *node;
631 	struct hlist_head *head;
632 	int dumped = 0;
633 
634 	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
635 	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
636 		return ip_rt_dump(skb, cb);
637 
638 	s_h = cb->args[0];
639 	s_e = cb->args[1];
640 
641 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
642 		e = 0;
643 		head = &net->ipv4.fib_table_hash[h];
644 		hlist_for_each_entry(tb, node, head, tb_hlist) {
645 			if (e < s_e)
646 				goto next;
647 			if (dumped)
648 				memset(&cb->args[2], 0, sizeof(cb->args) -
649 						 2 * sizeof(cb->args[0]));
650 			if (tb->tb_dump(tb, skb, cb) < 0)
651 				goto out;
652 			dumped = 1;
653 next:
654 			e++;
655 		}
656 	}
657 out:
658 	cb->args[1] = e;
659 	cb->args[0] = h;
660 
661 	return skb->len;
662 }
663 
664 /* Prepare and feed intra-kernel routing request.
665    Really, it should be netlink message, but :-( netlink
666    can be not configured, so that we feed it directly
667    to fib engine. It is legal, because all events occur
668    only when netlink is already locked.
669  */
670 
671 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
672 {
673 	struct net *net = dev_net(ifa->ifa_dev->dev);
674 	struct fib_table *tb;
675 	struct fib_config cfg = {
676 		.fc_protocol = RTPROT_KERNEL,
677 		.fc_type = type,
678 		.fc_dst = dst,
679 		.fc_dst_len = dst_len,
680 		.fc_prefsrc = ifa->ifa_local,
681 		.fc_oif = ifa->ifa_dev->dev->ifindex,
682 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
683 		.fc_nlinfo = {
684 			.nl_net = net,
685 		},
686 	};
687 
688 	if (type == RTN_UNICAST)
689 		tb = fib_new_table(net, RT_TABLE_MAIN);
690 	else
691 		tb = fib_new_table(net, RT_TABLE_LOCAL);
692 
693 	if (tb == NULL)
694 		return;
695 
696 	cfg.fc_table = tb->tb_id;
697 
698 	if (type != RTN_LOCAL)
699 		cfg.fc_scope = RT_SCOPE_LINK;
700 	else
701 		cfg.fc_scope = RT_SCOPE_HOST;
702 
703 	if (cmd == RTM_NEWROUTE)
704 		tb->tb_insert(tb, &cfg);
705 	else
706 		tb->tb_delete(tb, &cfg);
707 }
708 
709 void fib_add_ifaddr(struct in_ifaddr *ifa)
710 {
711 	struct in_device *in_dev = ifa->ifa_dev;
712 	struct net_device *dev = in_dev->dev;
713 	struct in_ifaddr *prim = ifa;
714 	__be32 mask = ifa->ifa_mask;
715 	__be32 addr = ifa->ifa_local;
716 	__be32 prefix = ifa->ifa_address&mask;
717 
718 	if (ifa->ifa_flags&IFA_F_SECONDARY) {
719 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
720 		if (prim == NULL) {
721 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
722 			return;
723 		}
724 	}
725 
726 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
727 
728 	if (!(dev->flags&IFF_UP))
729 		return;
730 
731 	/* Add broadcast address, if it is explicitly assigned. */
732 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
733 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
734 
735 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
736 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
737 		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
738 			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
739 
740 		/* Add network specific broadcasts, when it takes a sense */
741 		if (ifa->ifa_prefixlen < 31) {
742 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
743 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
744 		}
745 	}
746 }
747 
748 static void fib_del_ifaddr(struct in_ifaddr *ifa)
749 {
750 	struct in_device *in_dev = ifa->ifa_dev;
751 	struct net_device *dev = in_dev->dev;
752 	struct in_ifaddr *ifa1;
753 	struct in_ifaddr *prim = ifa;
754 	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
755 	__be32 any = ifa->ifa_address&ifa->ifa_mask;
756 #define LOCAL_OK	1
757 #define BRD_OK		2
758 #define BRD0_OK		4
759 #define BRD1_OK		8
760 	unsigned ok = 0;
761 
762 	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
763 		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
764 			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
765 	else {
766 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
767 		if (prim == NULL) {
768 			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
769 			return;
770 		}
771 	}
772 
773 	/* Deletion is more complicated than add.
774 	   We should take care of not to delete too much :-)
775 
776 	   Scan address list to be sure that addresses are really gone.
777 	 */
778 
779 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
780 		if (ifa->ifa_local == ifa1->ifa_local)
781 			ok |= LOCAL_OK;
782 		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
783 			ok |= BRD_OK;
784 		if (brd == ifa1->ifa_broadcast)
785 			ok |= BRD1_OK;
786 		if (any == ifa1->ifa_broadcast)
787 			ok |= BRD0_OK;
788 	}
789 
790 	if (!(ok&BRD_OK))
791 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
792 	if (!(ok&BRD1_OK))
793 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
794 	if (!(ok&BRD0_OK))
795 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
796 	if (!(ok&LOCAL_OK)) {
797 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
798 
799 		/* Check, that this local address finally disappeared. */
800 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
801 			/* And the last, but not the least thing.
802 			   We must flush stray FIB entries.
803 
804 			   First of all, we scan fib_info list searching
805 			   for stray nexthop entries, then ignite fib_flush.
806 			*/
807 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
808 				fib_flush(dev_net(dev));
809 		}
810 	}
811 #undef LOCAL_OK
812 #undef BRD_OK
813 #undef BRD0_OK
814 #undef BRD1_OK
815 }
816 
817 static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
818 {
819 
820 	struct fib_result       res;
821 	struct flowi            fl = { .mark = frn->fl_mark,
822 				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
823 							    .tos = frn->fl_tos,
824 							    .scope = frn->fl_scope } } };
825 
826 #ifdef CONFIG_IP_MULTIPLE_TABLES
827 	res.r = NULL;
828 #endif
829 
830 	frn->err = -ENOENT;
831 	if (tb) {
832 		local_bh_disable();
833 
834 		frn->tb_id = tb->tb_id;
835 		frn->err = tb->tb_lookup(tb, &fl, &res);
836 
837 		if (!frn->err) {
838 			frn->prefixlen = res.prefixlen;
839 			frn->nh_sel = res.nh_sel;
840 			frn->type = res.type;
841 			frn->scope = res.scope;
842 			fib_res_put(&res);
843 		}
844 		local_bh_enable();
845 	}
846 }
847 
848 static void nl_fib_input(struct sk_buff *skb)
849 {
850 	struct net *net;
851 	struct fib_result_nl *frn;
852 	struct nlmsghdr *nlh;
853 	struct fib_table *tb;
854 	u32 pid;
855 
856 	net = sock_net(skb->sk);
857 	nlh = nlmsg_hdr(skb);
858 	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
859 	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
860 		return;
861 
862 	skb = skb_clone(skb, GFP_KERNEL);
863 	if (skb == NULL)
864 		return;
865 	nlh = nlmsg_hdr(skb);
866 
867 	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
868 	tb = fib_get_table(net, frn->tb_id_in);
869 
870 	nl_fib_lookup(frn, tb);
871 
872 	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
873 	NETLINK_CB(skb).pid = 0;         /* from kernel */
874 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
875 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
876 }
877 
878 static int nl_fib_lookup_init(struct net *net)
879 {
880 	struct sock *sk;
881 	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
882 				   nl_fib_input, NULL, THIS_MODULE);
883 	if (sk == NULL)
884 		return -EAFNOSUPPORT;
885 	net->ipv4.fibnl = sk;
886 	return 0;
887 }
888 
889 static void nl_fib_lookup_exit(struct net *net)
890 {
891 	netlink_kernel_release(net->ipv4.fibnl);
892 	net->ipv4.fibnl = NULL;
893 }
894 
895 static void fib_disable_ip(struct net_device *dev, int force)
896 {
897 	if (fib_sync_down_dev(dev, force))
898 		fib_flush(dev_net(dev));
899 	rt_cache_flush(dev_net(dev), 0);
900 	arp_ifdown(dev);
901 }
902 
903 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
904 {
905 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
906 	struct net_device *dev = ifa->ifa_dev->dev;
907 
908 	switch (event) {
909 	case NETDEV_UP:
910 		fib_add_ifaddr(ifa);
911 #ifdef CONFIG_IP_ROUTE_MULTIPATH
912 		fib_sync_up(dev);
913 #endif
914 		rt_cache_flush(dev_net(dev), -1);
915 		break;
916 	case NETDEV_DOWN:
917 		fib_del_ifaddr(ifa);
918 		if (ifa->ifa_dev->ifa_list == NULL) {
919 			/* Last address was deleted from this interface.
920 			   Disable IP.
921 			 */
922 			fib_disable_ip(dev, 1);
923 		} else {
924 			rt_cache_flush(dev_net(dev), -1);
925 		}
926 		break;
927 	}
928 	return NOTIFY_DONE;
929 }
930 
931 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
932 {
933 	struct net_device *dev = ptr;
934 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
935 
936 	if (event == NETDEV_UNREGISTER) {
937 		fib_disable_ip(dev, 2);
938 		return NOTIFY_DONE;
939 	}
940 
941 	if (!in_dev)
942 		return NOTIFY_DONE;
943 
944 	switch (event) {
945 	case NETDEV_UP:
946 		for_ifa(in_dev) {
947 			fib_add_ifaddr(ifa);
948 		} endfor_ifa(in_dev);
949 #ifdef CONFIG_IP_ROUTE_MULTIPATH
950 		fib_sync_up(dev);
951 #endif
952 		rt_cache_flush(dev_net(dev), -1);
953 		break;
954 	case NETDEV_DOWN:
955 		fib_disable_ip(dev, 0);
956 		break;
957 	case NETDEV_CHANGEMTU:
958 	case NETDEV_CHANGE:
959 		rt_cache_flush(dev_net(dev), 0);
960 		break;
961 	}
962 	return NOTIFY_DONE;
963 }
964 
965 static struct notifier_block fib_inetaddr_notifier = {
966 	.notifier_call = fib_inetaddr_event,
967 };
968 
969 static struct notifier_block fib_netdev_notifier = {
970 	.notifier_call = fib_netdev_event,
971 };
972 
973 static int __net_init ip_fib_net_init(struct net *net)
974 {
975 	int err;
976 	unsigned int i;
977 
978 	net->ipv4.fib_table_hash = kzalloc(
979 			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
980 	if (net->ipv4.fib_table_hash == NULL)
981 		return -ENOMEM;
982 
983 	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
984 		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
985 
986 	err = fib4_rules_init(net);
987 	if (err < 0)
988 		goto fail;
989 	return 0;
990 
991 fail:
992 	kfree(net->ipv4.fib_table_hash);
993 	return err;
994 }
995 
996 static void __net_exit ip_fib_net_exit(struct net *net)
997 {
998 	unsigned int i;
999 
1000 #ifdef CONFIG_IP_MULTIPLE_TABLES
1001 	fib4_rules_exit(net);
1002 #endif
1003 
1004 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1005 		struct fib_table *tb;
1006 		struct hlist_head *head;
1007 		struct hlist_node *node, *tmp;
1008 
1009 		head = &net->ipv4.fib_table_hash[i];
1010 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1011 			hlist_del(node);
1012 			tb->tb_flush(tb);
1013 			kfree(tb);
1014 		}
1015 	}
1016 	kfree(net->ipv4.fib_table_hash);
1017 }
1018 
1019 static int __net_init fib_net_init(struct net *net)
1020 {
1021 	int error;
1022 
1023 	error = ip_fib_net_init(net);
1024 	if (error < 0)
1025 		goto out;
1026 	error = nl_fib_lookup_init(net);
1027 	if (error < 0)
1028 		goto out_nlfl;
1029 	error = fib_proc_init(net);
1030 	if (error < 0)
1031 		goto out_proc;
1032 out:
1033 	return error;
1034 
1035 out_proc:
1036 	nl_fib_lookup_exit(net);
1037 out_nlfl:
1038 	ip_fib_net_exit(net);
1039 	goto out;
1040 }
1041 
1042 static void __net_exit fib_net_exit(struct net *net)
1043 {
1044 	fib_proc_exit(net);
1045 	nl_fib_lookup_exit(net);
1046 	ip_fib_net_exit(net);
1047 }
1048 
1049 static struct pernet_operations fib_net_ops = {
1050 	.init = fib_net_init,
1051 	.exit = fib_net_exit,
1052 };
1053 
1054 void __init ip_fib_init(void)
1055 {
1056 	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
1057 	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
1058 	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1059 
1060 	register_pernet_subsys(&fib_net_ops);
1061 	register_netdevice_notifier(&fib_netdev_notifier);
1062 	register_inetaddr_notifier(&fib_inetaddr_notifier);
1063 
1064 	fib_hash_init();
1065 }
1066 
1067 EXPORT_SYMBOL(inet_addr_type);
1068 EXPORT_SYMBOL(inet_dev_addr_type);
1069 EXPORT_SYMBOL(ip_dev_find);
1070