xref: /openbmc/linux/net/ipv4/fib_frontend.c (revision 93dc544c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <asm/system.h>
19 #include <linux/bitops.h>
20 #include <linux/capability.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/inetdevice.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_addr.h>
33 #include <linux/if_arp.h>
34 #include <linux/skbuff.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/icmp.h>
44 #include <net/arp.h>
45 #include <net/ip_fib.h>
46 #include <net/rtnetlink.h>
47 
48 #ifndef CONFIG_IP_MULTIPLE_TABLES
49 
50 static int __net_init fib4_rules_init(struct net *net)
51 {
52 	struct fib_table *local_table, *main_table;
53 
54 	local_table = fib_hash_table(RT_TABLE_LOCAL);
55 	if (local_table == NULL)
56 		return -ENOMEM;
57 
58 	main_table  = fib_hash_table(RT_TABLE_MAIN);
59 	if (main_table == NULL)
60 		goto fail;
61 
62 	hlist_add_head_rcu(&local_table->tb_hlist,
63 				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
64 	hlist_add_head_rcu(&main_table->tb_hlist,
65 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
66 	return 0;
67 
68 fail:
69 	kfree(local_table);
70 	return -ENOMEM;
71 }
72 #else
73 
74 struct fib_table *fib_new_table(struct net *net, u32 id)
75 {
76 	struct fib_table *tb;
77 	unsigned int h;
78 
79 	if (id == 0)
80 		id = RT_TABLE_MAIN;
81 	tb = fib_get_table(net, id);
82 	if (tb)
83 		return tb;
84 
85 	tb = fib_hash_table(id);
86 	if (!tb)
87 		return NULL;
88 	h = id & (FIB_TABLE_HASHSZ - 1);
89 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
90 	return tb;
91 }
92 
93 struct fib_table *fib_get_table(struct net *net, u32 id)
94 {
95 	struct fib_table *tb;
96 	struct hlist_node *node;
97 	struct hlist_head *head;
98 	unsigned int h;
99 
100 	if (id == 0)
101 		id = RT_TABLE_MAIN;
102 	h = id & (FIB_TABLE_HASHSZ - 1);
103 
104 	rcu_read_lock();
105 	head = &net->ipv4.fib_table_hash[h];
106 	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
107 		if (tb->tb_id == id) {
108 			rcu_read_unlock();
109 			return tb;
110 		}
111 	}
112 	rcu_read_unlock();
113 	return NULL;
114 }
115 #endif /* CONFIG_IP_MULTIPLE_TABLES */
116 
117 void fib_select_default(struct net *net,
118 			const struct flowi *flp, struct fib_result *res)
119 {
120 	struct fib_table *tb;
121 	int table = RT_TABLE_MAIN;
122 #ifdef CONFIG_IP_MULTIPLE_TABLES
123 	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
124 		return;
125 	table = res->r->table;
126 #endif
127 	tb = fib_get_table(net, table);
128 	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
129 		tb->tb_select_default(tb, flp, res);
130 }
131 
132 static void fib_flush(struct net *net)
133 {
134 	int flushed = 0;
135 	struct fib_table *tb;
136 	struct hlist_node *node;
137 	struct hlist_head *head;
138 	unsigned int h;
139 
140 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
141 		head = &net->ipv4.fib_table_hash[h];
142 		hlist_for_each_entry(tb, node, head, tb_hlist)
143 			flushed += tb->tb_flush(tb);
144 	}
145 
146 	if (flushed)
147 		rt_cache_flush(net, -1);
148 }
149 
150 /*
151  *	Find the first device with a given source address.
152  */
153 
154 struct net_device * ip_dev_find(struct net *net, __be32 addr)
155 {
156 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
157 	struct fib_result res;
158 	struct net_device *dev = NULL;
159 	struct fib_table *local_table;
160 
161 #ifdef CONFIG_IP_MULTIPLE_TABLES
162 	res.r = NULL;
163 #endif
164 
165 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
166 	if (!local_table || local_table->tb_lookup(local_table, &fl, &res))
167 		return NULL;
168 	if (res.type != RTN_LOCAL)
169 		goto out;
170 	dev = FIB_RES_DEV(res);
171 
172 	if (dev)
173 		dev_hold(dev);
174 out:
175 	fib_res_put(&res);
176 	return dev;
177 }
178 
179 /*
180  * Find address type as if only "dev" was present in the system. If
181  * on_dev is NULL then all interfaces are taken into consideration.
182  */
183 static inline unsigned __inet_dev_addr_type(struct net *net,
184 					    const struct net_device *dev,
185 					    __be32 addr)
186 {
187 	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
188 	struct fib_result	res;
189 	unsigned ret = RTN_BROADCAST;
190 	struct fib_table *local_table;
191 
192 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
193 		return RTN_BROADCAST;
194 	if (ipv4_is_multicast(addr))
195 		return RTN_MULTICAST;
196 
197 #ifdef CONFIG_IP_MULTIPLE_TABLES
198 	res.r = NULL;
199 #endif
200 
201 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
202 	if (local_table) {
203 		ret = RTN_UNICAST;
204 		if (!local_table->tb_lookup(local_table, &fl, &res)) {
205 			if (!dev || dev == res.fi->fib_dev)
206 				ret = res.type;
207 			fib_res_put(&res);
208 		}
209 	}
210 	return ret;
211 }
212 
213 unsigned int inet_addr_type(struct net *net, __be32 addr)
214 {
215 	return __inet_dev_addr_type(net, NULL, addr);
216 }
217 
218 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
219 				__be32 addr)
220 {
221        return __inet_dev_addr_type(net, dev, addr);
222 }
223 
224 /* Given (packet source, input interface) and optional (dst, oif, tos):
225    - (main) check, that source is valid i.e. not broadcast or our local
226      address.
227    - figure out what "logical" interface this packet arrived
228      and calculate "specific destination" address.
229    - check, that packet arrived from expected physical interface.
230  */
231 
232 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
233 			struct net_device *dev, __be32 *spec_dst, u32 *itag)
234 {
235 	struct in_device *in_dev;
236 	struct flowi fl = { .nl_u = { .ip4_u =
237 				      { .daddr = src,
238 					.saddr = dst,
239 					.tos = tos } },
240 			    .iif = oif };
241 	struct fib_result res;
242 	int no_addr, rpf;
243 	int ret;
244 	struct net *net;
245 
246 	no_addr = rpf = 0;
247 	rcu_read_lock();
248 	in_dev = __in_dev_get_rcu(dev);
249 	if (in_dev) {
250 		no_addr = in_dev->ifa_list == NULL;
251 		rpf = IN_DEV_RPFILTER(in_dev);
252 	}
253 	rcu_read_unlock();
254 
255 	if (in_dev == NULL)
256 		goto e_inval;
257 
258 	net = dev_net(dev);
259 	if (fib_lookup(net, &fl, &res))
260 		goto last_resort;
261 	if (res.type != RTN_UNICAST)
262 		goto e_inval_res;
263 	*spec_dst = FIB_RES_PREFSRC(res);
264 	fib_combine_itag(itag, &res);
265 #ifdef CONFIG_IP_ROUTE_MULTIPATH
266 	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
267 #else
268 	if (FIB_RES_DEV(res) == dev)
269 #endif
270 	{
271 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
272 		fib_res_put(&res);
273 		return ret;
274 	}
275 	fib_res_put(&res);
276 	if (no_addr)
277 		goto last_resort;
278 	if (rpf)
279 		goto e_inval;
280 	fl.oif = dev->ifindex;
281 
282 	ret = 0;
283 	if (fib_lookup(net, &fl, &res) == 0) {
284 		if (res.type == RTN_UNICAST) {
285 			*spec_dst = FIB_RES_PREFSRC(res);
286 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
287 		}
288 		fib_res_put(&res);
289 	}
290 	return ret;
291 
292 last_resort:
293 	if (rpf)
294 		goto e_inval;
295 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
296 	*itag = 0;
297 	return 0;
298 
299 e_inval_res:
300 	fib_res_put(&res);
301 e_inval:
302 	return -EINVAL;
303 }
304 
305 static inline __be32 sk_extract_addr(struct sockaddr *addr)
306 {
307 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
308 }
309 
310 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
311 {
312 	struct nlattr *nla;
313 
314 	nla = (struct nlattr *) ((char *) mx + len);
315 	nla->nla_type = type;
316 	nla->nla_len = nla_attr_size(4);
317 	*(u32 *) nla_data(nla) = value;
318 
319 	return len + nla_total_size(4);
320 }
321 
322 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
323 				 struct fib_config *cfg)
324 {
325 	__be32 addr;
326 	int plen;
327 
328 	memset(cfg, 0, sizeof(*cfg));
329 	cfg->fc_nlinfo.nl_net = net;
330 
331 	if (rt->rt_dst.sa_family != AF_INET)
332 		return -EAFNOSUPPORT;
333 
334 	/*
335 	 * Check mask for validity:
336 	 * a) it must be contiguous.
337 	 * b) destination must have all host bits clear.
338 	 * c) if application forgot to set correct family (AF_INET),
339 	 *    reject request unless it is absolutely clear i.e.
340 	 *    both family and mask are zero.
341 	 */
342 	plen = 32;
343 	addr = sk_extract_addr(&rt->rt_dst);
344 	if (!(rt->rt_flags & RTF_HOST)) {
345 		__be32 mask = sk_extract_addr(&rt->rt_genmask);
346 
347 		if (rt->rt_genmask.sa_family != AF_INET) {
348 			if (mask || rt->rt_genmask.sa_family)
349 				return -EAFNOSUPPORT;
350 		}
351 
352 		if (bad_mask(mask, addr))
353 			return -EINVAL;
354 
355 		plen = inet_mask_len(mask);
356 	}
357 
358 	cfg->fc_dst_len = plen;
359 	cfg->fc_dst = addr;
360 
361 	if (cmd != SIOCDELRT) {
362 		cfg->fc_nlflags = NLM_F_CREATE;
363 		cfg->fc_protocol = RTPROT_BOOT;
364 	}
365 
366 	if (rt->rt_metric)
367 		cfg->fc_priority = rt->rt_metric - 1;
368 
369 	if (rt->rt_flags & RTF_REJECT) {
370 		cfg->fc_scope = RT_SCOPE_HOST;
371 		cfg->fc_type = RTN_UNREACHABLE;
372 		return 0;
373 	}
374 
375 	cfg->fc_scope = RT_SCOPE_NOWHERE;
376 	cfg->fc_type = RTN_UNICAST;
377 
378 	if (rt->rt_dev) {
379 		char *colon;
380 		struct net_device *dev;
381 		char devname[IFNAMSIZ];
382 
383 		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
384 			return -EFAULT;
385 
386 		devname[IFNAMSIZ-1] = 0;
387 		colon = strchr(devname, ':');
388 		if (colon)
389 			*colon = 0;
390 		dev = __dev_get_by_name(net, devname);
391 		if (!dev)
392 			return -ENODEV;
393 		cfg->fc_oif = dev->ifindex;
394 		if (colon) {
395 			struct in_ifaddr *ifa;
396 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
397 			if (!in_dev)
398 				return -ENODEV;
399 			*colon = ':';
400 			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
401 				if (strcmp(ifa->ifa_label, devname) == 0)
402 					break;
403 			if (ifa == NULL)
404 				return -ENODEV;
405 			cfg->fc_prefsrc = ifa->ifa_local;
406 		}
407 	}
408 
409 	addr = sk_extract_addr(&rt->rt_gateway);
410 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
411 		cfg->fc_gw = addr;
412 		if (rt->rt_flags & RTF_GATEWAY &&
413 		    inet_addr_type(net, addr) == RTN_UNICAST)
414 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
415 	}
416 
417 	if (cmd == SIOCDELRT)
418 		return 0;
419 
420 	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
421 		return -EINVAL;
422 
423 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
424 		cfg->fc_scope = RT_SCOPE_LINK;
425 
426 	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
427 		struct nlattr *mx;
428 		int len = 0;
429 
430 		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
431 		if (mx == NULL)
432 			return -ENOMEM;
433 
434 		if (rt->rt_flags & RTF_MTU)
435 			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
436 
437 		if (rt->rt_flags & RTF_WINDOW)
438 			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
439 
440 		if (rt->rt_flags & RTF_IRTT)
441 			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
442 
443 		cfg->fc_mx = mx;
444 		cfg->fc_mx_len = len;
445 	}
446 
447 	return 0;
448 }
449 
450 /*
451  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
452  */
453 
454 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
455 {
456 	struct fib_config cfg;
457 	struct rtentry rt;
458 	int err;
459 
460 	switch (cmd) {
461 	case SIOCADDRT:		/* Add a route */
462 	case SIOCDELRT:		/* Delete a route */
463 		if (!capable(CAP_NET_ADMIN))
464 			return -EPERM;
465 
466 		if (copy_from_user(&rt, arg, sizeof(rt)))
467 			return -EFAULT;
468 
469 		rtnl_lock();
470 		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
471 		if (err == 0) {
472 			struct fib_table *tb;
473 
474 			if (cmd == SIOCDELRT) {
475 				tb = fib_get_table(net, cfg.fc_table);
476 				if (tb)
477 					err = tb->tb_delete(tb, &cfg);
478 				else
479 					err = -ESRCH;
480 			} else {
481 				tb = fib_new_table(net, cfg.fc_table);
482 				if (tb)
483 					err = tb->tb_insert(tb, &cfg);
484 				else
485 					err = -ENOBUFS;
486 			}
487 
488 			/* allocated by rtentry_to_fib_config() */
489 			kfree(cfg.fc_mx);
490 		}
491 		rtnl_unlock();
492 		return err;
493 	}
494 	return -EINVAL;
495 }
496 
497 const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
498 	[RTA_DST]		= { .type = NLA_U32 },
499 	[RTA_SRC]		= { .type = NLA_U32 },
500 	[RTA_IIF]		= { .type = NLA_U32 },
501 	[RTA_OIF]		= { .type = NLA_U32 },
502 	[RTA_GATEWAY]		= { .type = NLA_U32 },
503 	[RTA_PRIORITY]		= { .type = NLA_U32 },
504 	[RTA_PREFSRC]		= { .type = NLA_U32 },
505 	[RTA_METRICS]		= { .type = NLA_NESTED },
506 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
507 	[RTA_FLOW]		= { .type = NLA_U32 },
508 };
509 
510 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
511 			    struct nlmsghdr *nlh, struct fib_config *cfg)
512 {
513 	struct nlattr *attr;
514 	int err, remaining;
515 	struct rtmsg *rtm;
516 
517 	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
518 	if (err < 0)
519 		goto errout;
520 
521 	memset(cfg, 0, sizeof(*cfg));
522 
523 	rtm = nlmsg_data(nlh);
524 	cfg->fc_dst_len = rtm->rtm_dst_len;
525 	cfg->fc_tos = rtm->rtm_tos;
526 	cfg->fc_table = rtm->rtm_table;
527 	cfg->fc_protocol = rtm->rtm_protocol;
528 	cfg->fc_scope = rtm->rtm_scope;
529 	cfg->fc_type = rtm->rtm_type;
530 	cfg->fc_flags = rtm->rtm_flags;
531 	cfg->fc_nlflags = nlh->nlmsg_flags;
532 
533 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
534 	cfg->fc_nlinfo.nlh = nlh;
535 	cfg->fc_nlinfo.nl_net = net;
536 
537 	if (cfg->fc_type > RTN_MAX) {
538 		err = -EINVAL;
539 		goto errout;
540 	}
541 
542 	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
543 		switch (nla_type(attr)) {
544 		case RTA_DST:
545 			cfg->fc_dst = nla_get_be32(attr);
546 			break;
547 		case RTA_OIF:
548 			cfg->fc_oif = nla_get_u32(attr);
549 			break;
550 		case RTA_GATEWAY:
551 			cfg->fc_gw = nla_get_be32(attr);
552 			break;
553 		case RTA_PRIORITY:
554 			cfg->fc_priority = nla_get_u32(attr);
555 			break;
556 		case RTA_PREFSRC:
557 			cfg->fc_prefsrc = nla_get_be32(attr);
558 			break;
559 		case RTA_METRICS:
560 			cfg->fc_mx = nla_data(attr);
561 			cfg->fc_mx_len = nla_len(attr);
562 			break;
563 		case RTA_MULTIPATH:
564 			cfg->fc_mp = nla_data(attr);
565 			cfg->fc_mp_len = nla_len(attr);
566 			break;
567 		case RTA_FLOW:
568 			cfg->fc_flow = nla_get_u32(attr);
569 			break;
570 		case RTA_TABLE:
571 			cfg->fc_table = nla_get_u32(attr);
572 			break;
573 		}
574 	}
575 
576 	return 0;
577 errout:
578 	return err;
579 }
580 
581 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
582 {
583 	struct net *net = sock_net(skb->sk);
584 	struct fib_config cfg;
585 	struct fib_table *tb;
586 	int err;
587 
588 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
589 	if (err < 0)
590 		goto errout;
591 
592 	tb = fib_get_table(net, cfg.fc_table);
593 	if (tb == NULL) {
594 		err = -ESRCH;
595 		goto errout;
596 	}
597 
598 	err = tb->tb_delete(tb, &cfg);
599 errout:
600 	return err;
601 }
602 
603 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
604 {
605 	struct net *net = sock_net(skb->sk);
606 	struct fib_config cfg;
607 	struct fib_table *tb;
608 	int err;
609 
610 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
611 	if (err < 0)
612 		goto errout;
613 
614 	tb = fib_new_table(net, cfg.fc_table);
615 	if (tb == NULL) {
616 		err = -ENOBUFS;
617 		goto errout;
618 	}
619 
620 	err = tb->tb_insert(tb, &cfg);
621 errout:
622 	return err;
623 }
624 
625 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
626 {
627 	struct net *net = sock_net(skb->sk);
628 	unsigned int h, s_h;
629 	unsigned int e = 0, s_e;
630 	struct fib_table *tb;
631 	struct hlist_node *node;
632 	struct hlist_head *head;
633 	int dumped = 0;
634 
635 	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
636 	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
637 		return ip_rt_dump(skb, cb);
638 
639 	s_h = cb->args[0];
640 	s_e = cb->args[1];
641 
642 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
643 		e = 0;
644 		head = &net->ipv4.fib_table_hash[h];
645 		hlist_for_each_entry(tb, node, head, tb_hlist) {
646 			if (e < s_e)
647 				goto next;
648 			if (dumped)
649 				memset(&cb->args[2], 0, sizeof(cb->args) -
650 						 2 * sizeof(cb->args[0]));
651 			if (tb->tb_dump(tb, skb, cb) < 0)
652 				goto out;
653 			dumped = 1;
654 next:
655 			e++;
656 		}
657 	}
658 out:
659 	cb->args[1] = e;
660 	cb->args[0] = h;
661 
662 	return skb->len;
663 }
664 
665 /* Prepare and feed intra-kernel routing request.
666    Really, it should be netlink message, but :-( netlink
667    can be not configured, so that we feed it directly
668    to fib engine. It is legal, because all events occur
669    only when netlink is already locked.
670  */
671 
672 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
673 {
674 	struct net *net = dev_net(ifa->ifa_dev->dev);
675 	struct fib_table *tb;
676 	struct fib_config cfg = {
677 		.fc_protocol = RTPROT_KERNEL,
678 		.fc_type = type,
679 		.fc_dst = dst,
680 		.fc_dst_len = dst_len,
681 		.fc_prefsrc = ifa->ifa_local,
682 		.fc_oif = ifa->ifa_dev->dev->ifindex,
683 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
684 		.fc_nlinfo = {
685 			.nl_net = net,
686 		},
687 	};
688 
689 	if (type == RTN_UNICAST)
690 		tb = fib_new_table(net, RT_TABLE_MAIN);
691 	else
692 		tb = fib_new_table(net, RT_TABLE_LOCAL);
693 
694 	if (tb == NULL)
695 		return;
696 
697 	cfg.fc_table = tb->tb_id;
698 
699 	if (type != RTN_LOCAL)
700 		cfg.fc_scope = RT_SCOPE_LINK;
701 	else
702 		cfg.fc_scope = RT_SCOPE_HOST;
703 
704 	if (cmd == RTM_NEWROUTE)
705 		tb->tb_insert(tb, &cfg);
706 	else
707 		tb->tb_delete(tb, &cfg);
708 }
709 
710 void fib_add_ifaddr(struct in_ifaddr *ifa)
711 {
712 	struct in_device *in_dev = ifa->ifa_dev;
713 	struct net_device *dev = in_dev->dev;
714 	struct in_ifaddr *prim = ifa;
715 	__be32 mask = ifa->ifa_mask;
716 	__be32 addr = ifa->ifa_local;
717 	__be32 prefix = ifa->ifa_address&mask;
718 
719 	if (ifa->ifa_flags&IFA_F_SECONDARY) {
720 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
721 		if (prim == NULL) {
722 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
723 			return;
724 		}
725 	}
726 
727 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
728 
729 	if (!(dev->flags&IFF_UP))
730 		return;
731 
732 	/* Add broadcast address, if it is explicitly assigned. */
733 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
734 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
735 
736 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
737 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
738 		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
739 			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
740 
741 		/* Add network specific broadcasts, when it takes a sense */
742 		if (ifa->ifa_prefixlen < 31) {
743 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
744 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
745 		}
746 	}
747 }
748 
749 static void fib_del_ifaddr(struct in_ifaddr *ifa)
750 {
751 	struct in_device *in_dev = ifa->ifa_dev;
752 	struct net_device *dev = in_dev->dev;
753 	struct in_ifaddr *ifa1;
754 	struct in_ifaddr *prim = ifa;
755 	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
756 	__be32 any = ifa->ifa_address&ifa->ifa_mask;
757 #define LOCAL_OK	1
758 #define BRD_OK		2
759 #define BRD0_OK		4
760 #define BRD1_OK		8
761 	unsigned ok = 0;
762 
763 	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
764 		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
765 			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
766 	else {
767 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
768 		if (prim == NULL) {
769 			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
770 			return;
771 		}
772 	}
773 
774 	/* Deletion is more complicated than add.
775 	   We should take care of not to delete too much :-)
776 
777 	   Scan address list to be sure that addresses are really gone.
778 	 */
779 
780 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
781 		if (ifa->ifa_local == ifa1->ifa_local)
782 			ok |= LOCAL_OK;
783 		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
784 			ok |= BRD_OK;
785 		if (brd == ifa1->ifa_broadcast)
786 			ok |= BRD1_OK;
787 		if (any == ifa1->ifa_broadcast)
788 			ok |= BRD0_OK;
789 	}
790 
791 	if (!(ok&BRD_OK))
792 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
793 	if (!(ok&BRD1_OK))
794 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
795 	if (!(ok&BRD0_OK))
796 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
797 	if (!(ok&LOCAL_OK)) {
798 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
799 
800 		/* Check, that this local address finally disappeared. */
801 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
802 			/* And the last, but not the least thing.
803 			   We must flush stray FIB entries.
804 
805 			   First of all, we scan fib_info list searching
806 			   for stray nexthop entries, then ignite fib_flush.
807 			*/
808 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
809 				fib_flush(dev_net(dev));
810 		}
811 	}
812 #undef LOCAL_OK
813 #undef BRD_OK
814 #undef BRD0_OK
815 #undef BRD1_OK
816 }
817 
818 static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
819 {
820 
821 	struct fib_result       res;
822 	struct flowi            fl = { .mark = frn->fl_mark,
823 				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
824 							    .tos = frn->fl_tos,
825 							    .scope = frn->fl_scope } } };
826 
827 #ifdef CONFIG_IP_MULTIPLE_TABLES
828 	res.r = NULL;
829 #endif
830 
831 	frn->err = -ENOENT;
832 	if (tb) {
833 		local_bh_disable();
834 
835 		frn->tb_id = tb->tb_id;
836 		frn->err = tb->tb_lookup(tb, &fl, &res);
837 
838 		if (!frn->err) {
839 			frn->prefixlen = res.prefixlen;
840 			frn->nh_sel = res.nh_sel;
841 			frn->type = res.type;
842 			frn->scope = res.scope;
843 			fib_res_put(&res);
844 		}
845 		local_bh_enable();
846 	}
847 }
848 
849 static void nl_fib_input(struct sk_buff *skb)
850 {
851 	struct net *net;
852 	struct fib_result_nl *frn;
853 	struct nlmsghdr *nlh;
854 	struct fib_table *tb;
855 	u32 pid;
856 
857 	net = sock_net(skb->sk);
858 	nlh = nlmsg_hdr(skb);
859 	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
860 	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
861 		return;
862 
863 	skb = skb_clone(skb, GFP_KERNEL);
864 	if (skb == NULL)
865 		return;
866 	nlh = nlmsg_hdr(skb);
867 
868 	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
869 	tb = fib_get_table(net, frn->tb_id_in);
870 
871 	nl_fib_lookup(frn, tb);
872 
873 	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
874 	NETLINK_CB(skb).pid = 0;         /* from kernel */
875 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
876 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
877 }
878 
879 static int nl_fib_lookup_init(struct net *net)
880 {
881 	struct sock *sk;
882 	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
883 				   nl_fib_input, NULL, THIS_MODULE);
884 	if (sk == NULL)
885 		return -EAFNOSUPPORT;
886 	net->ipv4.fibnl = sk;
887 	return 0;
888 }
889 
890 static void nl_fib_lookup_exit(struct net *net)
891 {
892 	netlink_kernel_release(net->ipv4.fibnl);
893 	net->ipv4.fibnl = NULL;
894 }
895 
896 static void fib_disable_ip(struct net_device *dev, int force)
897 {
898 	if (fib_sync_down_dev(dev, force))
899 		fib_flush(dev_net(dev));
900 	rt_cache_flush(dev_net(dev), 0);
901 	arp_ifdown(dev);
902 }
903 
904 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
905 {
906 	struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
907 	struct net_device *dev = ifa->ifa_dev->dev;
908 
909 	switch (event) {
910 	case NETDEV_UP:
911 		fib_add_ifaddr(ifa);
912 #ifdef CONFIG_IP_ROUTE_MULTIPATH
913 		fib_sync_up(dev);
914 #endif
915 		rt_cache_flush(dev_net(dev), -1);
916 		break;
917 	case NETDEV_DOWN:
918 		fib_del_ifaddr(ifa);
919 		if (ifa->ifa_dev->ifa_list == NULL) {
920 			/* Last address was deleted from this interface.
921 			   Disable IP.
922 			 */
923 			fib_disable_ip(dev, 1);
924 		} else {
925 			rt_cache_flush(dev_net(dev), -1);
926 		}
927 		break;
928 	}
929 	return NOTIFY_DONE;
930 }
931 
932 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
933 {
934 	struct net_device *dev = ptr;
935 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
936 
937 	if (event == NETDEV_UNREGISTER) {
938 		fib_disable_ip(dev, 2);
939 		return NOTIFY_DONE;
940 	}
941 
942 	if (!in_dev)
943 		return NOTIFY_DONE;
944 
945 	switch (event) {
946 	case NETDEV_UP:
947 		for_ifa(in_dev) {
948 			fib_add_ifaddr(ifa);
949 		} endfor_ifa(in_dev);
950 #ifdef CONFIG_IP_ROUTE_MULTIPATH
951 		fib_sync_up(dev);
952 #endif
953 		rt_cache_flush(dev_net(dev), -1);
954 		break;
955 	case NETDEV_DOWN:
956 		fib_disable_ip(dev, 0);
957 		break;
958 	case NETDEV_CHANGEMTU:
959 	case NETDEV_CHANGE:
960 		rt_cache_flush(dev_net(dev), 0);
961 		break;
962 	}
963 	return NOTIFY_DONE;
964 }
965 
966 static struct notifier_block fib_inetaddr_notifier = {
967 	.notifier_call =fib_inetaddr_event,
968 };
969 
970 static struct notifier_block fib_netdev_notifier = {
971 	.notifier_call =fib_netdev_event,
972 };
973 
974 static int __net_init ip_fib_net_init(struct net *net)
975 {
976 	int err;
977 	unsigned int i;
978 
979 	net->ipv4.fib_table_hash = kzalloc(
980 			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
981 	if (net->ipv4.fib_table_hash == NULL)
982 		return -ENOMEM;
983 
984 	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
985 		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
986 
987 	err = fib4_rules_init(net);
988 	if (err < 0)
989 		goto fail;
990 	return 0;
991 
992 fail:
993 	kfree(net->ipv4.fib_table_hash);
994 	return err;
995 }
996 
997 static void __net_exit ip_fib_net_exit(struct net *net)
998 {
999 	unsigned int i;
1000 
1001 #ifdef CONFIG_IP_MULTIPLE_TABLES
1002 	fib4_rules_exit(net);
1003 #endif
1004 
1005 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1006 		struct fib_table *tb;
1007 		struct hlist_head *head;
1008 		struct hlist_node *node, *tmp;
1009 
1010 		head = &net->ipv4.fib_table_hash[i];
1011 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1012 			hlist_del(node);
1013 			tb->tb_flush(tb);
1014 			kfree(tb);
1015 		}
1016 	}
1017 	kfree(net->ipv4.fib_table_hash);
1018 }
1019 
1020 static int __net_init fib_net_init(struct net *net)
1021 {
1022 	int error;
1023 
1024 	error = ip_fib_net_init(net);
1025 	if (error < 0)
1026 		goto out;
1027 	error = nl_fib_lookup_init(net);
1028 	if (error < 0)
1029 		goto out_nlfl;
1030 	error = fib_proc_init(net);
1031 	if (error < 0)
1032 		goto out_proc;
1033 out:
1034 	return error;
1035 
1036 out_proc:
1037 	nl_fib_lookup_exit(net);
1038 out_nlfl:
1039 	ip_fib_net_exit(net);
1040 	goto out;
1041 }
1042 
1043 static void __net_exit fib_net_exit(struct net *net)
1044 {
1045 	fib_proc_exit(net);
1046 	nl_fib_lookup_exit(net);
1047 	ip_fib_net_exit(net);
1048 }
1049 
1050 static struct pernet_operations fib_net_ops = {
1051 	.init = fib_net_init,
1052 	.exit = fib_net_exit,
1053 };
1054 
1055 void __init ip_fib_init(void)
1056 {
1057 	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
1058 	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
1059 	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1060 
1061 	register_pernet_subsys(&fib_net_ops);
1062 	register_netdevice_notifier(&fib_netdev_notifier);
1063 	register_inetaddr_notifier(&fib_inetaddr_notifier);
1064 
1065 	fib_hash_init();
1066 }
1067 
1068 EXPORT_SYMBOL(inet_addr_type);
1069 EXPORT_SYMBOL(inet_dev_addr_type);
1070 EXPORT_SYMBOL(ip_dev_find);
1071