xref: /openbmc/linux/net/ipv4/fib_frontend.c (revision b6dcefde)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		IPv4 Forwarding Information Base: FIB frontend.
7  *
8  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *		This program is free software; you can redistribute it and/or
11  *		modify it under the terms of the GNU General Public License
12  *		as published by the Free Software Foundation; either version
13  *		2 of the License, or (at your option) any later version.
14  */
15 
16 #include <linux/module.h>
17 #include <asm/uaccess.h>
18 #include <asm/system.h>
19 #include <linux/bitops.h>
20 #include <linux/capability.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/string.h>
25 #include <linux/socket.h>
26 #include <linux/sockios.h>
27 #include <linux/errno.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/inetdevice.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_addr.h>
33 #include <linux/if_arp.h>
34 #include <linux/skbuff.h>
35 #include <linux/init.h>
36 #include <linux/list.h>
37 
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/arp.h>
44 #include <net/ip_fib.h>
45 #include <net/rtnetlink.h>
46 
47 #ifndef CONFIG_IP_MULTIPLE_TABLES
48 
49 static int __net_init fib4_rules_init(struct net *net)
50 {
51 	struct fib_table *local_table, *main_table;
52 
53 	local_table = fib_hash_table(RT_TABLE_LOCAL);
54 	if (local_table == NULL)
55 		return -ENOMEM;
56 
57 	main_table  = fib_hash_table(RT_TABLE_MAIN);
58 	if (main_table == NULL)
59 		goto fail;
60 
61 	hlist_add_head_rcu(&local_table->tb_hlist,
62 				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
63 	hlist_add_head_rcu(&main_table->tb_hlist,
64 				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
65 	return 0;
66 
67 fail:
68 	kfree(local_table);
69 	return -ENOMEM;
70 }
71 #else
72 
73 struct fib_table *fib_new_table(struct net *net, u32 id)
74 {
75 	struct fib_table *tb;
76 	unsigned int h;
77 
78 	if (id == 0)
79 		id = RT_TABLE_MAIN;
80 	tb = fib_get_table(net, id);
81 	if (tb)
82 		return tb;
83 
84 	tb = fib_hash_table(id);
85 	if (!tb)
86 		return NULL;
87 	h = id & (FIB_TABLE_HASHSZ - 1);
88 	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
89 	return tb;
90 }
91 
92 struct fib_table *fib_get_table(struct net *net, u32 id)
93 {
94 	struct fib_table *tb;
95 	struct hlist_node *node;
96 	struct hlist_head *head;
97 	unsigned int h;
98 
99 	if (id == 0)
100 		id = RT_TABLE_MAIN;
101 	h = id & (FIB_TABLE_HASHSZ - 1);
102 
103 	rcu_read_lock();
104 	head = &net->ipv4.fib_table_hash[h];
105 	hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
106 		if (tb->tb_id == id) {
107 			rcu_read_unlock();
108 			return tb;
109 		}
110 	}
111 	rcu_read_unlock();
112 	return NULL;
113 }
114 #endif /* CONFIG_IP_MULTIPLE_TABLES */
115 
116 void fib_select_default(struct net *net,
117 			const struct flowi *flp, struct fib_result *res)
118 {
119 	struct fib_table *tb;
120 	int table = RT_TABLE_MAIN;
121 #ifdef CONFIG_IP_MULTIPLE_TABLES
122 	if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
123 		return;
124 	table = res->r->table;
125 #endif
126 	tb = fib_get_table(net, table);
127 	if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
128 		fib_table_select_default(tb, flp, res);
129 }
130 
131 static void fib_flush(struct net *net)
132 {
133 	int flushed = 0;
134 	struct fib_table *tb;
135 	struct hlist_node *node;
136 	struct hlist_head *head;
137 	unsigned int h;
138 
139 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
140 		head = &net->ipv4.fib_table_hash[h];
141 		hlist_for_each_entry(tb, node, head, tb_hlist)
142 			flushed += fib_table_flush(tb);
143 	}
144 
145 	if (flushed)
146 		rt_cache_flush(net, -1);
147 }
148 
149 /*
150  *	Find the first device with a given source address.
151  */
152 
153 struct net_device * ip_dev_find(struct net *net, __be32 addr)
154 {
155 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
156 	struct fib_result res;
157 	struct net_device *dev = NULL;
158 	struct fib_table *local_table;
159 
160 #ifdef CONFIG_IP_MULTIPLE_TABLES
161 	res.r = NULL;
162 #endif
163 
164 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
165 	if (!local_table || fib_table_lookup(local_table, &fl, &res))
166 		return NULL;
167 	if (res.type != RTN_LOCAL)
168 		goto out;
169 	dev = FIB_RES_DEV(res);
170 
171 	if (dev)
172 		dev_hold(dev);
173 out:
174 	fib_res_put(&res);
175 	return dev;
176 }
177 
178 /*
179  * Find address type as if only "dev" was present in the system. If
180  * on_dev is NULL then all interfaces are taken into consideration.
181  */
182 static inline unsigned __inet_dev_addr_type(struct net *net,
183 					    const struct net_device *dev,
184 					    __be32 addr)
185 {
186 	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
187 	struct fib_result	res;
188 	unsigned ret = RTN_BROADCAST;
189 	struct fib_table *local_table;
190 
191 	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
192 		return RTN_BROADCAST;
193 	if (ipv4_is_multicast(addr))
194 		return RTN_MULTICAST;
195 
196 #ifdef CONFIG_IP_MULTIPLE_TABLES
197 	res.r = NULL;
198 #endif
199 
200 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
201 	if (local_table) {
202 		ret = RTN_UNICAST;
203 		if (!fib_table_lookup(local_table, &fl, &res)) {
204 			if (!dev || dev == res.fi->fib_dev)
205 				ret = res.type;
206 			fib_res_put(&res);
207 		}
208 	}
209 	return ret;
210 }
211 
212 unsigned int inet_addr_type(struct net *net, __be32 addr)
213 {
214 	return __inet_dev_addr_type(net, NULL, addr);
215 }
216 
217 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
218 				__be32 addr)
219 {
220        return __inet_dev_addr_type(net, dev, addr);
221 }
222 
223 /* Given (packet source, input interface) and optional (dst, oif, tos):
224    - (main) check, that source is valid i.e. not broadcast or our local
225      address.
226    - figure out what "logical" interface this packet arrived
227      and calculate "specific destination" address.
228    - check, that packet arrived from expected physical interface.
229  */
230 
231 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
232 			struct net_device *dev, __be32 *spec_dst,
233 			u32 *itag, u32 mark)
234 {
235 	struct in_device *in_dev;
236 	struct flowi fl = { .nl_u = { .ip4_u =
237 				      { .daddr = src,
238 					.saddr = dst,
239 					.tos = tos } },
240 			    .mark = mark,
241 			    .iif = oif };
242 
243 	struct fib_result res;
244 	int no_addr, rpf, accept_local;
245 	int ret;
246 	struct net *net;
247 
248 	no_addr = rpf = accept_local = 0;
249 	rcu_read_lock();
250 	in_dev = __in_dev_get_rcu(dev);
251 	if (in_dev) {
252 		no_addr = in_dev->ifa_list == NULL;
253 		rpf = IN_DEV_RPFILTER(in_dev);
254 		accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
255 		if (mark && !IN_DEV_SRC_VMARK(in_dev))
256 			fl.mark = 0;
257 	}
258 	rcu_read_unlock();
259 
260 	if (in_dev == NULL)
261 		goto e_inval;
262 
263 	net = dev_net(dev);
264 	if (fib_lookup(net, &fl, &res))
265 		goto last_resort;
266 	if (res.type != RTN_UNICAST) {
267 		if (res.type != RTN_LOCAL || !accept_local)
268 			goto e_inval_res;
269 	}
270 	*spec_dst = FIB_RES_PREFSRC(res);
271 	fib_combine_itag(itag, &res);
272 #ifdef CONFIG_IP_ROUTE_MULTIPATH
273 	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
274 #else
275 	if (FIB_RES_DEV(res) == dev)
276 #endif
277 	{
278 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
279 		fib_res_put(&res);
280 		return ret;
281 	}
282 	fib_res_put(&res);
283 	if (no_addr)
284 		goto last_resort;
285 	if (rpf == 1)
286 		goto e_inval;
287 	fl.oif = dev->ifindex;
288 
289 	ret = 0;
290 	if (fib_lookup(net, &fl, &res) == 0) {
291 		if (res.type == RTN_UNICAST) {
292 			*spec_dst = FIB_RES_PREFSRC(res);
293 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
294 		}
295 		fib_res_put(&res);
296 	}
297 	return ret;
298 
299 last_resort:
300 	if (rpf)
301 		goto e_inval;
302 	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
303 	*itag = 0;
304 	return 0;
305 
306 e_inval_res:
307 	fib_res_put(&res);
308 e_inval:
309 	return -EINVAL;
310 }
311 
312 static inline __be32 sk_extract_addr(struct sockaddr *addr)
313 {
314 	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
315 }
316 
317 static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
318 {
319 	struct nlattr *nla;
320 
321 	nla = (struct nlattr *) ((char *) mx + len);
322 	nla->nla_type = type;
323 	nla->nla_len = nla_attr_size(4);
324 	*(u32 *) nla_data(nla) = value;
325 
326 	return len + nla_total_size(4);
327 }
328 
329 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
330 				 struct fib_config *cfg)
331 {
332 	__be32 addr;
333 	int plen;
334 
335 	memset(cfg, 0, sizeof(*cfg));
336 	cfg->fc_nlinfo.nl_net = net;
337 
338 	if (rt->rt_dst.sa_family != AF_INET)
339 		return -EAFNOSUPPORT;
340 
341 	/*
342 	 * Check mask for validity:
343 	 * a) it must be contiguous.
344 	 * b) destination must have all host bits clear.
345 	 * c) if application forgot to set correct family (AF_INET),
346 	 *    reject request unless it is absolutely clear i.e.
347 	 *    both family and mask are zero.
348 	 */
349 	plen = 32;
350 	addr = sk_extract_addr(&rt->rt_dst);
351 	if (!(rt->rt_flags & RTF_HOST)) {
352 		__be32 mask = sk_extract_addr(&rt->rt_genmask);
353 
354 		if (rt->rt_genmask.sa_family != AF_INET) {
355 			if (mask || rt->rt_genmask.sa_family)
356 				return -EAFNOSUPPORT;
357 		}
358 
359 		if (bad_mask(mask, addr))
360 			return -EINVAL;
361 
362 		plen = inet_mask_len(mask);
363 	}
364 
365 	cfg->fc_dst_len = plen;
366 	cfg->fc_dst = addr;
367 
368 	if (cmd != SIOCDELRT) {
369 		cfg->fc_nlflags = NLM_F_CREATE;
370 		cfg->fc_protocol = RTPROT_BOOT;
371 	}
372 
373 	if (rt->rt_metric)
374 		cfg->fc_priority = rt->rt_metric - 1;
375 
376 	if (rt->rt_flags & RTF_REJECT) {
377 		cfg->fc_scope = RT_SCOPE_HOST;
378 		cfg->fc_type = RTN_UNREACHABLE;
379 		return 0;
380 	}
381 
382 	cfg->fc_scope = RT_SCOPE_NOWHERE;
383 	cfg->fc_type = RTN_UNICAST;
384 
385 	if (rt->rt_dev) {
386 		char *colon;
387 		struct net_device *dev;
388 		char devname[IFNAMSIZ];
389 
390 		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
391 			return -EFAULT;
392 
393 		devname[IFNAMSIZ-1] = 0;
394 		colon = strchr(devname, ':');
395 		if (colon)
396 			*colon = 0;
397 		dev = __dev_get_by_name(net, devname);
398 		if (!dev)
399 			return -ENODEV;
400 		cfg->fc_oif = dev->ifindex;
401 		if (colon) {
402 			struct in_ifaddr *ifa;
403 			struct in_device *in_dev = __in_dev_get_rtnl(dev);
404 			if (!in_dev)
405 				return -ENODEV;
406 			*colon = ':';
407 			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
408 				if (strcmp(ifa->ifa_label, devname) == 0)
409 					break;
410 			if (ifa == NULL)
411 				return -ENODEV;
412 			cfg->fc_prefsrc = ifa->ifa_local;
413 		}
414 	}
415 
416 	addr = sk_extract_addr(&rt->rt_gateway);
417 	if (rt->rt_gateway.sa_family == AF_INET && addr) {
418 		cfg->fc_gw = addr;
419 		if (rt->rt_flags & RTF_GATEWAY &&
420 		    inet_addr_type(net, addr) == RTN_UNICAST)
421 			cfg->fc_scope = RT_SCOPE_UNIVERSE;
422 	}
423 
424 	if (cmd == SIOCDELRT)
425 		return 0;
426 
427 	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
428 		return -EINVAL;
429 
430 	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
431 		cfg->fc_scope = RT_SCOPE_LINK;
432 
433 	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
434 		struct nlattr *mx;
435 		int len = 0;
436 
437 		mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL);
438 		if (mx == NULL)
439 			return -ENOMEM;
440 
441 		if (rt->rt_flags & RTF_MTU)
442 			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
443 
444 		if (rt->rt_flags & RTF_WINDOW)
445 			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
446 
447 		if (rt->rt_flags & RTF_IRTT)
448 			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
449 
450 		cfg->fc_mx = mx;
451 		cfg->fc_mx_len = len;
452 	}
453 
454 	return 0;
455 }
456 
457 /*
458  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
459  */
460 
461 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
462 {
463 	struct fib_config cfg;
464 	struct rtentry rt;
465 	int err;
466 
467 	switch (cmd) {
468 	case SIOCADDRT:		/* Add a route */
469 	case SIOCDELRT:		/* Delete a route */
470 		if (!capable(CAP_NET_ADMIN))
471 			return -EPERM;
472 
473 		if (copy_from_user(&rt, arg, sizeof(rt)))
474 			return -EFAULT;
475 
476 		rtnl_lock();
477 		err = rtentry_to_fib_config(net, cmd, &rt, &cfg);
478 		if (err == 0) {
479 			struct fib_table *tb;
480 
481 			if (cmd == SIOCDELRT) {
482 				tb = fib_get_table(net, cfg.fc_table);
483 				if (tb)
484 					err = fib_table_delete(tb, &cfg);
485 				else
486 					err = -ESRCH;
487 			} else {
488 				tb = fib_new_table(net, cfg.fc_table);
489 				if (tb)
490 					err = fib_table_insert(tb, &cfg);
491 				else
492 					err = -ENOBUFS;
493 			}
494 
495 			/* allocated by rtentry_to_fib_config() */
496 			kfree(cfg.fc_mx);
497 		}
498 		rtnl_unlock();
499 		return err;
500 	}
501 	return -EINVAL;
502 }
503 
504 const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
505 	[RTA_DST]		= { .type = NLA_U32 },
506 	[RTA_SRC]		= { .type = NLA_U32 },
507 	[RTA_IIF]		= { .type = NLA_U32 },
508 	[RTA_OIF]		= { .type = NLA_U32 },
509 	[RTA_GATEWAY]		= { .type = NLA_U32 },
510 	[RTA_PRIORITY]		= { .type = NLA_U32 },
511 	[RTA_PREFSRC]		= { .type = NLA_U32 },
512 	[RTA_METRICS]		= { .type = NLA_NESTED },
513 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
514 	[RTA_FLOW]		= { .type = NLA_U32 },
515 };
516 
517 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
518 			    struct nlmsghdr *nlh, struct fib_config *cfg)
519 {
520 	struct nlattr *attr;
521 	int err, remaining;
522 	struct rtmsg *rtm;
523 
524 	err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy);
525 	if (err < 0)
526 		goto errout;
527 
528 	memset(cfg, 0, sizeof(*cfg));
529 
530 	rtm = nlmsg_data(nlh);
531 	cfg->fc_dst_len = rtm->rtm_dst_len;
532 	cfg->fc_tos = rtm->rtm_tos;
533 	cfg->fc_table = rtm->rtm_table;
534 	cfg->fc_protocol = rtm->rtm_protocol;
535 	cfg->fc_scope = rtm->rtm_scope;
536 	cfg->fc_type = rtm->rtm_type;
537 	cfg->fc_flags = rtm->rtm_flags;
538 	cfg->fc_nlflags = nlh->nlmsg_flags;
539 
540 	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
541 	cfg->fc_nlinfo.nlh = nlh;
542 	cfg->fc_nlinfo.nl_net = net;
543 
544 	if (cfg->fc_type > RTN_MAX) {
545 		err = -EINVAL;
546 		goto errout;
547 	}
548 
549 	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
550 		switch (nla_type(attr)) {
551 		case RTA_DST:
552 			cfg->fc_dst = nla_get_be32(attr);
553 			break;
554 		case RTA_OIF:
555 			cfg->fc_oif = nla_get_u32(attr);
556 			break;
557 		case RTA_GATEWAY:
558 			cfg->fc_gw = nla_get_be32(attr);
559 			break;
560 		case RTA_PRIORITY:
561 			cfg->fc_priority = nla_get_u32(attr);
562 			break;
563 		case RTA_PREFSRC:
564 			cfg->fc_prefsrc = nla_get_be32(attr);
565 			break;
566 		case RTA_METRICS:
567 			cfg->fc_mx = nla_data(attr);
568 			cfg->fc_mx_len = nla_len(attr);
569 			break;
570 		case RTA_MULTIPATH:
571 			cfg->fc_mp = nla_data(attr);
572 			cfg->fc_mp_len = nla_len(attr);
573 			break;
574 		case RTA_FLOW:
575 			cfg->fc_flow = nla_get_u32(attr);
576 			break;
577 		case RTA_TABLE:
578 			cfg->fc_table = nla_get_u32(attr);
579 			break;
580 		}
581 	}
582 
583 	return 0;
584 errout:
585 	return err;
586 }
587 
588 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
589 {
590 	struct net *net = sock_net(skb->sk);
591 	struct fib_config cfg;
592 	struct fib_table *tb;
593 	int err;
594 
595 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
596 	if (err < 0)
597 		goto errout;
598 
599 	tb = fib_get_table(net, cfg.fc_table);
600 	if (tb == NULL) {
601 		err = -ESRCH;
602 		goto errout;
603 	}
604 
605 	err = fib_table_delete(tb, &cfg);
606 errout:
607 	return err;
608 }
609 
610 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
611 {
612 	struct net *net = sock_net(skb->sk);
613 	struct fib_config cfg;
614 	struct fib_table *tb;
615 	int err;
616 
617 	err = rtm_to_fib_config(net, skb, nlh, &cfg);
618 	if (err < 0)
619 		goto errout;
620 
621 	tb = fib_new_table(net, cfg.fc_table);
622 	if (tb == NULL) {
623 		err = -ENOBUFS;
624 		goto errout;
625 	}
626 
627 	err = fib_table_insert(tb, &cfg);
628 errout:
629 	return err;
630 }
631 
632 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
633 {
634 	struct net *net = sock_net(skb->sk);
635 	unsigned int h, s_h;
636 	unsigned int e = 0, s_e;
637 	struct fib_table *tb;
638 	struct hlist_node *node;
639 	struct hlist_head *head;
640 	int dumped = 0;
641 
642 	if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
643 	    ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
644 		return ip_rt_dump(skb, cb);
645 
646 	s_h = cb->args[0];
647 	s_e = cb->args[1];
648 
649 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
650 		e = 0;
651 		head = &net->ipv4.fib_table_hash[h];
652 		hlist_for_each_entry(tb, node, head, tb_hlist) {
653 			if (e < s_e)
654 				goto next;
655 			if (dumped)
656 				memset(&cb->args[2], 0, sizeof(cb->args) -
657 						 2 * sizeof(cb->args[0]));
658 			if (fib_table_dump(tb, skb, cb) < 0)
659 				goto out;
660 			dumped = 1;
661 next:
662 			e++;
663 		}
664 	}
665 out:
666 	cb->args[1] = e;
667 	cb->args[0] = h;
668 
669 	return skb->len;
670 }
671 
672 /* Prepare and feed intra-kernel routing request.
673    Really, it should be netlink message, but :-( netlink
674    can be not configured, so that we feed it directly
675    to fib engine. It is legal, because all events occur
676    only when netlink is already locked.
677  */
678 
679 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
680 {
681 	struct net *net = dev_net(ifa->ifa_dev->dev);
682 	struct fib_table *tb;
683 	struct fib_config cfg = {
684 		.fc_protocol = RTPROT_KERNEL,
685 		.fc_type = type,
686 		.fc_dst = dst,
687 		.fc_dst_len = dst_len,
688 		.fc_prefsrc = ifa->ifa_local,
689 		.fc_oif = ifa->ifa_dev->dev->ifindex,
690 		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
691 		.fc_nlinfo = {
692 			.nl_net = net,
693 		},
694 	};
695 
696 	if (type == RTN_UNICAST)
697 		tb = fib_new_table(net, RT_TABLE_MAIN);
698 	else
699 		tb = fib_new_table(net, RT_TABLE_LOCAL);
700 
701 	if (tb == NULL)
702 		return;
703 
704 	cfg.fc_table = tb->tb_id;
705 
706 	if (type != RTN_LOCAL)
707 		cfg.fc_scope = RT_SCOPE_LINK;
708 	else
709 		cfg.fc_scope = RT_SCOPE_HOST;
710 
711 	if (cmd == RTM_NEWROUTE)
712 		fib_table_insert(tb, &cfg);
713 	else
714 		fib_table_delete(tb, &cfg);
715 }
716 
717 void fib_add_ifaddr(struct in_ifaddr *ifa)
718 {
719 	struct in_device *in_dev = ifa->ifa_dev;
720 	struct net_device *dev = in_dev->dev;
721 	struct in_ifaddr *prim = ifa;
722 	__be32 mask = ifa->ifa_mask;
723 	__be32 addr = ifa->ifa_local;
724 	__be32 prefix = ifa->ifa_address&mask;
725 
726 	if (ifa->ifa_flags&IFA_F_SECONDARY) {
727 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
728 		if (prim == NULL) {
729 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
730 			return;
731 		}
732 	}
733 
734 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
735 
736 	if (!(dev->flags&IFF_UP))
737 		return;
738 
739 	/* Add broadcast address, if it is explicitly assigned. */
740 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
741 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
742 
743 	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
744 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
745 		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
746 			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
747 
748 		/* Add network specific broadcasts, when it takes a sense */
749 		if (ifa->ifa_prefixlen < 31) {
750 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
751 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
752 		}
753 	}
754 }
755 
756 static void fib_del_ifaddr(struct in_ifaddr *ifa)
757 {
758 	struct in_device *in_dev = ifa->ifa_dev;
759 	struct net_device *dev = in_dev->dev;
760 	struct in_ifaddr *ifa1;
761 	struct in_ifaddr *prim = ifa;
762 	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
763 	__be32 any = ifa->ifa_address&ifa->ifa_mask;
764 #define LOCAL_OK	1
765 #define BRD_OK		2
766 #define BRD0_OK		4
767 #define BRD1_OK		8
768 	unsigned ok = 0;
769 
770 	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
771 		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
772 			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
773 	else {
774 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
775 		if (prim == NULL) {
776 			printk(KERN_WARNING "fib_del_ifaddr: bug: prim == NULL\n");
777 			return;
778 		}
779 	}
780 
781 	/* Deletion is more complicated than add.
782 	   We should take care of not to delete too much :-)
783 
784 	   Scan address list to be sure that addresses are really gone.
785 	 */
786 
787 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
788 		if (ifa->ifa_local == ifa1->ifa_local)
789 			ok |= LOCAL_OK;
790 		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
791 			ok |= BRD_OK;
792 		if (brd == ifa1->ifa_broadcast)
793 			ok |= BRD1_OK;
794 		if (any == ifa1->ifa_broadcast)
795 			ok |= BRD0_OK;
796 	}
797 
798 	if (!(ok&BRD_OK))
799 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
800 	if (!(ok&BRD1_OK))
801 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
802 	if (!(ok&BRD0_OK))
803 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
804 	if (!(ok&LOCAL_OK)) {
805 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
806 
807 		/* Check, that this local address finally disappeared. */
808 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
809 			/* And the last, but not the least thing.
810 			   We must flush stray FIB entries.
811 
812 			   First of all, we scan fib_info list searching
813 			   for stray nexthop entries, then ignite fib_flush.
814 			*/
815 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
816 				fib_flush(dev_net(dev));
817 		}
818 	}
819 #undef LOCAL_OK
820 #undef BRD_OK
821 #undef BRD0_OK
822 #undef BRD1_OK
823 }
824 
825 static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
826 {
827 
828 	struct fib_result       res;
829 	struct flowi            fl = { .mark = frn->fl_mark,
830 				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
831 							    .tos = frn->fl_tos,
832 							    .scope = frn->fl_scope } } };
833 
834 #ifdef CONFIG_IP_MULTIPLE_TABLES
835 	res.r = NULL;
836 #endif
837 
838 	frn->err = -ENOENT;
839 	if (tb) {
840 		local_bh_disable();
841 
842 		frn->tb_id = tb->tb_id;
843 		frn->err = fib_table_lookup(tb, &fl, &res);
844 
845 		if (!frn->err) {
846 			frn->prefixlen = res.prefixlen;
847 			frn->nh_sel = res.nh_sel;
848 			frn->type = res.type;
849 			frn->scope = res.scope;
850 			fib_res_put(&res);
851 		}
852 		local_bh_enable();
853 	}
854 }
855 
856 static void nl_fib_input(struct sk_buff *skb)
857 {
858 	struct net *net;
859 	struct fib_result_nl *frn;
860 	struct nlmsghdr *nlh;
861 	struct fib_table *tb;
862 	u32 pid;
863 
864 	net = sock_net(skb->sk);
865 	nlh = nlmsg_hdr(skb);
866 	if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len ||
867 	    nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn)))
868 		return;
869 
870 	skb = skb_clone(skb, GFP_KERNEL);
871 	if (skb == NULL)
872 		return;
873 	nlh = nlmsg_hdr(skb);
874 
875 	frn = (struct fib_result_nl *) NLMSG_DATA(nlh);
876 	tb = fib_get_table(net, frn->tb_id_in);
877 
878 	nl_fib_lookup(frn, tb);
879 
880 	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
881 	NETLINK_CB(skb).pid = 0;         /* from kernel */
882 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
883 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
884 }
885 
886 static int nl_fib_lookup_init(struct net *net)
887 {
888 	struct sock *sk;
889 	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
890 				   nl_fib_input, NULL, THIS_MODULE);
891 	if (sk == NULL)
892 		return -EAFNOSUPPORT;
893 	net->ipv4.fibnl = sk;
894 	return 0;
895 }
896 
897 static void nl_fib_lookup_exit(struct net *net)
898 {
899 	netlink_kernel_release(net->ipv4.fibnl);
900 	net->ipv4.fibnl = NULL;
901 }
902 
903 static void fib_disable_ip(struct net_device *dev, int force, int delay)
904 {
905 	if (fib_sync_down_dev(dev, force))
906 		fib_flush(dev_net(dev));
907 	rt_cache_flush(dev_net(dev), delay);
908 	arp_ifdown(dev);
909 }
910 
911 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
912 {
913 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
914 	struct net_device *dev = ifa->ifa_dev->dev;
915 
916 	switch (event) {
917 	case NETDEV_UP:
918 		fib_add_ifaddr(ifa);
919 #ifdef CONFIG_IP_ROUTE_MULTIPATH
920 		fib_sync_up(dev);
921 #endif
922 		rt_cache_flush(dev_net(dev), -1);
923 		break;
924 	case NETDEV_DOWN:
925 		fib_del_ifaddr(ifa);
926 		if (ifa->ifa_dev->ifa_list == NULL) {
927 			/* Last address was deleted from this interface.
928 			   Disable IP.
929 			 */
930 			fib_disable_ip(dev, 1, 0);
931 		} else {
932 			rt_cache_flush(dev_net(dev), -1);
933 		}
934 		break;
935 	}
936 	return NOTIFY_DONE;
937 }
938 
939 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
940 {
941 	struct net_device *dev = ptr;
942 	struct in_device *in_dev = __in_dev_get_rtnl(dev);
943 
944 	if (event == NETDEV_UNREGISTER) {
945 		fib_disable_ip(dev, 2, -1);
946 		return NOTIFY_DONE;
947 	}
948 
949 	if (!in_dev)
950 		return NOTIFY_DONE;
951 
952 	switch (event) {
953 	case NETDEV_UP:
954 		for_ifa(in_dev) {
955 			fib_add_ifaddr(ifa);
956 		} endfor_ifa(in_dev);
957 #ifdef CONFIG_IP_ROUTE_MULTIPATH
958 		fib_sync_up(dev);
959 #endif
960 		rt_cache_flush(dev_net(dev), -1);
961 		break;
962 	case NETDEV_DOWN:
963 		fib_disable_ip(dev, 0, 0);
964 		break;
965 	case NETDEV_CHANGEMTU:
966 	case NETDEV_CHANGE:
967 		rt_cache_flush(dev_net(dev), 0);
968 		break;
969 	case NETDEV_UNREGISTER_BATCH:
970 		rt_cache_flush_batch();
971 		break;
972 	}
973 	return NOTIFY_DONE;
974 }
975 
976 static struct notifier_block fib_inetaddr_notifier = {
977 	.notifier_call = fib_inetaddr_event,
978 };
979 
980 static struct notifier_block fib_netdev_notifier = {
981 	.notifier_call = fib_netdev_event,
982 };
983 
984 static int __net_init ip_fib_net_init(struct net *net)
985 {
986 	int err;
987 	unsigned int i;
988 
989 	net->ipv4.fib_table_hash = kzalloc(
990 			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
991 	if (net->ipv4.fib_table_hash == NULL)
992 		return -ENOMEM;
993 
994 	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
995 		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
996 
997 	err = fib4_rules_init(net);
998 	if (err < 0)
999 		goto fail;
1000 	return 0;
1001 
1002 fail:
1003 	kfree(net->ipv4.fib_table_hash);
1004 	return err;
1005 }
1006 
1007 static void __net_exit ip_fib_net_exit(struct net *net)
1008 {
1009 	unsigned int i;
1010 
1011 #ifdef CONFIG_IP_MULTIPLE_TABLES
1012 	fib4_rules_exit(net);
1013 #endif
1014 
1015 	for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
1016 		struct fib_table *tb;
1017 		struct hlist_head *head;
1018 		struct hlist_node *node, *tmp;
1019 
1020 		head = &net->ipv4.fib_table_hash[i];
1021 		hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
1022 			hlist_del(node);
1023 			fib_table_flush(tb);
1024 			kfree(tb);
1025 		}
1026 	}
1027 	kfree(net->ipv4.fib_table_hash);
1028 }
1029 
1030 static int __net_init fib_net_init(struct net *net)
1031 {
1032 	int error;
1033 
1034 	error = ip_fib_net_init(net);
1035 	if (error < 0)
1036 		goto out;
1037 	error = nl_fib_lookup_init(net);
1038 	if (error < 0)
1039 		goto out_nlfl;
1040 	error = fib_proc_init(net);
1041 	if (error < 0)
1042 		goto out_proc;
1043 out:
1044 	return error;
1045 
1046 out_proc:
1047 	nl_fib_lookup_exit(net);
1048 out_nlfl:
1049 	ip_fib_net_exit(net);
1050 	goto out;
1051 }
1052 
1053 static void __net_exit fib_net_exit(struct net *net)
1054 {
1055 	fib_proc_exit(net);
1056 	nl_fib_lookup_exit(net);
1057 	ip_fib_net_exit(net);
1058 }
1059 
1060 static struct pernet_operations fib_net_ops = {
1061 	.init = fib_net_init,
1062 	.exit = fib_net_exit,
1063 };
1064 
1065 void __init ip_fib_init(void)
1066 {
1067 	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL);
1068 	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL);
1069 	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib);
1070 
1071 	register_pernet_subsys(&fib_net_ops);
1072 	register_netdevice_notifier(&fib_netdev_notifier);
1073 	register_inetaddr_notifier(&fib_inetaddr_notifier);
1074 
1075 	fib_hash_init();
1076 }
1077 
1078 EXPORT_SYMBOL(inet_addr_type);
1079 EXPORT_SYMBOL(inet_dev_addr_type);
1080 EXPORT_SYMBOL(ip_dev_find);
1081