xref: /openbmc/linux/net/ipv4/fib_frontend.c (revision fac59652993f075d57860769c99045b3ca18780d)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   * INET		An implementation of the TCP/IP protocol suite for the LINUX
4   *		operating system.  INET is implemented using the  BSD Socket
5   *		interface as the means of communication with the user level.
6   *
7   *		IPv4 Forwarding Information Base: FIB frontend.
8   *
9   * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10   */
11  
12  #include <linux/module.h>
13  #include <linux/uaccess.h>
14  #include <linux/bitops.h>
15  #include <linux/capability.h>
16  #include <linux/types.h>
17  #include <linux/kernel.h>
18  #include <linux/mm.h>
19  #include <linux/string.h>
20  #include <linux/socket.h>
21  #include <linux/sockios.h>
22  #include <linux/errno.h>
23  #include <linux/in.h>
24  #include <linux/inet.h>
25  #include <linux/inetdevice.h>
26  #include <linux/netdevice.h>
27  #include <linux/if_addr.h>
28  #include <linux/if_arp.h>
29  #include <linux/skbuff.h>
30  #include <linux/cache.h>
31  #include <linux/init.h>
32  #include <linux/list.h>
33  #include <linux/slab.h>
34  
35  #include <net/inet_dscp.h>
36  #include <net/ip.h>
37  #include <net/protocol.h>
38  #include <net/route.h>
39  #include <net/tcp.h>
40  #include <net/sock.h>
41  #include <net/arp.h>
42  #include <net/ip_fib.h>
43  #include <net/nexthop.h>
44  #include <net/rtnetlink.h>
45  #include <net/xfrm.h>
46  #include <net/l3mdev.h>
47  #include <net/lwtunnel.h>
48  #include <trace/events/fib.h>
49  
50  #ifndef CONFIG_IP_MULTIPLE_TABLES
51  
fib4_rules_init(struct net * net)52  static int __net_init fib4_rules_init(struct net *net)
53  {
54  	struct fib_table *local_table, *main_table;
55  
56  	main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
57  	if (!main_table)
58  		return -ENOMEM;
59  
60  	local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
61  	if (!local_table)
62  		goto fail;
63  
64  	hlist_add_head_rcu(&local_table->tb_hlist,
65  				&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
66  	hlist_add_head_rcu(&main_table->tb_hlist,
67  				&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
68  	return 0;
69  
70  fail:
71  	fib_free_table(main_table);
72  	return -ENOMEM;
73  }
74  #else
75  
fib_new_table(struct net * net,u32 id)76  struct fib_table *fib_new_table(struct net *net, u32 id)
77  {
78  	struct fib_table *tb, *alias = NULL;
79  	unsigned int h;
80  
81  	if (id == 0)
82  		id = RT_TABLE_MAIN;
83  	tb = fib_get_table(net, id);
84  	if (tb)
85  		return tb;
86  
87  	if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
88  		alias = fib_new_table(net, RT_TABLE_MAIN);
89  
90  	tb = fib_trie_table(id, alias);
91  	if (!tb)
92  		return NULL;
93  
94  	switch (id) {
95  	case RT_TABLE_MAIN:
96  		rcu_assign_pointer(net->ipv4.fib_main, tb);
97  		break;
98  	case RT_TABLE_DEFAULT:
99  		rcu_assign_pointer(net->ipv4.fib_default, tb);
100  		break;
101  	default:
102  		break;
103  	}
104  
105  	h = id & (FIB_TABLE_HASHSZ - 1);
106  	hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
107  	return tb;
108  }
109  EXPORT_SYMBOL_GPL(fib_new_table);
110  
111  /* caller must hold either rtnl or rcu read lock */
fib_get_table(struct net * net,u32 id)112  struct fib_table *fib_get_table(struct net *net, u32 id)
113  {
114  	struct fib_table *tb;
115  	struct hlist_head *head;
116  	unsigned int h;
117  
118  	if (id == 0)
119  		id = RT_TABLE_MAIN;
120  	h = id & (FIB_TABLE_HASHSZ - 1);
121  
122  	head = &net->ipv4.fib_table_hash[h];
123  	hlist_for_each_entry_rcu(tb, head, tb_hlist,
124  				 lockdep_rtnl_is_held()) {
125  		if (tb->tb_id == id)
126  			return tb;
127  	}
128  	return NULL;
129  }
130  #endif /* CONFIG_IP_MULTIPLE_TABLES */
131  
fib_replace_table(struct net * net,struct fib_table * old,struct fib_table * new)132  static void fib_replace_table(struct net *net, struct fib_table *old,
133  			      struct fib_table *new)
134  {
135  #ifdef CONFIG_IP_MULTIPLE_TABLES
136  	switch (new->tb_id) {
137  	case RT_TABLE_MAIN:
138  		rcu_assign_pointer(net->ipv4.fib_main, new);
139  		break;
140  	case RT_TABLE_DEFAULT:
141  		rcu_assign_pointer(net->ipv4.fib_default, new);
142  		break;
143  	default:
144  		break;
145  	}
146  
147  #endif
148  	/* replace the old table in the hlist */
149  	hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
150  }
151  
fib_unmerge(struct net * net)152  int fib_unmerge(struct net *net)
153  {
154  	struct fib_table *old, *new, *main_table;
155  
156  	/* attempt to fetch local table if it has been allocated */
157  	old = fib_get_table(net, RT_TABLE_LOCAL);
158  	if (!old)
159  		return 0;
160  
161  	new = fib_trie_unmerge(old);
162  	if (!new)
163  		return -ENOMEM;
164  
165  	/* table is already unmerged */
166  	if (new == old)
167  		return 0;
168  
169  	/* replace merged table with clean table */
170  	fib_replace_table(net, old, new);
171  	fib_free_table(old);
172  
173  	/* attempt to fetch main table if it has been allocated */
174  	main_table = fib_get_table(net, RT_TABLE_MAIN);
175  	if (!main_table)
176  		return 0;
177  
178  	/* flush local entries from main table */
179  	fib_table_flush_external(main_table);
180  
181  	return 0;
182  }
183  
fib_flush(struct net * net)184  void fib_flush(struct net *net)
185  {
186  	int flushed = 0;
187  	unsigned int h;
188  
189  	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
190  		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
191  		struct hlist_node *tmp;
192  		struct fib_table *tb;
193  
194  		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
195  			flushed += fib_table_flush(net, tb, false);
196  	}
197  
198  	if (flushed)
199  		rt_cache_flush(net);
200  }
201  
202  /*
203   * Find address type as if only "dev" was present in the system. If
204   * on_dev is NULL then all interfaces are taken into consideration.
205   */
__inet_dev_addr_type(struct net * net,const struct net_device * dev,__be32 addr,u32 tb_id)206  static inline unsigned int __inet_dev_addr_type(struct net *net,
207  						const struct net_device *dev,
208  						__be32 addr, u32 tb_id)
209  {
210  	struct flowi4		fl4 = { .daddr = addr };
211  	struct fib_result	res;
212  	unsigned int ret = RTN_BROADCAST;
213  	struct fib_table *table;
214  
215  	if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
216  		return RTN_BROADCAST;
217  	if (ipv4_is_multicast(addr))
218  		return RTN_MULTICAST;
219  
220  	rcu_read_lock();
221  
222  	table = fib_get_table(net, tb_id);
223  	if (table) {
224  		ret = RTN_UNICAST;
225  		if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
226  			struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
227  
228  			if (!dev || dev == nhc->nhc_dev)
229  				ret = res.type;
230  		}
231  	}
232  
233  	rcu_read_unlock();
234  	return ret;
235  }
236  
inet_addr_type_table(struct net * net,__be32 addr,u32 tb_id)237  unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
238  {
239  	return __inet_dev_addr_type(net, NULL, addr, tb_id);
240  }
241  EXPORT_SYMBOL(inet_addr_type_table);
242  
inet_addr_type(struct net * net,__be32 addr)243  unsigned int inet_addr_type(struct net *net, __be32 addr)
244  {
245  	return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
246  }
247  EXPORT_SYMBOL(inet_addr_type);
248  
inet_dev_addr_type(struct net * net,const struct net_device * dev,__be32 addr)249  unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
250  				__be32 addr)
251  {
252  	u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
253  
254  	return __inet_dev_addr_type(net, dev, addr, rt_table);
255  }
256  EXPORT_SYMBOL(inet_dev_addr_type);
257  
258  /* inet_addr_type with dev == NULL but using the table from a dev
259   * if one is associated
260   */
inet_addr_type_dev_table(struct net * net,const struct net_device * dev,__be32 addr)261  unsigned int inet_addr_type_dev_table(struct net *net,
262  				      const struct net_device *dev,
263  				      __be32 addr)
264  {
265  	u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;
266  
267  	return __inet_dev_addr_type(net, NULL, addr, rt_table);
268  }
269  EXPORT_SYMBOL(inet_addr_type_dev_table);
270  
fib_compute_spec_dst(struct sk_buff * skb)271  __be32 fib_compute_spec_dst(struct sk_buff *skb)
272  {
273  	struct net_device *dev = skb->dev;
274  	struct in_device *in_dev;
275  	struct fib_result res;
276  	struct rtable *rt;
277  	struct net *net;
278  	int scope;
279  
280  	rt = skb_rtable(skb);
281  	if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
282  	    RTCF_LOCAL)
283  		return ip_hdr(skb)->daddr;
284  
285  	in_dev = __in_dev_get_rcu(dev);
286  
287  	net = dev_net(dev);
288  
289  	scope = RT_SCOPE_UNIVERSE;
290  	if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
291  		bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
292  		struct flowi4 fl4 = {
293  			.flowi4_iif = LOOPBACK_IFINDEX,
294  			.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
295  			.daddr = ip_hdr(skb)->saddr,
296  			.flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK,
297  			.flowi4_scope = scope,
298  			.flowi4_mark = vmark ? skb->mark : 0,
299  		};
300  		if (!fib_lookup(net, &fl4, &res, 0))
301  			return fib_result_prefsrc(net, &res);
302  	} else {
303  		scope = RT_SCOPE_LINK;
304  	}
305  
306  	return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
307  }
308  
fib_info_nh_uses_dev(struct fib_info * fi,const struct net_device * dev)309  bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
310  {
311  	bool dev_match = false;
312  #ifdef CONFIG_IP_ROUTE_MULTIPATH
313  	if (unlikely(fi->nh)) {
314  		dev_match = nexthop_uses_dev(fi->nh, dev);
315  	} else {
316  		int ret;
317  
318  		for (ret = 0; ret < fib_info_num_path(fi); ret++) {
319  			const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
320  
321  			if (nhc_l3mdev_matches_dev(nhc, dev)) {
322  				dev_match = true;
323  				break;
324  			}
325  		}
326  	}
327  #else
328  	if (fib_info_nhc(fi, 0)->nhc_dev == dev)
329  		dev_match = true;
330  #endif
331  
332  	return dev_match;
333  }
334  EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
335  
336  /* Given (packet source, input interface) and optional (dst, oif, tos):
337   * - (main) check, that source is valid i.e. not broadcast or our local
338   *   address.
339   * - figure out what "logical" interface this packet arrived
340   *   and calculate "specific destination" address.
341   * - check, that packet arrived from expected physical interface.
342   * called with rcu_read_lock()
343   */
__fib_validate_source(struct sk_buff * skb,__be32 src,__be32 dst,u8 tos,int oif,struct net_device * dev,int rpf,struct in_device * idev,u32 * itag)344  static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
345  				 u8 tos, int oif, struct net_device *dev,
346  				 int rpf, struct in_device *idev, u32 *itag)
347  {
348  	struct net *net = dev_net(dev);
349  	struct flow_keys flkeys;
350  	int ret, no_addr;
351  	struct fib_result res;
352  	struct flowi4 fl4;
353  	bool dev_match;
354  
355  	fl4.flowi4_oif = 0;
356  	fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
357  	fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
358  	fl4.daddr = src;
359  	fl4.saddr = dst;
360  	fl4.flowi4_tos = tos;
361  	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
362  	fl4.flowi4_tun_key.tun_id = 0;
363  	fl4.flowi4_flags = 0;
364  	fl4.flowi4_uid = sock_net_uid(net, NULL);
365  	fl4.flowi4_multipath_hash = 0;
366  
367  	no_addr = idev->ifa_list == NULL;
368  
369  	fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
370  	if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
371  		fl4.flowi4_proto = 0;
372  		fl4.fl4_sport = 0;
373  		fl4.fl4_dport = 0;
374  	} else {
375  		swap(fl4.fl4_sport, fl4.fl4_dport);
376  	}
377  
378  	if (fib_lookup(net, &fl4, &res, 0))
379  		goto last_resort;
380  	if (res.type != RTN_UNICAST &&
381  	    (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
382  		goto e_inval;
383  	fib_combine_itag(itag, &res);
384  
385  	dev_match = fib_info_nh_uses_dev(res.fi, dev);
386  	/* This is not common, loopback packets retain skb_dst so normally they
387  	 * would not even hit this slow path.
388  	 */
389  	dev_match = dev_match || (res.type == RTN_LOCAL &&
390  				  dev == net->loopback_dev);
391  	if (dev_match) {
392  		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
393  		return ret;
394  	}
395  	if (no_addr)
396  		goto last_resort;
397  	if (rpf == 1)
398  		goto e_rpf;
399  	fl4.flowi4_oif = dev->ifindex;
400  
401  	ret = 0;
402  	if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
403  		if (res.type == RTN_UNICAST)
404  			ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
405  	}
406  	return ret;
407  
408  last_resort:
409  	if (rpf)
410  		goto e_rpf;
411  	*itag = 0;
412  	return 0;
413  
414  e_inval:
415  	return -EINVAL;
416  e_rpf:
417  	return -EXDEV;
418  }
419  
420  /* Ignore rp_filter for packets protected by IPsec. */
fib_validate_source(struct sk_buff * skb,__be32 src,__be32 dst,u8 tos,int oif,struct net_device * dev,struct in_device * idev,u32 * itag)421  int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
422  			u8 tos, int oif, struct net_device *dev,
423  			struct in_device *idev, u32 *itag)
424  {
425  	int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
426  	struct net *net = dev_net(dev);
427  
428  	if (!r && !fib_num_tclassid_users(net) &&
429  	    (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
430  		if (IN_DEV_ACCEPT_LOCAL(idev))
431  			goto ok;
432  		/* with custom local routes in place, checking local addresses
433  		 * only will be too optimistic, with custom rules, checking
434  		 * local addresses only can be too strict, e.g. due to vrf
435  		 */
436  		if (net->ipv4.fib_has_custom_local_routes ||
437  		    fib4_has_custom_rules(net))
438  			goto full_check;
439  		/* Within the same container, it is regarded as a martian source,
440  		 * and the same host but different containers are not.
441  		 */
442  		if (inet_lookup_ifaddr_rcu(net, src))
443  			return -EINVAL;
444  
445  ok:
446  		*itag = 0;
447  		return 0;
448  	}
449  
450  full_check:
451  	return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
452  }
453  
sk_extract_addr(struct sockaddr * addr)454  static inline __be32 sk_extract_addr(struct sockaddr *addr)
455  {
456  	return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
457  }
458  
put_rtax(struct nlattr * mx,int len,int type,u32 value)459  static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
460  {
461  	struct nlattr *nla;
462  
463  	nla = (struct nlattr *) ((char *) mx + len);
464  	nla->nla_type = type;
465  	nla->nla_len = nla_attr_size(4);
466  	*(u32 *) nla_data(nla) = value;
467  
468  	return len + nla_total_size(4);
469  }
470  
rtentry_to_fib_config(struct net * net,int cmd,struct rtentry * rt,struct fib_config * cfg)471  static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
472  				 struct fib_config *cfg)
473  {
474  	__be32 addr;
475  	int plen;
476  
477  	memset(cfg, 0, sizeof(*cfg));
478  	cfg->fc_nlinfo.nl_net = net;
479  
480  	if (rt->rt_dst.sa_family != AF_INET)
481  		return -EAFNOSUPPORT;
482  
483  	/*
484  	 * Check mask for validity:
485  	 * a) it must be contiguous.
486  	 * b) destination must have all host bits clear.
487  	 * c) if application forgot to set correct family (AF_INET),
488  	 *    reject request unless it is absolutely clear i.e.
489  	 *    both family and mask are zero.
490  	 */
491  	plen = 32;
492  	addr = sk_extract_addr(&rt->rt_dst);
493  	if (!(rt->rt_flags & RTF_HOST)) {
494  		__be32 mask = sk_extract_addr(&rt->rt_genmask);
495  
496  		if (rt->rt_genmask.sa_family != AF_INET) {
497  			if (mask || rt->rt_genmask.sa_family)
498  				return -EAFNOSUPPORT;
499  		}
500  
501  		if (bad_mask(mask, addr))
502  			return -EINVAL;
503  
504  		plen = inet_mask_len(mask);
505  	}
506  
507  	cfg->fc_dst_len = plen;
508  	cfg->fc_dst = addr;
509  
510  	if (cmd != SIOCDELRT) {
511  		cfg->fc_nlflags = NLM_F_CREATE;
512  		cfg->fc_protocol = RTPROT_BOOT;
513  	}
514  
515  	if (rt->rt_metric)
516  		cfg->fc_priority = rt->rt_metric - 1;
517  
518  	if (rt->rt_flags & RTF_REJECT) {
519  		cfg->fc_scope = RT_SCOPE_HOST;
520  		cfg->fc_type = RTN_UNREACHABLE;
521  		return 0;
522  	}
523  
524  	cfg->fc_scope = RT_SCOPE_NOWHERE;
525  	cfg->fc_type = RTN_UNICAST;
526  
527  	if (rt->rt_dev) {
528  		char *colon;
529  		struct net_device *dev;
530  		char devname[IFNAMSIZ];
531  
532  		if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
533  			return -EFAULT;
534  
535  		devname[IFNAMSIZ-1] = 0;
536  		colon = strchr(devname, ':');
537  		if (colon)
538  			*colon = 0;
539  		dev = __dev_get_by_name(net, devname);
540  		if (!dev)
541  			return -ENODEV;
542  		cfg->fc_oif = dev->ifindex;
543  		cfg->fc_table = l3mdev_fib_table(dev);
544  		if (colon) {
545  			const struct in_ifaddr *ifa;
546  			struct in_device *in_dev;
547  
548  			in_dev = __in_dev_get_rtnl(dev);
549  			if (!in_dev)
550  				return -ENODEV;
551  
552  			*colon = ':';
553  
554  			rcu_read_lock();
555  			in_dev_for_each_ifa_rcu(ifa, in_dev) {
556  				if (strcmp(ifa->ifa_label, devname) == 0)
557  					break;
558  			}
559  			rcu_read_unlock();
560  
561  			if (!ifa)
562  				return -ENODEV;
563  			cfg->fc_prefsrc = ifa->ifa_local;
564  		}
565  	}
566  
567  	addr = sk_extract_addr(&rt->rt_gateway);
568  	if (rt->rt_gateway.sa_family == AF_INET && addr) {
569  		unsigned int addr_type;
570  
571  		cfg->fc_gw4 = addr;
572  		cfg->fc_gw_family = AF_INET;
573  		addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
574  		if (rt->rt_flags & RTF_GATEWAY &&
575  		    addr_type == RTN_UNICAST)
576  			cfg->fc_scope = RT_SCOPE_UNIVERSE;
577  	}
578  
579  	if (!cfg->fc_table)
580  		cfg->fc_table = RT_TABLE_MAIN;
581  
582  	if (cmd == SIOCDELRT)
583  		return 0;
584  
585  	if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
586  		return -EINVAL;
587  
588  	if (cfg->fc_scope == RT_SCOPE_NOWHERE)
589  		cfg->fc_scope = RT_SCOPE_LINK;
590  
591  	if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
592  		struct nlattr *mx;
593  		int len = 0;
594  
595  		mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
596  		if (!mx)
597  			return -ENOMEM;
598  
599  		if (rt->rt_flags & RTF_MTU)
600  			len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);
601  
602  		if (rt->rt_flags & RTF_WINDOW)
603  			len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);
604  
605  		if (rt->rt_flags & RTF_IRTT)
606  			len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);
607  
608  		cfg->fc_mx = mx;
609  		cfg->fc_mx_len = len;
610  	}
611  
612  	return 0;
613  }
614  
615  /*
616   * Handle IP routing ioctl calls.
617   * These are used to manipulate the routing tables
618   */
ip_rt_ioctl(struct net * net,unsigned int cmd,struct rtentry * rt)619  int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
620  {
621  	struct fib_config cfg;
622  	int err;
623  
624  	switch (cmd) {
625  	case SIOCADDRT:		/* Add a route */
626  	case SIOCDELRT:		/* Delete a route */
627  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
628  			return -EPERM;
629  
630  		rtnl_lock();
631  		err = rtentry_to_fib_config(net, cmd, rt, &cfg);
632  		if (err == 0) {
633  			struct fib_table *tb;
634  
635  			if (cmd == SIOCDELRT) {
636  				tb = fib_get_table(net, cfg.fc_table);
637  				if (tb)
638  					err = fib_table_delete(net, tb, &cfg,
639  							       NULL);
640  				else
641  					err = -ESRCH;
642  			} else {
643  				tb = fib_new_table(net, cfg.fc_table);
644  				if (tb)
645  					err = fib_table_insert(net, tb,
646  							       &cfg, NULL);
647  				else
648  					err = -ENOBUFS;
649  			}
650  
651  			/* allocated by rtentry_to_fib_config() */
652  			kfree(cfg.fc_mx);
653  		}
654  		rtnl_unlock();
655  		return err;
656  	}
657  	return -EINVAL;
658  }
659  
660  const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
661  	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
662  	[RTA_DST]		= { .type = NLA_U32 },
663  	[RTA_SRC]		= { .type = NLA_U32 },
664  	[RTA_IIF]		= { .type = NLA_U32 },
665  	[RTA_OIF]		= { .type = NLA_U32 },
666  	[RTA_GATEWAY]		= { .type = NLA_U32 },
667  	[RTA_PRIORITY]		= { .type = NLA_U32 },
668  	[RTA_PREFSRC]		= { .type = NLA_U32 },
669  	[RTA_METRICS]		= { .type = NLA_NESTED },
670  	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
671  	[RTA_FLOW]		= { .type = NLA_U32 },
672  	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
673  	[RTA_ENCAP]		= { .type = NLA_NESTED },
674  	[RTA_UID]		= { .type = NLA_U32 },
675  	[RTA_MARK]		= { .type = NLA_U32 },
676  	[RTA_TABLE]		= { .type = NLA_U32 },
677  	[RTA_IP_PROTO]		= { .type = NLA_U8 },
678  	[RTA_SPORT]		= { .type = NLA_U16 },
679  	[RTA_DPORT]		= { .type = NLA_U16 },
680  	[RTA_NH_ID]		= { .type = NLA_U32 },
681  };
682  
fib_gw_from_via(struct fib_config * cfg,struct nlattr * nla,struct netlink_ext_ack * extack)683  int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
684  		    struct netlink_ext_ack *extack)
685  {
686  	struct rtvia *via;
687  	int alen;
688  
689  	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
690  		NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
691  		return -EINVAL;
692  	}
693  
694  	via = nla_data(nla);
695  	alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
696  
697  	switch (via->rtvia_family) {
698  	case AF_INET:
699  		if (alen != sizeof(__be32)) {
700  			NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
701  			return -EINVAL;
702  		}
703  		cfg->fc_gw_family = AF_INET;
704  		cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
705  		break;
706  	case AF_INET6:
707  #if IS_ENABLED(CONFIG_IPV6)
708  		if (alen != sizeof(struct in6_addr)) {
709  			NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
710  			return -EINVAL;
711  		}
712  		cfg->fc_gw_family = AF_INET6;
713  		cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
714  #else
715  		NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
716  		return -EINVAL;
717  #endif
718  		break;
719  	default:
720  		NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
721  		return -EINVAL;
722  	}
723  
724  	return 0;
725  }
726  
rtm_to_fib_config(struct net * net,struct sk_buff * skb,struct nlmsghdr * nlh,struct fib_config * cfg,struct netlink_ext_ack * extack)727  static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
728  			     struct nlmsghdr *nlh, struct fib_config *cfg,
729  			     struct netlink_ext_ack *extack)
730  {
731  	bool has_gw = false, has_via = false;
732  	struct nlattr *attr;
733  	int err, remaining;
734  	struct rtmsg *rtm;
735  
736  	err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
737  					rtm_ipv4_policy, extack);
738  	if (err < 0)
739  		goto errout;
740  
741  	memset(cfg, 0, sizeof(*cfg));
742  
743  	rtm = nlmsg_data(nlh);
744  
745  	if (!inet_validate_dscp(rtm->rtm_tos)) {
746  		NL_SET_ERR_MSG(extack,
747  			       "Invalid dsfield (tos): ECN bits must be 0");
748  		err = -EINVAL;
749  		goto errout;
750  	}
751  	cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
752  
753  	cfg->fc_dst_len = rtm->rtm_dst_len;
754  	cfg->fc_table = rtm->rtm_table;
755  	cfg->fc_protocol = rtm->rtm_protocol;
756  	cfg->fc_scope = rtm->rtm_scope;
757  	cfg->fc_type = rtm->rtm_type;
758  	cfg->fc_flags = rtm->rtm_flags;
759  	cfg->fc_nlflags = nlh->nlmsg_flags;
760  
761  	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
762  	cfg->fc_nlinfo.nlh = nlh;
763  	cfg->fc_nlinfo.nl_net = net;
764  
765  	if (cfg->fc_type > RTN_MAX) {
766  		NL_SET_ERR_MSG(extack, "Invalid route type");
767  		err = -EINVAL;
768  		goto errout;
769  	}
770  
771  	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
772  		switch (nla_type(attr)) {
773  		case RTA_DST:
774  			cfg->fc_dst = nla_get_be32(attr);
775  			break;
776  		case RTA_OIF:
777  			cfg->fc_oif = nla_get_u32(attr);
778  			break;
779  		case RTA_GATEWAY:
780  			has_gw = true;
781  			cfg->fc_gw4 = nla_get_be32(attr);
782  			if (cfg->fc_gw4)
783  				cfg->fc_gw_family = AF_INET;
784  			break;
785  		case RTA_VIA:
786  			has_via = true;
787  			err = fib_gw_from_via(cfg, attr, extack);
788  			if (err)
789  				goto errout;
790  			break;
791  		case RTA_PRIORITY:
792  			cfg->fc_priority = nla_get_u32(attr);
793  			break;
794  		case RTA_PREFSRC:
795  			cfg->fc_prefsrc = nla_get_be32(attr);
796  			break;
797  		case RTA_METRICS:
798  			cfg->fc_mx = nla_data(attr);
799  			cfg->fc_mx_len = nla_len(attr);
800  			break;
801  		case RTA_MULTIPATH:
802  			err = lwtunnel_valid_encap_type_attr(nla_data(attr),
803  							     nla_len(attr),
804  							     extack);
805  			if (err < 0)
806  				goto errout;
807  			cfg->fc_mp = nla_data(attr);
808  			cfg->fc_mp_len = nla_len(attr);
809  			break;
810  		case RTA_FLOW:
811  			cfg->fc_flow = nla_get_u32(attr);
812  			break;
813  		case RTA_TABLE:
814  			cfg->fc_table = nla_get_u32(attr);
815  			break;
816  		case RTA_ENCAP:
817  			cfg->fc_encap = attr;
818  			break;
819  		case RTA_ENCAP_TYPE:
820  			cfg->fc_encap_type = nla_get_u16(attr);
821  			err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
822  							extack);
823  			if (err < 0)
824  				goto errout;
825  			break;
826  		case RTA_NH_ID:
827  			cfg->fc_nh_id = nla_get_u32(attr);
828  			break;
829  		}
830  	}
831  
832  	if (cfg->fc_nh_id) {
833  		if (cfg->fc_oif || cfg->fc_gw_family ||
834  		    cfg->fc_encap || cfg->fc_mp) {
835  			NL_SET_ERR_MSG(extack,
836  				       "Nexthop specification and nexthop id are mutually exclusive");
837  			return -EINVAL;
838  		}
839  	}
840  
841  	if (has_gw && has_via) {
842  		NL_SET_ERR_MSG(extack,
843  			       "Nexthop configuration can not contain both GATEWAY and VIA");
844  		return -EINVAL;
845  	}
846  
847  	if (!cfg->fc_table)
848  		cfg->fc_table = RT_TABLE_MAIN;
849  
850  	return 0;
851  errout:
852  	return err;
853  }
854  
inet_rtm_delroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)855  static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
856  			     struct netlink_ext_ack *extack)
857  {
858  	struct net *net = sock_net(skb->sk);
859  	struct fib_config cfg;
860  	struct fib_table *tb;
861  	int err;
862  
863  	err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
864  	if (err < 0)
865  		goto errout;
866  
867  	if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
868  		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
869  		err = -EINVAL;
870  		goto errout;
871  	}
872  
873  	tb = fib_get_table(net, cfg.fc_table);
874  	if (!tb) {
875  		NL_SET_ERR_MSG(extack, "FIB table does not exist");
876  		err = -ESRCH;
877  		goto errout;
878  	}
879  
880  	err = fib_table_delete(net, tb, &cfg, extack);
881  errout:
882  	return err;
883  }
884  
inet_rtm_newroute(struct sk_buff * skb,struct nlmsghdr * nlh,struct netlink_ext_ack * extack)885  static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
886  			     struct netlink_ext_ack *extack)
887  {
888  	struct net *net = sock_net(skb->sk);
889  	struct fib_config cfg;
890  	struct fib_table *tb;
891  	int err;
892  
893  	err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
894  	if (err < 0)
895  		goto errout;
896  
897  	tb = fib_new_table(net, cfg.fc_table);
898  	if (!tb) {
899  		err = -ENOBUFS;
900  		goto errout;
901  	}
902  
903  	err = fib_table_insert(net, tb, &cfg, extack);
904  	if (!err && cfg.fc_type == RTN_LOCAL)
905  		net->ipv4.fib_has_custom_local_routes = true;
906  errout:
907  	return err;
908  }
909  
ip_valid_fib_dump_req(struct net * net,const struct nlmsghdr * nlh,struct fib_dump_filter * filter,struct netlink_callback * cb)910  int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
911  			  struct fib_dump_filter *filter,
912  			  struct netlink_callback *cb)
913  {
914  	struct netlink_ext_ack *extack = cb->extack;
915  	struct nlattr *tb[RTA_MAX + 1];
916  	struct rtmsg *rtm;
917  	int err, i;
918  
919  	ASSERT_RTNL();
920  
921  	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
922  		NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
923  		return -EINVAL;
924  	}
925  
926  	rtm = nlmsg_data(nlh);
927  	if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
928  	    rtm->rtm_scope) {
929  		NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
930  		return -EINVAL;
931  	}
932  
933  	if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
934  		NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
935  		return -EINVAL;
936  	}
937  	if (rtm->rtm_flags & RTM_F_CLONED)
938  		filter->dump_routes = false;
939  	else
940  		filter->dump_exceptions = false;
941  
942  	filter->flags    = rtm->rtm_flags;
943  	filter->protocol = rtm->rtm_protocol;
944  	filter->rt_type  = rtm->rtm_type;
945  	filter->table_id = rtm->rtm_table;
946  
947  	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
948  					    rtm_ipv4_policy, extack);
949  	if (err < 0)
950  		return err;
951  
952  	for (i = 0; i <= RTA_MAX; ++i) {
953  		int ifindex;
954  
955  		if (!tb[i])
956  			continue;
957  
958  		switch (i) {
959  		case RTA_TABLE:
960  			filter->table_id = nla_get_u32(tb[i]);
961  			break;
962  		case RTA_OIF:
963  			ifindex = nla_get_u32(tb[i]);
964  			filter->dev = __dev_get_by_index(net, ifindex);
965  			if (!filter->dev)
966  				return -ENODEV;
967  			break;
968  		default:
969  			NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
970  			return -EINVAL;
971  		}
972  	}
973  
974  	if (filter->flags || filter->protocol || filter->rt_type ||
975  	    filter->table_id || filter->dev) {
976  		filter->filter_set = 1;
977  		cb->answer_flags = NLM_F_DUMP_FILTERED;
978  	}
979  
980  	return 0;
981  }
982  EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
983  
inet_dump_fib(struct sk_buff * skb,struct netlink_callback * cb)984  static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
985  {
986  	struct fib_dump_filter filter = { .dump_routes = true,
987  					  .dump_exceptions = true };
988  	const struct nlmsghdr *nlh = cb->nlh;
989  	struct net *net = sock_net(skb->sk);
990  	unsigned int h, s_h;
991  	unsigned int e = 0, s_e;
992  	struct fib_table *tb;
993  	struct hlist_head *head;
994  	int dumped = 0, err;
995  
996  	if (cb->strict_check) {
997  		err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
998  		if (err < 0)
999  			return err;
1000  	} else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
1001  		struct rtmsg *rtm = nlmsg_data(nlh);
1002  
1003  		filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
1004  	}
1005  
1006  	/* ipv4 does not use prefix flag */
1007  	if (filter.flags & RTM_F_PREFIX)
1008  		return skb->len;
1009  
1010  	if (filter.table_id) {
1011  		tb = fib_get_table(net, filter.table_id);
1012  		if (!tb) {
1013  			if (rtnl_msg_family(cb->nlh) != PF_INET)
1014  				return skb->len;
1015  
1016  			NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
1017  			return -ENOENT;
1018  		}
1019  
1020  		rcu_read_lock();
1021  		err = fib_table_dump(tb, skb, cb, &filter);
1022  		rcu_read_unlock();
1023  		return skb->len ? : err;
1024  	}
1025  
1026  	s_h = cb->args[0];
1027  	s_e = cb->args[1];
1028  
1029  	rcu_read_lock();
1030  
1031  	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
1032  		e = 0;
1033  		head = &net->ipv4.fib_table_hash[h];
1034  		hlist_for_each_entry_rcu(tb, head, tb_hlist) {
1035  			if (e < s_e)
1036  				goto next;
1037  			if (dumped)
1038  				memset(&cb->args[2], 0, sizeof(cb->args) -
1039  						 2 * sizeof(cb->args[0]));
1040  			err = fib_table_dump(tb, skb, cb, &filter);
1041  			if (err < 0) {
1042  				if (likely(skb->len))
1043  					goto out;
1044  
1045  				goto out_err;
1046  			}
1047  			dumped = 1;
1048  next:
1049  			e++;
1050  		}
1051  	}
1052  out:
1053  	err = skb->len;
1054  out_err:
1055  	rcu_read_unlock();
1056  
1057  	cb->args[1] = e;
1058  	cb->args[0] = h;
1059  
1060  	return err;
1061  }
1062  
1063  /* Prepare and feed intra-kernel routing request.
1064   * Really, it should be netlink message, but :-( netlink
1065   * can be not configured, so that we feed it directly
1066   * to fib engine. It is legal, because all events occur
1067   * only when netlink is already locked.
1068   */
fib_magic(int cmd,int type,__be32 dst,int dst_len,struct in_ifaddr * ifa,u32 rt_priority)1069  static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
1070  		      struct in_ifaddr *ifa, u32 rt_priority)
1071  {
1072  	struct net *net = dev_net(ifa->ifa_dev->dev);
1073  	u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
1074  	struct fib_table *tb;
1075  	struct fib_config cfg = {
1076  		.fc_protocol = RTPROT_KERNEL,
1077  		.fc_type = type,
1078  		.fc_dst = dst,
1079  		.fc_dst_len = dst_len,
1080  		.fc_priority = rt_priority,
1081  		.fc_prefsrc = ifa->ifa_local,
1082  		.fc_oif = ifa->ifa_dev->dev->ifindex,
1083  		.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
1084  		.fc_nlinfo = {
1085  			.nl_net = net,
1086  		},
1087  	};
1088  
1089  	if (!tb_id)
1090  		tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
1091  
1092  	tb = fib_new_table(net, tb_id);
1093  	if (!tb)
1094  		return;
1095  
1096  	cfg.fc_table = tb->tb_id;
1097  
1098  	if (type != RTN_LOCAL)
1099  		cfg.fc_scope = RT_SCOPE_LINK;
1100  	else
1101  		cfg.fc_scope = RT_SCOPE_HOST;
1102  
1103  	if (cmd == RTM_NEWROUTE)
1104  		fib_table_insert(net, tb, &cfg, NULL);
1105  	else
1106  		fib_table_delete(net, tb, &cfg, NULL);
1107  }
1108  
fib_add_ifaddr(struct in_ifaddr * ifa)1109  void fib_add_ifaddr(struct in_ifaddr *ifa)
1110  {
1111  	struct in_device *in_dev = ifa->ifa_dev;
1112  	struct net_device *dev = in_dev->dev;
1113  	struct in_ifaddr *prim = ifa;
1114  	__be32 mask = ifa->ifa_mask;
1115  	__be32 addr = ifa->ifa_local;
1116  	__be32 prefix = ifa->ifa_address & mask;
1117  
1118  	if (ifa->ifa_flags & IFA_F_SECONDARY) {
1119  		prim = inet_ifa_byprefix(in_dev, prefix, mask);
1120  		if (!prim) {
1121  			pr_warn("%s: bug: prim == NULL\n", __func__);
1122  			return;
1123  		}
1124  	}
1125  
1126  	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
1127  
1128  	if (!(dev->flags & IFF_UP))
1129  		return;
1130  
1131  	/* Add broadcast address, if it is explicitly assigned. */
1132  	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
1133  		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1134  			  prim, 0);
1135  		arp_invalidate(dev, ifa->ifa_broadcast, false);
1136  	}
1137  
1138  	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
1139  	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
1140  		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1141  			fib_magic(RTM_NEWROUTE,
1142  				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1143  				  prefix, ifa->ifa_prefixlen, prim,
1144  				  ifa->ifa_rt_priority);
1145  
1146  		/* Add the network broadcast address, when it makes sense */
1147  		if (ifa->ifa_prefixlen < 31) {
1148  			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
1149  				  32, prim, 0);
1150  			arp_invalidate(dev, prefix | ~mask, false);
1151  		}
1152  	}
1153  }
1154  
fib_modify_prefix_metric(struct in_ifaddr * ifa,u32 new_metric)1155  void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
1156  {
1157  	__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
1158  	struct in_device *in_dev = ifa->ifa_dev;
1159  	struct net_device *dev = in_dev->dev;
1160  
1161  	if (!(dev->flags & IFF_UP) ||
1162  	    ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
1163  	    ipv4_is_zeronet(prefix) ||
1164  	    (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
1165  		return;
1166  
1167  	/* add the new */
1168  	fib_magic(RTM_NEWROUTE,
1169  		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1170  		  prefix, ifa->ifa_prefixlen, ifa, new_metric);
1171  
1172  	/* delete the old */
1173  	fib_magic(RTM_DELROUTE,
1174  		  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1175  		  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
1176  }
1177  
1178  /* Delete primary or secondary address.
1179   * Optionally, on secondary address promotion consider the addresses
1180   * from subnet iprim as deleted, even if they are in device list.
1181   * In this case the secondary ifa can be in device list.
1182   */
fib_del_ifaddr(struct in_ifaddr * ifa,struct in_ifaddr * iprim)1183  void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
1184  {
1185  	struct in_device *in_dev = ifa->ifa_dev;
1186  	struct net_device *dev = in_dev->dev;
1187  	struct in_ifaddr *ifa1;
1188  	struct in_ifaddr *prim = ifa, *prim1 = NULL;
1189  	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
1190  	__be32 any = ifa->ifa_address & ifa->ifa_mask;
1191  #define LOCAL_OK	1
1192  #define BRD_OK		2
1193  #define BRD0_OK		4
1194  #define BRD1_OK		8
1195  	unsigned int ok = 0;
1196  	int subnet = 0;		/* Primary network */
1197  	int gone = 1;		/* Address is missing */
1198  	int same_prefsrc = 0;	/* Another primary with same IP */
1199  
1200  	if (ifa->ifa_flags & IFA_F_SECONDARY) {
1201  		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
1202  		if (!prim) {
1203  			/* if the device has been deleted, we don't perform
1204  			 * address promotion
1205  			 */
1206  			if (!in_dev->dead)
1207  				pr_warn("%s: bug: prim == NULL\n", __func__);
1208  			return;
1209  		}
1210  		if (iprim && iprim != prim) {
1211  			pr_warn("%s: bug: iprim != prim\n", __func__);
1212  			return;
1213  		}
1214  	} else if (!ipv4_is_zeronet(any) &&
1215  		   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
1216  		if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
1217  			fib_magic(RTM_DELROUTE,
1218  				  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
1219  				  any, ifa->ifa_prefixlen, prim, 0);
1220  		subnet = 1;
1221  	}
1222  
1223  	if (in_dev->dead)
1224  		goto no_promotions;
1225  
1226  	/* Deletion is more complicated than add.
1227  	 * We should take care of not to delete too much :-)
1228  	 *
1229  	 * Scan address list to be sure that addresses are really gone.
1230  	 */
1231  	rcu_read_lock();
1232  	in_dev_for_each_ifa_rcu(ifa1, in_dev) {
1233  		if (ifa1 == ifa) {
1234  			/* promotion, keep the IP */
1235  			gone = 0;
1236  			continue;
1237  		}
1238  		/* Ignore IFAs from our subnet */
1239  		if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
1240  		    inet_ifa_match(ifa1->ifa_address, iprim))
1241  			continue;
1242  
1243  		/* Ignore ifa1 if it uses different primary IP (prefsrc) */
1244  		if (ifa1->ifa_flags & IFA_F_SECONDARY) {
1245  			/* Another address from our subnet? */
1246  			if (ifa1->ifa_mask == prim->ifa_mask &&
1247  			    inet_ifa_match(ifa1->ifa_address, prim))
1248  				prim1 = prim;
1249  			else {
1250  				/* We reached the secondaries, so
1251  				 * same_prefsrc should be determined.
1252  				 */
1253  				if (!same_prefsrc)
1254  					continue;
1255  				/* Search new prim1 if ifa1 is not
1256  				 * using the current prim1
1257  				 */
1258  				if (!prim1 ||
1259  				    ifa1->ifa_mask != prim1->ifa_mask ||
1260  				    !inet_ifa_match(ifa1->ifa_address, prim1))
1261  					prim1 = inet_ifa_byprefix(in_dev,
1262  							ifa1->ifa_address,
1263  							ifa1->ifa_mask);
1264  				if (!prim1)
1265  					continue;
1266  				if (prim1->ifa_local != prim->ifa_local)
1267  					continue;
1268  			}
1269  		} else {
1270  			if (prim->ifa_local != ifa1->ifa_local)
1271  				continue;
1272  			prim1 = ifa1;
1273  			if (prim != prim1)
1274  				same_prefsrc = 1;
1275  		}
1276  		if (ifa->ifa_local == ifa1->ifa_local)
1277  			ok |= LOCAL_OK;
1278  		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
1279  			ok |= BRD_OK;
1280  		if (brd == ifa1->ifa_broadcast)
1281  			ok |= BRD1_OK;
1282  		if (any == ifa1->ifa_broadcast)
1283  			ok |= BRD0_OK;
1284  		/* primary has network specific broadcasts */
1285  		if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
1286  			__be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
1287  			__be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;
1288  
1289  			if (!ipv4_is_zeronet(any1)) {
1290  				if (ifa->ifa_broadcast == brd1 ||
1291  				    ifa->ifa_broadcast == any1)
1292  					ok |= BRD_OK;
1293  				if (brd == brd1 || brd == any1)
1294  					ok |= BRD1_OK;
1295  				if (any == brd1 || any == any1)
1296  					ok |= BRD0_OK;
1297  			}
1298  		}
1299  	}
1300  	rcu_read_unlock();
1301  
1302  no_promotions:
1303  	if (!(ok & BRD_OK))
1304  		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
1305  			  prim, 0);
1306  	if (subnet && ifa->ifa_prefixlen < 31) {
1307  		if (!(ok & BRD1_OK))
1308  			fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
1309  				  prim, 0);
1310  		if (!(ok & BRD0_OK))
1311  			fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
1312  				  prim, 0);
1313  	}
1314  	if (!(ok & LOCAL_OK)) {
1315  		unsigned int addr_type;
1316  
1317  		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
1318  
1319  		/* Check, that this local address finally disappeared. */
1320  		addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
1321  						     ifa->ifa_local);
1322  		if (gone && addr_type != RTN_LOCAL) {
1323  			/* And the last, but not the least thing.
1324  			 * We must flush stray FIB entries.
1325  			 *
1326  			 * First of all, we scan fib_info list searching
1327  			 * for stray nexthop entries, then ignite fib_flush.
1328  			 */
1329  			if (fib_sync_down_addr(dev, ifa->ifa_local))
1330  				fib_flush(dev_net(dev));
1331  		}
1332  	}
1333  #undef LOCAL_OK
1334  #undef BRD_OK
1335  #undef BRD0_OK
1336  #undef BRD1_OK
1337  }
1338  
nl_fib_lookup(struct net * net,struct fib_result_nl * frn)1339  static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
1340  {
1341  
1342  	struct fib_result       res;
1343  	struct flowi4           fl4 = {
1344  		.flowi4_mark = frn->fl_mark,
1345  		.daddr = frn->fl_addr,
1346  		.flowi4_tos = frn->fl_tos & IPTOS_RT_MASK,
1347  		.flowi4_scope = frn->fl_scope,
1348  	};
1349  	struct fib_table *tb;
1350  
1351  	rcu_read_lock();
1352  
1353  	tb = fib_get_table(net, frn->tb_id_in);
1354  
1355  	frn->err = -ENOENT;
1356  	if (tb) {
1357  		local_bh_disable();
1358  
1359  		frn->tb_id = tb->tb_id;
1360  		frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
1361  
1362  		if (!frn->err) {
1363  			frn->prefixlen = res.prefixlen;
1364  			frn->nh_sel = res.nh_sel;
1365  			frn->type = res.type;
1366  			frn->scope = res.scope;
1367  		}
1368  		local_bh_enable();
1369  	}
1370  
1371  	rcu_read_unlock();
1372  }
1373  
nl_fib_input(struct sk_buff * skb)1374  static void nl_fib_input(struct sk_buff *skb)
1375  {
1376  	struct net *net;
1377  	struct fib_result_nl *frn;
1378  	struct nlmsghdr *nlh;
1379  	u32 portid;
1380  
1381  	net = sock_net(skb->sk);
1382  	nlh = nlmsg_hdr(skb);
1383  	if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
1384  	    skb->len < nlh->nlmsg_len ||
1385  	    nlmsg_len(nlh) < sizeof(*frn))
1386  		return;
1387  
1388  	skb = netlink_skb_clone(skb, GFP_KERNEL);
1389  	if (!skb)
1390  		return;
1391  	nlh = nlmsg_hdr(skb);
1392  
1393  	frn = nlmsg_data(nlh);
1394  	nl_fib_lookup(net, frn);
1395  
1396  	portid = NETLINK_CB(skb).portid;      /* netlink portid */
1397  	NETLINK_CB(skb).portid = 0;        /* from kernel */
1398  	NETLINK_CB(skb).dst_group = 0;  /* unicast */
1399  	nlmsg_unicast(net->ipv4.fibnl, skb, portid);
1400  }
1401  
nl_fib_lookup_init(struct net * net)1402  static int __net_init nl_fib_lookup_init(struct net *net)
1403  {
1404  	struct sock *sk;
1405  	struct netlink_kernel_cfg cfg = {
1406  		.input	= nl_fib_input,
1407  	};
1408  
1409  	sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
1410  	if (!sk)
1411  		return -EAFNOSUPPORT;
1412  	net->ipv4.fibnl = sk;
1413  	return 0;
1414  }
1415  
nl_fib_lookup_exit(struct net * net)1416  static void nl_fib_lookup_exit(struct net *net)
1417  {
1418  	netlink_kernel_release(net->ipv4.fibnl);
1419  	net->ipv4.fibnl = NULL;
1420  }
1421  
fib_disable_ip(struct net_device * dev,unsigned long event,bool force)1422  static void fib_disable_ip(struct net_device *dev, unsigned long event,
1423  			   bool force)
1424  {
1425  	if (fib_sync_down_dev(dev, event, force))
1426  		fib_flush(dev_net(dev));
1427  	else
1428  		rt_cache_flush(dev_net(dev));
1429  	arp_ifdown(dev);
1430  }
1431  
fib_inetaddr_event(struct notifier_block * this,unsigned long event,void * ptr)1432  static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
1433  {
1434  	struct in_ifaddr *ifa = ptr;
1435  	struct net_device *dev = ifa->ifa_dev->dev;
1436  	struct net *net = dev_net(dev);
1437  
1438  	switch (event) {
1439  	case NETDEV_UP:
1440  		fib_add_ifaddr(ifa);
1441  #ifdef CONFIG_IP_ROUTE_MULTIPATH
1442  		fib_sync_up(dev, RTNH_F_DEAD);
1443  #endif
1444  		atomic_inc(&net->ipv4.dev_addr_genid);
1445  		rt_cache_flush(dev_net(dev));
1446  		break;
1447  	case NETDEV_DOWN:
1448  		fib_del_ifaddr(ifa, NULL);
1449  		atomic_inc(&net->ipv4.dev_addr_genid);
1450  		if (!ifa->ifa_dev->ifa_list) {
1451  			/* Last address was deleted from this interface.
1452  			 * Disable IP.
1453  			 */
1454  			fib_disable_ip(dev, event, true);
1455  		} else {
1456  			rt_cache_flush(dev_net(dev));
1457  		}
1458  		break;
1459  	}
1460  	return NOTIFY_DONE;
1461  }
1462  
fib_netdev_event(struct notifier_block * this,unsigned long event,void * ptr)1463  static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
1464  {
1465  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1466  	struct netdev_notifier_changeupper_info *upper_info = ptr;
1467  	struct netdev_notifier_info_ext *info_ext = ptr;
1468  	struct in_device *in_dev;
1469  	struct net *net = dev_net(dev);
1470  	struct in_ifaddr *ifa;
1471  	unsigned int flags;
1472  
1473  	if (event == NETDEV_UNREGISTER) {
1474  		fib_disable_ip(dev, event, true);
1475  		rt_flush_dev(dev);
1476  		return NOTIFY_DONE;
1477  	}
1478  
1479  	in_dev = __in_dev_get_rtnl(dev);
1480  	if (!in_dev)
1481  		return NOTIFY_DONE;
1482  
1483  	switch (event) {
1484  	case NETDEV_UP:
1485  		in_dev_for_each_ifa_rtnl(ifa, in_dev) {
1486  			fib_add_ifaddr(ifa);
1487  		}
1488  #ifdef CONFIG_IP_ROUTE_MULTIPATH
1489  		fib_sync_up(dev, RTNH_F_DEAD);
1490  #endif
1491  		atomic_inc(&net->ipv4.dev_addr_genid);
1492  		rt_cache_flush(net);
1493  		break;
1494  	case NETDEV_DOWN:
1495  		fib_disable_ip(dev, event, false);
1496  		break;
1497  	case NETDEV_CHANGE:
1498  		flags = dev_get_flags(dev);
1499  		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1500  			fib_sync_up(dev, RTNH_F_LINKDOWN);
1501  		else
1502  			fib_sync_down_dev(dev, event, false);
1503  		rt_cache_flush(net);
1504  		break;
1505  	case NETDEV_CHANGEMTU:
1506  		fib_sync_mtu(dev, info_ext->ext.mtu);
1507  		rt_cache_flush(net);
1508  		break;
1509  	case NETDEV_CHANGEUPPER:
1510  		upper_info = ptr;
1511  		/* flush all routes if dev is linked to or unlinked from
1512  		 * an L3 master device (e.g., VRF)
1513  		 */
1514  		if (upper_info->upper_dev &&
1515  		    netif_is_l3_master(upper_info->upper_dev))
1516  			fib_disable_ip(dev, NETDEV_DOWN, true);
1517  		break;
1518  	}
1519  	return NOTIFY_DONE;
1520  }
1521  
1522  static struct notifier_block fib_inetaddr_notifier = {
1523  	.notifier_call = fib_inetaddr_event,
1524  };
1525  
1526  static struct notifier_block fib_netdev_notifier = {
1527  	.notifier_call = fib_netdev_event,
1528  };
1529  
ip_fib_net_init(struct net * net)1530  static int __net_init ip_fib_net_init(struct net *net)
1531  {
1532  	int err;
1533  	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
1534  
1535  	err = fib4_notifier_init(net);
1536  	if (err)
1537  		return err;
1538  
1539  #ifdef CONFIG_IP_ROUTE_MULTIPATH
1540  	/* Default to 3-tuple */
1541  	net->ipv4.sysctl_fib_multipath_hash_fields =
1542  		FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
1543  #endif
1544  
1545  	/* Avoid false sharing : Use at least a full cache line */
1546  	size = max_t(size_t, size, L1_CACHE_BYTES);
1547  
1548  	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
1549  	if (!net->ipv4.fib_table_hash) {
1550  		err = -ENOMEM;
1551  		goto err_table_hash_alloc;
1552  	}
1553  
1554  	err = fib4_rules_init(net);
1555  	if (err < 0)
1556  		goto err_rules_init;
1557  	return 0;
1558  
1559  err_rules_init:
1560  	kfree(net->ipv4.fib_table_hash);
1561  err_table_hash_alloc:
1562  	fib4_notifier_exit(net);
1563  	return err;
1564  }
1565  
ip_fib_net_exit(struct net * net)1566  static void ip_fib_net_exit(struct net *net)
1567  {
1568  	int i;
1569  
1570  	ASSERT_RTNL();
1571  #ifdef CONFIG_IP_MULTIPLE_TABLES
1572  	RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
1573  	RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
1574  #endif
1575  	/* Destroy the tables in reverse order to guarantee that the
1576  	 * local table, ID 255, is destroyed before the main table, ID
1577  	 * 254. This is necessary as the local table may contain
1578  	 * references to data contained in the main table.
1579  	 */
1580  	for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
1581  		struct hlist_head *head = &net->ipv4.fib_table_hash[i];
1582  		struct hlist_node *tmp;
1583  		struct fib_table *tb;
1584  
1585  		hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
1586  			hlist_del(&tb->tb_hlist);
1587  			fib_table_flush(net, tb, true);
1588  			fib_free_table(tb);
1589  		}
1590  	}
1591  
1592  #ifdef CONFIG_IP_MULTIPLE_TABLES
1593  	fib4_rules_exit(net);
1594  #endif
1595  
1596  	kfree(net->ipv4.fib_table_hash);
1597  	fib4_notifier_exit(net);
1598  }
1599  
fib_net_init(struct net * net)1600  static int __net_init fib_net_init(struct net *net)
1601  {
1602  	int error;
1603  
1604  #ifdef CONFIG_IP_ROUTE_CLASSID
1605  	atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
1606  #endif
1607  	error = ip_fib_net_init(net);
1608  	if (error < 0)
1609  		goto out;
1610  	error = nl_fib_lookup_init(net);
1611  	if (error < 0)
1612  		goto out_nlfl;
1613  	error = fib_proc_init(net);
1614  	if (error < 0)
1615  		goto out_proc;
1616  out:
1617  	return error;
1618  
1619  out_proc:
1620  	nl_fib_lookup_exit(net);
1621  out_nlfl:
1622  	rtnl_lock();
1623  	ip_fib_net_exit(net);
1624  	rtnl_unlock();
1625  	goto out;
1626  }
1627  
fib_net_exit(struct net * net)1628  static void __net_exit fib_net_exit(struct net *net)
1629  {
1630  	fib_proc_exit(net);
1631  	nl_fib_lookup_exit(net);
1632  }
1633  
fib_net_exit_batch(struct list_head * net_list)1634  static void __net_exit fib_net_exit_batch(struct list_head *net_list)
1635  {
1636  	struct net *net;
1637  
1638  	rtnl_lock();
1639  	list_for_each_entry(net, net_list, exit_list)
1640  		ip_fib_net_exit(net);
1641  
1642  	rtnl_unlock();
1643  }
1644  
1645  static struct pernet_operations fib_net_ops = {
1646  	.init = fib_net_init,
1647  	.exit = fib_net_exit,
1648  	.exit_batch = fib_net_exit_batch,
1649  };
1650  
ip_fib_init(void)1651  void __init ip_fib_init(void)
1652  {
1653  	fib_trie_init();
1654  
1655  	register_pernet_subsys(&fib_net_ops);
1656  
1657  	register_netdevice_notifier(&fib_netdev_notifier);
1658  	register_inetaddr_notifier(&fib_inetaddr_notifier);
1659  
1660  	rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
1661  	rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
1662  	rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
1663  }
1664