xref: /openbmc/linux/net/core/dev.c (revision 36db6e8484ed455bbb320d89a119378897ae991c)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   *      NET3    Protocol independent device support routines.
4   *
5   *	Derived from the non IP parts of dev.c 1.0.19
6   *              Authors:	Ross Biro
7   *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8   *				Mark Evans, <evansmp@uhura.aston.ac.uk>
9   *
10   *	Additional Authors:
11   *		Florian la Roche <rzsfl@rz.uni-sb.de>
12   *		Alan Cox <gw4pts@gw4pts.ampr.org>
13   *		David Hinds <dahinds@users.sourceforge.net>
14   *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15   *		Adam Sulmicki <adam@cfar.umd.edu>
16   *              Pekka Riikonen <priikone@poesidon.pspt.fi>
17   *
18   *	Changes:
19   *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
20   *                                      to 2 if register_netdev gets called
21   *                                      before net_dev_init & also removed a
22   *                                      few lines of code in the process.
23   *		Alan Cox	:	device private ioctl copies fields back.
24   *		Alan Cox	:	Transmit queue code does relevant
25   *					stunts to keep the queue safe.
26   *		Alan Cox	:	Fixed double lock.
27   *		Alan Cox	:	Fixed promisc NULL pointer trap
28   *		????????	:	Support the full private ioctl range
29   *		Alan Cox	:	Moved ioctl permission check into
30   *					drivers
31   *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
32   *		Alan Cox	:	100 backlog just doesn't cut it when
33   *					you start doing multicast video 8)
34   *		Alan Cox	:	Rewrote net_bh and list manager.
35   *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
36   *		Alan Cox	:	Took out transmit every packet pass
37   *					Saved a few bytes in the ioctl handler
38   *		Alan Cox	:	Network driver sets packet type before
39   *					calling netif_rx. Saves a function
40   *					call a packet.
41   *		Alan Cox	:	Hashed net_bh()
42   *		Richard Kooijman:	Timestamp fixes.
43   *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
44   *		Alan Cox	:	Device lock protection.
45   *              Alan Cox        :       Fixed nasty side effect of device close
46   *					changes.
47   *		Rudi Cilibrasi	:	Pass the right thing to
48   *					set_mac_address()
49   *		Dave Miller	:	32bit quantity for the device lock to
50   *					make it work out on a Sparc.
51   *		Bjorn Ekwall	:	Added KERNELD hack.
52   *		Alan Cox	:	Cleaned up the backlog initialise.
53   *		Craig Metz	:	SIOCGIFCONF fix if space for under
54   *					1 device.
55   *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
56   *					is no device open function.
57   *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
58   *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
59   *		Cyrus Durgin	:	Cleaned for KMOD
60   *		Adam Sulmicki   :	Bug Fix : Network Device Unload
61   *					A network device unload needs to purge
62   *					the backlog queue.
63   *	Paul Rusty Russell	:	SIOCSIFNAME
64   *              Pekka Riikonen  :	Netdev boot-time settings code
65   *              Andrew Morton   :       Make unregister_netdevice wait
66   *                                      indefinitely on dev->refcnt
67   *              J Hadi Salim    :       - Backlog queue sampling
68   *				        - netif_rx() feedback
69   */
70  
71  #include <linux/uaccess.h>
72  #include <linux/bitmap.h>
73  #include <linux/capability.h>
74  #include <linux/cpu.h>
75  #include <linux/types.h>
76  #include <linux/kernel.h>
77  #include <linux/hash.h>
78  #include <linux/slab.h>
79  #include <linux/sched.h>
80  #include <linux/sched/mm.h>
81  #include <linux/mutex.h>
82  #include <linux/rwsem.h>
83  #include <linux/string.h>
84  #include <linux/mm.h>
85  #include <linux/socket.h>
86  #include <linux/sockios.h>
87  #include <linux/errno.h>
88  #include <linux/interrupt.h>
89  #include <linux/if_ether.h>
90  #include <linux/netdevice.h>
91  #include <linux/etherdevice.h>
92  #include <linux/ethtool.h>
93  #include <linux/skbuff.h>
94  #include <linux/kthread.h>
95  #include <linux/bpf.h>
96  #include <linux/bpf_trace.h>
97  #include <net/net_namespace.h>
98  #include <net/sock.h>
99  #include <net/busy_poll.h>
100  #include <linux/rtnetlink.h>
101  #include <linux/stat.h>
102  #include <net/dsa.h>
103  #include <net/dst.h>
104  #include <net/dst_metadata.h>
105  #include <net/gro.h>
106  #include <net/pkt_sched.h>
107  #include <net/pkt_cls.h>
108  #include <net/checksum.h>
109  #include <net/xfrm.h>
110  #include <net/tcx.h>
111  #include <linux/highmem.h>
112  #include <linux/init.h>
113  #include <linux/module.h>
114  #include <linux/netpoll.h>
115  #include <linux/rcupdate.h>
116  #include <linux/delay.h>
117  #include <net/iw_handler.h>
118  #include <asm/current.h>
119  #include <linux/audit.h>
120  #include <linux/dmaengine.h>
121  #include <linux/err.h>
122  #include <linux/ctype.h>
123  #include <linux/if_arp.h>
124  #include <linux/if_vlan.h>
125  #include <linux/ip.h>
126  #include <net/ip.h>
127  #include <net/mpls.h>
128  #include <linux/ipv6.h>
129  #include <linux/in.h>
130  #include <linux/jhash.h>
131  #include <linux/random.h>
132  #include <trace/events/napi.h>
133  #include <trace/events/net.h>
134  #include <trace/events/skb.h>
135  #include <trace/events/qdisc.h>
136  #include <trace/events/xdp.h>
137  #include <linux/inetdevice.h>
138  #include <linux/cpu_rmap.h>
139  #include <linux/static_key.h>
140  #include <linux/hashtable.h>
141  #include <linux/vmalloc.h>
142  #include <linux/if_macvlan.h>
143  #include <linux/errqueue.h>
144  #include <linux/hrtimer.h>
145  #include <linux/netfilter_netdev.h>
146  #include <linux/crash_dump.h>
147  #include <linux/sctp.h>
148  #include <net/udp_tunnel.h>
149  #include <linux/net_namespace.h>
150  #include <linux/indirect_call_wrapper.h>
151  #include <net/devlink.h>
152  #include <linux/pm_runtime.h>
153  #include <linux/prandom.h>
154  #include <linux/once_lite.h>
155  #include <net/netdev_rx_queue.h>
156  
157  #include "dev.h"
158  #include "net-sysfs.h"
159  
160  static DEFINE_SPINLOCK(ptype_lock);
161  struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
162  struct list_head ptype_all __read_mostly;	/* Taps */
163  
164  static int netif_rx_internal(struct sk_buff *skb);
165  static int call_netdevice_notifiers_extack(unsigned long val,
166  					   struct net_device *dev,
167  					   struct netlink_ext_ack *extack);
168  static struct napi_struct *napi_by_id(unsigned int napi_id);
169  
170  /*
171   * The @dev_base_head list is protected by @dev_base_lock and the rtnl
172   * semaphore.
173   *
174   * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
175   *
176   * Writers must hold the rtnl semaphore while they loop through the
177   * dev_base_head list, and hold dev_base_lock for writing when they do the
178   * actual updates.  This allows pure readers to access the list even
179   * while a writer is preparing to update it.
180   *
181   * To put it another way, dev_base_lock is held for writing only to
182   * protect against pure readers; the rtnl semaphore provides the
183   * protection against other writers.
184   *
185   * See, for example usages, register_netdevice() and
186   * unregister_netdevice(), which must be called with the rtnl
187   * semaphore held.
188   */
189  DEFINE_RWLOCK(dev_base_lock);
190  EXPORT_SYMBOL(dev_base_lock);
191  
192  static DEFINE_MUTEX(ifalias_mutex);
193  
194  /* protects napi_hash addition/deletion and napi_gen_id */
195  static DEFINE_SPINLOCK(napi_hash_lock);
196  
197  static unsigned int napi_gen_id = NR_CPUS;
198  static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
199  
200  static DECLARE_RWSEM(devnet_rename_sem);
201  
dev_base_seq_inc(struct net * net)202  static inline void dev_base_seq_inc(struct net *net)
203  {
204  	while (++net->dev_base_seq == 0)
205  		;
206  }
207  
dev_name_hash(struct net * net,const char * name)208  static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
209  {
210  	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
211  
212  	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
213  }
214  
dev_index_hash(struct net * net,int ifindex)215  static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
216  {
217  	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
218  }
219  
rps_lock_irqsave(struct softnet_data * sd,unsigned long * flags)220  static inline void rps_lock_irqsave(struct softnet_data *sd,
221  				    unsigned long *flags)
222  {
223  	if (IS_ENABLED(CONFIG_RPS))
224  		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
225  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
226  		local_irq_save(*flags);
227  }
228  
rps_lock_irq_disable(struct softnet_data * sd)229  static inline void rps_lock_irq_disable(struct softnet_data *sd)
230  {
231  	if (IS_ENABLED(CONFIG_RPS))
232  		spin_lock_irq(&sd->input_pkt_queue.lock);
233  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
234  		local_irq_disable();
235  }
236  
rps_unlock_irq_restore(struct softnet_data * sd,unsigned long * flags)237  static inline void rps_unlock_irq_restore(struct softnet_data *sd,
238  					  unsigned long *flags)
239  {
240  	if (IS_ENABLED(CONFIG_RPS))
241  		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
242  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
243  		local_irq_restore(*flags);
244  }
245  
rps_unlock_irq_enable(struct softnet_data * sd)246  static inline void rps_unlock_irq_enable(struct softnet_data *sd)
247  {
248  	if (IS_ENABLED(CONFIG_RPS))
249  		spin_unlock_irq(&sd->input_pkt_queue.lock);
250  	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
251  		local_irq_enable();
252  }
253  
netdev_name_node_alloc(struct net_device * dev,const char * name)254  static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
255  						       const char *name)
256  {
257  	struct netdev_name_node *name_node;
258  
259  	name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
260  	if (!name_node)
261  		return NULL;
262  	INIT_HLIST_NODE(&name_node->hlist);
263  	name_node->dev = dev;
264  	name_node->name = name;
265  	return name_node;
266  }
267  
268  static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device * dev)269  netdev_name_node_head_alloc(struct net_device *dev)
270  {
271  	struct netdev_name_node *name_node;
272  
273  	name_node = netdev_name_node_alloc(dev, dev->name);
274  	if (!name_node)
275  		return NULL;
276  	INIT_LIST_HEAD(&name_node->list);
277  	return name_node;
278  }
279  
netdev_name_node_free(struct netdev_name_node * name_node)280  static void netdev_name_node_free(struct netdev_name_node *name_node)
281  {
282  	kfree(name_node);
283  }
284  
netdev_name_node_add(struct net * net,struct netdev_name_node * name_node)285  static void netdev_name_node_add(struct net *net,
286  				 struct netdev_name_node *name_node)
287  {
288  	hlist_add_head_rcu(&name_node->hlist,
289  			   dev_name_hash(net, name_node->name));
290  }
291  
netdev_name_node_del(struct netdev_name_node * name_node)292  static void netdev_name_node_del(struct netdev_name_node *name_node)
293  {
294  	hlist_del_rcu(&name_node->hlist);
295  }
296  
netdev_name_node_lookup(struct net * net,const char * name)297  static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
298  							const char *name)
299  {
300  	struct hlist_head *head = dev_name_hash(net, name);
301  	struct netdev_name_node *name_node;
302  
303  	hlist_for_each_entry(name_node, head, hlist)
304  		if (!strcmp(name_node->name, name))
305  			return name_node;
306  	return NULL;
307  }
308  
netdev_name_node_lookup_rcu(struct net * net,const char * name)309  static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
310  							    const char *name)
311  {
312  	struct hlist_head *head = dev_name_hash(net, name);
313  	struct netdev_name_node *name_node;
314  
315  	hlist_for_each_entry_rcu(name_node, head, hlist)
316  		if (!strcmp(name_node->name, name))
317  			return name_node;
318  	return NULL;
319  }
320  
netdev_name_in_use(struct net * net,const char * name)321  bool netdev_name_in_use(struct net *net, const char *name)
322  {
323  	return netdev_name_node_lookup(net, name);
324  }
325  EXPORT_SYMBOL(netdev_name_in_use);
326  
netdev_name_node_alt_create(struct net_device * dev,const char * name)327  int netdev_name_node_alt_create(struct net_device *dev, const char *name)
328  {
329  	struct netdev_name_node *name_node;
330  	struct net *net = dev_net(dev);
331  
332  	name_node = netdev_name_node_lookup(net, name);
333  	if (name_node)
334  		return -EEXIST;
335  	name_node = netdev_name_node_alloc(dev, name);
336  	if (!name_node)
337  		return -ENOMEM;
338  	netdev_name_node_add(net, name_node);
339  	/* The node that holds dev->name acts as a head of per-device list. */
340  	list_add_tail(&name_node->list, &dev->name_node->list);
341  
342  	return 0;
343  }
344  
__netdev_name_node_alt_destroy(struct netdev_name_node * name_node)345  static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
346  {
347  	list_del(&name_node->list);
348  	kfree(name_node->name);
349  	netdev_name_node_free(name_node);
350  }
351  
netdev_name_node_alt_destroy(struct net_device * dev,const char * name)352  int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
353  {
354  	struct netdev_name_node *name_node;
355  	struct net *net = dev_net(dev);
356  
357  	name_node = netdev_name_node_lookup(net, name);
358  	if (!name_node)
359  		return -ENOENT;
360  	/* lookup might have found our primary name or a name belonging
361  	 * to another device.
362  	 */
363  	if (name_node == dev->name_node || name_node->dev != dev)
364  		return -EINVAL;
365  
366  	netdev_name_node_del(name_node);
367  	synchronize_rcu();
368  	__netdev_name_node_alt_destroy(name_node);
369  
370  	return 0;
371  }
372  
netdev_name_node_alt_flush(struct net_device * dev)373  static void netdev_name_node_alt_flush(struct net_device *dev)
374  {
375  	struct netdev_name_node *name_node, *tmp;
376  
377  	list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
378  		__netdev_name_node_alt_destroy(name_node);
379  }
380  
381  /* Device list insertion */
list_netdevice(struct net_device * dev)382  static void list_netdevice(struct net_device *dev)
383  {
384  	struct netdev_name_node *name_node;
385  	struct net *net = dev_net(dev);
386  
387  	ASSERT_RTNL();
388  
389  	write_lock(&dev_base_lock);
390  	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
391  	netdev_name_node_add(net, dev->name_node);
392  	hlist_add_head_rcu(&dev->index_hlist,
393  			   dev_index_hash(net, dev->ifindex));
394  	write_unlock(&dev_base_lock);
395  
396  	netdev_for_each_altname(dev, name_node)
397  		netdev_name_node_add(net, name_node);
398  
399  	/* We reserved the ifindex, this can't fail */
400  	WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
401  
402  	dev_base_seq_inc(net);
403  }
404  
405  /* Device list removal
406   * caller must respect a RCU grace period before freeing/reusing dev
407   */
unlist_netdevice(struct net_device * dev,bool lock)408  static void unlist_netdevice(struct net_device *dev, bool lock)
409  {
410  	struct netdev_name_node *name_node;
411  	struct net *net = dev_net(dev);
412  
413  	ASSERT_RTNL();
414  
415  	xa_erase(&net->dev_by_index, dev->ifindex);
416  
417  	netdev_for_each_altname(dev, name_node)
418  		netdev_name_node_del(name_node);
419  
420  	/* Unlink dev from the device chain */
421  	if (lock)
422  		write_lock(&dev_base_lock);
423  	list_del_rcu(&dev->dev_list);
424  	netdev_name_node_del(dev->name_node);
425  	hlist_del_rcu(&dev->index_hlist);
426  	if (lock)
427  		write_unlock(&dev_base_lock);
428  
429  	dev_base_seq_inc(dev_net(dev));
430  }
431  
432  /*
433   *	Our notifier list
434   */
435  
436  static RAW_NOTIFIER_HEAD(netdev_chain);
437  
438  /*
439   *	Device drivers call our routines to queue packets here. We empty the
440   *	queue in the local softnet handler.
441   */
442  
443  DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
444  EXPORT_PER_CPU_SYMBOL(softnet_data);
445  
446  #ifdef CONFIG_LOCKDEP
447  /*
448   * register_netdevice() inits txq->_xmit_lock and sets lockdep class
449   * according to dev->type
450   */
451  static const unsigned short netdev_lock_type[] = {
452  	 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
453  	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
454  	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
455  	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
456  	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
457  	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
458  	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
459  	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
460  	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
461  	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
462  	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
463  	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
464  	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
465  	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
466  	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
467  
468  static const char *const netdev_lock_name[] = {
469  	"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
470  	"_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
471  	"_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
472  	"_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
473  	"_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
474  	"_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
475  	"_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
476  	"_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
477  	"_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
478  	"_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
479  	"_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
480  	"_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
481  	"_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
482  	"_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
483  	"_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
484  
485  static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
486  static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
487  
netdev_lock_pos(unsigned short dev_type)488  static inline unsigned short netdev_lock_pos(unsigned short dev_type)
489  {
490  	int i;
491  
492  	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
493  		if (netdev_lock_type[i] == dev_type)
494  			return i;
495  	/* the last key is used by default */
496  	return ARRAY_SIZE(netdev_lock_type) - 1;
497  }
498  
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)499  static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
500  						 unsigned short dev_type)
501  {
502  	int i;
503  
504  	i = netdev_lock_pos(dev_type);
505  	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
506  				   netdev_lock_name[i]);
507  }
508  
netdev_set_addr_lockdep_class(struct net_device * dev)509  static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
510  {
511  	int i;
512  
513  	i = netdev_lock_pos(dev->type);
514  	lockdep_set_class_and_name(&dev->addr_list_lock,
515  				   &netdev_addr_lock_key[i],
516  				   netdev_lock_name[i]);
517  }
518  #else
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)519  static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
520  						 unsigned short dev_type)
521  {
522  }
523  
netdev_set_addr_lockdep_class(struct net_device * dev)524  static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
525  {
526  }
527  #endif
528  
529  /*******************************************************************************
530   *
531   *		Protocol management and registration routines
532   *
533   *******************************************************************************/
534  
535  
536  /*
537   *	Add a protocol ID to the list. Now that the input handler is
538   *	smarter we can dispense with all the messy stuff that used to be
539   *	here.
540   *
541   *	BEWARE!!! Protocol handlers, mangling input packets,
542   *	MUST BE last in hash buckets and checking protocol handlers
543   *	MUST start from promiscuous ptype_all chain in net_bh.
544   *	It is true now, do not change it.
545   *	Explanation follows: if protocol handler, mangling packet, will
546   *	be the first on list, it is not able to sense, that packet
547   *	is cloned and should be copied-on-write, so that it will
548   *	change it and subsequent readers will get broken packet.
549   *							--ANK (980803)
550   */
551  
ptype_head(const struct packet_type * pt)552  static inline struct list_head *ptype_head(const struct packet_type *pt)
553  {
554  	if (pt->type == htons(ETH_P_ALL))
555  		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
556  	else
557  		return pt->dev ? &pt->dev->ptype_specific :
558  				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
559  }
560  
561  /**
562   *	dev_add_pack - add packet handler
563   *	@pt: packet type declaration
564   *
565   *	Add a protocol handler to the networking stack. The passed &packet_type
566   *	is linked into kernel lists and may not be freed until it has been
567   *	removed from the kernel lists.
568   *
569   *	This call does not sleep therefore it can not
570   *	guarantee all CPU's that are in middle of receiving packets
571   *	will see the new packet type (until the next received packet).
572   */
573  
dev_add_pack(struct packet_type * pt)574  void dev_add_pack(struct packet_type *pt)
575  {
576  	struct list_head *head = ptype_head(pt);
577  
578  	spin_lock(&ptype_lock);
579  	list_add_rcu(&pt->list, head);
580  	spin_unlock(&ptype_lock);
581  }
582  EXPORT_SYMBOL(dev_add_pack);
583  
584  /**
585   *	__dev_remove_pack	 - remove packet handler
586   *	@pt: packet type declaration
587   *
588   *	Remove a protocol handler that was previously added to the kernel
589   *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
590   *	from the kernel lists and can be freed or reused once this function
591   *	returns.
592   *
593   *      The packet type might still be in use by receivers
594   *	and must not be freed until after all the CPU's have gone
595   *	through a quiescent state.
596   */
__dev_remove_pack(struct packet_type * pt)597  void __dev_remove_pack(struct packet_type *pt)
598  {
599  	struct list_head *head = ptype_head(pt);
600  	struct packet_type *pt1;
601  
602  	spin_lock(&ptype_lock);
603  
604  	list_for_each_entry(pt1, head, list) {
605  		if (pt == pt1) {
606  			list_del_rcu(&pt->list);
607  			goto out;
608  		}
609  	}
610  
611  	pr_warn("dev_remove_pack: %p not found\n", pt);
612  out:
613  	spin_unlock(&ptype_lock);
614  }
615  EXPORT_SYMBOL(__dev_remove_pack);
616  
617  /**
618   *	dev_remove_pack	 - remove packet handler
619   *	@pt: packet type declaration
620   *
621   *	Remove a protocol handler that was previously added to the kernel
622   *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
623   *	from the kernel lists and can be freed or reused once this function
624   *	returns.
625   *
626   *	This call sleeps to guarantee that no CPU is looking at the packet
627   *	type after return.
628   */
dev_remove_pack(struct packet_type * pt)629  void dev_remove_pack(struct packet_type *pt)
630  {
631  	__dev_remove_pack(pt);
632  
633  	synchronize_net();
634  }
635  EXPORT_SYMBOL(dev_remove_pack);
636  
637  
638  /*******************************************************************************
639   *
640   *			    Device Interface Subroutines
641   *
642   *******************************************************************************/
643  
644  /**
645   *	dev_get_iflink	- get 'iflink' value of a interface
646   *	@dev: targeted interface
647   *
648   *	Indicates the ifindex the interface is linked to.
649   *	Physical interfaces have the same 'ifindex' and 'iflink' values.
650   */
651  
dev_get_iflink(const struct net_device * dev)652  int dev_get_iflink(const struct net_device *dev)
653  {
654  	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
655  		return dev->netdev_ops->ndo_get_iflink(dev);
656  
657  	return dev->ifindex;
658  }
659  EXPORT_SYMBOL(dev_get_iflink);
660  
661  /**
662   *	dev_fill_metadata_dst - Retrieve tunnel egress information.
663   *	@dev: targeted interface
664   *	@skb: The packet.
665   *
666   *	For better visibility of tunnel traffic OVS needs to retrieve
667   *	egress tunnel information for a packet. Following API allows
668   *	user to get this info.
669   */
dev_fill_metadata_dst(struct net_device * dev,struct sk_buff * skb)670  int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
671  {
672  	struct ip_tunnel_info *info;
673  
674  	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
675  		return -EINVAL;
676  
677  	info = skb_tunnel_info_unclone(skb);
678  	if (!info)
679  		return -ENOMEM;
680  	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
681  		return -EINVAL;
682  
683  	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
684  }
685  EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
686  
dev_fwd_path(struct net_device_path_stack * stack)687  static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
688  {
689  	int k = stack->num_paths++;
690  
691  	if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
692  		return NULL;
693  
694  	return &stack->path[k];
695  }
696  
dev_fill_forward_path(const struct net_device * dev,const u8 * daddr,struct net_device_path_stack * stack)697  int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
698  			  struct net_device_path_stack *stack)
699  {
700  	const struct net_device *last_dev;
701  	struct net_device_path_ctx ctx = {
702  		.dev	= dev,
703  	};
704  	struct net_device_path *path;
705  	int ret = 0;
706  
707  	memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
708  	stack->num_paths = 0;
709  	while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
710  		last_dev = ctx.dev;
711  		path = dev_fwd_path(stack);
712  		if (!path)
713  			return -1;
714  
715  		memset(path, 0, sizeof(struct net_device_path));
716  		ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
717  		if (ret < 0)
718  			return -1;
719  
720  		if (WARN_ON_ONCE(last_dev == ctx.dev))
721  			return -1;
722  	}
723  
724  	if (!ctx.dev)
725  		return ret;
726  
727  	path = dev_fwd_path(stack);
728  	if (!path)
729  		return -1;
730  	path->type = DEV_PATH_ETHERNET;
731  	path->dev = ctx.dev;
732  
733  	return ret;
734  }
735  EXPORT_SYMBOL_GPL(dev_fill_forward_path);
736  
737  /**
738   *	__dev_get_by_name	- find a device by its name
739   *	@net: the applicable net namespace
740   *	@name: name to find
741   *
742   *	Find an interface by name. Must be called under RTNL semaphore
743   *	or @dev_base_lock. If the name is found a pointer to the device
744   *	is returned. If the name is not found then %NULL is returned. The
745   *	reference counters are not incremented so the caller must be
746   *	careful with locks.
747   */
748  
__dev_get_by_name(struct net * net,const char * name)749  struct net_device *__dev_get_by_name(struct net *net, const char *name)
750  {
751  	struct netdev_name_node *node_name;
752  
753  	node_name = netdev_name_node_lookup(net, name);
754  	return node_name ? node_name->dev : NULL;
755  }
756  EXPORT_SYMBOL(__dev_get_by_name);
757  
758  /**
759   * dev_get_by_name_rcu	- find a device by its name
760   * @net: the applicable net namespace
761   * @name: name to find
762   *
763   * Find an interface by name.
764   * If the name is found a pointer to the device is returned.
765   * If the name is not found then %NULL is returned.
766   * The reference counters are not incremented so the caller must be
767   * careful with locks. The caller must hold RCU lock.
768   */
769  
dev_get_by_name_rcu(struct net * net,const char * name)770  struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
771  {
772  	struct netdev_name_node *node_name;
773  
774  	node_name = netdev_name_node_lookup_rcu(net, name);
775  	return node_name ? node_name->dev : NULL;
776  }
777  EXPORT_SYMBOL(dev_get_by_name_rcu);
778  
779  /* Deprecated for new users, call netdev_get_by_name() instead */
dev_get_by_name(struct net * net,const char * name)780  struct net_device *dev_get_by_name(struct net *net, const char *name)
781  {
782  	struct net_device *dev;
783  
784  	rcu_read_lock();
785  	dev = dev_get_by_name_rcu(net, name);
786  	dev_hold(dev);
787  	rcu_read_unlock();
788  	return dev;
789  }
790  EXPORT_SYMBOL(dev_get_by_name);
791  
792  /**
793   *	netdev_get_by_name() - find a device by its name
794   *	@net: the applicable net namespace
795   *	@name: name to find
796   *	@tracker: tracking object for the acquired reference
797   *	@gfp: allocation flags for the tracker
798   *
799   *	Find an interface by name. This can be called from any
800   *	context and does its own locking. The returned handle has
801   *	the usage count incremented and the caller must use netdev_put() to
802   *	release it when it is no longer needed. %NULL is returned if no
803   *	matching device is found.
804   */
netdev_get_by_name(struct net * net,const char * name,netdevice_tracker * tracker,gfp_t gfp)805  struct net_device *netdev_get_by_name(struct net *net, const char *name,
806  				      netdevice_tracker *tracker, gfp_t gfp)
807  {
808  	struct net_device *dev;
809  
810  	dev = dev_get_by_name(net, name);
811  	if (dev)
812  		netdev_tracker_alloc(dev, tracker, gfp);
813  	return dev;
814  }
815  EXPORT_SYMBOL(netdev_get_by_name);
816  
817  /**
818   *	__dev_get_by_index - find a device by its ifindex
819   *	@net: the applicable net namespace
820   *	@ifindex: index of device
821   *
822   *	Search for an interface by index. Returns %NULL if the device
823   *	is not found or a pointer to the device. The device has not
824   *	had its reference counter increased so the caller must be careful
825   *	about locking. The caller must hold either the RTNL semaphore
826   *	or @dev_base_lock.
827   */
828  
__dev_get_by_index(struct net * net,int ifindex)829  struct net_device *__dev_get_by_index(struct net *net, int ifindex)
830  {
831  	struct net_device *dev;
832  	struct hlist_head *head = dev_index_hash(net, ifindex);
833  
834  	hlist_for_each_entry(dev, head, index_hlist)
835  		if (dev->ifindex == ifindex)
836  			return dev;
837  
838  	return NULL;
839  }
840  EXPORT_SYMBOL(__dev_get_by_index);
841  
842  /**
843   *	dev_get_by_index_rcu - find a device by its ifindex
844   *	@net: the applicable net namespace
845   *	@ifindex: index of device
846   *
847   *	Search for an interface by index. Returns %NULL if the device
848   *	is not found or a pointer to the device. The device has not
849   *	had its reference counter increased so the caller must be careful
850   *	about locking. The caller must hold RCU lock.
851   */
852  
dev_get_by_index_rcu(struct net * net,int ifindex)853  struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
854  {
855  	struct net_device *dev;
856  	struct hlist_head *head = dev_index_hash(net, ifindex);
857  
858  	hlist_for_each_entry_rcu(dev, head, index_hlist)
859  		if (dev->ifindex == ifindex)
860  			return dev;
861  
862  	return NULL;
863  }
864  EXPORT_SYMBOL(dev_get_by_index_rcu);
865  
866  /* Deprecated for new users, call netdev_get_by_index() instead */
dev_get_by_index(struct net * net,int ifindex)867  struct net_device *dev_get_by_index(struct net *net, int ifindex)
868  {
869  	struct net_device *dev;
870  
871  	rcu_read_lock();
872  	dev = dev_get_by_index_rcu(net, ifindex);
873  	dev_hold(dev);
874  	rcu_read_unlock();
875  	return dev;
876  }
877  EXPORT_SYMBOL(dev_get_by_index);
878  
879  /**
880   *	netdev_get_by_index() - find a device by its ifindex
881   *	@net: the applicable net namespace
882   *	@ifindex: index of device
883   *	@tracker: tracking object for the acquired reference
884   *	@gfp: allocation flags for the tracker
885   *
886   *	Search for an interface by index. Returns NULL if the device
887   *	is not found or a pointer to the device. The device returned has
888   *	had a reference added and the pointer is safe until the user calls
889   *	netdev_put() to indicate they have finished with it.
890   */
netdev_get_by_index(struct net * net,int ifindex,netdevice_tracker * tracker,gfp_t gfp)891  struct net_device *netdev_get_by_index(struct net *net, int ifindex,
892  				       netdevice_tracker *tracker, gfp_t gfp)
893  {
894  	struct net_device *dev;
895  
896  	dev = dev_get_by_index(net, ifindex);
897  	if (dev)
898  		netdev_tracker_alloc(dev, tracker, gfp);
899  	return dev;
900  }
901  EXPORT_SYMBOL(netdev_get_by_index);
902  
903  /**
904   *	dev_get_by_napi_id - find a device by napi_id
905   *	@napi_id: ID of the NAPI struct
906   *
907   *	Search for an interface by NAPI ID. Returns %NULL if the device
908   *	is not found or a pointer to the device. The device has not had
909   *	its reference counter increased so the caller must be careful
910   *	about locking. The caller must hold RCU lock.
911   */
912  
dev_get_by_napi_id(unsigned int napi_id)913  struct net_device *dev_get_by_napi_id(unsigned int napi_id)
914  {
915  	struct napi_struct *napi;
916  
917  	WARN_ON_ONCE(!rcu_read_lock_held());
918  
919  	if (napi_id < MIN_NAPI_ID)
920  		return NULL;
921  
922  	napi = napi_by_id(napi_id);
923  
924  	return napi ? napi->dev : NULL;
925  }
926  EXPORT_SYMBOL(dev_get_by_napi_id);
927  
928  /**
929   *	netdev_get_name - get a netdevice name, knowing its ifindex.
930   *	@net: network namespace
931   *	@name: a pointer to the buffer where the name will be stored.
932   *	@ifindex: the ifindex of the interface to get the name from.
933   */
netdev_get_name(struct net * net,char * name,int ifindex)934  int netdev_get_name(struct net *net, char *name, int ifindex)
935  {
936  	struct net_device *dev;
937  	int ret;
938  
939  	down_read(&devnet_rename_sem);
940  	rcu_read_lock();
941  
942  	dev = dev_get_by_index_rcu(net, ifindex);
943  	if (!dev) {
944  		ret = -ENODEV;
945  		goto out;
946  	}
947  
948  	strcpy(name, dev->name);
949  
950  	ret = 0;
951  out:
952  	rcu_read_unlock();
953  	up_read(&devnet_rename_sem);
954  	return ret;
955  }
956  
dev_addr_cmp(struct net_device * dev,unsigned short type,const char * ha)957  static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
958  			 const char *ha)
959  {
960  	return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
961  }
962  
963  /**
964   *	dev_getbyhwaddr_rcu - find a device by its hardware address
965   *	@net: the applicable net namespace
966   *	@type: media type of device
967   *	@ha: hardware address
968   *
969   *	Search for an interface by MAC address. Returns NULL if the device
970   *	is not found or a pointer to the device.
971   *	The caller must hold RCU.
972   *	The returned device has not had its ref count increased
973   *	and the caller must therefore be careful about locking
974   *
975   */
976  
dev_getbyhwaddr_rcu(struct net * net,unsigned short type,const char * ha)977  struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
978  				       const char *ha)
979  {
980  	struct net_device *dev;
981  
982  	for_each_netdev_rcu(net, dev)
983  		if (dev_addr_cmp(dev, type, ha))
984  			return dev;
985  
986  	return NULL;
987  }
988  EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
989  
990  /**
991   * dev_getbyhwaddr() - find a device by its hardware address
992   * @net: the applicable net namespace
993   * @type: media type of device
994   * @ha: hardware address
995   *
996   * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
997   * rtnl_lock.
998   *
999   * Context: rtnl_lock() must be held.
1000   * Return: pointer to the net_device, or NULL if not found
1001   */
dev_getbyhwaddr(struct net * net,unsigned short type,const char * ha)1002  struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
1003  				   const char *ha)
1004  {
1005  	struct net_device *dev;
1006  
1007  	ASSERT_RTNL();
1008  	for_each_netdev(net, dev)
1009  		if (dev_addr_cmp(dev, type, ha))
1010  			return dev;
1011  
1012  	return NULL;
1013  }
1014  EXPORT_SYMBOL(dev_getbyhwaddr);
1015  
dev_getfirstbyhwtype(struct net * net,unsigned short type)1016  struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1017  {
1018  	struct net_device *dev, *ret = NULL;
1019  
1020  	rcu_read_lock();
1021  	for_each_netdev_rcu(net, dev)
1022  		if (dev->type == type) {
1023  			dev_hold(dev);
1024  			ret = dev;
1025  			break;
1026  		}
1027  	rcu_read_unlock();
1028  	return ret;
1029  }
1030  EXPORT_SYMBOL(dev_getfirstbyhwtype);
1031  
1032  /**
1033   *	__dev_get_by_flags - find any device with given flags
1034   *	@net: the applicable net namespace
1035   *	@if_flags: IFF_* values
1036   *	@mask: bitmask of bits in if_flags to check
1037   *
1038   *	Search for any interface with the given flags. Returns NULL if a device
1039   *	is not found or a pointer to the device. Must be called inside
1040   *	rtnl_lock(), and result refcount is unchanged.
1041   */
1042  
__dev_get_by_flags(struct net * net,unsigned short if_flags,unsigned short mask)1043  struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1044  				      unsigned short mask)
1045  {
1046  	struct net_device *dev, *ret;
1047  
1048  	ASSERT_RTNL();
1049  
1050  	ret = NULL;
1051  	for_each_netdev(net, dev) {
1052  		if (((dev->flags ^ if_flags) & mask) == 0) {
1053  			ret = dev;
1054  			break;
1055  		}
1056  	}
1057  	return ret;
1058  }
1059  EXPORT_SYMBOL(__dev_get_by_flags);
1060  
1061  /**
1062   *	dev_valid_name - check if name is okay for network device
1063   *	@name: name string
1064   *
1065   *	Network device names need to be valid file names to
1066   *	allow sysfs to work.  We also disallow any kind of
1067   *	whitespace.
1068   */
dev_valid_name(const char * name)1069  bool dev_valid_name(const char *name)
1070  {
1071  	if (*name == '\0')
1072  		return false;
1073  	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1074  		return false;
1075  	if (!strcmp(name, ".") || !strcmp(name, ".."))
1076  		return false;
1077  
1078  	while (*name) {
1079  		if (*name == '/' || *name == ':' || isspace(*name))
1080  			return false;
1081  		name++;
1082  	}
1083  	return true;
1084  }
1085  EXPORT_SYMBOL(dev_valid_name);
1086  
1087  /**
1088   *	__dev_alloc_name - allocate a name for a device
1089   *	@net: network namespace to allocate the device name in
1090   *	@name: name format string
1091   *	@buf:  scratch buffer and result name string
1092   *
1093   *	Passed a format string - eg "lt%d" it will try and find a suitable
1094   *	id. It scans list of devices to build up a free map, then chooses
1095   *	the first empty slot. The caller must hold the dev_base or rtnl lock
1096   *	while allocating the name and adding the device in order to avoid
1097   *	duplicates.
1098   *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1099   *	Returns the number of the unit assigned or a negative errno code.
1100   */
1101  
__dev_alloc_name(struct net * net,const char * name,char * buf)1102  static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1103  {
1104  	int i = 0;
1105  	const char *p;
1106  	const int max_netdevices = 8*PAGE_SIZE;
1107  	unsigned long *inuse;
1108  	struct net_device *d;
1109  
1110  	if (!dev_valid_name(name))
1111  		return -EINVAL;
1112  
1113  	p = strchr(name, '%');
1114  	if (p) {
1115  		/*
1116  		 * Verify the string as this thing may have come from
1117  		 * the user.  There must be either one "%d" and no other "%"
1118  		 * characters.
1119  		 */
1120  		if (p[1] != 'd' || strchr(p + 2, '%'))
1121  			return -EINVAL;
1122  
1123  		/* Use one page as a bit array of possible slots */
1124  		inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1125  		if (!inuse)
1126  			return -ENOMEM;
1127  
1128  		for_each_netdev(net, d) {
1129  			struct netdev_name_node *name_node;
1130  
1131  			netdev_for_each_altname(d, name_node) {
1132  				if (!sscanf(name_node->name, name, &i))
1133  					continue;
1134  				if (i < 0 || i >= max_netdevices)
1135  					continue;
1136  
1137  				/*  avoid cases where sscanf is not exact inverse of printf */
1138  				snprintf(buf, IFNAMSIZ, name, i);
1139  				if (!strncmp(buf, name_node->name, IFNAMSIZ))
1140  					__set_bit(i, inuse);
1141  			}
1142  			if (!sscanf(d->name, name, &i))
1143  				continue;
1144  			if (i < 0 || i >= max_netdevices)
1145  				continue;
1146  
1147  			/*  avoid cases where sscanf is not exact inverse of printf */
1148  			snprintf(buf, IFNAMSIZ, name, i);
1149  			if (!strncmp(buf, d->name, IFNAMSIZ))
1150  				__set_bit(i, inuse);
1151  		}
1152  
1153  		i = find_first_zero_bit(inuse, max_netdevices);
1154  		bitmap_free(inuse);
1155  	}
1156  
1157  	snprintf(buf, IFNAMSIZ, name, i);
1158  	if (!netdev_name_in_use(net, buf))
1159  		return i;
1160  
1161  	/* It is possible to run out of possible slots
1162  	 * when the name is long and there isn't enough space left
1163  	 * for the digits, or if all bits are used.
1164  	 */
1165  	return -ENFILE;
1166  }
1167  
dev_prep_valid_name(struct net * net,struct net_device * dev,const char * want_name,char * out_name)1168  static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1169  			       const char *want_name, char *out_name)
1170  {
1171  	int ret;
1172  
1173  	if (!dev_valid_name(want_name))
1174  		return -EINVAL;
1175  
1176  	if (strchr(want_name, '%')) {
1177  		ret = __dev_alloc_name(net, want_name, out_name);
1178  		return ret < 0 ? ret : 0;
1179  	} else if (netdev_name_in_use(net, want_name)) {
1180  		return -EEXIST;
1181  	} else if (out_name != want_name) {
1182  		strscpy(out_name, want_name, IFNAMSIZ);
1183  	}
1184  
1185  	return 0;
1186  }
1187  
dev_alloc_name_ns(struct net * net,struct net_device * dev,const char * name)1188  static int dev_alloc_name_ns(struct net *net,
1189  			     struct net_device *dev,
1190  			     const char *name)
1191  {
1192  	char buf[IFNAMSIZ];
1193  	int ret;
1194  
1195  	BUG_ON(!net);
1196  	ret = __dev_alloc_name(net, name, buf);
1197  	if (ret >= 0)
1198  		strscpy(dev->name, buf, IFNAMSIZ);
1199  	return ret;
1200  }
1201  
1202  /**
1203   *	dev_alloc_name - allocate a name for a device
1204   *	@dev: device
1205   *	@name: name format string
1206   *
1207   *	Passed a format string - eg "lt%d" it will try and find a suitable
1208   *	id. It scans list of devices to build up a free map, then chooses
1209   *	the first empty slot. The caller must hold the dev_base or rtnl lock
1210   *	while allocating the name and adding the device in order to avoid
1211   *	duplicates.
1212   *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1213   *	Returns the number of the unit assigned or a negative errno code.
1214   */
1215  
dev_alloc_name(struct net_device * dev,const char * name)1216  int dev_alloc_name(struct net_device *dev, const char *name)
1217  {
1218  	return dev_alloc_name_ns(dev_net(dev), dev, name);
1219  }
1220  EXPORT_SYMBOL(dev_alloc_name);
1221  
dev_get_valid_name(struct net * net,struct net_device * dev,const char * name)1222  static int dev_get_valid_name(struct net *net, struct net_device *dev,
1223  			      const char *name)
1224  {
1225  	char buf[IFNAMSIZ];
1226  	int ret;
1227  
1228  	ret = dev_prep_valid_name(net, dev, name, buf);
1229  	if (ret >= 0)
1230  		strscpy(dev->name, buf, IFNAMSIZ);
1231  	return ret;
1232  }
1233  
1234  /**
1235   *	dev_change_name - change name of a device
1236   *	@dev: device
1237   *	@newname: name (or format string) must be at least IFNAMSIZ
1238   *
1239   *	Change name of a device, can pass format strings "eth%d".
1240   *	for wildcarding.
1241   */
dev_change_name(struct net_device * dev,const char * newname)1242  int dev_change_name(struct net_device *dev, const char *newname)
1243  {
1244  	unsigned char old_assign_type;
1245  	char oldname[IFNAMSIZ];
1246  	int err = 0;
1247  	int ret;
1248  	struct net *net;
1249  
1250  	ASSERT_RTNL();
1251  	BUG_ON(!dev_net(dev));
1252  
1253  	net = dev_net(dev);
1254  
1255  	down_write(&devnet_rename_sem);
1256  
1257  	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1258  		up_write(&devnet_rename_sem);
1259  		return 0;
1260  	}
1261  
1262  	memcpy(oldname, dev->name, IFNAMSIZ);
1263  
1264  	err = dev_get_valid_name(net, dev, newname);
1265  	if (err < 0) {
1266  		up_write(&devnet_rename_sem);
1267  		return err;
1268  	}
1269  
1270  	if (oldname[0] && !strchr(oldname, '%'))
1271  		netdev_info(dev, "renamed from %s%s\n", oldname,
1272  			    dev->flags & IFF_UP ? " (while UP)" : "");
1273  
1274  	old_assign_type = dev->name_assign_type;
1275  	dev->name_assign_type = NET_NAME_RENAMED;
1276  
1277  rollback:
1278  	ret = device_rename(&dev->dev, dev->name);
1279  	if (ret) {
1280  		memcpy(dev->name, oldname, IFNAMSIZ);
1281  		dev->name_assign_type = old_assign_type;
1282  		up_write(&devnet_rename_sem);
1283  		return ret;
1284  	}
1285  
1286  	up_write(&devnet_rename_sem);
1287  
1288  	netdev_adjacent_rename_links(dev, oldname);
1289  
1290  	write_lock(&dev_base_lock);
1291  	netdev_name_node_del(dev->name_node);
1292  	write_unlock(&dev_base_lock);
1293  
1294  	synchronize_rcu();
1295  
1296  	write_lock(&dev_base_lock);
1297  	netdev_name_node_add(net, dev->name_node);
1298  	write_unlock(&dev_base_lock);
1299  
1300  	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1301  	ret = notifier_to_errno(ret);
1302  
1303  	if (ret) {
1304  		/* err >= 0 after dev_alloc_name() or stores the first errno */
1305  		if (err >= 0) {
1306  			err = ret;
1307  			down_write(&devnet_rename_sem);
1308  			memcpy(dev->name, oldname, IFNAMSIZ);
1309  			memcpy(oldname, newname, IFNAMSIZ);
1310  			dev->name_assign_type = old_assign_type;
1311  			old_assign_type = NET_NAME_RENAMED;
1312  			goto rollback;
1313  		} else {
1314  			netdev_err(dev, "name change rollback failed: %d\n",
1315  				   ret);
1316  		}
1317  	}
1318  
1319  	return err;
1320  }
1321  
1322  /**
1323   *	dev_set_alias - change ifalias of a device
1324   *	@dev: device
1325   *	@alias: name up to IFALIASZ
1326   *	@len: limit of bytes to copy from info
1327   *
1328   *	Set ifalias for a device,
1329   */
dev_set_alias(struct net_device * dev,const char * alias,size_t len)1330  int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1331  {
1332  	struct dev_ifalias *new_alias = NULL;
1333  
1334  	if (len >= IFALIASZ)
1335  		return -EINVAL;
1336  
1337  	if (len) {
1338  		new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1339  		if (!new_alias)
1340  			return -ENOMEM;
1341  
1342  		memcpy(new_alias->ifalias, alias, len);
1343  		new_alias->ifalias[len] = 0;
1344  	}
1345  
1346  	mutex_lock(&ifalias_mutex);
1347  	new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1348  					mutex_is_locked(&ifalias_mutex));
1349  	mutex_unlock(&ifalias_mutex);
1350  
1351  	if (new_alias)
1352  		kfree_rcu(new_alias, rcuhead);
1353  
1354  	return len;
1355  }
1356  EXPORT_SYMBOL(dev_set_alias);
1357  
1358  /**
1359   *	dev_get_alias - get ifalias of a device
1360   *	@dev: device
1361   *	@name: buffer to store name of ifalias
1362   *	@len: size of buffer
1363   *
1364   *	get ifalias for a device.  Caller must make sure dev cannot go
1365   *	away,  e.g. rcu read lock or own a reference count to device.
1366   */
dev_get_alias(const struct net_device * dev,char * name,size_t len)1367  int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1368  {
1369  	const struct dev_ifalias *alias;
1370  	int ret = 0;
1371  
1372  	rcu_read_lock();
1373  	alias = rcu_dereference(dev->ifalias);
1374  	if (alias)
1375  		ret = snprintf(name, len, "%s", alias->ifalias);
1376  	rcu_read_unlock();
1377  
1378  	return ret;
1379  }
1380  
1381  /**
1382   *	netdev_features_change - device changes features
1383   *	@dev: device to cause notification
1384   *
1385   *	Called to indicate a device has changed features.
1386   */
netdev_features_change(struct net_device * dev)1387  void netdev_features_change(struct net_device *dev)
1388  {
1389  	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1390  }
1391  EXPORT_SYMBOL(netdev_features_change);
1392  
1393  /**
1394   *	netdev_state_change - device changes state
1395   *	@dev: device to cause notification
1396   *
1397   *	Called to indicate a device has changed state. This function calls
1398   *	the notifier chains for netdev_chain and sends a NEWLINK message
1399   *	to the routing socket.
1400   */
netdev_state_change(struct net_device * dev)1401  void netdev_state_change(struct net_device *dev)
1402  {
1403  	if (dev->flags & IFF_UP) {
1404  		struct netdev_notifier_change_info change_info = {
1405  			.info.dev = dev,
1406  		};
1407  
1408  		call_netdevice_notifiers_info(NETDEV_CHANGE,
1409  					      &change_info.info);
1410  		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1411  	}
1412  }
1413  EXPORT_SYMBOL(netdev_state_change);
1414  
1415  /**
1416   * __netdev_notify_peers - notify network peers about existence of @dev,
1417   * to be called when rtnl lock is already held.
1418   * @dev: network device
1419   *
1420   * Generate traffic such that interested network peers are aware of
1421   * @dev, such as by generating a gratuitous ARP. This may be used when
1422   * a device wants to inform the rest of the network about some sort of
1423   * reconfiguration such as a failover event or virtual machine
1424   * migration.
1425   */
__netdev_notify_peers(struct net_device * dev)1426  void __netdev_notify_peers(struct net_device *dev)
1427  {
1428  	ASSERT_RTNL();
1429  	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1430  	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1431  }
1432  EXPORT_SYMBOL(__netdev_notify_peers);
1433  
1434  /**
1435   * netdev_notify_peers - notify network peers about existence of @dev
1436   * @dev: network device
1437   *
1438   * Generate traffic such that interested network peers are aware of
1439   * @dev, such as by generating a gratuitous ARP. This may be used when
1440   * a device wants to inform the rest of the network about some sort of
1441   * reconfiguration such as a failover event or virtual machine
1442   * migration.
1443   */
netdev_notify_peers(struct net_device * dev)1444  void netdev_notify_peers(struct net_device *dev)
1445  {
1446  	rtnl_lock();
1447  	__netdev_notify_peers(dev);
1448  	rtnl_unlock();
1449  }
1450  EXPORT_SYMBOL(netdev_notify_peers);
1451  
1452  static int napi_threaded_poll(void *data);
1453  
napi_kthread_create(struct napi_struct * n)1454  static int napi_kthread_create(struct napi_struct *n)
1455  {
1456  	int err = 0;
1457  
1458  	/* Create and wake up the kthread once to put it in
1459  	 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1460  	 * warning and work with loadavg.
1461  	 */
1462  	n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1463  				n->dev->name, n->napi_id);
1464  	if (IS_ERR(n->thread)) {
1465  		err = PTR_ERR(n->thread);
1466  		pr_err("kthread_run failed with err %d\n", err);
1467  		n->thread = NULL;
1468  	}
1469  
1470  	return err;
1471  }
1472  
__dev_open(struct net_device * dev,struct netlink_ext_ack * extack)1473  static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1474  {
1475  	const struct net_device_ops *ops = dev->netdev_ops;
1476  	int ret;
1477  
1478  	ASSERT_RTNL();
1479  	dev_addr_check(dev);
1480  
1481  	if (!netif_device_present(dev)) {
1482  		/* may be detached because parent is runtime-suspended */
1483  		if (dev->dev.parent)
1484  			pm_runtime_resume(dev->dev.parent);
1485  		if (!netif_device_present(dev))
1486  			return -ENODEV;
1487  	}
1488  
1489  	/* Block netpoll from trying to do any rx path servicing.
1490  	 * If we don't do this there is a chance ndo_poll_controller
1491  	 * or ndo_poll may be running while we open the device
1492  	 */
1493  	netpoll_poll_disable(dev);
1494  
1495  	ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1496  	ret = notifier_to_errno(ret);
1497  	if (ret)
1498  		return ret;
1499  
1500  	set_bit(__LINK_STATE_START, &dev->state);
1501  
1502  	if (ops->ndo_validate_addr)
1503  		ret = ops->ndo_validate_addr(dev);
1504  
1505  	if (!ret && ops->ndo_open)
1506  		ret = ops->ndo_open(dev);
1507  
1508  	netpoll_poll_enable(dev);
1509  
1510  	if (ret)
1511  		clear_bit(__LINK_STATE_START, &dev->state);
1512  	else {
1513  		dev->flags |= IFF_UP;
1514  		dev_set_rx_mode(dev);
1515  		dev_activate(dev);
1516  		add_device_randomness(dev->dev_addr, dev->addr_len);
1517  	}
1518  
1519  	return ret;
1520  }
1521  
1522  /**
1523   *	dev_open	- prepare an interface for use.
1524   *	@dev: device to open
1525   *	@extack: netlink extended ack
1526   *
1527   *	Takes a device from down to up state. The device's private open
1528   *	function is invoked and then the multicast lists are loaded. Finally
1529   *	the device is moved into the up state and a %NETDEV_UP message is
1530   *	sent to the netdev notifier chain.
1531   *
1532   *	Calling this function on an active interface is a nop. On a failure
1533   *	a negative errno code is returned.
1534   */
dev_open(struct net_device * dev,struct netlink_ext_ack * extack)1535  int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1536  {
1537  	int ret;
1538  
1539  	if (dev->flags & IFF_UP)
1540  		return 0;
1541  
1542  	ret = __dev_open(dev, extack);
1543  	if (ret < 0)
1544  		return ret;
1545  
1546  	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1547  	call_netdevice_notifiers(NETDEV_UP, dev);
1548  
1549  	return ret;
1550  }
1551  EXPORT_SYMBOL(dev_open);
1552  
__dev_close_many(struct list_head * head)1553  static void __dev_close_many(struct list_head *head)
1554  {
1555  	struct net_device *dev;
1556  
1557  	ASSERT_RTNL();
1558  	might_sleep();
1559  
1560  	list_for_each_entry(dev, head, close_list) {
1561  		/* Temporarily disable netpoll until the interface is down */
1562  		netpoll_poll_disable(dev);
1563  
1564  		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1565  
1566  		clear_bit(__LINK_STATE_START, &dev->state);
1567  
1568  		/* Synchronize to scheduled poll. We cannot touch poll list, it
1569  		 * can be even on different cpu. So just clear netif_running().
1570  		 *
1571  		 * dev->stop() will invoke napi_disable() on all of it's
1572  		 * napi_struct instances on this device.
1573  		 */
1574  		smp_mb__after_atomic(); /* Commit netif_running(). */
1575  	}
1576  
1577  	dev_deactivate_many(head);
1578  
1579  	list_for_each_entry(dev, head, close_list) {
1580  		const struct net_device_ops *ops = dev->netdev_ops;
1581  
1582  		/*
1583  		 *	Call the device specific close. This cannot fail.
1584  		 *	Only if device is UP
1585  		 *
1586  		 *	We allow it to be called even after a DETACH hot-plug
1587  		 *	event.
1588  		 */
1589  		if (ops->ndo_stop)
1590  			ops->ndo_stop(dev);
1591  
1592  		dev->flags &= ~IFF_UP;
1593  		netpoll_poll_enable(dev);
1594  	}
1595  }
1596  
__dev_close(struct net_device * dev)1597  static void __dev_close(struct net_device *dev)
1598  {
1599  	LIST_HEAD(single);
1600  
1601  	list_add(&dev->close_list, &single);
1602  	__dev_close_many(&single);
1603  	list_del(&single);
1604  }
1605  
dev_close_many(struct list_head * head,bool unlink)1606  void dev_close_many(struct list_head *head, bool unlink)
1607  {
1608  	struct net_device *dev, *tmp;
1609  
1610  	/* Remove the devices that don't need to be closed */
1611  	list_for_each_entry_safe(dev, tmp, head, close_list)
1612  		if (!(dev->flags & IFF_UP))
1613  			list_del_init(&dev->close_list);
1614  
1615  	__dev_close_many(head);
1616  
1617  	list_for_each_entry_safe(dev, tmp, head, close_list) {
1618  		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1619  		call_netdevice_notifiers(NETDEV_DOWN, dev);
1620  		if (unlink)
1621  			list_del_init(&dev->close_list);
1622  	}
1623  }
1624  EXPORT_SYMBOL(dev_close_many);
1625  
1626  /**
1627   *	dev_close - shutdown an interface.
1628   *	@dev: device to shutdown
1629   *
1630   *	This function moves an active device into down state. A
1631   *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1632   *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1633   *	chain.
1634   */
dev_close(struct net_device * dev)1635  void dev_close(struct net_device *dev)
1636  {
1637  	if (dev->flags & IFF_UP) {
1638  		LIST_HEAD(single);
1639  
1640  		list_add(&dev->close_list, &single);
1641  		dev_close_many(&single, true);
1642  		list_del(&single);
1643  	}
1644  }
1645  EXPORT_SYMBOL(dev_close);
1646  
1647  
1648  /**
1649   *	dev_disable_lro - disable Large Receive Offload on a device
1650   *	@dev: device
1651   *
1652   *	Disable Large Receive Offload (LRO) on a net device.  Must be
1653   *	called under RTNL.  This is needed if received packets may be
1654   *	forwarded to another interface.
1655   */
dev_disable_lro(struct net_device * dev)1656  void dev_disable_lro(struct net_device *dev)
1657  {
1658  	struct net_device *lower_dev;
1659  	struct list_head *iter;
1660  
1661  	dev->wanted_features &= ~NETIF_F_LRO;
1662  	netdev_update_features(dev);
1663  
1664  	if (unlikely(dev->features & NETIF_F_LRO))
1665  		netdev_WARN(dev, "failed to disable LRO!\n");
1666  
1667  	netdev_for_each_lower_dev(dev, lower_dev, iter)
1668  		dev_disable_lro(lower_dev);
1669  }
1670  EXPORT_SYMBOL(dev_disable_lro);
1671  
1672  /**
1673   *	dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1674   *	@dev: device
1675   *
1676   *	Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1677   *	called under RTNL.  This is needed if Generic XDP is installed on
1678   *	the device.
1679   */
dev_disable_gro_hw(struct net_device * dev)1680  static void dev_disable_gro_hw(struct net_device *dev)
1681  {
1682  	dev->wanted_features &= ~NETIF_F_GRO_HW;
1683  	netdev_update_features(dev);
1684  
1685  	if (unlikely(dev->features & NETIF_F_GRO_HW))
1686  		netdev_WARN(dev, "failed to disable GRO_HW!\n");
1687  }
1688  
netdev_cmd_to_name(enum netdev_cmd cmd)1689  const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1690  {
1691  #define N(val) 						\
1692  	case NETDEV_##val:				\
1693  		return "NETDEV_" __stringify(val);
1694  	switch (cmd) {
1695  	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1696  	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1697  	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1698  	N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1699  	N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1700  	N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1701  	N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1702  	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1703  	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1704  	N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1705  	N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1706  	N(XDP_FEAT_CHANGE)
1707  	}
1708  #undef N
1709  	return "UNKNOWN_NETDEV_EVENT";
1710  }
1711  EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1712  
call_netdevice_notifier(struct notifier_block * nb,unsigned long val,struct net_device * dev)1713  static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1714  				   struct net_device *dev)
1715  {
1716  	struct netdev_notifier_info info = {
1717  		.dev = dev,
1718  	};
1719  
1720  	return nb->notifier_call(nb, val, &info);
1721  }
1722  
call_netdevice_register_notifiers(struct notifier_block * nb,struct net_device * dev)1723  static int call_netdevice_register_notifiers(struct notifier_block *nb,
1724  					     struct net_device *dev)
1725  {
1726  	int err;
1727  
1728  	err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1729  	err = notifier_to_errno(err);
1730  	if (err)
1731  		return err;
1732  
1733  	if (!(dev->flags & IFF_UP))
1734  		return 0;
1735  
1736  	call_netdevice_notifier(nb, NETDEV_UP, dev);
1737  	return 0;
1738  }
1739  
call_netdevice_unregister_notifiers(struct notifier_block * nb,struct net_device * dev)1740  static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1741  						struct net_device *dev)
1742  {
1743  	if (dev->flags & IFF_UP) {
1744  		call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1745  					dev);
1746  		call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1747  	}
1748  	call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1749  }
1750  
call_netdevice_register_net_notifiers(struct notifier_block * nb,struct net * net)1751  static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1752  						 struct net *net)
1753  {
1754  	struct net_device *dev;
1755  	int err;
1756  
1757  	for_each_netdev(net, dev) {
1758  		err = call_netdevice_register_notifiers(nb, dev);
1759  		if (err)
1760  			goto rollback;
1761  	}
1762  	return 0;
1763  
1764  rollback:
1765  	for_each_netdev_continue_reverse(net, dev)
1766  		call_netdevice_unregister_notifiers(nb, dev);
1767  	return err;
1768  }
1769  
call_netdevice_unregister_net_notifiers(struct notifier_block * nb,struct net * net)1770  static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1771  						    struct net *net)
1772  {
1773  	struct net_device *dev;
1774  
1775  	for_each_netdev(net, dev)
1776  		call_netdevice_unregister_notifiers(nb, dev);
1777  }
1778  
1779  static int dev_boot_phase = 1;
1780  
1781  /**
1782   * register_netdevice_notifier - register a network notifier block
1783   * @nb: notifier
1784   *
1785   * Register a notifier to be called when network device events occur.
1786   * The notifier passed is linked into the kernel structures and must
1787   * not be reused until it has been unregistered. A negative errno code
1788   * is returned on a failure.
1789   *
1790   * When registered all registration and up events are replayed
1791   * to the new notifier to allow device to have a race free
1792   * view of the network device list.
1793   */
1794  
register_netdevice_notifier(struct notifier_block * nb)1795  int register_netdevice_notifier(struct notifier_block *nb)
1796  {
1797  	struct net *net;
1798  	int err;
1799  
1800  	/* Close race with setup_net() and cleanup_net() */
1801  	down_write(&pernet_ops_rwsem);
1802  	rtnl_lock();
1803  	err = raw_notifier_chain_register(&netdev_chain, nb);
1804  	if (err)
1805  		goto unlock;
1806  	if (dev_boot_phase)
1807  		goto unlock;
1808  	for_each_net(net) {
1809  		err = call_netdevice_register_net_notifiers(nb, net);
1810  		if (err)
1811  			goto rollback;
1812  	}
1813  
1814  unlock:
1815  	rtnl_unlock();
1816  	up_write(&pernet_ops_rwsem);
1817  	return err;
1818  
1819  rollback:
1820  	for_each_net_continue_reverse(net)
1821  		call_netdevice_unregister_net_notifiers(nb, net);
1822  
1823  	raw_notifier_chain_unregister(&netdev_chain, nb);
1824  	goto unlock;
1825  }
1826  EXPORT_SYMBOL(register_netdevice_notifier);
1827  
1828  /**
1829   * unregister_netdevice_notifier - unregister a network notifier block
1830   * @nb: notifier
1831   *
1832   * Unregister a notifier previously registered by
1833   * register_netdevice_notifier(). The notifier is unlinked into the
1834   * kernel structures and may then be reused. A negative errno code
1835   * is returned on a failure.
1836   *
1837   * After unregistering unregister and down device events are synthesized
1838   * for all devices on the device list to the removed notifier to remove
1839   * the need for special case cleanup code.
1840   */
1841  
unregister_netdevice_notifier(struct notifier_block * nb)1842  int unregister_netdevice_notifier(struct notifier_block *nb)
1843  {
1844  	struct net *net;
1845  	int err;
1846  
1847  	/* Close race with setup_net() and cleanup_net() */
1848  	down_write(&pernet_ops_rwsem);
1849  	rtnl_lock();
1850  	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1851  	if (err)
1852  		goto unlock;
1853  
1854  	for_each_net(net)
1855  		call_netdevice_unregister_net_notifiers(nb, net);
1856  
1857  unlock:
1858  	rtnl_unlock();
1859  	up_write(&pernet_ops_rwsem);
1860  	return err;
1861  }
1862  EXPORT_SYMBOL(unregister_netdevice_notifier);
1863  
__register_netdevice_notifier_net(struct net * net,struct notifier_block * nb,bool ignore_call_fail)1864  static int __register_netdevice_notifier_net(struct net *net,
1865  					     struct notifier_block *nb,
1866  					     bool ignore_call_fail)
1867  {
1868  	int err;
1869  
1870  	err = raw_notifier_chain_register(&net->netdev_chain, nb);
1871  	if (err)
1872  		return err;
1873  	if (dev_boot_phase)
1874  		return 0;
1875  
1876  	err = call_netdevice_register_net_notifiers(nb, net);
1877  	if (err && !ignore_call_fail)
1878  		goto chain_unregister;
1879  
1880  	return 0;
1881  
1882  chain_unregister:
1883  	raw_notifier_chain_unregister(&net->netdev_chain, nb);
1884  	return err;
1885  }
1886  
__unregister_netdevice_notifier_net(struct net * net,struct notifier_block * nb)1887  static int __unregister_netdevice_notifier_net(struct net *net,
1888  					       struct notifier_block *nb)
1889  {
1890  	int err;
1891  
1892  	err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1893  	if (err)
1894  		return err;
1895  
1896  	call_netdevice_unregister_net_notifiers(nb, net);
1897  	return 0;
1898  }
1899  
1900  /**
1901   * register_netdevice_notifier_net - register a per-netns network notifier block
1902   * @net: network namespace
1903   * @nb: notifier
1904   *
1905   * Register a notifier to be called when network device events occur.
1906   * The notifier passed is linked into the kernel structures and must
1907   * not be reused until it has been unregistered. A negative errno code
1908   * is returned on a failure.
1909   *
1910   * When registered all registration and up events are replayed
1911   * to the new notifier to allow device to have a race free
1912   * view of the network device list.
1913   */
1914  
register_netdevice_notifier_net(struct net * net,struct notifier_block * nb)1915  int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1916  {
1917  	int err;
1918  
1919  	rtnl_lock();
1920  	err = __register_netdevice_notifier_net(net, nb, false);
1921  	rtnl_unlock();
1922  	return err;
1923  }
1924  EXPORT_SYMBOL(register_netdevice_notifier_net);
1925  
1926  /**
1927   * unregister_netdevice_notifier_net - unregister a per-netns
1928   *                                     network notifier block
1929   * @net: network namespace
1930   * @nb: notifier
1931   *
1932   * Unregister a notifier previously registered by
1933   * register_netdevice_notifier_net(). The notifier is unlinked from the
1934   * kernel structures and may then be reused. A negative errno code
1935   * is returned on a failure.
1936   *
1937   * After unregistering unregister and down device events are synthesized
1938   * for all devices on the device list to the removed notifier to remove
1939   * the need for special case cleanup code.
1940   */
1941  
unregister_netdevice_notifier_net(struct net * net,struct notifier_block * nb)1942  int unregister_netdevice_notifier_net(struct net *net,
1943  				      struct notifier_block *nb)
1944  {
1945  	int err;
1946  
1947  	rtnl_lock();
1948  	err = __unregister_netdevice_notifier_net(net, nb);
1949  	rtnl_unlock();
1950  	return err;
1951  }
1952  EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1953  
__move_netdevice_notifier_net(struct net * src_net,struct net * dst_net,struct notifier_block * nb)1954  static void __move_netdevice_notifier_net(struct net *src_net,
1955  					  struct net *dst_net,
1956  					  struct notifier_block *nb)
1957  {
1958  	__unregister_netdevice_notifier_net(src_net, nb);
1959  	__register_netdevice_notifier_net(dst_net, nb, true);
1960  }
1961  
register_netdevice_notifier_dev_net(struct net_device * dev,struct notifier_block * nb,struct netdev_net_notifier * nn)1962  int register_netdevice_notifier_dev_net(struct net_device *dev,
1963  					struct notifier_block *nb,
1964  					struct netdev_net_notifier *nn)
1965  {
1966  	int err;
1967  
1968  	rtnl_lock();
1969  	err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1970  	if (!err) {
1971  		nn->nb = nb;
1972  		list_add(&nn->list, &dev->net_notifier_list);
1973  	}
1974  	rtnl_unlock();
1975  	return err;
1976  }
1977  EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1978  
unregister_netdevice_notifier_dev_net(struct net_device * dev,struct notifier_block * nb,struct netdev_net_notifier * nn)1979  int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1980  					  struct notifier_block *nb,
1981  					  struct netdev_net_notifier *nn)
1982  {
1983  	int err;
1984  
1985  	rtnl_lock();
1986  	list_del(&nn->list);
1987  	err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1988  	rtnl_unlock();
1989  	return err;
1990  }
1991  EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1992  
move_netdevice_notifiers_dev_net(struct net_device * dev,struct net * net)1993  static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1994  					     struct net *net)
1995  {
1996  	struct netdev_net_notifier *nn;
1997  
1998  	list_for_each_entry(nn, &dev->net_notifier_list, list)
1999  		__move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
2000  }
2001  
2002  /**
2003   *	call_netdevice_notifiers_info - call all network notifier blocks
2004   *	@val: value passed unmodified to notifier function
2005   *	@info: notifier information data
2006   *
2007   *	Call all network notifier blocks.  Parameters and return value
2008   *	are as for raw_notifier_call_chain().
2009   */
2010  
call_netdevice_notifiers_info(unsigned long val,struct netdev_notifier_info * info)2011  int call_netdevice_notifiers_info(unsigned long val,
2012  				  struct netdev_notifier_info *info)
2013  {
2014  	struct net *net = dev_net(info->dev);
2015  	int ret;
2016  
2017  	ASSERT_RTNL();
2018  
2019  	/* Run per-netns notifier block chain first, then run the global one.
2020  	 * Hopefully, one day, the global one is going to be removed after
2021  	 * all notifier block registrators get converted to be per-netns.
2022  	 */
2023  	ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2024  	if (ret & NOTIFY_STOP_MASK)
2025  		return ret;
2026  	return raw_notifier_call_chain(&netdev_chain, val, info);
2027  }
2028  
2029  /**
2030   *	call_netdevice_notifiers_info_robust - call per-netns notifier blocks
2031   *	                                       for and rollback on error
2032   *	@val_up: value passed unmodified to notifier function
2033   *	@val_down: value passed unmodified to the notifier function when
2034   *	           recovering from an error on @val_up
2035   *	@info: notifier information data
2036   *
2037   *	Call all per-netns network notifier blocks, but not notifier blocks on
2038   *	the global notifier chain. Parameters and return value are as for
2039   *	raw_notifier_call_chain_robust().
2040   */
2041  
2042  static int
call_netdevice_notifiers_info_robust(unsigned long val_up,unsigned long val_down,struct netdev_notifier_info * info)2043  call_netdevice_notifiers_info_robust(unsigned long val_up,
2044  				     unsigned long val_down,
2045  				     struct netdev_notifier_info *info)
2046  {
2047  	struct net *net = dev_net(info->dev);
2048  
2049  	ASSERT_RTNL();
2050  
2051  	return raw_notifier_call_chain_robust(&net->netdev_chain,
2052  					      val_up, val_down, info);
2053  }
2054  
call_netdevice_notifiers_extack(unsigned long val,struct net_device * dev,struct netlink_ext_ack * extack)2055  static int call_netdevice_notifiers_extack(unsigned long val,
2056  					   struct net_device *dev,
2057  					   struct netlink_ext_ack *extack)
2058  {
2059  	struct netdev_notifier_info info = {
2060  		.dev = dev,
2061  		.extack = extack,
2062  	};
2063  
2064  	return call_netdevice_notifiers_info(val, &info);
2065  }
2066  
2067  /**
2068   *	call_netdevice_notifiers - call all network notifier blocks
2069   *      @val: value passed unmodified to notifier function
2070   *      @dev: net_device pointer passed unmodified to notifier function
2071   *
2072   *	Call all network notifier blocks.  Parameters and return value
2073   *	are as for raw_notifier_call_chain().
2074   */
2075  
call_netdevice_notifiers(unsigned long val,struct net_device * dev)2076  int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2077  {
2078  	return call_netdevice_notifiers_extack(val, dev, NULL);
2079  }
2080  EXPORT_SYMBOL(call_netdevice_notifiers);
2081  
2082  /**
2083   *	call_netdevice_notifiers_mtu - call all network notifier blocks
2084   *	@val: value passed unmodified to notifier function
2085   *	@dev: net_device pointer passed unmodified to notifier function
2086   *	@arg: additional u32 argument passed to the notifier function
2087   *
2088   *	Call all network notifier blocks.  Parameters and return value
2089   *	are as for raw_notifier_call_chain().
2090   */
call_netdevice_notifiers_mtu(unsigned long val,struct net_device * dev,u32 arg)2091  static int call_netdevice_notifiers_mtu(unsigned long val,
2092  					struct net_device *dev, u32 arg)
2093  {
2094  	struct netdev_notifier_info_ext info = {
2095  		.info.dev = dev,
2096  		.ext.mtu = arg,
2097  	};
2098  
2099  	BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2100  
2101  	return call_netdevice_notifiers_info(val, &info.info);
2102  }
2103  
2104  #ifdef CONFIG_NET_INGRESS
2105  static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2106  
net_inc_ingress_queue(void)2107  void net_inc_ingress_queue(void)
2108  {
2109  	static_branch_inc(&ingress_needed_key);
2110  }
2111  EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2112  
net_dec_ingress_queue(void)2113  void net_dec_ingress_queue(void)
2114  {
2115  	static_branch_dec(&ingress_needed_key);
2116  }
2117  EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2118  #endif
2119  
2120  #ifdef CONFIG_NET_EGRESS
2121  static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2122  
net_inc_egress_queue(void)2123  void net_inc_egress_queue(void)
2124  {
2125  	static_branch_inc(&egress_needed_key);
2126  }
2127  EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2128  
net_dec_egress_queue(void)2129  void net_dec_egress_queue(void)
2130  {
2131  	static_branch_dec(&egress_needed_key);
2132  }
2133  EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2134  #endif
2135  
2136  DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2137  EXPORT_SYMBOL(netstamp_needed_key);
2138  #ifdef CONFIG_JUMP_LABEL
2139  static atomic_t netstamp_needed_deferred;
2140  static atomic_t netstamp_wanted;
netstamp_clear(struct work_struct * work)2141  static void netstamp_clear(struct work_struct *work)
2142  {
2143  	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2144  	int wanted;
2145  
2146  	wanted = atomic_add_return(deferred, &netstamp_wanted);
2147  	if (wanted > 0)
2148  		static_branch_enable(&netstamp_needed_key);
2149  	else
2150  		static_branch_disable(&netstamp_needed_key);
2151  }
2152  static DECLARE_WORK(netstamp_work, netstamp_clear);
2153  #endif
2154  
net_enable_timestamp(void)2155  void net_enable_timestamp(void)
2156  {
2157  #ifdef CONFIG_JUMP_LABEL
2158  	int wanted = atomic_read(&netstamp_wanted);
2159  
2160  	while (wanted > 0) {
2161  		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2162  			return;
2163  	}
2164  	atomic_inc(&netstamp_needed_deferred);
2165  	schedule_work(&netstamp_work);
2166  #else
2167  	static_branch_inc(&netstamp_needed_key);
2168  #endif
2169  }
2170  EXPORT_SYMBOL(net_enable_timestamp);
2171  
net_disable_timestamp(void)2172  void net_disable_timestamp(void)
2173  {
2174  #ifdef CONFIG_JUMP_LABEL
2175  	int wanted = atomic_read(&netstamp_wanted);
2176  
2177  	while (wanted > 1) {
2178  		if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2179  			return;
2180  	}
2181  	atomic_dec(&netstamp_needed_deferred);
2182  	schedule_work(&netstamp_work);
2183  #else
2184  	static_branch_dec(&netstamp_needed_key);
2185  #endif
2186  }
2187  EXPORT_SYMBOL(net_disable_timestamp);
2188  
net_timestamp_set(struct sk_buff * skb)2189  static inline void net_timestamp_set(struct sk_buff *skb)
2190  {
2191  	skb->tstamp = 0;
2192  	skb->mono_delivery_time = 0;
2193  	if (static_branch_unlikely(&netstamp_needed_key))
2194  		skb->tstamp = ktime_get_real();
2195  }
2196  
2197  #define net_timestamp_check(COND, SKB)				\
2198  	if (static_branch_unlikely(&netstamp_needed_key)) {	\
2199  		if ((COND) && !(SKB)->tstamp)			\
2200  			(SKB)->tstamp = ktime_get_real();	\
2201  	}							\
2202  
is_skb_forwardable(const struct net_device * dev,const struct sk_buff * skb)2203  bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2204  {
2205  	return __is_skb_forwardable(dev, skb, true);
2206  }
2207  EXPORT_SYMBOL_GPL(is_skb_forwardable);
2208  
__dev_forward_skb2(struct net_device * dev,struct sk_buff * skb,bool check_mtu)2209  static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2210  			      bool check_mtu)
2211  {
2212  	int ret = ____dev_forward_skb(dev, skb, check_mtu);
2213  
2214  	if (likely(!ret)) {
2215  		skb->protocol = eth_type_trans(skb, dev);
2216  		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2217  	}
2218  
2219  	return ret;
2220  }
2221  
__dev_forward_skb(struct net_device * dev,struct sk_buff * skb)2222  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2223  {
2224  	return __dev_forward_skb2(dev, skb, true);
2225  }
2226  EXPORT_SYMBOL_GPL(__dev_forward_skb);
2227  
2228  /**
2229   * dev_forward_skb - loopback an skb to another netif
2230   *
2231   * @dev: destination network device
2232   * @skb: buffer to forward
2233   *
2234   * return values:
2235   *	NET_RX_SUCCESS	(no congestion)
2236   *	NET_RX_DROP     (packet was dropped, but freed)
2237   *
2238   * dev_forward_skb can be used for injecting an skb from the
2239   * start_xmit function of one device into the receive queue
2240   * of another device.
2241   *
2242   * The receiving device may be in another namespace, so
2243   * we have to clear all information in the skb that could
2244   * impact namespace isolation.
2245   */
dev_forward_skb(struct net_device * dev,struct sk_buff * skb)2246  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2247  {
2248  	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2249  }
2250  EXPORT_SYMBOL_GPL(dev_forward_skb);
2251  
dev_forward_skb_nomtu(struct net_device * dev,struct sk_buff * skb)2252  int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2253  {
2254  	return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2255  }
2256  
deliver_skb(struct sk_buff * skb,struct packet_type * pt_prev,struct net_device * orig_dev)2257  static inline int deliver_skb(struct sk_buff *skb,
2258  			      struct packet_type *pt_prev,
2259  			      struct net_device *orig_dev)
2260  {
2261  	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2262  		return -ENOMEM;
2263  	refcount_inc(&skb->users);
2264  	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2265  }
2266  
deliver_ptype_list_skb(struct sk_buff * skb,struct packet_type ** pt,struct net_device * orig_dev,__be16 type,struct list_head * ptype_list)2267  static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2268  					  struct packet_type **pt,
2269  					  struct net_device *orig_dev,
2270  					  __be16 type,
2271  					  struct list_head *ptype_list)
2272  {
2273  	struct packet_type *ptype, *pt_prev = *pt;
2274  
2275  	list_for_each_entry_rcu(ptype, ptype_list, list) {
2276  		if (ptype->type != type)
2277  			continue;
2278  		if (pt_prev)
2279  			deliver_skb(skb, pt_prev, orig_dev);
2280  		pt_prev = ptype;
2281  	}
2282  	*pt = pt_prev;
2283  }
2284  
skb_loop_sk(struct packet_type * ptype,struct sk_buff * skb)2285  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2286  {
2287  	if (!ptype->af_packet_priv || !skb->sk)
2288  		return false;
2289  
2290  	if (ptype->id_match)
2291  		return ptype->id_match(ptype, skb->sk);
2292  	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2293  		return true;
2294  
2295  	return false;
2296  }
2297  
2298  /**
2299   * dev_nit_active - return true if any network interface taps are in use
2300   *
2301   * @dev: network device to check for the presence of taps
2302   */
dev_nit_active(struct net_device * dev)2303  bool dev_nit_active(struct net_device *dev)
2304  {
2305  	return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2306  }
2307  EXPORT_SYMBOL_GPL(dev_nit_active);
2308  
2309  /*
2310   *	Support routine. Sends outgoing frames to any network
2311   *	taps currently in use.
2312   */
2313  
dev_queue_xmit_nit(struct sk_buff * skb,struct net_device * dev)2314  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2315  {
2316  	struct packet_type *ptype;
2317  	struct sk_buff *skb2 = NULL;
2318  	struct packet_type *pt_prev = NULL;
2319  	struct list_head *ptype_list = &ptype_all;
2320  
2321  	rcu_read_lock();
2322  again:
2323  	list_for_each_entry_rcu(ptype, ptype_list, list) {
2324  		if (READ_ONCE(ptype->ignore_outgoing))
2325  			continue;
2326  
2327  		/* Never send packets back to the socket
2328  		 * they originated from - MvS (miquels@drinkel.ow.org)
2329  		 */
2330  		if (skb_loop_sk(ptype, skb))
2331  			continue;
2332  
2333  		if (pt_prev) {
2334  			deliver_skb(skb2, pt_prev, skb->dev);
2335  			pt_prev = ptype;
2336  			continue;
2337  		}
2338  
2339  		/* need to clone skb, done only once */
2340  		skb2 = skb_clone(skb, GFP_ATOMIC);
2341  		if (!skb2)
2342  			goto out_unlock;
2343  
2344  		net_timestamp_set(skb2);
2345  
2346  		/* skb->nh should be correctly
2347  		 * set by sender, so that the second statement is
2348  		 * just protection against buggy protocols.
2349  		 */
2350  		skb_reset_mac_header(skb2);
2351  
2352  		if (skb_network_header(skb2) < skb2->data ||
2353  		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2354  			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2355  					     ntohs(skb2->protocol),
2356  					     dev->name);
2357  			skb_reset_network_header(skb2);
2358  		}
2359  
2360  		skb2->transport_header = skb2->network_header;
2361  		skb2->pkt_type = PACKET_OUTGOING;
2362  		pt_prev = ptype;
2363  	}
2364  
2365  	if (ptype_list == &ptype_all) {
2366  		ptype_list = &dev->ptype_all;
2367  		goto again;
2368  	}
2369  out_unlock:
2370  	if (pt_prev) {
2371  		if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2372  			pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2373  		else
2374  			kfree_skb(skb2);
2375  	}
2376  	rcu_read_unlock();
2377  }
2378  EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2379  
2380  /**
2381   * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2382   * @dev: Network device
2383   * @txq: number of queues available
2384   *
2385   * If real_num_tx_queues is changed the tc mappings may no longer be
2386   * valid. To resolve this verify the tc mapping remains valid and if
2387   * not NULL the mapping. With no priorities mapping to this
2388   * offset/count pair it will no longer be used. In the worst case TC0
2389   * is invalid nothing can be done so disable priority mappings. If is
2390   * expected that drivers will fix this mapping if they can before
2391   * calling netif_set_real_num_tx_queues.
2392   */
netif_setup_tc(struct net_device * dev,unsigned int txq)2393  static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2394  {
2395  	int i;
2396  	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2397  
2398  	/* If TC0 is invalidated disable TC mapping */
2399  	if (tc->offset + tc->count > txq) {
2400  		netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2401  		dev->num_tc = 0;
2402  		return;
2403  	}
2404  
2405  	/* Invalidated prio to tc mappings set to TC0 */
2406  	for (i = 1; i < TC_BITMASK + 1; i++) {
2407  		int q = netdev_get_prio_tc_map(dev, i);
2408  
2409  		tc = &dev->tc_to_txq[q];
2410  		if (tc->offset + tc->count > txq) {
2411  			netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2412  				    i, q);
2413  			netdev_set_prio_tc_map(dev, i, 0);
2414  		}
2415  	}
2416  }
2417  
netdev_txq_to_tc(struct net_device * dev,unsigned int txq)2418  int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2419  {
2420  	if (dev->num_tc) {
2421  		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2422  		int i;
2423  
2424  		/* walk through the TCs and see if it falls into any of them */
2425  		for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2426  			if ((txq - tc->offset) < tc->count)
2427  				return i;
2428  		}
2429  
2430  		/* didn't find it, just return -1 to indicate no match */
2431  		return -1;
2432  	}
2433  
2434  	return 0;
2435  }
2436  EXPORT_SYMBOL(netdev_txq_to_tc);
2437  
2438  #ifdef CONFIG_XPS
2439  static struct static_key xps_needed __read_mostly;
2440  static struct static_key xps_rxqs_needed __read_mostly;
2441  static DEFINE_MUTEX(xps_map_mutex);
2442  #define xmap_dereference(P)		\
2443  	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2444  
remove_xps_queue(struct xps_dev_maps * dev_maps,struct xps_dev_maps * old_maps,int tci,u16 index)2445  static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2446  			     struct xps_dev_maps *old_maps, int tci, u16 index)
2447  {
2448  	struct xps_map *map = NULL;
2449  	int pos;
2450  
2451  	map = xmap_dereference(dev_maps->attr_map[tci]);
2452  	if (!map)
2453  		return false;
2454  
2455  	for (pos = map->len; pos--;) {
2456  		if (map->queues[pos] != index)
2457  			continue;
2458  
2459  		if (map->len > 1) {
2460  			map->queues[pos] = map->queues[--map->len];
2461  			break;
2462  		}
2463  
2464  		if (old_maps)
2465  			RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2466  		RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2467  		kfree_rcu(map, rcu);
2468  		return false;
2469  	}
2470  
2471  	return true;
2472  }
2473  
remove_xps_queue_cpu(struct net_device * dev,struct xps_dev_maps * dev_maps,int cpu,u16 offset,u16 count)2474  static bool remove_xps_queue_cpu(struct net_device *dev,
2475  				 struct xps_dev_maps *dev_maps,
2476  				 int cpu, u16 offset, u16 count)
2477  {
2478  	int num_tc = dev_maps->num_tc;
2479  	bool active = false;
2480  	int tci;
2481  
2482  	for (tci = cpu * num_tc; num_tc--; tci++) {
2483  		int i, j;
2484  
2485  		for (i = count, j = offset; i--; j++) {
2486  			if (!remove_xps_queue(dev_maps, NULL, tci, j))
2487  				break;
2488  		}
2489  
2490  		active |= i < 0;
2491  	}
2492  
2493  	return active;
2494  }
2495  
reset_xps_maps(struct net_device * dev,struct xps_dev_maps * dev_maps,enum xps_map_type type)2496  static void reset_xps_maps(struct net_device *dev,
2497  			   struct xps_dev_maps *dev_maps,
2498  			   enum xps_map_type type)
2499  {
2500  	static_key_slow_dec_cpuslocked(&xps_needed);
2501  	if (type == XPS_RXQS)
2502  		static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2503  
2504  	RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2505  
2506  	kfree_rcu(dev_maps, rcu);
2507  }
2508  
clean_xps_maps(struct net_device * dev,enum xps_map_type type,u16 offset,u16 count)2509  static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2510  			   u16 offset, u16 count)
2511  {
2512  	struct xps_dev_maps *dev_maps;
2513  	bool active = false;
2514  	int i, j;
2515  
2516  	dev_maps = xmap_dereference(dev->xps_maps[type]);
2517  	if (!dev_maps)
2518  		return;
2519  
2520  	for (j = 0; j < dev_maps->nr_ids; j++)
2521  		active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2522  	if (!active)
2523  		reset_xps_maps(dev, dev_maps, type);
2524  
2525  	if (type == XPS_CPUS) {
2526  		for (i = offset + (count - 1); count--; i--)
2527  			netdev_queue_numa_node_write(
2528  				netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2529  	}
2530  }
2531  
netif_reset_xps_queues(struct net_device * dev,u16 offset,u16 count)2532  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2533  				   u16 count)
2534  {
2535  	if (!static_key_false(&xps_needed))
2536  		return;
2537  
2538  	cpus_read_lock();
2539  	mutex_lock(&xps_map_mutex);
2540  
2541  	if (static_key_false(&xps_rxqs_needed))
2542  		clean_xps_maps(dev, XPS_RXQS, offset, count);
2543  
2544  	clean_xps_maps(dev, XPS_CPUS, offset, count);
2545  
2546  	mutex_unlock(&xps_map_mutex);
2547  	cpus_read_unlock();
2548  }
2549  
netif_reset_xps_queues_gt(struct net_device * dev,u16 index)2550  static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2551  {
2552  	netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2553  }
2554  
expand_xps_map(struct xps_map * map,int attr_index,u16 index,bool is_rxqs_map)2555  static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2556  				      u16 index, bool is_rxqs_map)
2557  {
2558  	struct xps_map *new_map;
2559  	int alloc_len = XPS_MIN_MAP_ALLOC;
2560  	int i, pos;
2561  
2562  	for (pos = 0; map && pos < map->len; pos++) {
2563  		if (map->queues[pos] != index)
2564  			continue;
2565  		return map;
2566  	}
2567  
2568  	/* Need to add tx-queue to this CPU's/rx-queue's existing map */
2569  	if (map) {
2570  		if (pos < map->alloc_len)
2571  			return map;
2572  
2573  		alloc_len = map->alloc_len * 2;
2574  	}
2575  
2576  	/* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2577  	 *  map
2578  	 */
2579  	if (is_rxqs_map)
2580  		new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2581  	else
2582  		new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2583  				       cpu_to_node(attr_index));
2584  	if (!new_map)
2585  		return NULL;
2586  
2587  	for (i = 0; i < pos; i++)
2588  		new_map->queues[i] = map->queues[i];
2589  	new_map->alloc_len = alloc_len;
2590  	new_map->len = pos;
2591  
2592  	return new_map;
2593  }
2594  
2595  /* Copy xps maps at a given index */
xps_copy_dev_maps(struct xps_dev_maps * dev_maps,struct xps_dev_maps * new_dev_maps,int index,int tc,bool skip_tc)2596  static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2597  			      struct xps_dev_maps *new_dev_maps, int index,
2598  			      int tc, bool skip_tc)
2599  {
2600  	int i, tci = index * dev_maps->num_tc;
2601  	struct xps_map *map;
2602  
2603  	/* copy maps belonging to foreign traffic classes */
2604  	for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2605  		if (i == tc && skip_tc)
2606  			continue;
2607  
2608  		/* fill in the new device map from the old device map */
2609  		map = xmap_dereference(dev_maps->attr_map[tci]);
2610  		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2611  	}
2612  }
2613  
2614  /* Must be called under cpus_read_lock */
__netif_set_xps_queue(struct net_device * dev,const unsigned long * mask,u16 index,enum xps_map_type type)2615  int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2616  			  u16 index, enum xps_map_type type)
2617  {
2618  	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2619  	const unsigned long *online_mask = NULL;
2620  	bool active = false, copy = false;
2621  	int i, j, tci, numa_node_id = -2;
2622  	int maps_sz, num_tc = 1, tc = 0;
2623  	struct xps_map *map, *new_map;
2624  	unsigned int nr_ids;
2625  
2626  	WARN_ON_ONCE(index >= dev->num_tx_queues);
2627  
2628  	if (dev->num_tc) {
2629  		/* Do not allow XPS on subordinate device directly */
2630  		num_tc = dev->num_tc;
2631  		if (num_tc < 0)
2632  			return -EINVAL;
2633  
2634  		/* If queue belongs to subordinate dev use its map */
2635  		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2636  
2637  		tc = netdev_txq_to_tc(dev, index);
2638  		if (tc < 0)
2639  			return -EINVAL;
2640  	}
2641  
2642  	mutex_lock(&xps_map_mutex);
2643  
2644  	dev_maps = xmap_dereference(dev->xps_maps[type]);
2645  	if (type == XPS_RXQS) {
2646  		maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2647  		nr_ids = dev->num_rx_queues;
2648  	} else {
2649  		maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2650  		if (num_possible_cpus() > 1)
2651  			online_mask = cpumask_bits(cpu_online_mask);
2652  		nr_ids = nr_cpu_ids;
2653  	}
2654  
2655  	if (maps_sz < L1_CACHE_BYTES)
2656  		maps_sz = L1_CACHE_BYTES;
2657  
2658  	/* The old dev_maps could be larger or smaller than the one we're
2659  	 * setting up now, as dev->num_tc or nr_ids could have been updated in
2660  	 * between. We could try to be smart, but let's be safe instead and only
2661  	 * copy foreign traffic classes if the two map sizes match.
2662  	 */
2663  	if (dev_maps &&
2664  	    dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2665  		copy = true;
2666  
2667  	/* allocate memory for queue storage */
2668  	for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2669  	     j < nr_ids;) {
2670  		if (!new_dev_maps) {
2671  			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2672  			if (!new_dev_maps) {
2673  				mutex_unlock(&xps_map_mutex);
2674  				return -ENOMEM;
2675  			}
2676  
2677  			new_dev_maps->nr_ids = nr_ids;
2678  			new_dev_maps->num_tc = num_tc;
2679  		}
2680  
2681  		tci = j * num_tc + tc;
2682  		map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2683  
2684  		map = expand_xps_map(map, j, index, type == XPS_RXQS);
2685  		if (!map)
2686  			goto error;
2687  
2688  		RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2689  	}
2690  
2691  	if (!new_dev_maps)
2692  		goto out_no_new_maps;
2693  
2694  	if (!dev_maps) {
2695  		/* Increment static keys at most once per type */
2696  		static_key_slow_inc_cpuslocked(&xps_needed);
2697  		if (type == XPS_RXQS)
2698  			static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2699  	}
2700  
2701  	for (j = 0; j < nr_ids; j++) {
2702  		bool skip_tc = false;
2703  
2704  		tci = j * num_tc + tc;
2705  		if (netif_attr_test_mask(j, mask, nr_ids) &&
2706  		    netif_attr_test_online(j, online_mask, nr_ids)) {
2707  			/* add tx-queue to CPU/rx-queue maps */
2708  			int pos = 0;
2709  
2710  			skip_tc = true;
2711  
2712  			map = xmap_dereference(new_dev_maps->attr_map[tci]);
2713  			while ((pos < map->len) && (map->queues[pos] != index))
2714  				pos++;
2715  
2716  			if (pos == map->len)
2717  				map->queues[map->len++] = index;
2718  #ifdef CONFIG_NUMA
2719  			if (type == XPS_CPUS) {
2720  				if (numa_node_id == -2)
2721  					numa_node_id = cpu_to_node(j);
2722  				else if (numa_node_id != cpu_to_node(j))
2723  					numa_node_id = -1;
2724  			}
2725  #endif
2726  		}
2727  
2728  		if (copy)
2729  			xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2730  					  skip_tc);
2731  	}
2732  
2733  	rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2734  
2735  	/* Cleanup old maps */
2736  	if (!dev_maps)
2737  		goto out_no_old_maps;
2738  
2739  	for (j = 0; j < dev_maps->nr_ids; j++) {
2740  		for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2741  			map = xmap_dereference(dev_maps->attr_map[tci]);
2742  			if (!map)
2743  				continue;
2744  
2745  			if (copy) {
2746  				new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2747  				if (map == new_map)
2748  					continue;
2749  			}
2750  
2751  			RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2752  			kfree_rcu(map, rcu);
2753  		}
2754  	}
2755  
2756  	old_dev_maps = dev_maps;
2757  
2758  out_no_old_maps:
2759  	dev_maps = new_dev_maps;
2760  	active = true;
2761  
2762  out_no_new_maps:
2763  	if (type == XPS_CPUS)
2764  		/* update Tx queue numa node */
2765  		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2766  					     (numa_node_id >= 0) ?
2767  					     numa_node_id : NUMA_NO_NODE);
2768  
2769  	if (!dev_maps)
2770  		goto out_no_maps;
2771  
2772  	/* removes tx-queue from unused CPUs/rx-queues */
2773  	for (j = 0; j < dev_maps->nr_ids; j++) {
2774  		tci = j * dev_maps->num_tc;
2775  
2776  		for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2777  			if (i == tc &&
2778  			    netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2779  			    netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2780  				continue;
2781  
2782  			active |= remove_xps_queue(dev_maps,
2783  						   copy ? old_dev_maps : NULL,
2784  						   tci, index);
2785  		}
2786  	}
2787  
2788  	if (old_dev_maps)
2789  		kfree_rcu(old_dev_maps, rcu);
2790  
2791  	/* free map if not active */
2792  	if (!active)
2793  		reset_xps_maps(dev, dev_maps, type);
2794  
2795  out_no_maps:
2796  	mutex_unlock(&xps_map_mutex);
2797  
2798  	return 0;
2799  error:
2800  	/* remove any maps that we added */
2801  	for (j = 0; j < nr_ids; j++) {
2802  		for (i = num_tc, tci = j * num_tc; i--; tci++) {
2803  			new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2804  			map = copy ?
2805  			      xmap_dereference(dev_maps->attr_map[tci]) :
2806  			      NULL;
2807  			if (new_map && new_map != map)
2808  				kfree(new_map);
2809  		}
2810  	}
2811  
2812  	mutex_unlock(&xps_map_mutex);
2813  
2814  	kfree(new_dev_maps);
2815  	return -ENOMEM;
2816  }
2817  EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2818  
netif_set_xps_queue(struct net_device * dev,const struct cpumask * mask,u16 index)2819  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2820  			u16 index)
2821  {
2822  	int ret;
2823  
2824  	cpus_read_lock();
2825  	ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2826  	cpus_read_unlock();
2827  
2828  	return ret;
2829  }
2830  EXPORT_SYMBOL(netif_set_xps_queue);
2831  
2832  #endif
netdev_unbind_all_sb_channels(struct net_device * dev)2833  static void netdev_unbind_all_sb_channels(struct net_device *dev)
2834  {
2835  	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2836  
2837  	/* Unbind any subordinate channels */
2838  	while (txq-- != &dev->_tx[0]) {
2839  		if (txq->sb_dev)
2840  			netdev_unbind_sb_channel(dev, txq->sb_dev);
2841  	}
2842  }
2843  
netdev_reset_tc(struct net_device * dev)2844  void netdev_reset_tc(struct net_device *dev)
2845  {
2846  #ifdef CONFIG_XPS
2847  	netif_reset_xps_queues_gt(dev, 0);
2848  #endif
2849  	netdev_unbind_all_sb_channels(dev);
2850  
2851  	/* Reset TC configuration of device */
2852  	dev->num_tc = 0;
2853  	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2854  	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2855  }
2856  EXPORT_SYMBOL(netdev_reset_tc);
2857  
netdev_set_tc_queue(struct net_device * dev,u8 tc,u16 count,u16 offset)2858  int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2859  {
2860  	if (tc >= dev->num_tc)
2861  		return -EINVAL;
2862  
2863  #ifdef CONFIG_XPS
2864  	netif_reset_xps_queues(dev, offset, count);
2865  #endif
2866  	dev->tc_to_txq[tc].count = count;
2867  	dev->tc_to_txq[tc].offset = offset;
2868  	return 0;
2869  }
2870  EXPORT_SYMBOL(netdev_set_tc_queue);
2871  
netdev_set_num_tc(struct net_device * dev,u8 num_tc)2872  int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2873  {
2874  	if (num_tc > TC_MAX_QUEUE)
2875  		return -EINVAL;
2876  
2877  #ifdef CONFIG_XPS
2878  	netif_reset_xps_queues_gt(dev, 0);
2879  #endif
2880  	netdev_unbind_all_sb_channels(dev);
2881  
2882  	dev->num_tc = num_tc;
2883  	return 0;
2884  }
2885  EXPORT_SYMBOL(netdev_set_num_tc);
2886  
netdev_unbind_sb_channel(struct net_device * dev,struct net_device * sb_dev)2887  void netdev_unbind_sb_channel(struct net_device *dev,
2888  			      struct net_device *sb_dev)
2889  {
2890  	struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2891  
2892  #ifdef CONFIG_XPS
2893  	netif_reset_xps_queues_gt(sb_dev, 0);
2894  #endif
2895  	memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2896  	memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2897  
2898  	while (txq-- != &dev->_tx[0]) {
2899  		if (txq->sb_dev == sb_dev)
2900  			txq->sb_dev = NULL;
2901  	}
2902  }
2903  EXPORT_SYMBOL(netdev_unbind_sb_channel);
2904  
netdev_bind_sb_channel_queue(struct net_device * dev,struct net_device * sb_dev,u8 tc,u16 count,u16 offset)2905  int netdev_bind_sb_channel_queue(struct net_device *dev,
2906  				 struct net_device *sb_dev,
2907  				 u8 tc, u16 count, u16 offset)
2908  {
2909  	/* Make certain the sb_dev and dev are already configured */
2910  	if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2911  		return -EINVAL;
2912  
2913  	/* We cannot hand out queues we don't have */
2914  	if ((offset + count) > dev->real_num_tx_queues)
2915  		return -EINVAL;
2916  
2917  	/* Record the mapping */
2918  	sb_dev->tc_to_txq[tc].count = count;
2919  	sb_dev->tc_to_txq[tc].offset = offset;
2920  
2921  	/* Provide a way for Tx queue to find the tc_to_txq map or
2922  	 * XPS map for itself.
2923  	 */
2924  	while (count--)
2925  		netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2926  
2927  	return 0;
2928  }
2929  EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2930  
netdev_set_sb_channel(struct net_device * dev,u16 channel)2931  int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2932  {
2933  	/* Do not use a multiqueue device to represent a subordinate channel */
2934  	if (netif_is_multiqueue(dev))
2935  		return -ENODEV;
2936  
2937  	/* We allow channels 1 - 32767 to be used for subordinate channels.
2938  	 * Channel 0 is meant to be "native" mode and used only to represent
2939  	 * the main root device. We allow writing 0 to reset the device back
2940  	 * to normal mode after being used as a subordinate channel.
2941  	 */
2942  	if (channel > S16_MAX)
2943  		return -EINVAL;
2944  
2945  	dev->num_tc = -channel;
2946  
2947  	return 0;
2948  }
2949  EXPORT_SYMBOL(netdev_set_sb_channel);
2950  
2951  /*
2952   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2953   * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2954   */
netif_set_real_num_tx_queues(struct net_device * dev,unsigned int txq)2955  int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2956  {
2957  	bool disabling;
2958  	int rc;
2959  
2960  	disabling = txq < dev->real_num_tx_queues;
2961  
2962  	if (txq < 1 || txq > dev->num_tx_queues)
2963  		return -EINVAL;
2964  
2965  	if (dev->reg_state == NETREG_REGISTERED ||
2966  	    dev->reg_state == NETREG_UNREGISTERING) {
2967  		ASSERT_RTNL();
2968  
2969  		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2970  						  txq);
2971  		if (rc)
2972  			return rc;
2973  
2974  		if (dev->num_tc)
2975  			netif_setup_tc(dev, txq);
2976  
2977  		dev_qdisc_change_real_num_tx(dev, txq);
2978  
2979  		dev->real_num_tx_queues = txq;
2980  
2981  		if (disabling) {
2982  			synchronize_net();
2983  			qdisc_reset_all_tx_gt(dev, txq);
2984  #ifdef CONFIG_XPS
2985  			netif_reset_xps_queues_gt(dev, txq);
2986  #endif
2987  		}
2988  	} else {
2989  		dev->real_num_tx_queues = txq;
2990  	}
2991  
2992  	return 0;
2993  }
2994  EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2995  
2996  #ifdef CONFIG_SYSFS
2997  /**
2998   *	netif_set_real_num_rx_queues - set actual number of RX queues used
2999   *	@dev: Network device
3000   *	@rxq: Actual number of RX queues
3001   *
3002   *	This must be called either with the rtnl_lock held or before
3003   *	registration of the net device.  Returns 0 on success, or a
3004   *	negative error code.  If called before registration, it always
3005   *	succeeds.
3006   */
netif_set_real_num_rx_queues(struct net_device * dev,unsigned int rxq)3007  int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3008  {
3009  	int rc;
3010  
3011  	if (rxq < 1 || rxq > dev->num_rx_queues)
3012  		return -EINVAL;
3013  
3014  	if (dev->reg_state == NETREG_REGISTERED) {
3015  		ASSERT_RTNL();
3016  
3017  		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3018  						  rxq);
3019  		if (rc)
3020  			return rc;
3021  	}
3022  
3023  	dev->real_num_rx_queues = rxq;
3024  	return 0;
3025  }
3026  EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3027  #endif
3028  
3029  /**
3030   *	netif_set_real_num_queues - set actual number of RX and TX queues used
3031   *	@dev: Network device
3032   *	@txq: Actual number of TX queues
3033   *	@rxq: Actual number of RX queues
3034   *
3035   *	Set the real number of both TX and RX queues.
3036   *	Does nothing if the number of queues is already correct.
3037   */
netif_set_real_num_queues(struct net_device * dev,unsigned int txq,unsigned int rxq)3038  int netif_set_real_num_queues(struct net_device *dev,
3039  			      unsigned int txq, unsigned int rxq)
3040  {
3041  	unsigned int old_rxq = dev->real_num_rx_queues;
3042  	int err;
3043  
3044  	if (txq < 1 || txq > dev->num_tx_queues ||
3045  	    rxq < 1 || rxq > dev->num_rx_queues)
3046  		return -EINVAL;
3047  
3048  	/* Start from increases, so the error path only does decreases -
3049  	 * decreases can't fail.
3050  	 */
3051  	if (rxq > dev->real_num_rx_queues) {
3052  		err = netif_set_real_num_rx_queues(dev, rxq);
3053  		if (err)
3054  			return err;
3055  	}
3056  	if (txq > dev->real_num_tx_queues) {
3057  		err = netif_set_real_num_tx_queues(dev, txq);
3058  		if (err)
3059  			goto undo_rx;
3060  	}
3061  	if (rxq < dev->real_num_rx_queues)
3062  		WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
3063  	if (txq < dev->real_num_tx_queues)
3064  		WARN_ON(netif_set_real_num_tx_queues(dev, txq));
3065  
3066  	return 0;
3067  undo_rx:
3068  	WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
3069  	return err;
3070  }
3071  EXPORT_SYMBOL(netif_set_real_num_queues);
3072  
3073  /**
3074   * netif_set_tso_max_size() - set the max size of TSO frames supported
3075   * @dev:	netdev to update
3076   * @size:	max skb->len of a TSO frame
3077   *
3078   * Set the limit on the size of TSO super-frames the device can handle.
3079   * Unless explicitly set the stack will assume the value of
3080   * %GSO_LEGACY_MAX_SIZE.
3081   */
netif_set_tso_max_size(struct net_device * dev,unsigned int size)3082  void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3083  {
3084  	dev->tso_max_size = min(GSO_MAX_SIZE, size);
3085  	if (size < READ_ONCE(dev->gso_max_size))
3086  		netif_set_gso_max_size(dev, size);
3087  	if (size < READ_ONCE(dev->gso_ipv4_max_size))
3088  		netif_set_gso_ipv4_max_size(dev, size);
3089  }
3090  EXPORT_SYMBOL(netif_set_tso_max_size);
3091  
3092  /**
3093   * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3094   * @dev:	netdev to update
3095   * @segs:	max number of TCP segments
3096   *
3097   * Set the limit on the number of TCP segments the device can generate from
3098   * a single TSO super-frame.
3099   * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3100   */
netif_set_tso_max_segs(struct net_device * dev,unsigned int segs)3101  void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3102  {
3103  	dev->tso_max_segs = segs;
3104  	if (segs < READ_ONCE(dev->gso_max_segs))
3105  		netif_set_gso_max_segs(dev, segs);
3106  }
3107  EXPORT_SYMBOL(netif_set_tso_max_segs);
3108  
3109  /**
3110   * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3111   * @to:		netdev to update
3112   * @from:	netdev from which to copy the limits
3113   */
netif_inherit_tso_max(struct net_device * to,const struct net_device * from)3114  void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3115  {
3116  	netif_set_tso_max_size(to, from->tso_max_size);
3117  	netif_set_tso_max_segs(to, from->tso_max_segs);
3118  }
3119  EXPORT_SYMBOL(netif_inherit_tso_max);
3120  
3121  /**
3122   * netif_get_num_default_rss_queues - default number of RSS queues
3123   *
3124   * Default value is the number of physical cores if there are only 1 or 2, or
3125   * divided by 2 if there are more.
3126   */
netif_get_num_default_rss_queues(void)3127  int netif_get_num_default_rss_queues(void)
3128  {
3129  	cpumask_var_t cpus;
3130  	int cpu, count = 0;
3131  
3132  	if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3133  		return 1;
3134  
3135  	cpumask_copy(cpus, cpu_online_mask);
3136  	for_each_cpu(cpu, cpus) {
3137  		++count;
3138  		cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3139  	}
3140  	free_cpumask_var(cpus);
3141  
3142  	return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3143  }
3144  EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3145  
__netif_reschedule(struct Qdisc * q)3146  static void __netif_reschedule(struct Qdisc *q)
3147  {
3148  	struct softnet_data *sd;
3149  	unsigned long flags;
3150  
3151  	local_irq_save(flags);
3152  	sd = this_cpu_ptr(&softnet_data);
3153  	q->next_sched = NULL;
3154  	*sd->output_queue_tailp = q;
3155  	sd->output_queue_tailp = &q->next_sched;
3156  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3157  	local_irq_restore(flags);
3158  }
3159  
__netif_schedule(struct Qdisc * q)3160  void __netif_schedule(struct Qdisc *q)
3161  {
3162  	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3163  		__netif_reschedule(q);
3164  }
3165  EXPORT_SYMBOL(__netif_schedule);
3166  
3167  struct dev_kfree_skb_cb {
3168  	enum skb_drop_reason reason;
3169  };
3170  
get_kfree_skb_cb(const struct sk_buff * skb)3171  static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3172  {
3173  	return (struct dev_kfree_skb_cb *)skb->cb;
3174  }
3175  
netif_schedule_queue(struct netdev_queue * txq)3176  void netif_schedule_queue(struct netdev_queue *txq)
3177  {
3178  	rcu_read_lock();
3179  	if (!netif_xmit_stopped(txq)) {
3180  		struct Qdisc *q = rcu_dereference(txq->qdisc);
3181  
3182  		__netif_schedule(q);
3183  	}
3184  	rcu_read_unlock();
3185  }
3186  EXPORT_SYMBOL(netif_schedule_queue);
3187  
netif_tx_wake_queue(struct netdev_queue * dev_queue)3188  void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3189  {
3190  	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3191  		struct Qdisc *q;
3192  
3193  		rcu_read_lock();
3194  		q = rcu_dereference(dev_queue->qdisc);
3195  		__netif_schedule(q);
3196  		rcu_read_unlock();
3197  	}
3198  }
3199  EXPORT_SYMBOL(netif_tx_wake_queue);
3200  
dev_kfree_skb_irq_reason(struct sk_buff * skb,enum skb_drop_reason reason)3201  void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3202  {
3203  	unsigned long flags;
3204  
3205  	if (unlikely(!skb))
3206  		return;
3207  
3208  	if (likely(refcount_read(&skb->users) == 1)) {
3209  		smp_rmb();
3210  		refcount_set(&skb->users, 0);
3211  	} else if (likely(!refcount_dec_and_test(&skb->users))) {
3212  		return;
3213  	}
3214  	get_kfree_skb_cb(skb)->reason = reason;
3215  	local_irq_save(flags);
3216  	skb->next = __this_cpu_read(softnet_data.completion_queue);
3217  	__this_cpu_write(softnet_data.completion_queue, skb);
3218  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
3219  	local_irq_restore(flags);
3220  }
3221  EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3222  
dev_kfree_skb_any_reason(struct sk_buff * skb,enum skb_drop_reason reason)3223  void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3224  {
3225  	if (in_hardirq() || irqs_disabled())
3226  		dev_kfree_skb_irq_reason(skb, reason);
3227  	else
3228  		kfree_skb_reason(skb, reason);
3229  }
3230  EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3231  
3232  
3233  /**
3234   * netif_device_detach - mark device as removed
3235   * @dev: network device
3236   *
3237   * Mark device as removed from system and therefore no longer available.
3238   */
netif_device_detach(struct net_device * dev)3239  void netif_device_detach(struct net_device *dev)
3240  {
3241  	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3242  	    netif_running(dev)) {
3243  		netif_tx_stop_all_queues(dev);
3244  	}
3245  }
3246  EXPORT_SYMBOL(netif_device_detach);
3247  
3248  /**
3249   * netif_device_attach - mark device as attached
3250   * @dev: network device
3251   *
3252   * Mark device as attached from system and restart if needed.
3253   */
netif_device_attach(struct net_device * dev)3254  void netif_device_attach(struct net_device *dev)
3255  {
3256  	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3257  	    netif_running(dev)) {
3258  		netif_tx_wake_all_queues(dev);
3259  		__netdev_watchdog_up(dev);
3260  	}
3261  }
3262  EXPORT_SYMBOL(netif_device_attach);
3263  
3264  /*
3265   * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3266   * to be used as a distribution range.
3267   */
skb_tx_hash(const struct net_device * dev,const struct net_device * sb_dev,struct sk_buff * skb)3268  static u16 skb_tx_hash(const struct net_device *dev,
3269  		       const struct net_device *sb_dev,
3270  		       struct sk_buff *skb)
3271  {
3272  	u32 hash;
3273  	u16 qoffset = 0;
3274  	u16 qcount = dev->real_num_tx_queues;
3275  
3276  	if (dev->num_tc) {
3277  		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3278  
3279  		qoffset = sb_dev->tc_to_txq[tc].offset;
3280  		qcount = sb_dev->tc_to_txq[tc].count;
3281  		if (unlikely(!qcount)) {
3282  			net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3283  					     sb_dev->name, qoffset, tc);
3284  			qoffset = 0;
3285  			qcount = dev->real_num_tx_queues;
3286  		}
3287  	}
3288  
3289  	if (skb_rx_queue_recorded(skb)) {
3290  		DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3291  		hash = skb_get_rx_queue(skb);
3292  		if (hash >= qoffset)
3293  			hash -= qoffset;
3294  		while (unlikely(hash >= qcount))
3295  			hash -= qcount;
3296  		return hash + qoffset;
3297  	}
3298  
3299  	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3300  }
3301  
skb_warn_bad_offload(const struct sk_buff * skb)3302  void skb_warn_bad_offload(const struct sk_buff *skb)
3303  {
3304  	static const netdev_features_t null_features;
3305  	struct net_device *dev = skb->dev;
3306  	const char *name = "";
3307  
3308  	if (!net_ratelimit())
3309  		return;
3310  
3311  	if (dev) {
3312  		if (dev->dev.parent)
3313  			name = dev_driver_string(dev->dev.parent);
3314  		else
3315  			name = netdev_name(dev);
3316  	}
3317  	skb_dump(KERN_WARNING, skb, false);
3318  	WARN(1, "%s: caps=(%pNF, %pNF)\n",
3319  	     name, dev ? &dev->features : &null_features,
3320  	     skb->sk ? &skb->sk->sk_route_caps : &null_features);
3321  }
3322  
3323  /*
3324   * Invalidate hardware checksum when packet is to be mangled, and
3325   * complete checksum manually on outgoing path.
3326   */
skb_checksum_help(struct sk_buff * skb)3327  int skb_checksum_help(struct sk_buff *skb)
3328  {
3329  	__wsum csum;
3330  	int ret = 0, offset;
3331  
3332  	if (skb->ip_summed == CHECKSUM_COMPLETE)
3333  		goto out_set_summed;
3334  
3335  	if (unlikely(skb_is_gso(skb))) {
3336  		skb_warn_bad_offload(skb);
3337  		return -EINVAL;
3338  	}
3339  
3340  	/* Before computing a checksum, we should make sure no frag could
3341  	 * be modified by an external entity : checksum could be wrong.
3342  	 */
3343  	if (skb_has_shared_frag(skb)) {
3344  		ret = __skb_linearize(skb);
3345  		if (ret)
3346  			goto out;
3347  	}
3348  
3349  	offset = skb_checksum_start_offset(skb);
3350  	ret = -EINVAL;
3351  	if (unlikely(offset >= skb_headlen(skb))) {
3352  		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3353  		WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3354  			  offset, skb_headlen(skb));
3355  		goto out;
3356  	}
3357  	csum = skb_checksum(skb, offset, skb->len - offset, 0);
3358  
3359  	offset += skb->csum_offset;
3360  	if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3361  		DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3362  		WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3363  			  offset + sizeof(__sum16), skb_headlen(skb));
3364  		goto out;
3365  	}
3366  	ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3367  	if (ret)
3368  		goto out;
3369  
3370  	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3371  out_set_summed:
3372  	skb->ip_summed = CHECKSUM_NONE;
3373  out:
3374  	return ret;
3375  }
3376  EXPORT_SYMBOL(skb_checksum_help);
3377  
skb_crc32c_csum_help(struct sk_buff * skb)3378  int skb_crc32c_csum_help(struct sk_buff *skb)
3379  {
3380  	__le32 crc32c_csum;
3381  	int ret = 0, offset, start;
3382  
3383  	if (skb->ip_summed != CHECKSUM_PARTIAL)
3384  		goto out;
3385  
3386  	if (unlikely(skb_is_gso(skb)))
3387  		goto out;
3388  
3389  	/* Before computing a checksum, we should make sure no frag could
3390  	 * be modified by an external entity : checksum could be wrong.
3391  	 */
3392  	if (unlikely(skb_has_shared_frag(skb))) {
3393  		ret = __skb_linearize(skb);
3394  		if (ret)
3395  			goto out;
3396  	}
3397  	start = skb_checksum_start_offset(skb);
3398  	offset = start + offsetof(struct sctphdr, checksum);
3399  	if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3400  		ret = -EINVAL;
3401  		goto out;
3402  	}
3403  
3404  	ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3405  	if (ret)
3406  		goto out;
3407  
3408  	crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3409  						  skb->len - start, ~(__u32)0,
3410  						  crc32c_csum_stub));
3411  	*(__le32 *)(skb->data + offset) = crc32c_csum;
3412  	skb_reset_csum_not_inet(skb);
3413  out:
3414  	return ret;
3415  }
3416  
skb_network_protocol(struct sk_buff * skb,int * depth)3417  __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3418  {
3419  	__be16 type = skb->protocol;
3420  
3421  	/* Tunnel gso handlers can set protocol to ethernet. */
3422  	if (type == htons(ETH_P_TEB)) {
3423  		struct ethhdr *eth;
3424  
3425  		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3426  			return 0;
3427  
3428  		eth = (struct ethhdr *)skb->data;
3429  		type = eth->h_proto;
3430  	}
3431  
3432  	return vlan_get_protocol_and_depth(skb, type, depth);
3433  }
3434  
3435  
3436  /* Take action when hardware reception checksum errors are detected. */
3437  #ifdef CONFIG_BUG
do_netdev_rx_csum_fault(struct net_device * dev,struct sk_buff * skb)3438  static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3439  {
3440  	netdev_err(dev, "hw csum failure\n");
3441  	skb_dump(KERN_ERR, skb, true);
3442  	dump_stack();
3443  }
3444  
netdev_rx_csum_fault(struct net_device * dev,struct sk_buff * skb)3445  void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3446  {
3447  	DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3448  }
3449  EXPORT_SYMBOL(netdev_rx_csum_fault);
3450  #endif
3451  
3452  /* XXX: check that highmem exists at all on the given machine. */
illegal_highdma(struct net_device * dev,struct sk_buff * skb)3453  static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3454  {
3455  #ifdef CONFIG_HIGHMEM
3456  	int i;
3457  
3458  	if (!(dev->features & NETIF_F_HIGHDMA)) {
3459  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3460  			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3461  
3462  			if (PageHighMem(skb_frag_page(frag)))
3463  				return 1;
3464  		}
3465  	}
3466  #endif
3467  	return 0;
3468  }
3469  
3470  /* If MPLS offload request, verify we are testing hardware MPLS features
3471   * instead of standard features for the netdev.
3472   */
3473  #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)3474  static netdev_features_t net_mpls_features(struct sk_buff *skb,
3475  					   netdev_features_t features,
3476  					   __be16 type)
3477  {
3478  	if (eth_p_mpls(type))
3479  		features &= skb->dev->mpls_features;
3480  
3481  	return features;
3482  }
3483  #else
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)3484  static netdev_features_t net_mpls_features(struct sk_buff *skb,
3485  					   netdev_features_t features,
3486  					   __be16 type)
3487  {
3488  	return features;
3489  }
3490  #endif
3491  
harmonize_features(struct sk_buff * skb,netdev_features_t features)3492  static netdev_features_t harmonize_features(struct sk_buff *skb,
3493  	netdev_features_t features)
3494  {
3495  	__be16 type;
3496  
3497  	type = skb_network_protocol(skb, NULL);
3498  	features = net_mpls_features(skb, features, type);
3499  
3500  	if (skb->ip_summed != CHECKSUM_NONE &&
3501  	    !can_checksum_protocol(features, type)) {
3502  		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3503  	}
3504  	if (illegal_highdma(skb->dev, skb))
3505  		features &= ~NETIF_F_SG;
3506  
3507  	return features;
3508  }
3509  
passthru_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)3510  netdev_features_t passthru_features_check(struct sk_buff *skb,
3511  					  struct net_device *dev,
3512  					  netdev_features_t features)
3513  {
3514  	return features;
3515  }
3516  EXPORT_SYMBOL(passthru_features_check);
3517  
dflt_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)3518  static netdev_features_t dflt_features_check(struct sk_buff *skb,
3519  					     struct net_device *dev,
3520  					     netdev_features_t features)
3521  {
3522  	return vlan_features_check(skb, features);
3523  }
3524  
gso_features_check(const struct sk_buff * skb,struct net_device * dev,netdev_features_t features)3525  static netdev_features_t gso_features_check(const struct sk_buff *skb,
3526  					    struct net_device *dev,
3527  					    netdev_features_t features)
3528  {
3529  	u16 gso_segs = skb_shinfo(skb)->gso_segs;
3530  
3531  	if (gso_segs > READ_ONCE(dev->gso_max_segs))
3532  		return features & ~NETIF_F_GSO_MASK;
3533  
3534  	if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
3535  		return features & ~NETIF_F_GSO_MASK;
3536  
3537  	if (!skb_shinfo(skb)->gso_type) {
3538  		skb_warn_bad_offload(skb);
3539  		return features & ~NETIF_F_GSO_MASK;
3540  	}
3541  
3542  	/* Support for GSO partial features requires software
3543  	 * intervention before we can actually process the packets
3544  	 * so we need to strip support for any partial features now
3545  	 * and we can pull them back in after we have partially
3546  	 * segmented the frame.
3547  	 */
3548  	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3549  		features &= ~dev->gso_partial_features;
3550  
3551  	/* Make sure to clear the IPv4 ID mangling feature if the
3552  	 * IPv4 header has the potential to be fragmented.
3553  	 */
3554  	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3555  		struct iphdr *iph = skb->encapsulation ?
3556  				    inner_ip_hdr(skb) : ip_hdr(skb);
3557  
3558  		if (!(iph->frag_off & htons(IP_DF)))
3559  			features &= ~NETIF_F_TSO_MANGLEID;
3560  	}
3561  
3562  	return features;
3563  }
3564  
netif_skb_features(struct sk_buff * skb)3565  netdev_features_t netif_skb_features(struct sk_buff *skb)
3566  {
3567  	struct net_device *dev = skb->dev;
3568  	netdev_features_t features = dev->features;
3569  
3570  	if (skb_is_gso(skb))
3571  		features = gso_features_check(skb, dev, features);
3572  
3573  	/* If encapsulation offload request, verify we are testing
3574  	 * hardware encapsulation features instead of standard
3575  	 * features for the netdev
3576  	 */
3577  	if (skb->encapsulation)
3578  		features &= dev->hw_enc_features;
3579  
3580  	if (skb_vlan_tagged(skb))
3581  		features = netdev_intersect_features(features,
3582  						     dev->vlan_features |
3583  						     NETIF_F_HW_VLAN_CTAG_TX |
3584  						     NETIF_F_HW_VLAN_STAG_TX);
3585  
3586  	if (dev->netdev_ops->ndo_features_check)
3587  		features &= dev->netdev_ops->ndo_features_check(skb, dev,
3588  								features);
3589  	else
3590  		features &= dflt_features_check(skb, dev, features);
3591  
3592  	return harmonize_features(skb, features);
3593  }
3594  EXPORT_SYMBOL(netif_skb_features);
3595  
xmit_one(struct sk_buff * skb,struct net_device * dev,struct netdev_queue * txq,bool more)3596  static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3597  		    struct netdev_queue *txq, bool more)
3598  {
3599  	unsigned int len;
3600  	int rc;
3601  
3602  	if (dev_nit_active(dev))
3603  		dev_queue_xmit_nit(skb, dev);
3604  
3605  	len = skb->len;
3606  	trace_net_dev_start_xmit(skb, dev);
3607  	rc = netdev_start_xmit(skb, dev, txq, more);
3608  	trace_net_dev_xmit(skb, rc, dev, len);
3609  
3610  	return rc;
3611  }
3612  
dev_hard_start_xmit(struct sk_buff * first,struct net_device * dev,struct netdev_queue * txq,int * ret)3613  struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3614  				    struct netdev_queue *txq, int *ret)
3615  {
3616  	struct sk_buff *skb = first;
3617  	int rc = NETDEV_TX_OK;
3618  
3619  	while (skb) {
3620  		struct sk_buff *next = skb->next;
3621  
3622  		skb_mark_not_on_list(skb);
3623  		rc = xmit_one(skb, dev, txq, next != NULL);
3624  		if (unlikely(!dev_xmit_complete(rc))) {
3625  			skb->next = next;
3626  			goto out;
3627  		}
3628  
3629  		skb = next;
3630  		if (netif_tx_queue_stopped(txq) && skb) {
3631  			rc = NETDEV_TX_BUSY;
3632  			break;
3633  		}
3634  	}
3635  
3636  out:
3637  	*ret = rc;
3638  	return skb;
3639  }
3640  
validate_xmit_vlan(struct sk_buff * skb,netdev_features_t features)3641  static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3642  					  netdev_features_t features)
3643  {
3644  	if (skb_vlan_tag_present(skb) &&
3645  	    !vlan_hw_offload_capable(features, skb->vlan_proto))
3646  		skb = __vlan_hwaccel_push_inside(skb);
3647  	return skb;
3648  }
3649  
skb_csum_hwoffload_help(struct sk_buff * skb,const netdev_features_t features)3650  int skb_csum_hwoffload_help(struct sk_buff *skb,
3651  			    const netdev_features_t features)
3652  {
3653  	if (unlikely(skb_csum_is_sctp(skb)))
3654  		return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3655  			skb_crc32c_csum_help(skb);
3656  
3657  	if (features & NETIF_F_HW_CSUM)
3658  		return 0;
3659  
3660  	if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3661  		if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
3662  		    skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
3663  		    !ipv6_has_hopopt_jumbo(skb))
3664  			goto sw_checksum;
3665  
3666  		switch (skb->csum_offset) {
3667  		case offsetof(struct tcphdr, check):
3668  		case offsetof(struct udphdr, check):
3669  			return 0;
3670  		}
3671  	}
3672  
3673  sw_checksum:
3674  	return skb_checksum_help(skb);
3675  }
3676  EXPORT_SYMBOL(skb_csum_hwoffload_help);
3677  
validate_xmit_skb(struct sk_buff * skb,struct net_device * dev,bool * again)3678  static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3679  {
3680  	netdev_features_t features;
3681  
3682  	features = netif_skb_features(skb);
3683  	skb = validate_xmit_vlan(skb, features);
3684  	if (unlikely(!skb))
3685  		goto out_null;
3686  
3687  	skb = sk_validate_xmit_skb(skb, dev);
3688  	if (unlikely(!skb))
3689  		goto out_null;
3690  
3691  	if (netif_needs_gso(skb, features)) {
3692  		struct sk_buff *segs;
3693  
3694  		segs = skb_gso_segment(skb, features);
3695  		if (IS_ERR(segs)) {
3696  			goto out_kfree_skb;
3697  		} else if (segs) {
3698  			consume_skb(skb);
3699  			skb = segs;
3700  		}
3701  	} else {
3702  		if (skb_needs_linearize(skb, features) &&
3703  		    __skb_linearize(skb))
3704  			goto out_kfree_skb;
3705  
3706  		/* If packet is not checksummed and device does not
3707  		 * support checksumming for this protocol, complete
3708  		 * checksumming here.
3709  		 */
3710  		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3711  			if (skb->encapsulation)
3712  				skb_set_inner_transport_header(skb,
3713  							       skb_checksum_start_offset(skb));
3714  			else
3715  				skb_set_transport_header(skb,
3716  							 skb_checksum_start_offset(skb));
3717  			if (skb_csum_hwoffload_help(skb, features))
3718  				goto out_kfree_skb;
3719  		}
3720  	}
3721  
3722  	skb = validate_xmit_xfrm(skb, features, again);
3723  
3724  	return skb;
3725  
3726  out_kfree_skb:
3727  	kfree_skb(skb);
3728  out_null:
3729  	dev_core_stats_tx_dropped_inc(dev);
3730  	return NULL;
3731  }
3732  
validate_xmit_skb_list(struct sk_buff * skb,struct net_device * dev,bool * again)3733  struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3734  {
3735  	struct sk_buff *next, *head = NULL, *tail;
3736  
3737  	for (; skb != NULL; skb = next) {
3738  		next = skb->next;
3739  		skb_mark_not_on_list(skb);
3740  
3741  		/* in case skb wont be segmented, point to itself */
3742  		skb->prev = skb;
3743  
3744  		skb = validate_xmit_skb(skb, dev, again);
3745  		if (!skb)
3746  			continue;
3747  
3748  		if (!head)
3749  			head = skb;
3750  		else
3751  			tail->next = skb;
3752  		/* If skb was segmented, skb->prev points to
3753  		 * the last segment. If not, it still contains skb.
3754  		 */
3755  		tail = skb->prev;
3756  	}
3757  	return head;
3758  }
3759  EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3760  
qdisc_pkt_len_init(struct sk_buff * skb)3761  static void qdisc_pkt_len_init(struct sk_buff *skb)
3762  {
3763  	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3764  
3765  	qdisc_skb_cb(skb)->pkt_len = skb->len;
3766  
3767  	/* To get more precise estimation of bytes sent on wire,
3768  	 * we add to pkt_len the headers size of all segments
3769  	 */
3770  	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3771  		u16 gso_segs = shinfo->gso_segs;
3772  		unsigned int hdr_len;
3773  
3774  		/* mac layer + network layer */
3775  		hdr_len = skb_transport_offset(skb);
3776  
3777  		/* + transport layer */
3778  		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3779  			const struct tcphdr *th;
3780  			struct tcphdr _tcphdr;
3781  
3782  			th = skb_header_pointer(skb, hdr_len,
3783  						sizeof(_tcphdr), &_tcphdr);
3784  			if (likely(th))
3785  				hdr_len += __tcp_hdrlen(th);
3786  		} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
3787  			struct udphdr _udphdr;
3788  
3789  			if (skb_header_pointer(skb, hdr_len,
3790  					       sizeof(_udphdr), &_udphdr))
3791  				hdr_len += sizeof(struct udphdr);
3792  		}
3793  
3794  		if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
3795  			int payload = skb->len - hdr_len;
3796  
3797  			/* Malicious packet. */
3798  			if (payload <= 0)
3799  				return;
3800  			gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
3801  		}
3802  		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3803  	}
3804  }
3805  
dev_qdisc_enqueue(struct sk_buff * skb,struct Qdisc * q,struct sk_buff ** to_free,struct netdev_queue * txq)3806  static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3807  			     struct sk_buff **to_free,
3808  			     struct netdev_queue *txq)
3809  {
3810  	int rc;
3811  
3812  	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3813  	if (rc == NET_XMIT_SUCCESS)
3814  		trace_qdisc_enqueue(q, txq, skb);
3815  	return rc;
3816  }
3817  
__dev_xmit_skb(struct sk_buff * skb,struct Qdisc * q,struct net_device * dev,struct netdev_queue * txq)3818  static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3819  				 struct net_device *dev,
3820  				 struct netdev_queue *txq)
3821  {
3822  	spinlock_t *root_lock = qdisc_lock(q);
3823  	struct sk_buff *to_free = NULL;
3824  	bool contended;
3825  	int rc;
3826  
3827  	qdisc_calculate_pkt_len(skb, q);
3828  
3829  	if (q->flags & TCQ_F_NOLOCK) {
3830  		if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3831  		    qdisc_run_begin(q)) {
3832  			/* Retest nolock_qdisc_is_empty() within the protection
3833  			 * of q->seqlock to protect from racing with requeuing.
3834  			 */
3835  			if (unlikely(!nolock_qdisc_is_empty(q))) {
3836  				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3837  				__qdisc_run(q);
3838  				qdisc_run_end(q);
3839  
3840  				goto no_lock_out;
3841  			}
3842  
3843  			qdisc_bstats_cpu_update(q, skb);
3844  			if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3845  			    !nolock_qdisc_is_empty(q))
3846  				__qdisc_run(q);
3847  
3848  			qdisc_run_end(q);
3849  			return NET_XMIT_SUCCESS;
3850  		}
3851  
3852  		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3853  		qdisc_run(q);
3854  
3855  no_lock_out:
3856  		if (unlikely(to_free))
3857  			kfree_skb_list_reason(to_free,
3858  					      SKB_DROP_REASON_QDISC_DROP);
3859  		return rc;
3860  	}
3861  
3862  	/*
3863  	 * Heuristic to force contended enqueues to serialize on a
3864  	 * separate lock before trying to get qdisc main lock.
3865  	 * This permits qdisc->running owner to get the lock more
3866  	 * often and dequeue packets faster.
3867  	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3868  	 * and then other tasks will only enqueue packets. The packets will be
3869  	 * sent after the qdisc owner is scheduled again. To prevent this
3870  	 * scenario the task always serialize on the lock.
3871  	 */
3872  	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3873  	if (unlikely(contended))
3874  		spin_lock(&q->busylock);
3875  
3876  	spin_lock(root_lock);
3877  	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3878  		__qdisc_drop(skb, &to_free);
3879  		rc = NET_XMIT_DROP;
3880  	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3881  		   qdisc_run_begin(q)) {
3882  		/*
3883  		 * This is a work-conserving queue; there are no old skbs
3884  		 * waiting to be sent out; and the qdisc is not running -
3885  		 * xmit the skb directly.
3886  		 */
3887  
3888  		qdisc_bstats_update(q, skb);
3889  
3890  		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3891  			if (unlikely(contended)) {
3892  				spin_unlock(&q->busylock);
3893  				contended = false;
3894  			}
3895  			__qdisc_run(q);
3896  		}
3897  
3898  		qdisc_run_end(q);
3899  		rc = NET_XMIT_SUCCESS;
3900  	} else {
3901  		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3902  		if (qdisc_run_begin(q)) {
3903  			if (unlikely(contended)) {
3904  				spin_unlock(&q->busylock);
3905  				contended = false;
3906  			}
3907  			__qdisc_run(q);
3908  			qdisc_run_end(q);
3909  		}
3910  	}
3911  	spin_unlock(root_lock);
3912  	if (unlikely(to_free))
3913  		kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
3914  	if (unlikely(contended))
3915  		spin_unlock(&q->busylock);
3916  	return rc;
3917  }
3918  
3919  #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
skb_update_prio(struct sk_buff * skb)3920  static void skb_update_prio(struct sk_buff *skb)
3921  {
3922  	const struct netprio_map *map;
3923  	const struct sock *sk;
3924  	unsigned int prioidx;
3925  
3926  	if (skb->priority)
3927  		return;
3928  	map = rcu_dereference_bh(skb->dev->priomap);
3929  	if (!map)
3930  		return;
3931  	sk = skb_to_full_sk(skb);
3932  	if (!sk)
3933  		return;
3934  
3935  	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3936  
3937  	if (prioidx < map->priomap_len)
3938  		skb->priority = map->priomap[prioidx];
3939  }
3940  #else
3941  #define skb_update_prio(skb)
3942  #endif
3943  
3944  /**
3945   *	dev_loopback_xmit - loop back @skb
3946   *	@net: network namespace this loopback is happening in
3947   *	@sk:  sk needed to be a netfilter okfn
3948   *	@skb: buffer to transmit
3949   */
dev_loopback_xmit(struct net * net,struct sock * sk,struct sk_buff * skb)3950  int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3951  {
3952  	skb_reset_mac_header(skb);
3953  	__skb_pull(skb, skb_network_offset(skb));
3954  	skb->pkt_type = PACKET_LOOPBACK;
3955  	if (skb->ip_summed == CHECKSUM_NONE)
3956  		skb->ip_summed = CHECKSUM_UNNECESSARY;
3957  	DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3958  	skb_dst_force(skb);
3959  	netif_rx(skb);
3960  	return 0;
3961  }
3962  EXPORT_SYMBOL(dev_loopback_xmit);
3963  
3964  #ifdef CONFIG_NET_EGRESS
3965  static struct netdev_queue *
netdev_tx_queue_mapping(struct net_device * dev,struct sk_buff * skb)3966  netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3967  {
3968  	int qm = skb_get_queue_mapping(skb);
3969  
3970  	return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3971  }
3972  
netdev_xmit_txqueue_skipped(void)3973  static bool netdev_xmit_txqueue_skipped(void)
3974  {
3975  	return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3976  }
3977  
netdev_xmit_skip_txqueue(bool skip)3978  void netdev_xmit_skip_txqueue(bool skip)
3979  {
3980  	__this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3981  }
3982  EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3983  #endif /* CONFIG_NET_EGRESS */
3984  
3985  #ifdef CONFIG_NET_XGRESS
tc_run(struct tcx_entry * entry,struct sk_buff * skb)3986  static int tc_run(struct tcx_entry *entry, struct sk_buff *skb)
3987  {
3988  	int ret = TC_ACT_UNSPEC;
3989  #ifdef CONFIG_NET_CLS_ACT
3990  	struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3991  	struct tcf_result res;
3992  
3993  	if (!miniq)
3994  		return ret;
3995  
3996  	tc_skb_cb(skb)->mru = 0;
3997  	tc_skb_cb(skb)->post_ct = false;
3998  
3999  	mini_qdisc_bstats_cpu_update(miniq, skb);
4000  	ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
4001  	/* Only tcf related quirks below. */
4002  	switch (ret) {
4003  	case TC_ACT_SHOT:
4004  		mini_qdisc_qstats_cpu_drop(miniq);
4005  		break;
4006  	case TC_ACT_OK:
4007  	case TC_ACT_RECLASSIFY:
4008  		skb->tc_index = TC_H_MIN(res.classid);
4009  		break;
4010  	}
4011  #endif /* CONFIG_NET_CLS_ACT */
4012  	return ret;
4013  }
4014  
4015  static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
4016  
tcx_inc(void)4017  void tcx_inc(void)
4018  {
4019  	static_branch_inc(&tcx_needed_key);
4020  }
4021  
tcx_dec(void)4022  void tcx_dec(void)
4023  {
4024  	static_branch_dec(&tcx_needed_key);
4025  }
4026  
4027  static __always_inline enum tcx_action_base
tcx_run(const struct bpf_mprog_entry * entry,struct sk_buff * skb,const bool needs_mac)4028  tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
4029  	const bool needs_mac)
4030  {
4031  	const struct bpf_mprog_fp *fp;
4032  	const struct bpf_prog *prog;
4033  	int ret = TCX_NEXT;
4034  
4035  	if (needs_mac)
4036  		__skb_push(skb, skb->mac_len);
4037  	bpf_mprog_foreach_prog(entry, fp, prog) {
4038  		bpf_compute_data_pointers(skb);
4039  		ret = bpf_prog_run(prog, skb);
4040  		if (ret != TCX_NEXT)
4041  			break;
4042  	}
4043  	if (needs_mac)
4044  		__skb_pull(skb, skb->mac_len);
4045  	return tcx_action_code(skb, ret);
4046  }
4047  
4048  static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev,bool * another)4049  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4050  		   struct net_device *orig_dev, bool *another)
4051  {
4052  	struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
4053  	int sch_ret;
4054  
4055  	if (!entry)
4056  		return skb;
4057  	if (*pt_prev) {
4058  		*ret = deliver_skb(skb, *pt_prev, orig_dev);
4059  		*pt_prev = NULL;
4060  	}
4061  
4062  	qdisc_skb_cb(skb)->pkt_len = skb->len;
4063  	tcx_set_ingress(skb, true);
4064  
4065  	if (static_branch_unlikely(&tcx_needed_key)) {
4066  		sch_ret = tcx_run(entry, skb, true);
4067  		if (sch_ret != TC_ACT_UNSPEC)
4068  			goto ingress_verdict;
4069  	}
4070  	sch_ret = tc_run(tcx_entry(entry), skb);
4071  ingress_verdict:
4072  	switch (sch_ret) {
4073  	case TC_ACT_REDIRECT:
4074  		/* skb_mac_header check was done by BPF, so we can safely
4075  		 * push the L2 header back before redirecting to another
4076  		 * netdev.
4077  		 */
4078  		__skb_push(skb, skb->mac_len);
4079  		if (skb_do_redirect(skb) == -EAGAIN) {
4080  			__skb_pull(skb, skb->mac_len);
4081  			*another = true;
4082  			break;
4083  		}
4084  		*ret = NET_RX_SUCCESS;
4085  		return NULL;
4086  	case TC_ACT_SHOT:
4087  		kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
4088  		*ret = NET_RX_DROP;
4089  		return NULL;
4090  	/* used by tc_run */
4091  	case TC_ACT_STOLEN:
4092  	case TC_ACT_QUEUED:
4093  	case TC_ACT_TRAP:
4094  		consume_skb(skb);
4095  		fallthrough;
4096  	case TC_ACT_CONSUMED:
4097  		*ret = NET_RX_SUCCESS;
4098  		return NULL;
4099  	}
4100  
4101  	return skb;
4102  }
4103  
4104  static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff * skb,int * ret,struct net_device * dev)4105  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4106  {
4107  	struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4108  	int sch_ret;
4109  
4110  	if (!entry)
4111  		return skb;
4112  
4113  	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4114  	 * already set by the caller.
4115  	 */
4116  	if (static_branch_unlikely(&tcx_needed_key)) {
4117  		sch_ret = tcx_run(entry, skb, false);
4118  		if (sch_ret != TC_ACT_UNSPEC)
4119  			goto egress_verdict;
4120  	}
4121  	sch_ret = tc_run(tcx_entry(entry), skb);
4122  egress_verdict:
4123  	switch (sch_ret) {
4124  	case TC_ACT_REDIRECT:
4125  		/* No need to push/pop skb's mac_header here on egress! */
4126  		skb_do_redirect(skb);
4127  		*ret = NET_XMIT_SUCCESS;
4128  		return NULL;
4129  	case TC_ACT_SHOT:
4130  		kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
4131  		*ret = NET_XMIT_DROP;
4132  		return NULL;
4133  	/* used by tc_run */
4134  	case TC_ACT_STOLEN:
4135  	case TC_ACT_QUEUED:
4136  	case TC_ACT_TRAP:
4137  		consume_skb(skb);
4138  		fallthrough;
4139  	case TC_ACT_CONSUMED:
4140  		*ret = NET_XMIT_SUCCESS;
4141  		return NULL;
4142  	}
4143  
4144  	return skb;
4145  }
4146  #else
4147  static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev,bool * another)4148  sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4149  		   struct net_device *orig_dev, bool *another)
4150  {
4151  	return skb;
4152  }
4153  
4154  static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff * skb,int * ret,struct net_device * dev)4155  sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4156  {
4157  	return skb;
4158  }
4159  #endif /* CONFIG_NET_XGRESS */
4160  
4161  #ifdef CONFIG_XPS
__get_xps_queue_idx(struct net_device * dev,struct sk_buff * skb,struct xps_dev_maps * dev_maps,unsigned int tci)4162  static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4163  			       struct xps_dev_maps *dev_maps, unsigned int tci)
4164  {
4165  	int tc = netdev_get_prio_tc_map(dev, skb->priority);
4166  	struct xps_map *map;
4167  	int queue_index = -1;
4168  
4169  	if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4170  		return queue_index;
4171  
4172  	tci *= dev_maps->num_tc;
4173  	tci += tc;
4174  
4175  	map = rcu_dereference(dev_maps->attr_map[tci]);
4176  	if (map) {
4177  		if (map->len == 1)
4178  			queue_index = map->queues[0];
4179  		else
4180  			queue_index = map->queues[reciprocal_scale(
4181  						skb_get_hash(skb), map->len)];
4182  		if (unlikely(queue_index >= dev->real_num_tx_queues))
4183  			queue_index = -1;
4184  	}
4185  	return queue_index;
4186  }
4187  #endif
4188  
get_xps_queue(struct net_device * dev,struct net_device * sb_dev,struct sk_buff * skb)4189  static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4190  			 struct sk_buff *skb)
4191  {
4192  #ifdef CONFIG_XPS
4193  	struct xps_dev_maps *dev_maps;
4194  	struct sock *sk = skb->sk;
4195  	int queue_index = -1;
4196  
4197  	if (!static_key_false(&xps_needed))
4198  		return -1;
4199  
4200  	rcu_read_lock();
4201  	if (!static_key_false(&xps_rxqs_needed))
4202  		goto get_cpus_map;
4203  
4204  	dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4205  	if (dev_maps) {
4206  		int tci = sk_rx_queue_get(sk);
4207  
4208  		if (tci >= 0)
4209  			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4210  							  tci);
4211  	}
4212  
4213  get_cpus_map:
4214  	if (queue_index < 0) {
4215  		dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4216  		if (dev_maps) {
4217  			unsigned int tci = skb->sender_cpu - 1;
4218  
4219  			queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4220  							  tci);
4221  		}
4222  	}
4223  	rcu_read_unlock();
4224  
4225  	return queue_index;
4226  #else
4227  	return -1;
4228  #endif
4229  }
4230  
dev_pick_tx_zero(struct net_device * dev,struct sk_buff * skb,struct net_device * sb_dev)4231  u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4232  		     struct net_device *sb_dev)
4233  {
4234  	return 0;
4235  }
4236  EXPORT_SYMBOL(dev_pick_tx_zero);
4237  
dev_pick_tx_cpu_id(struct net_device * dev,struct sk_buff * skb,struct net_device * sb_dev)4238  u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4239  		       struct net_device *sb_dev)
4240  {
4241  	return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4242  }
4243  EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4244  
netdev_pick_tx(struct net_device * dev,struct sk_buff * skb,struct net_device * sb_dev)4245  u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4246  		     struct net_device *sb_dev)
4247  {
4248  	struct sock *sk = skb->sk;
4249  	int queue_index = sk_tx_queue_get(sk);
4250  
4251  	sb_dev = sb_dev ? : dev;
4252  
4253  	if (queue_index < 0 || skb->ooo_okay ||
4254  	    queue_index >= dev->real_num_tx_queues) {
4255  		int new_index = get_xps_queue(dev, sb_dev, skb);
4256  
4257  		if (new_index < 0)
4258  			new_index = skb_tx_hash(dev, sb_dev, skb);
4259  
4260  		if (queue_index != new_index && sk &&
4261  		    sk_fullsock(sk) &&
4262  		    rcu_access_pointer(sk->sk_dst_cache))
4263  			sk_tx_queue_set(sk, new_index);
4264  
4265  		queue_index = new_index;
4266  	}
4267  
4268  	return queue_index;
4269  }
4270  EXPORT_SYMBOL(netdev_pick_tx);
4271  
netdev_core_pick_tx(struct net_device * dev,struct sk_buff * skb,struct net_device * sb_dev)4272  struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4273  					 struct sk_buff *skb,
4274  					 struct net_device *sb_dev)
4275  {
4276  	int queue_index = 0;
4277  
4278  #ifdef CONFIG_XPS
4279  	u32 sender_cpu = skb->sender_cpu - 1;
4280  
4281  	if (sender_cpu >= (u32)NR_CPUS)
4282  		skb->sender_cpu = raw_smp_processor_id() + 1;
4283  #endif
4284  
4285  	if (dev->real_num_tx_queues != 1) {
4286  		const struct net_device_ops *ops = dev->netdev_ops;
4287  
4288  		if (ops->ndo_select_queue)
4289  			queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4290  		else
4291  			queue_index = netdev_pick_tx(dev, skb, sb_dev);
4292  
4293  		queue_index = netdev_cap_txqueue(dev, queue_index);
4294  	}
4295  
4296  	skb_set_queue_mapping(skb, queue_index);
4297  	return netdev_get_tx_queue(dev, queue_index);
4298  }
4299  
4300  /**
4301   * __dev_queue_xmit() - transmit a buffer
4302   * @skb:	buffer to transmit
4303   * @sb_dev:	suboordinate device used for L2 forwarding offload
4304   *
4305   * Queue a buffer for transmission to a network device. The caller must
4306   * have set the device and priority and built the buffer before calling
4307   * this function. The function can be called from an interrupt.
4308   *
4309   * When calling this method, interrupts MUST be enabled. This is because
4310   * the BH enable code must have IRQs enabled so that it will not deadlock.
4311   *
4312   * Regardless of the return value, the skb is consumed, so it is currently
4313   * difficult to retry a send to this method. (You can bump the ref count
4314   * before sending to hold a reference for retry if you are careful.)
4315   *
4316   * Return:
4317   * * 0				- buffer successfully transmitted
4318   * * positive qdisc return code	- NET_XMIT_DROP etc.
4319   * * negative errno		- other errors
4320   */
__dev_queue_xmit(struct sk_buff * skb,struct net_device * sb_dev)4321  int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4322  {
4323  	struct net_device *dev = skb->dev;
4324  	struct netdev_queue *txq = NULL;
4325  	struct Qdisc *q;
4326  	int rc = -ENOMEM;
4327  	bool again = false;
4328  
4329  	skb_reset_mac_header(skb);
4330  	skb_assert_len(skb);
4331  
4332  	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4333  		__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4334  
4335  	/* Disable soft irqs for various locks below. Also
4336  	 * stops preemption for RCU.
4337  	 */
4338  	rcu_read_lock_bh();
4339  
4340  	skb_update_prio(skb);
4341  
4342  	qdisc_pkt_len_init(skb);
4343  	tcx_set_ingress(skb, false);
4344  #ifdef CONFIG_NET_EGRESS
4345  	if (static_branch_unlikely(&egress_needed_key)) {
4346  		if (nf_hook_egress_active()) {
4347  			skb = nf_hook_egress(skb, &rc, dev);
4348  			if (!skb)
4349  				goto out;
4350  		}
4351  
4352  		netdev_xmit_skip_txqueue(false);
4353  
4354  		nf_skip_egress(skb, true);
4355  		skb = sch_handle_egress(skb, &rc, dev);
4356  		if (!skb)
4357  			goto out;
4358  		nf_skip_egress(skb, false);
4359  
4360  		if (netdev_xmit_txqueue_skipped())
4361  			txq = netdev_tx_queue_mapping(dev, skb);
4362  	}
4363  #endif
4364  	/* If device/qdisc don't need skb->dst, release it right now while
4365  	 * its hot in this cpu cache.
4366  	 */
4367  	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4368  		skb_dst_drop(skb);
4369  	else
4370  		skb_dst_force(skb);
4371  
4372  	if (!txq)
4373  		txq = netdev_core_pick_tx(dev, skb, sb_dev);
4374  
4375  	q = rcu_dereference_bh(txq->qdisc);
4376  
4377  	trace_net_dev_queue(skb);
4378  	if (q->enqueue) {
4379  		rc = __dev_xmit_skb(skb, q, dev, txq);
4380  		goto out;
4381  	}
4382  
4383  	/* The device has no queue. Common case for software devices:
4384  	 * loopback, all the sorts of tunnels...
4385  
4386  	 * Really, it is unlikely that netif_tx_lock protection is necessary
4387  	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4388  	 * counters.)
4389  	 * However, it is possible, that they rely on protection
4390  	 * made by us here.
4391  
4392  	 * Check this and shot the lock. It is not prone from deadlocks.
4393  	 *Either shot noqueue qdisc, it is even simpler 8)
4394  	 */
4395  	if (dev->flags & IFF_UP) {
4396  		int cpu = smp_processor_id(); /* ok because BHs are off */
4397  
4398  		/* Other cpus might concurrently change txq->xmit_lock_owner
4399  		 * to -1 or to their cpu id, but not to our id.
4400  		 */
4401  		if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4402  			if (dev_xmit_recursion())
4403  				goto recursion_alert;
4404  
4405  			skb = validate_xmit_skb(skb, dev, &again);
4406  			if (!skb)
4407  				goto out;
4408  
4409  			HARD_TX_LOCK(dev, txq, cpu);
4410  
4411  			if (!netif_xmit_stopped(txq)) {
4412  				dev_xmit_recursion_inc();
4413  				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4414  				dev_xmit_recursion_dec();
4415  				if (dev_xmit_complete(rc)) {
4416  					HARD_TX_UNLOCK(dev, txq);
4417  					goto out;
4418  				}
4419  			}
4420  			HARD_TX_UNLOCK(dev, txq);
4421  			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4422  					     dev->name);
4423  		} else {
4424  			/* Recursion is detected! It is possible,
4425  			 * unfortunately
4426  			 */
4427  recursion_alert:
4428  			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4429  					     dev->name);
4430  		}
4431  	}
4432  
4433  	rc = -ENETDOWN;
4434  	rcu_read_unlock_bh();
4435  
4436  	dev_core_stats_tx_dropped_inc(dev);
4437  	kfree_skb_list(skb);
4438  	return rc;
4439  out:
4440  	rcu_read_unlock_bh();
4441  	return rc;
4442  }
4443  EXPORT_SYMBOL(__dev_queue_xmit);
4444  
__dev_direct_xmit(struct sk_buff * skb,u16 queue_id)4445  int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4446  {
4447  	struct net_device *dev = skb->dev;
4448  	struct sk_buff *orig_skb = skb;
4449  	struct netdev_queue *txq;
4450  	int ret = NETDEV_TX_BUSY;
4451  	bool again = false;
4452  
4453  	if (unlikely(!netif_running(dev) ||
4454  		     !netif_carrier_ok(dev)))
4455  		goto drop;
4456  
4457  	skb = validate_xmit_skb_list(skb, dev, &again);
4458  	if (skb != orig_skb)
4459  		goto drop;
4460  
4461  	skb_set_queue_mapping(skb, queue_id);
4462  	txq = skb_get_tx_queue(dev, skb);
4463  
4464  	local_bh_disable();
4465  
4466  	dev_xmit_recursion_inc();
4467  	HARD_TX_LOCK(dev, txq, smp_processor_id());
4468  	if (!netif_xmit_frozen_or_drv_stopped(txq))
4469  		ret = netdev_start_xmit(skb, dev, txq, false);
4470  	HARD_TX_UNLOCK(dev, txq);
4471  	dev_xmit_recursion_dec();
4472  
4473  	local_bh_enable();
4474  	return ret;
4475  drop:
4476  	dev_core_stats_tx_dropped_inc(dev);
4477  	kfree_skb_list(skb);
4478  	return NET_XMIT_DROP;
4479  }
4480  EXPORT_SYMBOL(__dev_direct_xmit);
4481  
4482  /*************************************************************************
4483   *			Receiver routines
4484   *************************************************************************/
4485  
4486  int netdev_max_backlog __read_mostly = 1000;
4487  EXPORT_SYMBOL(netdev_max_backlog);
4488  
4489  int netdev_tstamp_prequeue __read_mostly = 1;
4490  unsigned int sysctl_skb_defer_max __read_mostly = 64;
4491  int netdev_budget __read_mostly = 300;
4492  /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4493  unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4494  int weight_p __read_mostly = 64;           /* old backlog weight */
4495  int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4496  int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4497  int dev_rx_weight __read_mostly = 64;
4498  int dev_tx_weight __read_mostly = 64;
4499  
4500  /* Called with irq disabled */
____napi_schedule(struct softnet_data * sd,struct napi_struct * napi)4501  static inline void ____napi_schedule(struct softnet_data *sd,
4502  				     struct napi_struct *napi)
4503  {
4504  	struct task_struct *thread;
4505  
4506  	lockdep_assert_irqs_disabled();
4507  
4508  	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4509  		/* Paired with smp_mb__before_atomic() in
4510  		 * napi_enable()/dev_set_threaded().
4511  		 * Use READ_ONCE() to guarantee a complete
4512  		 * read on napi->thread. Only call
4513  		 * wake_up_process() when it's not NULL.
4514  		 */
4515  		thread = READ_ONCE(napi->thread);
4516  		if (thread) {
4517  			/* Avoid doing set_bit() if the thread is in
4518  			 * INTERRUPTIBLE state, cause napi_thread_wait()
4519  			 * makes sure to proceed with napi polling
4520  			 * if the thread is explicitly woken from here.
4521  			 */
4522  			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
4523  				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4524  			wake_up_process(thread);
4525  			return;
4526  		}
4527  	}
4528  
4529  	list_add_tail(&napi->poll_list, &sd->poll_list);
4530  	WRITE_ONCE(napi->list_owner, smp_processor_id());
4531  	/* If not called from net_rx_action()
4532  	 * we have to raise NET_RX_SOFTIRQ.
4533  	 */
4534  	if (!sd->in_net_rx_action)
4535  		raise_softirq_irqoff(NET_RX_SOFTIRQ);
4536  }
4537  
4538  #ifdef CONFIG_RPS
4539  
4540  /* One global table that all flow-based protocols share. */
4541  struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4542  EXPORT_SYMBOL(rps_sock_flow_table);
4543  u32 rps_cpu_mask __read_mostly;
4544  EXPORT_SYMBOL(rps_cpu_mask);
4545  
4546  struct static_key_false rps_needed __read_mostly;
4547  EXPORT_SYMBOL(rps_needed);
4548  struct static_key_false rfs_needed __read_mostly;
4549  EXPORT_SYMBOL(rfs_needed);
4550  
4551  static struct rps_dev_flow *
set_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow * rflow,u16 next_cpu)4552  set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4553  	    struct rps_dev_flow *rflow, u16 next_cpu)
4554  {
4555  	if (next_cpu < nr_cpu_ids) {
4556  #ifdef CONFIG_RFS_ACCEL
4557  		struct netdev_rx_queue *rxqueue;
4558  		struct rps_dev_flow_table *flow_table;
4559  		struct rps_dev_flow *old_rflow;
4560  		u32 flow_id;
4561  		u16 rxq_index;
4562  		int rc;
4563  
4564  		/* Should we steer this flow to a different hardware queue? */
4565  		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4566  		    !(dev->features & NETIF_F_NTUPLE))
4567  			goto out;
4568  		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4569  		if (rxq_index == skb_get_rx_queue(skb))
4570  			goto out;
4571  
4572  		rxqueue = dev->_rx + rxq_index;
4573  		flow_table = rcu_dereference(rxqueue->rps_flow_table);
4574  		if (!flow_table)
4575  			goto out;
4576  		flow_id = skb_get_hash(skb) & flow_table->mask;
4577  		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4578  							rxq_index, flow_id);
4579  		if (rc < 0)
4580  			goto out;
4581  		old_rflow = rflow;
4582  		rflow = &flow_table->flows[flow_id];
4583  		rflow->filter = rc;
4584  		if (old_rflow->filter == rflow->filter)
4585  			old_rflow->filter = RPS_NO_FILTER;
4586  	out:
4587  #endif
4588  		rflow->last_qtail =
4589  			per_cpu(softnet_data, next_cpu).input_queue_head;
4590  	}
4591  
4592  	rflow->cpu = next_cpu;
4593  	return rflow;
4594  }
4595  
4596  /*
4597   * get_rps_cpu is called from netif_receive_skb and returns the target
4598   * CPU from the RPS map of the receiving queue for a given skb.
4599   * rcu_read_lock must be held on entry.
4600   */
get_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow ** rflowp)4601  static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4602  		       struct rps_dev_flow **rflowp)
4603  {
4604  	const struct rps_sock_flow_table *sock_flow_table;
4605  	struct netdev_rx_queue *rxqueue = dev->_rx;
4606  	struct rps_dev_flow_table *flow_table;
4607  	struct rps_map *map;
4608  	int cpu = -1;
4609  	u32 tcpu;
4610  	u32 hash;
4611  
4612  	if (skb_rx_queue_recorded(skb)) {
4613  		u16 index = skb_get_rx_queue(skb);
4614  
4615  		if (unlikely(index >= dev->real_num_rx_queues)) {
4616  			WARN_ONCE(dev->real_num_rx_queues > 1,
4617  				  "%s received packet on queue %u, but number "
4618  				  "of RX queues is %u\n",
4619  				  dev->name, index, dev->real_num_rx_queues);
4620  			goto done;
4621  		}
4622  		rxqueue += index;
4623  	}
4624  
4625  	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4626  
4627  	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4628  	map = rcu_dereference(rxqueue->rps_map);
4629  	if (!flow_table && !map)
4630  		goto done;
4631  
4632  	skb_reset_network_header(skb);
4633  	hash = skb_get_hash(skb);
4634  	if (!hash)
4635  		goto done;
4636  
4637  	sock_flow_table = rcu_dereference(rps_sock_flow_table);
4638  	if (flow_table && sock_flow_table) {
4639  		struct rps_dev_flow *rflow;
4640  		u32 next_cpu;
4641  		u32 ident;
4642  
4643  		/* First check into global flow table if there is a match.
4644  		 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4645  		 */
4646  		ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4647  		if ((ident ^ hash) & ~rps_cpu_mask)
4648  			goto try_rps;
4649  
4650  		next_cpu = ident & rps_cpu_mask;
4651  
4652  		/* OK, now we know there is a match,
4653  		 * we can look at the local (per receive queue) flow table
4654  		 */
4655  		rflow = &flow_table->flows[hash & flow_table->mask];
4656  		tcpu = rflow->cpu;
4657  
4658  		/*
4659  		 * If the desired CPU (where last recvmsg was done) is
4660  		 * different from current CPU (one in the rx-queue flow
4661  		 * table entry), switch if one of the following holds:
4662  		 *   - Current CPU is unset (>= nr_cpu_ids).
4663  		 *   - Current CPU is offline.
4664  		 *   - The current CPU's queue tail has advanced beyond the
4665  		 *     last packet that was enqueued using this table entry.
4666  		 *     This guarantees that all previous packets for the flow
4667  		 *     have been dequeued, thus preserving in order delivery.
4668  		 */
4669  		if (unlikely(tcpu != next_cpu) &&
4670  		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4671  		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4672  		      rflow->last_qtail)) >= 0)) {
4673  			tcpu = next_cpu;
4674  			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4675  		}
4676  
4677  		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4678  			*rflowp = rflow;
4679  			cpu = tcpu;
4680  			goto done;
4681  		}
4682  	}
4683  
4684  try_rps:
4685  
4686  	if (map) {
4687  		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4688  		if (cpu_online(tcpu)) {
4689  			cpu = tcpu;
4690  			goto done;
4691  		}
4692  	}
4693  
4694  done:
4695  	return cpu;
4696  }
4697  
4698  #ifdef CONFIG_RFS_ACCEL
4699  
4700  /**
4701   * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4702   * @dev: Device on which the filter was set
4703   * @rxq_index: RX queue index
4704   * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4705   * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4706   *
4707   * Drivers that implement ndo_rx_flow_steer() should periodically call
4708   * this function for each installed filter and remove the filters for
4709   * which it returns %true.
4710   */
rps_may_expire_flow(struct net_device * dev,u16 rxq_index,u32 flow_id,u16 filter_id)4711  bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4712  			 u32 flow_id, u16 filter_id)
4713  {
4714  	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4715  	struct rps_dev_flow_table *flow_table;
4716  	struct rps_dev_flow *rflow;
4717  	bool expire = true;
4718  	unsigned int cpu;
4719  
4720  	rcu_read_lock();
4721  	flow_table = rcu_dereference(rxqueue->rps_flow_table);
4722  	if (flow_table && flow_id <= flow_table->mask) {
4723  		rflow = &flow_table->flows[flow_id];
4724  		cpu = READ_ONCE(rflow->cpu);
4725  		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4726  		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4727  			   rflow->last_qtail) <
4728  		     (int)(10 * flow_table->mask)))
4729  			expire = false;
4730  	}
4731  	rcu_read_unlock();
4732  	return expire;
4733  }
4734  EXPORT_SYMBOL(rps_may_expire_flow);
4735  
4736  #endif /* CONFIG_RFS_ACCEL */
4737  
4738  /* Called from hardirq (IPI) context */
rps_trigger_softirq(void * data)4739  static void rps_trigger_softirq(void *data)
4740  {
4741  	struct softnet_data *sd = data;
4742  
4743  	____napi_schedule(sd, &sd->backlog);
4744  	sd->received_rps++;
4745  }
4746  
4747  #endif /* CONFIG_RPS */
4748  
4749  /* Called from hardirq (IPI) context */
trigger_rx_softirq(void * data)4750  static void trigger_rx_softirq(void *data)
4751  {
4752  	struct softnet_data *sd = data;
4753  
4754  	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4755  	smp_store_release(&sd->defer_ipi_scheduled, 0);
4756  }
4757  
4758  /*
4759   * After we queued a packet into sd->input_pkt_queue,
4760   * we need to make sure this queue is serviced soon.
4761   *
4762   * - If this is another cpu queue, link it to our rps_ipi_list,
4763   *   and make sure we will process rps_ipi_list from net_rx_action().
4764   *
4765   * - If this is our own queue, NAPI schedule our backlog.
4766   *   Note that this also raises NET_RX_SOFTIRQ.
4767   */
napi_schedule_rps(struct softnet_data * sd)4768  static void napi_schedule_rps(struct softnet_data *sd)
4769  {
4770  	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4771  
4772  #ifdef CONFIG_RPS
4773  	if (sd != mysd) {
4774  		sd->rps_ipi_next = mysd->rps_ipi_list;
4775  		mysd->rps_ipi_list = sd;
4776  
4777  		/* If not called from net_rx_action() or napi_threaded_poll()
4778  		 * we have to raise NET_RX_SOFTIRQ.
4779  		 */
4780  		if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4781  			__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4782  		return;
4783  	}
4784  #endif /* CONFIG_RPS */
4785  	__napi_schedule_irqoff(&mysd->backlog);
4786  }
4787  
4788  #ifdef CONFIG_NET_FLOW_LIMIT
4789  int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4790  #endif
4791  
skb_flow_limit(struct sk_buff * skb,unsigned int qlen)4792  static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4793  {
4794  #ifdef CONFIG_NET_FLOW_LIMIT
4795  	struct sd_flow_limit *fl;
4796  	struct softnet_data *sd;
4797  	unsigned int old_flow, new_flow;
4798  
4799  	if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
4800  		return false;
4801  
4802  	sd = this_cpu_ptr(&softnet_data);
4803  
4804  	rcu_read_lock();
4805  	fl = rcu_dereference(sd->flow_limit);
4806  	if (fl) {
4807  		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4808  		old_flow = fl->history[fl->history_head];
4809  		fl->history[fl->history_head] = new_flow;
4810  
4811  		fl->history_head++;
4812  		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4813  
4814  		if (likely(fl->buckets[old_flow]))
4815  			fl->buckets[old_flow]--;
4816  
4817  		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4818  			fl->count++;
4819  			rcu_read_unlock();
4820  			return true;
4821  		}
4822  	}
4823  	rcu_read_unlock();
4824  #endif
4825  	return false;
4826  }
4827  
4828  /*
4829   * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4830   * queue (may be a remote CPU queue).
4831   */
enqueue_to_backlog(struct sk_buff * skb,int cpu,unsigned int * qtail)4832  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4833  			      unsigned int *qtail)
4834  {
4835  	enum skb_drop_reason reason;
4836  	struct softnet_data *sd;
4837  	unsigned long flags;
4838  	unsigned int qlen;
4839  
4840  	reason = SKB_DROP_REASON_NOT_SPECIFIED;
4841  	sd = &per_cpu(softnet_data, cpu);
4842  
4843  	rps_lock_irqsave(sd, &flags);
4844  	if (!netif_running(skb->dev))
4845  		goto drop;
4846  	qlen = skb_queue_len(&sd->input_pkt_queue);
4847  	if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
4848  		if (qlen) {
4849  enqueue:
4850  			__skb_queue_tail(&sd->input_pkt_queue, skb);
4851  			input_queue_tail_incr_save(sd, qtail);
4852  			rps_unlock_irq_restore(sd, &flags);
4853  			return NET_RX_SUCCESS;
4854  		}
4855  
4856  		/* Schedule NAPI for backlog device
4857  		 * We can use non atomic operation since we own the queue lock
4858  		 */
4859  		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4860  			napi_schedule_rps(sd);
4861  		goto enqueue;
4862  	}
4863  	reason = SKB_DROP_REASON_CPU_BACKLOG;
4864  
4865  drop:
4866  	sd->dropped++;
4867  	rps_unlock_irq_restore(sd, &flags);
4868  
4869  	dev_core_stats_rx_dropped_inc(skb->dev);
4870  	kfree_skb_reason(skb, reason);
4871  	return NET_RX_DROP;
4872  }
4873  
netif_get_rxqueue(struct sk_buff * skb)4874  static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4875  {
4876  	struct net_device *dev = skb->dev;
4877  	struct netdev_rx_queue *rxqueue;
4878  
4879  	rxqueue = dev->_rx;
4880  
4881  	if (skb_rx_queue_recorded(skb)) {
4882  		u16 index = skb_get_rx_queue(skb);
4883  
4884  		if (unlikely(index >= dev->real_num_rx_queues)) {
4885  			WARN_ONCE(dev->real_num_rx_queues > 1,
4886  				  "%s received packet on queue %u, but number "
4887  				  "of RX queues is %u\n",
4888  				  dev->name, index, dev->real_num_rx_queues);
4889  
4890  			return rxqueue; /* Return first rxqueue */
4891  		}
4892  		rxqueue += index;
4893  	}
4894  	return rxqueue;
4895  }
4896  
bpf_prog_run_generic_xdp(struct sk_buff * skb,struct xdp_buff * xdp,struct bpf_prog * xdp_prog)4897  u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4898  			     struct bpf_prog *xdp_prog)
4899  {
4900  	void *orig_data, *orig_data_end, *hard_start;
4901  	struct netdev_rx_queue *rxqueue;
4902  	bool orig_bcast, orig_host;
4903  	u32 mac_len, frame_sz;
4904  	__be16 orig_eth_type;
4905  	struct ethhdr *eth;
4906  	u32 metalen, act;
4907  	int off;
4908  
4909  	/* The XDP program wants to see the packet starting at the MAC
4910  	 * header.
4911  	 */
4912  	mac_len = skb->data - skb_mac_header(skb);
4913  	hard_start = skb->data - skb_headroom(skb);
4914  
4915  	/* SKB "head" area always have tailroom for skb_shared_info */
4916  	frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4917  	frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4918  
4919  	rxqueue = netif_get_rxqueue(skb);
4920  	xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4921  	xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4922  			 skb_headlen(skb) + mac_len, true);
4923  
4924  	orig_data_end = xdp->data_end;
4925  	orig_data = xdp->data;
4926  	eth = (struct ethhdr *)xdp->data;
4927  	orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4928  	orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4929  	orig_eth_type = eth->h_proto;
4930  
4931  	act = bpf_prog_run_xdp(xdp_prog, xdp);
4932  
4933  	/* check if bpf_xdp_adjust_head was used */
4934  	off = xdp->data - orig_data;
4935  	if (off) {
4936  		if (off > 0)
4937  			__skb_pull(skb, off);
4938  		else if (off < 0)
4939  			__skb_push(skb, -off);
4940  
4941  		skb->mac_header += off;
4942  		skb_reset_network_header(skb);
4943  	}
4944  
4945  	/* check if bpf_xdp_adjust_tail was used */
4946  	off = xdp->data_end - orig_data_end;
4947  	if (off != 0) {
4948  		skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4949  		skb->len += off; /* positive on grow, negative on shrink */
4950  	}
4951  
4952  	/* check if XDP changed eth hdr such SKB needs update */
4953  	eth = (struct ethhdr *)xdp->data;
4954  	if ((orig_eth_type != eth->h_proto) ||
4955  	    (orig_host != ether_addr_equal_64bits(eth->h_dest,
4956  						  skb->dev->dev_addr)) ||
4957  	    (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4958  		__skb_push(skb, ETH_HLEN);
4959  		skb->pkt_type = PACKET_HOST;
4960  		skb->protocol = eth_type_trans(skb, skb->dev);
4961  	}
4962  
4963  	/* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
4964  	 * before calling us again on redirect path. We do not call do_redirect
4965  	 * as we leave that up to the caller.
4966  	 *
4967  	 * Caller is responsible for managing lifetime of skb (i.e. calling
4968  	 * kfree_skb in response to actions it cannot handle/XDP_DROP).
4969  	 */
4970  	switch (act) {
4971  	case XDP_REDIRECT:
4972  	case XDP_TX:
4973  		__skb_push(skb, mac_len);
4974  		break;
4975  	case XDP_PASS:
4976  		metalen = xdp->data - xdp->data_meta;
4977  		if (metalen)
4978  			skb_metadata_set(skb, metalen);
4979  		break;
4980  	}
4981  
4982  	return act;
4983  }
4984  
netif_receive_generic_xdp(struct sk_buff * skb,struct xdp_buff * xdp,struct bpf_prog * xdp_prog)4985  static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4986  				     struct xdp_buff *xdp,
4987  				     struct bpf_prog *xdp_prog)
4988  {
4989  	u32 act = XDP_DROP;
4990  
4991  	/* Reinjected packets coming from act_mirred or similar should
4992  	 * not get XDP generic processing.
4993  	 */
4994  	if (skb_is_redirected(skb))
4995  		return XDP_PASS;
4996  
4997  	/* XDP packets must be linear and must have sufficient headroom
4998  	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4999  	 * native XDP provides, thus we need to do it here as well.
5000  	 */
5001  	if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
5002  	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
5003  		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
5004  		int troom = skb->tail + skb->data_len - skb->end;
5005  
5006  		/* In case we have to go down the path and also linearize,
5007  		 * then lets do the pskb_expand_head() work just once here.
5008  		 */
5009  		if (pskb_expand_head(skb,
5010  				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
5011  				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
5012  			goto do_drop;
5013  		if (skb_linearize(skb))
5014  			goto do_drop;
5015  	}
5016  
5017  	act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog);
5018  	switch (act) {
5019  	case XDP_REDIRECT:
5020  	case XDP_TX:
5021  	case XDP_PASS:
5022  		break;
5023  	default:
5024  		bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act);
5025  		fallthrough;
5026  	case XDP_ABORTED:
5027  		trace_xdp_exception(skb->dev, xdp_prog, act);
5028  		fallthrough;
5029  	case XDP_DROP:
5030  	do_drop:
5031  		kfree_skb(skb);
5032  		break;
5033  	}
5034  
5035  	return act;
5036  }
5037  
5038  /* When doing generic XDP we have to bypass the qdisc layer and the
5039   * network taps in order to match in-driver-XDP behavior. This also means
5040   * that XDP packets are able to starve other packets going through a qdisc,
5041   * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
5042   * queues, so they do not have this starvation issue.
5043   */
generic_xdp_tx(struct sk_buff * skb,struct bpf_prog * xdp_prog)5044  void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
5045  {
5046  	struct net_device *dev = skb->dev;
5047  	struct netdev_queue *txq;
5048  	bool free_skb = true;
5049  	int cpu, rc;
5050  
5051  	txq = netdev_core_pick_tx(dev, skb, NULL);
5052  	cpu = smp_processor_id();
5053  	HARD_TX_LOCK(dev, txq, cpu);
5054  	if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5055  		rc = netdev_start_xmit(skb, dev, txq, 0);
5056  		if (dev_xmit_complete(rc))
5057  			free_skb = false;
5058  	}
5059  	HARD_TX_UNLOCK(dev, txq);
5060  	if (free_skb) {
5061  		trace_xdp_exception(dev, xdp_prog, XDP_TX);
5062  		dev_core_stats_tx_dropped_inc(dev);
5063  		kfree_skb(skb);
5064  	}
5065  }
5066  
5067  static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5068  
do_xdp_generic(struct bpf_prog * xdp_prog,struct sk_buff * skb)5069  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
5070  {
5071  	if (xdp_prog) {
5072  		struct xdp_buff xdp;
5073  		u32 act;
5074  		int err;
5075  
5076  		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
5077  		if (act != XDP_PASS) {
5078  			switch (act) {
5079  			case XDP_REDIRECT:
5080  				err = xdp_do_generic_redirect(skb->dev, skb,
5081  							      &xdp, xdp_prog);
5082  				if (err)
5083  					goto out_redir;
5084  				break;
5085  			case XDP_TX:
5086  				generic_xdp_tx(skb, xdp_prog);
5087  				break;
5088  			}
5089  			return XDP_DROP;
5090  		}
5091  	}
5092  	return XDP_PASS;
5093  out_redir:
5094  	kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
5095  	return XDP_DROP;
5096  }
5097  EXPORT_SYMBOL_GPL(do_xdp_generic);
5098  
netif_rx_internal(struct sk_buff * skb)5099  static int netif_rx_internal(struct sk_buff *skb)
5100  {
5101  	int ret;
5102  
5103  	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5104  
5105  	trace_netif_rx(skb);
5106  
5107  #ifdef CONFIG_RPS
5108  	if (static_branch_unlikely(&rps_needed)) {
5109  		struct rps_dev_flow voidflow, *rflow = &voidflow;
5110  		int cpu;
5111  
5112  		rcu_read_lock();
5113  
5114  		cpu = get_rps_cpu(skb->dev, skb, &rflow);
5115  		if (cpu < 0)
5116  			cpu = smp_processor_id();
5117  
5118  		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5119  
5120  		rcu_read_unlock();
5121  	} else
5122  #endif
5123  	{
5124  		unsigned int qtail;
5125  
5126  		ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5127  	}
5128  	return ret;
5129  }
5130  
5131  /**
5132   *	__netif_rx	-	Slightly optimized version of netif_rx
5133   *	@skb: buffer to post
5134   *
5135   *	This behaves as netif_rx except that it does not disable bottom halves.
5136   *	As a result this function may only be invoked from the interrupt context
5137   *	(either hard or soft interrupt).
5138   */
__netif_rx(struct sk_buff * skb)5139  int __netif_rx(struct sk_buff *skb)
5140  {
5141  	int ret;
5142  
5143  	lockdep_assert_once(hardirq_count() | softirq_count());
5144  
5145  	trace_netif_rx_entry(skb);
5146  	ret = netif_rx_internal(skb);
5147  	trace_netif_rx_exit(ret);
5148  	return ret;
5149  }
5150  EXPORT_SYMBOL(__netif_rx);
5151  
5152  /**
5153   *	netif_rx	-	post buffer to the network code
5154   *	@skb: buffer to post
5155   *
5156   *	This function receives a packet from a device driver and queues it for
5157   *	the upper (protocol) levels to process via the backlog NAPI device. It
5158   *	always succeeds. The buffer may be dropped during processing for
5159   *	congestion control or by the protocol layers.
5160   *	The network buffer is passed via the backlog NAPI device. Modern NIC
5161   *	driver should use NAPI and GRO.
5162   *	This function can used from interrupt and from process context. The
5163   *	caller from process context must not disable interrupts before invoking
5164   *	this function.
5165   *
5166   *	return values:
5167   *	NET_RX_SUCCESS	(no congestion)
5168   *	NET_RX_DROP     (packet was dropped)
5169   *
5170   */
netif_rx(struct sk_buff * skb)5171  int netif_rx(struct sk_buff *skb)
5172  {
5173  	bool need_bh_off = !(hardirq_count() | softirq_count());
5174  	int ret;
5175  
5176  	if (need_bh_off)
5177  		local_bh_disable();
5178  	trace_netif_rx_entry(skb);
5179  	ret = netif_rx_internal(skb);
5180  	trace_netif_rx_exit(ret);
5181  	if (need_bh_off)
5182  		local_bh_enable();
5183  	return ret;
5184  }
5185  EXPORT_SYMBOL(netif_rx);
5186  
net_tx_action(struct softirq_action * h)5187  static __latent_entropy void net_tx_action(struct softirq_action *h)
5188  {
5189  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5190  
5191  	if (sd->completion_queue) {
5192  		struct sk_buff *clist;
5193  
5194  		local_irq_disable();
5195  		clist = sd->completion_queue;
5196  		sd->completion_queue = NULL;
5197  		local_irq_enable();
5198  
5199  		while (clist) {
5200  			struct sk_buff *skb = clist;
5201  
5202  			clist = clist->next;
5203  
5204  			WARN_ON(refcount_read(&skb->users));
5205  			if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5206  				trace_consume_skb(skb, net_tx_action);
5207  			else
5208  				trace_kfree_skb(skb, net_tx_action,
5209  						get_kfree_skb_cb(skb)->reason);
5210  
5211  			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5212  				__kfree_skb(skb);
5213  			else
5214  				__napi_kfree_skb(skb,
5215  						 get_kfree_skb_cb(skb)->reason);
5216  		}
5217  	}
5218  
5219  	if (sd->output_queue) {
5220  		struct Qdisc *head;
5221  
5222  		local_irq_disable();
5223  		head = sd->output_queue;
5224  		sd->output_queue = NULL;
5225  		sd->output_queue_tailp = &sd->output_queue;
5226  		local_irq_enable();
5227  
5228  		rcu_read_lock();
5229  
5230  		while (head) {
5231  			struct Qdisc *q = head;
5232  			spinlock_t *root_lock = NULL;
5233  
5234  			head = head->next_sched;
5235  
5236  			/* We need to make sure head->next_sched is read
5237  			 * before clearing __QDISC_STATE_SCHED
5238  			 */
5239  			smp_mb__before_atomic();
5240  
5241  			if (!(q->flags & TCQ_F_NOLOCK)) {
5242  				root_lock = qdisc_lock(q);
5243  				spin_lock(root_lock);
5244  			} else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5245  						     &q->state))) {
5246  				/* There is a synchronize_net() between
5247  				 * STATE_DEACTIVATED flag being set and
5248  				 * qdisc_reset()/some_qdisc_is_busy() in
5249  				 * dev_deactivate(), so we can safely bail out
5250  				 * early here to avoid data race between
5251  				 * qdisc_deactivate() and some_qdisc_is_busy()
5252  				 * for lockless qdisc.
5253  				 */
5254  				clear_bit(__QDISC_STATE_SCHED, &q->state);
5255  				continue;
5256  			}
5257  
5258  			clear_bit(__QDISC_STATE_SCHED, &q->state);
5259  			qdisc_run(q);
5260  			if (root_lock)
5261  				spin_unlock(root_lock);
5262  		}
5263  
5264  		rcu_read_unlock();
5265  	}
5266  
5267  	xfrm_dev_backlog(sd);
5268  }
5269  
5270  #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5271  /* This hook is defined here for ATM LANE */
5272  int (*br_fdb_test_addr_hook)(struct net_device *dev,
5273  			     unsigned char *addr) __read_mostly;
5274  EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5275  #endif
5276  
5277  /**
5278   *	netdev_is_rx_handler_busy - check if receive handler is registered
5279   *	@dev: device to check
5280   *
5281   *	Check if a receive handler is already registered for a given device.
5282   *	Return true if there one.
5283   *
5284   *	The caller must hold the rtnl_mutex.
5285   */
netdev_is_rx_handler_busy(struct net_device * dev)5286  bool netdev_is_rx_handler_busy(struct net_device *dev)
5287  {
5288  	ASSERT_RTNL();
5289  	return dev && rtnl_dereference(dev->rx_handler);
5290  }
5291  EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5292  
5293  /**
5294   *	netdev_rx_handler_register - register receive handler
5295   *	@dev: device to register a handler for
5296   *	@rx_handler: receive handler to register
5297   *	@rx_handler_data: data pointer that is used by rx handler
5298   *
5299   *	Register a receive handler for a device. This handler will then be
5300   *	called from __netif_receive_skb. A negative errno code is returned
5301   *	on a failure.
5302   *
5303   *	The caller must hold the rtnl_mutex.
5304   *
5305   *	For a general description of rx_handler, see enum rx_handler_result.
5306   */
netdev_rx_handler_register(struct net_device * dev,rx_handler_func_t * rx_handler,void * rx_handler_data)5307  int netdev_rx_handler_register(struct net_device *dev,
5308  			       rx_handler_func_t *rx_handler,
5309  			       void *rx_handler_data)
5310  {
5311  	if (netdev_is_rx_handler_busy(dev))
5312  		return -EBUSY;
5313  
5314  	if (dev->priv_flags & IFF_NO_RX_HANDLER)
5315  		return -EINVAL;
5316  
5317  	/* Note: rx_handler_data must be set before rx_handler */
5318  	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5319  	rcu_assign_pointer(dev->rx_handler, rx_handler);
5320  
5321  	return 0;
5322  }
5323  EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5324  
5325  /**
5326   *	netdev_rx_handler_unregister - unregister receive handler
5327   *	@dev: device to unregister a handler from
5328   *
5329   *	Unregister a receive handler from a device.
5330   *
5331   *	The caller must hold the rtnl_mutex.
5332   */
netdev_rx_handler_unregister(struct net_device * dev)5333  void netdev_rx_handler_unregister(struct net_device *dev)
5334  {
5335  
5336  	ASSERT_RTNL();
5337  	RCU_INIT_POINTER(dev->rx_handler, NULL);
5338  	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5339  	 * section has a guarantee to see a non NULL rx_handler_data
5340  	 * as well.
5341  	 */
5342  	synchronize_net();
5343  	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5344  }
5345  EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5346  
5347  /*
5348   * Limit the use of PFMEMALLOC reserves to those protocols that implement
5349   * the special handling of PFMEMALLOC skbs.
5350   */
skb_pfmemalloc_protocol(struct sk_buff * skb)5351  static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5352  {
5353  	switch (skb->protocol) {
5354  	case htons(ETH_P_ARP):
5355  	case htons(ETH_P_IP):
5356  	case htons(ETH_P_IPV6):
5357  	case htons(ETH_P_8021Q):
5358  	case htons(ETH_P_8021AD):
5359  		return true;
5360  	default:
5361  		return false;
5362  	}
5363  }
5364  
nf_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)5365  static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5366  			     int *ret, struct net_device *orig_dev)
5367  {
5368  	if (nf_hook_ingress_active(skb)) {
5369  		int ingress_retval;
5370  
5371  		if (*pt_prev) {
5372  			*ret = deliver_skb(skb, *pt_prev, orig_dev);
5373  			*pt_prev = NULL;
5374  		}
5375  
5376  		rcu_read_lock();
5377  		ingress_retval = nf_hook_ingress(skb);
5378  		rcu_read_unlock();
5379  		return ingress_retval;
5380  	}
5381  	return 0;
5382  }
5383  
__netif_receive_skb_core(struct sk_buff ** pskb,bool pfmemalloc,struct packet_type ** ppt_prev)5384  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5385  				    struct packet_type **ppt_prev)
5386  {
5387  	struct packet_type *ptype, *pt_prev;
5388  	rx_handler_func_t *rx_handler;
5389  	struct sk_buff *skb = *pskb;
5390  	struct net_device *orig_dev;
5391  	bool deliver_exact = false;
5392  	int ret = NET_RX_DROP;
5393  	__be16 type;
5394  
5395  	net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
5396  
5397  	trace_netif_receive_skb(skb);
5398  
5399  	orig_dev = skb->dev;
5400  
5401  	skb_reset_network_header(skb);
5402  	if (!skb_transport_header_was_set(skb))
5403  		skb_reset_transport_header(skb);
5404  	skb_reset_mac_len(skb);
5405  
5406  	pt_prev = NULL;
5407  
5408  another_round:
5409  	skb->skb_iif = skb->dev->ifindex;
5410  
5411  	__this_cpu_inc(softnet_data.processed);
5412  
5413  	if (static_branch_unlikely(&generic_xdp_needed_key)) {
5414  		int ret2;
5415  
5416  		migrate_disable();
5417  		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5418  		migrate_enable();
5419  
5420  		if (ret2 != XDP_PASS) {
5421  			ret = NET_RX_DROP;
5422  			goto out;
5423  		}
5424  	}
5425  
5426  	if (eth_type_vlan(skb->protocol)) {
5427  		skb = skb_vlan_untag(skb);
5428  		if (unlikely(!skb))
5429  			goto out;
5430  	}
5431  
5432  	if (skb_skip_tc_classify(skb))
5433  		goto skip_classify;
5434  
5435  	if (pfmemalloc)
5436  		goto skip_taps;
5437  
5438  	list_for_each_entry_rcu(ptype, &ptype_all, list) {
5439  		if (pt_prev)
5440  			ret = deliver_skb(skb, pt_prev, orig_dev);
5441  		pt_prev = ptype;
5442  	}
5443  
5444  	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5445  		if (pt_prev)
5446  			ret = deliver_skb(skb, pt_prev, orig_dev);
5447  		pt_prev = ptype;
5448  	}
5449  
5450  skip_taps:
5451  #ifdef CONFIG_NET_INGRESS
5452  	if (static_branch_unlikely(&ingress_needed_key)) {
5453  		bool another = false;
5454  
5455  		nf_skip_egress(skb, true);
5456  		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5457  					 &another);
5458  		if (another)
5459  			goto another_round;
5460  		if (!skb)
5461  			goto out;
5462  
5463  		nf_skip_egress(skb, false);
5464  		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5465  			goto out;
5466  	}
5467  #endif
5468  	skb_reset_redirect(skb);
5469  skip_classify:
5470  	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5471  		goto drop;
5472  
5473  	if (skb_vlan_tag_present(skb)) {
5474  		if (pt_prev) {
5475  			ret = deliver_skb(skb, pt_prev, orig_dev);
5476  			pt_prev = NULL;
5477  		}
5478  		if (vlan_do_receive(&skb))
5479  			goto another_round;
5480  		else if (unlikely(!skb))
5481  			goto out;
5482  	}
5483  
5484  	rx_handler = rcu_dereference(skb->dev->rx_handler);
5485  	if (rx_handler) {
5486  		if (pt_prev) {
5487  			ret = deliver_skb(skb, pt_prev, orig_dev);
5488  			pt_prev = NULL;
5489  		}
5490  		switch (rx_handler(&skb)) {
5491  		case RX_HANDLER_CONSUMED:
5492  			ret = NET_RX_SUCCESS;
5493  			goto out;
5494  		case RX_HANDLER_ANOTHER:
5495  			goto another_round;
5496  		case RX_HANDLER_EXACT:
5497  			deliver_exact = true;
5498  			break;
5499  		case RX_HANDLER_PASS:
5500  			break;
5501  		default:
5502  			BUG();
5503  		}
5504  	}
5505  
5506  	if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5507  check_vlan_id:
5508  		if (skb_vlan_tag_get_id(skb)) {
5509  			/* Vlan id is non 0 and vlan_do_receive() above couldn't
5510  			 * find vlan device.
5511  			 */
5512  			skb->pkt_type = PACKET_OTHERHOST;
5513  		} else if (eth_type_vlan(skb->protocol)) {
5514  			/* Outer header is 802.1P with vlan 0, inner header is
5515  			 * 802.1Q or 802.1AD and vlan_do_receive() above could
5516  			 * not find vlan dev for vlan id 0.
5517  			 */
5518  			__vlan_hwaccel_clear_tag(skb);
5519  			skb = skb_vlan_untag(skb);
5520  			if (unlikely(!skb))
5521  				goto out;
5522  			if (vlan_do_receive(&skb))
5523  				/* After stripping off 802.1P header with vlan 0
5524  				 * vlan dev is found for inner header.
5525  				 */
5526  				goto another_round;
5527  			else if (unlikely(!skb))
5528  				goto out;
5529  			else
5530  				/* We have stripped outer 802.1P vlan 0 header.
5531  				 * But could not find vlan dev.
5532  				 * check again for vlan id to set OTHERHOST.
5533  				 */
5534  				goto check_vlan_id;
5535  		}
5536  		/* Note: we might in the future use prio bits
5537  		 * and set skb->priority like in vlan_do_receive()
5538  		 * For the time being, just ignore Priority Code Point
5539  		 */
5540  		__vlan_hwaccel_clear_tag(skb);
5541  	}
5542  
5543  	type = skb->protocol;
5544  
5545  	/* deliver only exact match when indicated */
5546  	if (likely(!deliver_exact)) {
5547  		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5548  				       &ptype_base[ntohs(type) &
5549  						   PTYPE_HASH_MASK]);
5550  	}
5551  
5552  	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5553  			       &orig_dev->ptype_specific);
5554  
5555  	if (unlikely(skb->dev != orig_dev)) {
5556  		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5557  				       &skb->dev->ptype_specific);
5558  	}
5559  
5560  	if (pt_prev) {
5561  		if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5562  			goto drop;
5563  		*ppt_prev = pt_prev;
5564  	} else {
5565  drop:
5566  		if (!deliver_exact)
5567  			dev_core_stats_rx_dropped_inc(skb->dev);
5568  		else
5569  			dev_core_stats_rx_nohandler_inc(skb->dev);
5570  		kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5571  		/* Jamal, now you will not able to escape explaining
5572  		 * me how you were going to use this. :-)
5573  		 */
5574  		ret = NET_RX_DROP;
5575  	}
5576  
5577  out:
5578  	/* The invariant here is that if *ppt_prev is not NULL
5579  	 * then skb should also be non-NULL.
5580  	 *
5581  	 * Apparently *ppt_prev assignment above holds this invariant due to
5582  	 * skb dereferencing near it.
5583  	 */
5584  	*pskb = skb;
5585  	return ret;
5586  }
5587  
__netif_receive_skb_one_core(struct sk_buff * skb,bool pfmemalloc)5588  static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5589  {
5590  	struct net_device *orig_dev = skb->dev;
5591  	struct packet_type *pt_prev = NULL;
5592  	int ret;
5593  
5594  	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5595  	if (pt_prev)
5596  		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5597  					 skb->dev, pt_prev, orig_dev);
5598  	return ret;
5599  }
5600  
5601  /**
5602   *	netif_receive_skb_core - special purpose version of netif_receive_skb
5603   *	@skb: buffer to process
5604   *
5605   *	More direct receive version of netif_receive_skb().  It should
5606   *	only be used by callers that have a need to skip RPS and Generic XDP.
5607   *	Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5608   *
5609   *	This function may only be called from softirq context and interrupts
5610   *	should be enabled.
5611   *
5612   *	Return values (usually ignored):
5613   *	NET_RX_SUCCESS: no congestion
5614   *	NET_RX_DROP: packet was dropped
5615   */
netif_receive_skb_core(struct sk_buff * skb)5616  int netif_receive_skb_core(struct sk_buff *skb)
5617  {
5618  	int ret;
5619  
5620  	rcu_read_lock();
5621  	ret = __netif_receive_skb_one_core(skb, false);
5622  	rcu_read_unlock();
5623  
5624  	return ret;
5625  }
5626  EXPORT_SYMBOL(netif_receive_skb_core);
5627  
__netif_receive_skb_list_ptype(struct list_head * head,struct packet_type * pt_prev,struct net_device * orig_dev)5628  static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5629  						  struct packet_type *pt_prev,
5630  						  struct net_device *orig_dev)
5631  {
5632  	struct sk_buff *skb, *next;
5633  
5634  	if (!pt_prev)
5635  		return;
5636  	if (list_empty(head))
5637  		return;
5638  	if (pt_prev->list_func != NULL)
5639  		INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5640  				   ip_list_rcv, head, pt_prev, orig_dev);
5641  	else
5642  		list_for_each_entry_safe(skb, next, head, list) {
5643  			skb_list_del_init(skb);
5644  			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5645  		}
5646  }
5647  
__netif_receive_skb_list_core(struct list_head * head,bool pfmemalloc)5648  static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5649  {
5650  	/* Fast-path assumptions:
5651  	 * - There is no RX handler.
5652  	 * - Only one packet_type matches.
5653  	 * If either of these fails, we will end up doing some per-packet
5654  	 * processing in-line, then handling the 'last ptype' for the whole
5655  	 * sublist.  This can't cause out-of-order delivery to any single ptype,
5656  	 * because the 'last ptype' must be constant across the sublist, and all
5657  	 * other ptypes are handled per-packet.
5658  	 */
5659  	/* Current (common) ptype of sublist */
5660  	struct packet_type *pt_curr = NULL;
5661  	/* Current (common) orig_dev of sublist */
5662  	struct net_device *od_curr = NULL;
5663  	struct list_head sublist;
5664  	struct sk_buff *skb, *next;
5665  
5666  	INIT_LIST_HEAD(&sublist);
5667  	list_for_each_entry_safe(skb, next, head, list) {
5668  		struct net_device *orig_dev = skb->dev;
5669  		struct packet_type *pt_prev = NULL;
5670  
5671  		skb_list_del_init(skb);
5672  		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5673  		if (!pt_prev)
5674  			continue;
5675  		if (pt_curr != pt_prev || od_curr != orig_dev) {
5676  			/* dispatch old sublist */
5677  			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5678  			/* start new sublist */
5679  			INIT_LIST_HEAD(&sublist);
5680  			pt_curr = pt_prev;
5681  			od_curr = orig_dev;
5682  		}
5683  		list_add_tail(&skb->list, &sublist);
5684  	}
5685  
5686  	/* dispatch final sublist */
5687  	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5688  }
5689  
__netif_receive_skb(struct sk_buff * skb)5690  static int __netif_receive_skb(struct sk_buff *skb)
5691  {
5692  	int ret;
5693  
5694  	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5695  		unsigned int noreclaim_flag;
5696  
5697  		/*
5698  		 * PFMEMALLOC skbs are special, they should
5699  		 * - be delivered to SOCK_MEMALLOC sockets only
5700  		 * - stay away from userspace
5701  		 * - have bounded memory usage
5702  		 *
5703  		 * Use PF_MEMALLOC as this saves us from propagating the allocation
5704  		 * context down to all allocation sites.
5705  		 */
5706  		noreclaim_flag = memalloc_noreclaim_save();
5707  		ret = __netif_receive_skb_one_core(skb, true);
5708  		memalloc_noreclaim_restore(noreclaim_flag);
5709  	} else
5710  		ret = __netif_receive_skb_one_core(skb, false);
5711  
5712  	return ret;
5713  }
5714  
__netif_receive_skb_list(struct list_head * head)5715  static void __netif_receive_skb_list(struct list_head *head)
5716  {
5717  	unsigned long noreclaim_flag = 0;
5718  	struct sk_buff *skb, *next;
5719  	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5720  
5721  	list_for_each_entry_safe(skb, next, head, list) {
5722  		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5723  			struct list_head sublist;
5724  
5725  			/* Handle the previous sublist */
5726  			list_cut_before(&sublist, head, &skb->list);
5727  			if (!list_empty(&sublist))
5728  				__netif_receive_skb_list_core(&sublist, pfmemalloc);
5729  			pfmemalloc = !pfmemalloc;
5730  			/* See comments in __netif_receive_skb */
5731  			if (pfmemalloc)
5732  				noreclaim_flag = memalloc_noreclaim_save();
5733  			else
5734  				memalloc_noreclaim_restore(noreclaim_flag);
5735  		}
5736  	}
5737  	/* Handle the remaining sublist */
5738  	if (!list_empty(head))
5739  		__netif_receive_skb_list_core(head, pfmemalloc);
5740  	/* Restore pflags */
5741  	if (pfmemalloc)
5742  		memalloc_noreclaim_restore(noreclaim_flag);
5743  }
5744  
generic_xdp_install(struct net_device * dev,struct netdev_bpf * xdp)5745  static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5746  {
5747  	struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5748  	struct bpf_prog *new = xdp->prog;
5749  	int ret = 0;
5750  
5751  	switch (xdp->command) {
5752  	case XDP_SETUP_PROG:
5753  		rcu_assign_pointer(dev->xdp_prog, new);
5754  		if (old)
5755  			bpf_prog_put(old);
5756  
5757  		if (old && !new) {
5758  			static_branch_dec(&generic_xdp_needed_key);
5759  		} else if (new && !old) {
5760  			static_branch_inc(&generic_xdp_needed_key);
5761  			dev_disable_lro(dev);
5762  			dev_disable_gro_hw(dev);
5763  		}
5764  		break;
5765  
5766  	default:
5767  		ret = -EINVAL;
5768  		break;
5769  	}
5770  
5771  	return ret;
5772  }
5773  
netif_receive_skb_internal(struct sk_buff * skb)5774  static int netif_receive_skb_internal(struct sk_buff *skb)
5775  {
5776  	int ret;
5777  
5778  	net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5779  
5780  	if (skb_defer_rx_timestamp(skb))
5781  		return NET_RX_SUCCESS;
5782  
5783  	rcu_read_lock();
5784  #ifdef CONFIG_RPS
5785  	if (static_branch_unlikely(&rps_needed)) {
5786  		struct rps_dev_flow voidflow, *rflow = &voidflow;
5787  		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5788  
5789  		if (cpu >= 0) {
5790  			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5791  			rcu_read_unlock();
5792  			return ret;
5793  		}
5794  	}
5795  #endif
5796  	ret = __netif_receive_skb(skb);
5797  	rcu_read_unlock();
5798  	return ret;
5799  }
5800  
netif_receive_skb_list_internal(struct list_head * head)5801  void netif_receive_skb_list_internal(struct list_head *head)
5802  {
5803  	struct sk_buff *skb, *next;
5804  	struct list_head sublist;
5805  
5806  	INIT_LIST_HEAD(&sublist);
5807  	list_for_each_entry_safe(skb, next, head, list) {
5808  		net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
5809  		skb_list_del_init(skb);
5810  		if (!skb_defer_rx_timestamp(skb))
5811  			list_add_tail(&skb->list, &sublist);
5812  	}
5813  	list_splice_init(&sublist, head);
5814  
5815  	rcu_read_lock();
5816  #ifdef CONFIG_RPS
5817  	if (static_branch_unlikely(&rps_needed)) {
5818  		list_for_each_entry_safe(skb, next, head, list) {
5819  			struct rps_dev_flow voidflow, *rflow = &voidflow;
5820  			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5821  
5822  			if (cpu >= 0) {
5823  				/* Will be handled, remove from list */
5824  				skb_list_del_init(skb);
5825  				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5826  			}
5827  		}
5828  	}
5829  #endif
5830  	__netif_receive_skb_list(head);
5831  	rcu_read_unlock();
5832  }
5833  
5834  /**
5835   *	netif_receive_skb - process receive buffer from network
5836   *	@skb: buffer to process
5837   *
5838   *	netif_receive_skb() is the main receive data processing function.
5839   *	It always succeeds. The buffer may be dropped during processing
5840   *	for congestion control or by the protocol layers.
5841   *
5842   *	This function may only be called from softirq context and interrupts
5843   *	should be enabled.
5844   *
5845   *	Return values (usually ignored):
5846   *	NET_RX_SUCCESS: no congestion
5847   *	NET_RX_DROP: packet was dropped
5848   */
netif_receive_skb(struct sk_buff * skb)5849  int netif_receive_skb(struct sk_buff *skb)
5850  {
5851  	int ret;
5852  
5853  	trace_netif_receive_skb_entry(skb);
5854  
5855  	ret = netif_receive_skb_internal(skb);
5856  	trace_netif_receive_skb_exit(ret);
5857  
5858  	return ret;
5859  }
5860  EXPORT_SYMBOL(netif_receive_skb);
5861  
5862  /**
5863   *	netif_receive_skb_list - process many receive buffers from network
5864   *	@head: list of skbs to process.
5865   *
5866   *	Since return value of netif_receive_skb() is normally ignored, and
5867   *	wouldn't be meaningful for a list, this function returns void.
5868   *
5869   *	This function may only be called from softirq context and interrupts
5870   *	should be enabled.
5871   */
netif_receive_skb_list(struct list_head * head)5872  void netif_receive_skb_list(struct list_head *head)
5873  {
5874  	struct sk_buff *skb;
5875  
5876  	if (list_empty(head))
5877  		return;
5878  	if (trace_netif_receive_skb_list_entry_enabled()) {
5879  		list_for_each_entry(skb, head, list)
5880  			trace_netif_receive_skb_list_entry(skb);
5881  	}
5882  	netif_receive_skb_list_internal(head);
5883  	trace_netif_receive_skb_list_exit(0);
5884  }
5885  EXPORT_SYMBOL(netif_receive_skb_list);
5886  
5887  static DEFINE_PER_CPU(struct work_struct, flush_works);
5888  
5889  /* Network device is going away, flush any packets still pending */
flush_backlog(struct work_struct * work)5890  static void flush_backlog(struct work_struct *work)
5891  {
5892  	struct sk_buff *skb, *tmp;
5893  	struct softnet_data *sd;
5894  
5895  	local_bh_disable();
5896  	sd = this_cpu_ptr(&softnet_data);
5897  
5898  	rps_lock_irq_disable(sd);
5899  	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5900  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5901  			__skb_unlink(skb, &sd->input_pkt_queue);
5902  			dev_kfree_skb_irq(skb);
5903  			input_queue_head_incr(sd);
5904  		}
5905  	}
5906  	rps_unlock_irq_enable(sd);
5907  
5908  	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5909  		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5910  			__skb_unlink(skb, &sd->process_queue);
5911  			kfree_skb(skb);
5912  			input_queue_head_incr(sd);
5913  		}
5914  	}
5915  	local_bh_enable();
5916  }
5917  
flush_required(int cpu)5918  static bool flush_required(int cpu)
5919  {
5920  #if IS_ENABLED(CONFIG_RPS)
5921  	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5922  	bool do_flush;
5923  
5924  	rps_lock_irq_disable(sd);
5925  
5926  	/* as insertion into process_queue happens with the rps lock held,
5927  	 * process_queue access may race only with dequeue
5928  	 */
5929  	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5930  		   !skb_queue_empty_lockless(&sd->process_queue);
5931  	rps_unlock_irq_enable(sd);
5932  
5933  	return do_flush;
5934  #endif
5935  	/* without RPS we can't safely check input_pkt_queue: during a
5936  	 * concurrent remote skb_queue_splice() we can detect as empty both
5937  	 * input_pkt_queue and process_queue even if the latter could end-up
5938  	 * containing a lot of packets.
5939  	 */
5940  	return true;
5941  }
5942  
flush_all_backlogs(void)5943  static void flush_all_backlogs(void)
5944  {
5945  	static cpumask_t flush_cpus;
5946  	unsigned int cpu;
5947  
5948  	/* since we are under rtnl lock protection we can use static data
5949  	 * for the cpumask and avoid allocating on stack the possibly
5950  	 * large mask
5951  	 */
5952  	ASSERT_RTNL();
5953  
5954  	cpus_read_lock();
5955  
5956  	cpumask_clear(&flush_cpus);
5957  	for_each_online_cpu(cpu) {
5958  		if (flush_required(cpu)) {
5959  			queue_work_on(cpu, system_highpri_wq,
5960  				      per_cpu_ptr(&flush_works, cpu));
5961  			cpumask_set_cpu(cpu, &flush_cpus);
5962  		}
5963  	}
5964  
5965  	/* we can have in flight packet[s] on the cpus we are not flushing,
5966  	 * synchronize_net() in unregister_netdevice_many() will take care of
5967  	 * them
5968  	 */
5969  	for_each_cpu(cpu, &flush_cpus)
5970  		flush_work(per_cpu_ptr(&flush_works, cpu));
5971  
5972  	cpus_read_unlock();
5973  }
5974  
net_rps_send_ipi(struct softnet_data * remsd)5975  static void net_rps_send_ipi(struct softnet_data *remsd)
5976  {
5977  #ifdef CONFIG_RPS
5978  	while (remsd) {
5979  		struct softnet_data *next = remsd->rps_ipi_next;
5980  
5981  		if (cpu_online(remsd->cpu))
5982  			smp_call_function_single_async(remsd->cpu, &remsd->csd);
5983  		remsd = next;
5984  	}
5985  #endif
5986  }
5987  
5988  /*
5989   * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5990   * Note: called with local irq disabled, but exits with local irq enabled.
5991   */
net_rps_action_and_irq_enable(struct softnet_data * sd)5992  static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5993  {
5994  #ifdef CONFIG_RPS
5995  	struct softnet_data *remsd = sd->rps_ipi_list;
5996  
5997  	if (remsd) {
5998  		sd->rps_ipi_list = NULL;
5999  
6000  		local_irq_enable();
6001  
6002  		/* Send pending IPI's to kick RPS processing on remote cpus. */
6003  		net_rps_send_ipi(remsd);
6004  	} else
6005  #endif
6006  		local_irq_enable();
6007  }
6008  
sd_has_rps_ipi_waiting(struct softnet_data * sd)6009  static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6010  {
6011  #ifdef CONFIG_RPS
6012  	return sd->rps_ipi_list != NULL;
6013  #else
6014  	return false;
6015  #endif
6016  }
6017  
process_backlog(struct napi_struct * napi,int quota)6018  static int process_backlog(struct napi_struct *napi, int quota)
6019  {
6020  	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6021  	bool again = true;
6022  	int work = 0;
6023  
6024  	/* Check if we have pending ipi, its better to send them now,
6025  	 * not waiting net_rx_action() end.
6026  	 */
6027  	if (sd_has_rps_ipi_waiting(sd)) {
6028  		local_irq_disable();
6029  		net_rps_action_and_irq_enable(sd);
6030  	}
6031  
6032  	napi->weight = READ_ONCE(dev_rx_weight);
6033  	while (again) {
6034  		struct sk_buff *skb;
6035  
6036  		while ((skb = __skb_dequeue(&sd->process_queue))) {
6037  			rcu_read_lock();
6038  			__netif_receive_skb(skb);
6039  			rcu_read_unlock();
6040  			input_queue_head_incr(sd);
6041  			if (++work >= quota)
6042  				return work;
6043  
6044  		}
6045  
6046  		rps_lock_irq_disable(sd);
6047  		if (skb_queue_empty(&sd->input_pkt_queue)) {
6048  			/*
6049  			 * Inline a custom version of __napi_complete().
6050  			 * only current cpu owns and manipulates this napi,
6051  			 * and NAPI_STATE_SCHED is the only possible flag set
6052  			 * on backlog.
6053  			 * We can use a plain write instead of clear_bit(),
6054  			 * and we dont need an smp_mb() memory barrier.
6055  			 */
6056  			napi->state = 0;
6057  			again = false;
6058  		} else {
6059  			skb_queue_splice_tail_init(&sd->input_pkt_queue,
6060  						   &sd->process_queue);
6061  		}
6062  		rps_unlock_irq_enable(sd);
6063  	}
6064  
6065  	return work;
6066  }
6067  
6068  /**
6069   * __napi_schedule - schedule for receive
6070   * @n: entry to schedule
6071   *
6072   * The entry's receive function will be scheduled to run.
6073   * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6074   */
__napi_schedule(struct napi_struct * n)6075  void __napi_schedule(struct napi_struct *n)
6076  {
6077  	unsigned long flags;
6078  
6079  	local_irq_save(flags);
6080  	____napi_schedule(this_cpu_ptr(&softnet_data), n);
6081  	local_irq_restore(flags);
6082  }
6083  EXPORT_SYMBOL(__napi_schedule);
6084  
6085  /**
6086   *	napi_schedule_prep - check if napi can be scheduled
6087   *	@n: napi context
6088   *
6089   * Test if NAPI routine is already running, and if not mark
6090   * it as running.  This is used as a condition variable to
6091   * insure only one NAPI poll instance runs.  We also make
6092   * sure there is no pending NAPI disable.
6093   */
napi_schedule_prep(struct napi_struct * n)6094  bool napi_schedule_prep(struct napi_struct *n)
6095  {
6096  	unsigned long new, val = READ_ONCE(n->state);
6097  
6098  	do {
6099  		if (unlikely(val & NAPIF_STATE_DISABLE))
6100  			return false;
6101  		new = val | NAPIF_STATE_SCHED;
6102  
6103  		/* Sets STATE_MISSED bit if STATE_SCHED was already set
6104  		 * This was suggested by Alexander Duyck, as compiler
6105  		 * emits better code than :
6106  		 * if (val & NAPIF_STATE_SCHED)
6107  		 *     new |= NAPIF_STATE_MISSED;
6108  		 */
6109  		new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6110  						   NAPIF_STATE_MISSED;
6111  	} while (!try_cmpxchg(&n->state, &val, new));
6112  
6113  	return !(val & NAPIF_STATE_SCHED);
6114  }
6115  EXPORT_SYMBOL(napi_schedule_prep);
6116  
6117  /**
6118   * __napi_schedule_irqoff - schedule for receive
6119   * @n: entry to schedule
6120   *
6121   * Variant of __napi_schedule() assuming hard irqs are masked.
6122   *
6123   * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6124   * because the interrupt disabled assumption might not be true
6125   * due to force-threaded interrupts and spinlock substitution.
6126   */
__napi_schedule_irqoff(struct napi_struct * n)6127  void __napi_schedule_irqoff(struct napi_struct *n)
6128  {
6129  	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6130  		____napi_schedule(this_cpu_ptr(&softnet_data), n);
6131  	else
6132  		__napi_schedule(n);
6133  }
6134  EXPORT_SYMBOL(__napi_schedule_irqoff);
6135  
napi_complete_done(struct napi_struct * n,int work_done)6136  bool napi_complete_done(struct napi_struct *n, int work_done)
6137  {
6138  	unsigned long flags, val, new, timeout = 0;
6139  	bool ret = true;
6140  
6141  	/*
6142  	 * 1) Don't let napi dequeue from the cpu poll list
6143  	 *    just in case its running on a different cpu.
6144  	 * 2) If we are busy polling, do nothing here, we have
6145  	 *    the guarantee we will be called later.
6146  	 */
6147  	if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6148  				 NAPIF_STATE_IN_BUSY_POLL)))
6149  		return false;
6150  
6151  	if (work_done) {
6152  		if (n->gro_bitmask)
6153  			timeout = READ_ONCE(n->dev->gro_flush_timeout);
6154  		n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6155  	}
6156  	if (n->defer_hard_irqs_count > 0) {
6157  		n->defer_hard_irqs_count--;
6158  		timeout = READ_ONCE(n->dev->gro_flush_timeout);
6159  		if (timeout)
6160  			ret = false;
6161  	}
6162  	if (n->gro_bitmask) {
6163  		/* When the NAPI instance uses a timeout and keeps postponing
6164  		 * it, we need to bound somehow the time packets are kept in
6165  		 * the GRO layer
6166  		 */
6167  		napi_gro_flush(n, !!timeout);
6168  	}
6169  
6170  	gro_normal_list(n);
6171  
6172  	if (unlikely(!list_empty(&n->poll_list))) {
6173  		/* If n->poll_list is not empty, we need to mask irqs */
6174  		local_irq_save(flags);
6175  		list_del_init(&n->poll_list);
6176  		local_irq_restore(flags);
6177  	}
6178  	WRITE_ONCE(n->list_owner, -1);
6179  
6180  	val = READ_ONCE(n->state);
6181  	do {
6182  		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6183  
6184  		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6185  			      NAPIF_STATE_SCHED_THREADED |
6186  			      NAPIF_STATE_PREFER_BUSY_POLL);
6187  
6188  		/* If STATE_MISSED was set, leave STATE_SCHED set,
6189  		 * because we will call napi->poll() one more time.
6190  		 * This C code was suggested by Alexander Duyck to help gcc.
6191  		 */
6192  		new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6193  						    NAPIF_STATE_SCHED;
6194  	} while (!try_cmpxchg(&n->state, &val, new));
6195  
6196  	if (unlikely(val & NAPIF_STATE_MISSED)) {
6197  		__napi_schedule(n);
6198  		return false;
6199  	}
6200  
6201  	if (timeout)
6202  		hrtimer_start(&n->timer, ns_to_ktime(timeout),
6203  			      HRTIMER_MODE_REL_PINNED);
6204  	return ret;
6205  }
6206  EXPORT_SYMBOL(napi_complete_done);
6207  
6208  /* must be called under rcu_read_lock(), as we dont take a reference */
napi_by_id(unsigned int napi_id)6209  static struct napi_struct *napi_by_id(unsigned int napi_id)
6210  {
6211  	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6212  	struct napi_struct *napi;
6213  
6214  	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6215  		if (napi->napi_id == napi_id)
6216  			return napi;
6217  
6218  	return NULL;
6219  }
6220  
6221  #if defined(CONFIG_NET_RX_BUSY_POLL)
6222  
__busy_poll_stop(struct napi_struct * napi,bool skip_schedule)6223  static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6224  {
6225  	if (!skip_schedule) {
6226  		gro_normal_list(napi);
6227  		__napi_schedule(napi);
6228  		return;
6229  	}
6230  
6231  	if (napi->gro_bitmask) {
6232  		/* flush too old packets
6233  		 * If HZ < 1000, flush all packets.
6234  		 */
6235  		napi_gro_flush(napi, HZ >= 1000);
6236  	}
6237  
6238  	gro_normal_list(napi);
6239  	clear_bit(NAPI_STATE_SCHED, &napi->state);
6240  }
6241  
busy_poll_stop(struct napi_struct * napi,void * have_poll_lock,bool prefer_busy_poll,u16 budget)6242  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
6243  			   u16 budget)
6244  {
6245  	bool skip_schedule = false;
6246  	unsigned long timeout;
6247  	int rc;
6248  
6249  	/* Busy polling means there is a high chance device driver hard irq
6250  	 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6251  	 * set in napi_schedule_prep().
6252  	 * Since we are about to call napi->poll() once more, we can safely
6253  	 * clear NAPI_STATE_MISSED.
6254  	 *
6255  	 * Note: x86 could use a single "lock and ..." instruction
6256  	 * to perform these two clear_bit()
6257  	 */
6258  	clear_bit(NAPI_STATE_MISSED, &napi->state);
6259  	clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6260  
6261  	local_bh_disable();
6262  
6263  	if (prefer_busy_poll) {
6264  		napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6265  		timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6266  		if (napi->defer_hard_irqs_count && timeout) {
6267  			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6268  			skip_schedule = true;
6269  		}
6270  	}
6271  
6272  	/* All we really want here is to re-enable device interrupts.
6273  	 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6274  	 */
6275  	rc = napi->poll(napi, budget);
6276  	/* We can't gro_normal_list() here, because napi->poll() might have
6277  	 * rearmed the napi (napi_complete_done()) in which case it could
6278  	 * already be running on another CPU.
6279  	 */
6280  	trace_napi_poll(napi, rc, budget);
6281  	netpoll_poll_unlock(have_poll_lock);
6282  	if (rc == budget)
6283  		__busy_poll_stop(napi, skip_schedule);
6284  	local_bh_enable();
6285  }
6286  
napi_busy_loop(unsigned int napi_id,bool (* loop_end)(void *,unsigned long),void * loop_end_arg,bool prefer_busy_poll,u16 budget)6287  void napi_busy_loop(unsigned int napi_id,
6288  		    bool (*loop_end)(void *, unsigned long),
6289  		    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6290  {
6291  	unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6292  	int (*napi_poll)(struct napi_struct *napi, int budget);
6293  	void *have_poll_lock = NULL;
6294  	struct napi_struct *napi;
6295  
6296  restart:
6297  	napi_poll = NULL;
6298  
6299  	rcu_read_lock();
6300  
6301  	napi = napi_by_id(napi_id);
6302  	if (!napi)
6303  		goto out;
6304  
6305  	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6306  		preempt_disable();
6307  	for (;;) {
6308  		int work = 0;
6309  
6310  		local_bh_disable();
6311  		if (!napi_poll) {
6312  			unsigned long val = READ_ONCE(napi->state);
6313  
6314  			/* If multiple threads are competing for this napi,
6315  			 * we avoid dirtying napi->state as much as we can.
6316  			 */
6317  			if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6318  				   NAPIF_STATE_IN_BUSY_POLL)) {
6319  				if (prefer_busy_poll)
6320  					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6321  				goto count;
6322  			}
6323  			if (cmpxchg(&napi->state, val,
6324  				    val | NAPIF_STATE_IN_BUSY_POLL |
6325  					  NAPIF_STATE_SCHED) != val) {
6326  				if (prefer_busy_poll)
6327  					set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6328  				goto count;
6329  			}
6330  			have_poll_lock = netpoll_poll_lock(napi);
6331  			napi_poll = napi->poll;
6332  		}
6333  		work = napi_poll(napi, budget);
6334  		trace_napi_poll(napi, work, budget);
6335  		gro_normal_list(napi);
6336  count:
6337  		if (work > 0)
6338  			__NET_ADD_STATS(dev_net(napi->dev),
6339  					LINUX_MIB_BUSYPOLLRXPACKETS, work);
6340  		local_bh_enable();
6341  
6342  		if (!loop_end || loop_end(loop_end_arg, start_time))
6343  			break;
6344  
6345  		if (unlikely(need_resched())) {
6346  			if (napi_poll)
6347  				busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6348  			if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6349  				preempt_enable();
6350  			rcu_read_unlock();
6351  			cond_resched();
6352  			if (loop_end(loop_end_arg, start_time))
6353  				return;
6354  			goto restart;
6355  		}
6356  		cpu_relax();
6357  	}
6358  	if (napi_poll)
6359  		busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6360  	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6361  		preempt_enable();
6362  out:
6363  	rcu_read_unlock();
6364  }
6365  EXPORT_SYMBOL(napi_busy_loop);
6366  
6367  #endif /* CONFIG_NET_RX_BUSY_POLL */
6368  
napi_hash_add(struct napi_struct * napi)6369  static void napi_hash_add(struct napi_struct *napi)
6370  {
6371  	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6372  		return;
6373  
6374  	spin_lock(&napi_hash_lock);
6375  
6376  	/* 0..NR_CPUS range is reserved for sender_cpu use */
6377  	do {
6378  		if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6379  			napi_gen_id = MIN_NAPI_ID;
6380  	} while (napi_by_id(napi_gen_id));
6381  	napi->napi_id = napi_gen_id;
6382  
6383  	hlist_add_head_rcu(&napi->napi_hash_node,
6384  			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6385  
6386  	spin_unlock(&napi_hash_lock);
6387  }
6388  
6389  /* Warning : caller is responsible to make sure rcu grace period
6390   * is respected before freeing memory containing @napi
6391   */
napi_hash_del(struct napi_struct * napi)6392  static void napi_hash_del(struct napi_struct *napi)
6393  {
6394  	spin_lock(&napi_hash_lock);
6395  
6396  	hlist_del_init_rcu(&napi->napi_hash_node);
6397  
6398  	spin_unlock(&napi_hash_lock);
6399  }
6400  
napi_watchdog(struct hrtimer * timer)6401  static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6402  {
6403  	struct napi_struct *napi;
6404  
6405  	napi = container_of(timer, struct napi_struct, timer);
6406  
6407  	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
6408  	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6409  	 */
6410  	if (!napi_disable_pending(napi) &&
6411  	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6412  		clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6413  		__napi_schedule_irqoff(napi);
6414  	}
6415  
6416  	return HRTIMER_NORESTART;
6417  }
6418  
init_gro_hash(struct napi_struct * napi)6419  static void init_gro_hash(struct napi_struct *napi)
6420  {
6421  	int i;
6422  
6423  	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6424  		INIT_LIST_HEAD(&napi->gro_hash[i].list);
6425  		napi->gro_hash[i].count = 0;
6426  	}
6427  	napi->gro_bitmask = 0;
6428  }
6429  
dev_set_threaded(struct net_device * dev,bool threaded)6430  int dev_set_threaded(struct net_device *dev, bool threaded)
6431  {
6432  	struct napi_struct *napi;
6433  	int err = 0;
6434  
6435  	if (dev->threaded == threaded)
6436  		return 0;
6437  
6438  	if (threaded) {
6439  		list_for_each_entry(napi, &dev->napi_list, dev_list) {
6440  			if (!napi->thread) {
6441  				err = napi_kthread_create(napi);
6442  				if (err) {
6443  					threaded = false;
6444  					break;
6445  				}
6446  			}
6447  		}
6448  	}
6449  
6450  	dev->threaded = threaded;
6451  
6452  	/* Make sure kthread is created before THREADED bit
6453  	 * is set.
6454  	 */
6455  	smp_mb__before_atomic();
6456  
6457  	/* Setting/unsetting threaded mode on a napi might not immediately
6458  	 * take effect, if the current napi instance is actively being
6459  	 * polled. In this case, the switch between threaded mode and
6460  	 * softirq mode will happen in the next round of napi_schedule().
6461  	 * This should not cause hiccups/stalls to the live traffic.
6462  	 */
6463  	list_for_each_entry(napi, &dev->napi_list, dev_list)
6464  		assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6465  
6466  	return err;
6467  }
6468  EXPORT_SYMBOL(dev_set_threaded);
6469  
netif_napi_add_weight(struct net_device * dev,struct napi_struct * napi,int (* poll)(struct napi_struct *,int),int weight)6470  void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6471  			   int (*poll)(struct napi_struct *, int), int weight)
6472  {
6473  	if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6474  		return;
6475  
6476  	INIT_LIST_HEAD(&napi->poll_list);
6477  	INIT_HLIST_NODE(&napi->napi_hash_node);
6478  	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6479  	napi->timer.function = napi_watchdog;
6480  	init_gro_hash(napi);
6481  	napi->skb = NULL;
6482  	INIT_LIST_HEAD(&napi->rx_list);
6483  	napi->rx_count = 0;
6484  	napi->poll = poll;
6485  	if (weight > NAPI_POLL_WEIGHT)
6486  		netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6487  				weight);
6488  	napi->weight = weight;
6489  	napi->dev = dev;
6490  #ifdef CONFIG_NETPOLL
6491  	napi->poll_owner = -1;
6492  #endif
6493  	napi->list_owner = -1;
6494  	set_bit(NAPI_STATE_SCHED, &napi->state);
6495  	set_bit(NAPI_STATE_NPSVC, &napi->state);
6496  	list_add_rcu(&napi->dev_list, &dev->napi_list);
6497  	napi_hash_add(napi);
6498  	napi_get_frags_check(napi);
6499  	/* Create kthread for this napi if dev->threaded is set.
6500  	 * Clear dev->threaded if kthread creation failed so that
6501  	 * threaded mode will not be enabled in napi_enable().
6502  	 */
6503  	if (dev->threaded && napi_kthread_create(napi))
6504  		dev->threaded = 0;
6505  }
6506  EXPORT_SYMBOL(netif_napi_add_weight);
6507  
napi_disable(struct napi_struct * n)6508  void napi_disable(struct napi_struct *n)
6509  {
6510  	unsigned long val, new;
6511  
6512  	might_sleep();
6513  	set_bit(NAPI_STATE_DISABLE, &n->state);
6514  
6515  	val = READ_ONCE(n->state);
6516  	do {
6517  		while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6518  			usleep_range(20, 200);
6519  			val = READ_ONCE(n->state);
6520  		}
6521  
6522  		new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6523  		new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6524  	} while (!try_cmpxchg(&n->state, &val, new));
6525  
6526  	hrtimer_cancel(&n->timer);
6527  
6528  	clear_bit(NAPI_STATE_DISABLE, &n->state);
6529  }
6530  EXPORT_SYMBOL(napi_disable);
6531  
6532  /**
6533   *	napi_enable - enable NAPI scheduling
6534   *	@n: NAPI context
6535   *
6536   * Resume NAPI from being scheduled on this context.
6537   * Must be paired with napi_disable.
6538   */
napi_enable(struct napi_struct * n)6539  void napi_enable(struct napi_struct *n)
6540  {
6541  	unsigned long new, val = READ_ONCE(n->state);
6542  
6543  	do {
6544  		BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6545  
6546  		new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6547  		if (n->dev->threaded && n->thread)
6548  			new |= NAPIF_STATE_THREADED;
6549  	} while (!try_cmpxchg(&n->state, &val, new));
6550  }
6551  EXPORT_SYMBOL(napi_enable);
6552  
flush_gro_hash(struct napi_struct * napi)6553  static void flush_gro_hash(struct napi_struct *napi)
6554  {
6555  	int i;
6556  
6557  	for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6558  		struct sk_buff *skb, *n;
6559  
6560  		list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6561  			kfree_skb(skb);
6562  		napi->gro_hash[i].count = 0;
6563  	}
6564  }
6565  
6566  /* Must be called in process context */
__netif_napi_del(struct napi_struct * napi)6567  void __netif_napi_del(struct napi_struct *napi)
6568  {
6569  	if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6570  		return;
6571  
6572  	napi_hash_del(napi);
6573  	list_del_rcu(&napi->dev_list);
6574  	napi_free_frags(napi);
6575  
6576  	flush_gro_hash(napi);
6577  	napi->gro_bitmask = 0;
6578  
6579  	if (napi->thread) {
6580  		kthread_stop(napi->thread);
6581  		napi->thread = NULL;
6582  	}
6583  }
6584  EXPORT_SYMBOL(__netif_napi_del);
6585  
__napi_poll(struct napi_struct * n,bool * repoll)6586  static int __napi_poll(struct napi_struct *n, bool *repoll)
6587  {
6588  	int work, weight;
6589  
6590  	weight = n->weight;
6591  
6592  	/* This NAPI_STATE_SCHED test is for avoiding a race
6593  	 * with netpoll's poll_napi().  Only the entity which
6594  	 * obtains the lock and sees NAPI_STATE_SCHED set will
6595  	 * actually make the ->poll() call.  Therefore we avoid
6596  	 * accidentally calling ->poll() when NAPI is not scheduled.
6597  	 */
6598  	work = 0;
6599  	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6600  		work = n->poll(n, weight);
6601  		trace_napi_poll(n, work, weight);
6602  	}
6603  
6604  	if (unlikely(work > weight))
6605  		netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6606  				n->poll, work, weight);
6607  
6608  	if (likely(work < weight))
6609  		return work;
6610  
6611  	/* Drivers must not modify the NAPI state if they
6612  	 * consume the entire weight.  In such cases this code
6613  	 * still "owns" the NAPI instance and therefore can
6614  	 * move the instance around on the list at-will.
6615  	 */
6616  	if (unlikely(napi_disable_pending(n))) {
6617  		napi_complete(n);
6618  		return work;
6619  	}
6620  
6621  	/* The NAPI context has more processing work, but busy-polling
6622  	 * is preferred. Exit early.
6623  	 */
6624  	if (napi_prefer_busy_poll(n)) {
6625  		if (napi_complete_done(n, work)) {
6626  			/* If timeout is not set, we need to make sure
6627  			 * that the NAPI is re-scheduled.
6628  			 */
6629  			napi_schedule(n);
6630  		}
6631  		return work;
6632  	}
6633  
6634  	if (n->gro_bitmask) {
6635  		/* flush too old packets
6636  		 * If HZ < 1000, flush all packets.
6637  		 */
6638  		napi_gro_flush(n, HZ >= 1000);
6639  	}
6640  
6641  	gro_normal_list(n);
6642  
6643  	/* Some drivers may have called napi_schedule
6644  	 * prior to exhausting their budget.
6645  	 */
6646  	if (unlikely(!list_empty(&n->poll_list))) {
6647  		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6648  			     n->dev ? n->dev->name : "backlog");
6649  		return work;
6650  	}
6651  
6652  	*repoll = true;
6653  
6654  	return work;
6655  }
6656  
napi_poll(struct napi_struct * n,struct list_head * repoll)6657  static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6658  {
6659  	bool do_repoll = false;
6660  	void *have;
6661  	int work;
6662  
6663  	list_del_init(&n->poll_list);
6664  
6665  	have = netpoll_poll_lock(n);
6666  
6667  	work = __napi_poll(n, &do_repoll);
6668  
6669  	if (do_repoll)
6670  		list_add_tail(&n->poll_list, repoll);
6671  
6672  	netpoll_poll_unlock(have);
6673  
6674  	return work;
6675  }
6676  
napi_thread_wait(struct napi_struct * napi)6677  static int napi_thread_wait(struct napi_struct *napi)
6678  {
6679  	bool woken = false;
6680  
6681  	set_current_state(TASK_INTERRUPTIBLE);
6682  
6683  	while (!kthread_should_stop()) {
6684  		/* Testing SCHED_THREADED bit here to make sure the current
6685  		 * kthread owns this napi and could poll on this napi.
6686  		 * Testing SCHED bit is not enough because SCHED bit might be
6687  		 * set by some other busy poll thread or by napi_disable().
6688  		 */
6689  		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
6690  			WARN_ON(!list_empty(&napi->poll_list));
6691  			__set_current_state(TASK_RUNNING);
6692  			return 0;
6693  		}
6694  
6695  		schedule();
6696  		/* woken being true indicates this thread owns this napi. */
6697  		woken = true;
6698  		set_current_state(TASK_INTERRUPTIBLE);
6699  	}
6700  	__set_current_state(TASK_RUNNING);
6701  
6702  	return -1;
6703  }
6704  
skb_defer_free_flush(struct softnet_data * sd)6705  static void skb_defer_free_flush(struct softnet_data *sd)
6706  {
6707  	struct sk_buff *skb, *next;
6708  
6709  	/* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6710  	if (!READ_ONCE(sd->defer_list))
6711  		return;
6712  
6713  	spin_lock(&sd->defer_lock);
6714  	skb = sd->defer_list;
6715  	sd->defer_list = NULL;
6716  	sd->defer_count = 0;
6717  	spin_unlock(&sd->defer_lock);
6718  
6719  	while (skb != NULL) {
6720  		next = skb->next;
6721  		napi_consume_skb(skb, 1);
6722  		skb = next;
6723  	}
6724  }
6725  
napi_threaded_poll(void * data)6726  static int napi_threaded_poll(void *data)
6727  {
6728  	struct napi_struct *napi = data;
6729  	struct softnet_data *sd;
6730  	void *have;
6731  
6732  	while (!napi_thread_wait(napi)) {
6733  		unsigned long last_qs = jiffies;
6734  
6735  		for (;;) {
6736  			bool repoll = false;
6737  
6738  			local_bh_disable();
6739  			sd = this_cpu_ptr(&softnet_data);
6740  			sd->in_napi_threaded_poll = true;
6741  
6742  			have = netpoll_poll_lock(napi);
6743  			__napi_poll(napi, &repoll);
6744  			netpoll_poll_unlock(have);
6745  
6746  			sd->in_napi_threaded_poll = false;
6747  			barrier();
6748  
6749  			if (sd_has_rps_ipi_waiting(sd)) {
6750  				local_irq_disable();
6751  				net_rps_action_and_irq_enable(sd);
6752  			}
6753  			skb_defer_free_flush(sd);
6754  			local_bh_enable();
6755  
6756  			if (!repoll)
6757  				break;
6758  
6759  			rcu_softirq_qs_periodic(last_qs);
6760  			cond_resched();
6761  		}
6762  	}
6763  	return 0;
6764  }
6765  
net_rx_action(struct softirq_action * h)6766  static __latent_entropy void net_rx_action(struct softirq_action *h)
6767  {
6768  	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6769  	unsigned long time_limit = jiffies +
6770  		usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
6771  	int budget = READ_ONCE(netdev_budget);
6772  	LIST_HEAD(list);
6773  	LIST_HEAD(repoll);
6774  
6775  start:
6776  	sd->in_net_rx_action = true;
6777  	local_irq_disable();
6778  	list_splice_init(&sd->poll_list, &list);
6779  	local_irq_enable();
6780  
6781  	for (;;) {
6782  		struct napi_struct *n;
6783  
6784  		skb_defer_free_flush(sd);
6785  
6786  		if (list_empty(&list)) {
6787  			if (list_empty(&repoll)) {
6788  				sd->in_net_rx_action = false;
6789  				barrier();
6790  				/* We need to check if ____napi_schedule()
6791  				 * had refilled poll_list while
6792  				 * sd->in_net_rx_action was true.
6793  				 */
6794  				if (!list_empty(&sd->poll_list))
6795  					goto start;
6796  				if (!sd_has_rps_ipi_waiting(sd))
6797  					goto end;
6798  			}
6799  			break;
6800  		}
6801  
6802  		n = list_first_entry(&list, struct napi_struct, poll_list);
6803  		budget -= napi_poll(n, &repoll);
6804  
6805  		/* If softirq window is exhausted then punt.
6806  		 * Allow this to run for 2 jiffies since which will allow
6807  		 * an average latency of 1.5/HZ.
6808  		 */
6809  		if (unlikely(budget <= 0 ||
6810  			     time_after_eq(jiffies, time_limit))) {
6811  			sd->time_squeeze++;
6812  			break;
6813  		}
6814  	}
6815  
6816  	local_irq_disable();
6817  
6818  	list_splice_tail_init(&sd->poll_list, &list);
6819  	list_splice_tail(&repoll, &list);
6820  	list_splice(&list, &sd->poll_list);
6821  	if (!list_empty(&sd->poll_list))
6822  		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
6823  	else
6824  		sd->in_net_rx_action = false;
6825  
6826  	net_rps_action_and_irq_enable(sd);
6827  end:;
6828  }
6829  
6830  struct netdev_adjacent {
6831  	struct net_device *dev;
6832  	netdevice_tracker dev_tracker;
6833  
6834  	/* upper master flag, there can only be one master device per list */
6835  	bool master;
6836  
6837  	/* lookup ignore flag */
6838  	bool ignore;
6839  
6840  	/* counter for the number of times this device was added to us */
6841  	u16 ref_nr;
6842  
6843  	/* private field for the users */
6844  	void *private;
6845  
6846  	struct list_head list;
6847  	struct rcu_head rcu;
6848  };
6849  
__netdev_find_adj(struct net_device * adj_dev,struct list_head * adj_list)6850  static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
6851  						 struct list_head *adj_list)
6852  {
6853  	struct netdev_adjacent *adj;
6854  
6855  	list_for_each_entry(adj, adj_list, list) {
6856  		if (adj->dev == adj_dev)
6857  			return adj;
6858  	}
6859  	return NULL;
6860  }
6861  
____netdev_has_upper_dev(struct net_device * upper_dev,struct netdev_nested_priv * priv)6862  static int ____netdev_has_upper_dev(struct net_device *upper_dev,
6863  				    struct netdev_nested_priv *priv)
6864  {
6865  	struct net_device *dev = (struct net_device *)priv->data;
6866  
6867  	return upper_dev == dev;
6868  }
6869  
6870  /**
6871   * netdev_has_upper_dev - Check if device is linked to an upper device
6872   * @dev: device
6873   * @upper_dev: upper device to check
6874   *
6875   * Find out if a device is linked to specified upper device and return true
6876   * in case it is. Note that this checks only immediate upper device,
6877   * not through a complete stack of devices. The caller must hold the RTNL lock.
6878   */
netdev_has_upper_dev(struct net_device * dev,struct net_device * upper_dev)6879  bool netdev_has_upper_dev(struct net_device *dev,
6880  			  struct net_device *upper_dev)
6881  {
6882  	struct netdev_nested_priv priv = {
6883  		.data = (void *)upper_dev,
6884  	};
6885  
6886  	ASSERT_RTNL();
6887  
6888  	return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6889  					     &priv);
6890  }
6891  EXPORT_SYMBOL(netdev_has_upper_dev);
6892  
6893  /**
6894   * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6895   * @dev: device
6896   * @upper_dev: upper device to check
6897   *
6898   * Find out if a device is linked to specified upper device and return true
6899   * in case it is. Note that this checks the entire upper device chain.
6900   * The caller must hold rcu lock.
6901   */
6902  
netdev_has_upper_dev_all_rcu(struct net_device * dev,struct net_device * upper_dev)6903  bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
6904  				  struct net_device *upper_dev)
6905  {
6906  	struct netdev_nested_priv priv = {
6907  		.data = (void *)upper_dev,
6908  	};
6909  
6910  	return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
6911  					       &priv);
6912  }
6913  EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
6914  
6915  /**
6916   * netdev_has_any_upper_dev - Check if device is linked to some device
6917   * @dev: device
6918   *
6919   * Find out if a device is linked to an upper device and return true in case
6920   * it is. The caller must hold the RTNL lock.
6921   */
netdev_has_any_upper_dev(struct net_device * dev)6922  bool netdev_has_any_upper_dev(struct net_device *dev)
6923  {
6924  	ASSERT_RTNL();
6925  
6926  	return !list_empty(&dev->adj_list.upper);
6927  }
6928  EXPORT_SYMBOL(netdev_has_any_upper_dev);
6929  
6930  /**
6931   * netdev_master_upper_dev_get - Get master upper device
6932   * @dev: device
6933   *
6934   * Find a master upper device and return pointer to it or NULL in case
6935   * it's not there. The caller must hold the RTNL lock.
6936   */
netdev_master_upper_dev_get(struct net_device * dev)6937  struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
6938  {
6939  	struct netdev_adjacent *upper;
6940  
6941  	ASSERT_RTNL();
6942  
6943  	if (list_empty(&dev->adj_list.upper))
6944  		return NULL;
6945  
6946  	upper = list_first_entry(&dev->adj_list.upper,
6947  				 struct netdev_adjacent, list);
6948  	if (likely(upper->master))
6949  		return upper->dev;
6950  	return NULL;
6951  }
6952  EXPORT_SYMBOL(netdev_master_upper_dev_get);
6953  
__netdev_master_upper_dev_get(struct net_device * dev)6954  static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
6955  {
6956  	struct netdev_adjacent *upper;
6957  
6958  	ASSERT_RTNL();
6959  
6960  	if (list_empty(&dev->adj_list.upper))
6961  		return NULL;
6962  
6963  	upper = list_first_entry(&dev->adj_list.upper,
6964  				 struct netdev_adjacent, list);
6965  	if (likely(upper->master) && !upper->ignore)
6966  		return upper->dev;
6967  	return NULL;
6968  }
6969  
6970  /**
6971   * netdev_has_any_lower_dev - Check if device is linked to some device
6972   * @dev: device
6973   *
6974   * Find out if a device is linked to a lower device and return true in case
6975   * it is. The caller must hold the RTNL lock.
6976   */
netdev_has_any_lower_dev(struct net_device * dev)6977  static bool netdev_has_any_lower_dev(struct net_device *dev)
6978  {
6979  	ASSERT_RTNL();
6980  
6981  	return !list_empty(&dev->adj_list.lower);
6982  }
6983  
netdev_adjacent_get_private(struct list_head * adj_list)6984  void *netdev_adjacent_get_private(struct list_head *adj_list)
6985  {
6986  	struct netdev_adjacent *adj;
6987  
6988  	adj = list_entry(adj_list, struct netdev_adjacent, list);
6989  
6990  	return adj->private;
6991  }
6992  EXPORT_SYMBOL(netdev_adjacent_get_private);
6993  
6994  /**
6995   * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
6996   * @dev: device
6997   * @iter: list_head ** of the current position
6998   *
6999   * Gets the next device from the dev's upper list, starting from iter
7000   * position. The caller must hold RCU read lock.
7001   */
netdev_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)7002  struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7003  						 struct list_head **iter)
7004  {
7005  	struct netdev_adjacent *upper;
7006  
7007  	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7008  
7009  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7010  
7011  	if (&upper->list == &dev->adj_list.upper)
7012  		return NULL;
7013  
7014  	*iter = &upper->list;
7015  
7016  	return upper->dev;
7017  }
7018  EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7019  
__netdev_next_upper_dev(struct net_device * dev,struct list_head ** iter,bool * ignore)7020  static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7021  						  struct list_head **iter,
7022  						  bool *ignore)
7023  {
7024  	struct netdev_adjacent *upper;
7025  
7026  	upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7027  
7028  	if (&upper->list == &dev->adj_list.upper)
7029  		return NULL;
7030  
7031  	*iter = &upper->list;
7032  	*ignore = upper->ignore;
7033  
7034  	return upper->dev;
7035  }
7036  
netdev_next_upper_dev_rcu(struct net_device * dev,struct list_head ** iter)7037  static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7038  						    struct list_head **iter)
7039  {
7040  	struct netdev_adjacent *upper;
7041  
7042  	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7043  
7044  	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7045  
7046  	if (&upper->list == &dev->adj_list.upper)
7047  		return NULL;
7048  
7049  	*iter = &upper->list;
7050  
7051  	return upper->dev;
7052  }
7053  
__netdev_walk_all_upper_dev(struct net_device * dev,int (* fn)(struct net_device * dev,struct netdev_nested_priv * priv),struct netdev_nested_priv * priv)7054  static int __netdev_walk_all_upper_dev(struct net_device *dev,
7055  				       int (*fn)(struct net_device *dev,
7056  					 struct netdev_nested_priv *priv),
7057  				       struct netdev_nested_priv *priv)
7058  {
7059  	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7060  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7061  	int ret, cur = 0;
7062  	bool ignore;
7063  
7064  	now = dev;
7065  	iter = &dev->adj_list.upper;
7066  
7067  	while (1) {
7068  		if (now != dev) {
7069  			ret = fn(now, priv);
7070  			if (ret)
7071  				return ret;
7072  		}
7073  
7074  		next = NULL;
7075  		while (1) {
7076  			udev = __netdev_next_upper_dev(now, &iter, &ignore);
7077  			if (!udev)
7078  				break;
7079  			if (ignore)
7080  				continue;
7081  
7082  			next = udev;
7083  			niter = &udev->adj_list.upper;
7084  			dev_stack[cur] = now;
7085  			iter_stack[cur++] = iter;
7086  			break;
7087  		}
7088  
7089  		if (!next) {
7090  			if (!cur)
7091  				return 0;
7092  			next = dev_stack[--cur];
7093  			niter = iter_stack[cur];
7094  		}
7095  
7096  		now = next;
7097  		iter = niter;
7098  	}
7099  
7100  	return 0;
7101  }
7102  
netdev_walk_all_upper_dev_rcu(struct net_device * dev,int (* fn)(struct net_device * dev,struct netdev_nested_priv * priv),struct netdev_nested_priv * priv)7103  int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7104  				  int (*fn)(struct net_device *dev,
7105  					    struct netdev_nested_priv *priv),
7106  				  struct netdev_nested_priv *priv)
7107  {
7108  	struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7109  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7110  	int ret, cur = 0;
7111  
7112  	now = dev;
7113  	iter = &dev->adj_list.upper;
7114  
7115  	while (1) {
7116  		if (now != dev) {
7117  			ret = fn(now, priv);
7118  			if (ret)
7119  				return ret;
7120  		}
7121  
7122  		next = NULL;
7123  		while (1) {
7124  			udev = netdev_next_upper_dev_rcu(now, &iter);
7125  			if (!udev)
7126  				break;
7127  
7128  			next = udev;
7129  			niter = &udev->adj_list.upper;
7130  			dev_stack[cur] = now;
7131  			iter_stack[cur++] = iter;
7132  			break;
7133  		}
7134  
7135  		if (!next) {
7136  			if (!cur)
7137  				return 0;
7138  			next = dev_stack[--cur];
7139  			niter = iter_stack[cur];
7140  		}
7141  
7142  		now = next;
7143  		iter = niter;
7144  	}
7145  
7146  	return 0;
7147  }
7148  EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7149  
__netdev_has_upper_dev(struct net_device * dev,struct net_device * upper_dev)7150  static bool __netdev_has_upper_dev(struct net_device *dev,
7151  				   struct net_device *upper_dev)
7152  {
7153  	struct netdev_nested_priv priv = {
7154  		.flags = 0,
7155  		.data = (void *)upper_dev,
7156  	};
7157  
7158  	ASSERT_RTNL();
7159  
7160  	return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7161  					   &priv);
7162  }
7163  
7164  /**
7165   * netdev_lower_get_next_private - Get the next ->private from the
7166   *				   lower neighbour list
7167   * @dev: device
7168   * @iter: list_head ** of the current position
7169   *
7170   * Gets the next netdev_adjacent->private from the dev's lower neighbour
7171   * list, starting from iter position. The caller must hold either hold the
7172   * RTNL lock or its own locking that guarantees that the neighbour lower
7173   * list will remain unchanged.
7174   */
netdev_lower_get_next_private(struct net_device * dev,struct list_head ** iter)7175  void *netdev_lower_get_next_private(struct net_device *dev,
7176  				    struct list_head **iter)
7177  {
7178  	struct netdev_adjacent *lower;
7179  
7180  	lower = list_entry(*iter, struct netdev_adjacent, list);
7181  
7182  	if (&lower->list == &dev->adj_list.lower)
7183  		return NULL;
7184  
7185  	*iter = lower->list.next;
7186  
7187  	return lower->private;
7188  }
7189  EXPORT_SYMBOL(netdev_lower_get_next_private);
7190  
7191  /**
7192   * netdev_lower_get_next_private_rcu - Get the next ->private from the
7193   *				       lower neighbour list, RCU
7194   *				       variant
7195   * @dev: device
7196   * @iter: list_head ** of the current position
7197   *
7198   * Gets the next netdev_adjacent->private from the dev's lower neighbour
7199   * list, starting from iter position. The caller must hold RCU read lock.
7200   */
netdev_lower_get_next_private_rcu(struct net_device * dev,struct list_head ** iter)7201  void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7202  					struct list_head **iter)
7203  {
7204  	struct netdev_adjacent *lower;
7205  
7206  	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7207  
7208  	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7209  
7210  	if (&lower->list == &dev->adj_list.lower)
7211  		return NULL;
7212  
7213  	*iter = &lower->list;
7214  
7215  	return lower->private;
7216  }
7217  EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7218  
7219  /**
7220   * netdev_lower_get_next - Get the next device from the lower neighbour
7221   *                         list
7222   * @dev: device
7223   * @iter: list_head ** of the current position
7224   *
7225   * Gets the next netdev_adjacent from the dev's lower neighbour
7226   * list, starting from iter position. The caller must hold RTNL lock or
7227   * its own locking that guarantees that the neighbour lower
7228   * list will remain unchanged.
7229   */
netdev_lower_get_next(struct net_device * dev,struct list_head ** iter)7230  void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7231  {
7232  	struct netdev_adjacent *lower;
7233  
7234  	lower = list_entry(*iter, struct netdev_adjacent, list);
7235  
7236  	if (&lower->list == &dev->adj_list.lower)
7237  		return NULL;
7238  
7239  	*iter = lower->list.next;
7240  
7241  	return lower->dev;
7242  }
7243  EXPORT_SYMBOL(netdev_lower_get_next);
7244  
netdev_next_lower_dev(struct net_device * dev,struct list_head ** iter)7245  static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7246  						struct list_head **iter)
7247  {
7248  	struct netdev_adjacent *lower;
7249  
7250  	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7251  
7252  	if (&lower->list == &dev->adj_list.lower)
7253  		return NULL;
7254  
7255  	*iter = &lower->list;
7256  
7257  	return lower->dev;
7258  }
7259  
__netdev_next_lower_dev(struct net_device * dev,struct list_head ** iter,bool * ignore)7260  static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7261  						  struct list_head **iter,
7262  						  bool *ignore)
7263  {
7264  	struct netdev_adjacent *lower;
7265  
7266  	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7267  
7268  	if (&lower->list == &dev->adj_list.lower)
7269  		return NULL;
7270  
7271  	*iter = &lower->list;
7272  	*ignore = lower->ignore;
7273  
7274  	return lower->dev;
7275  }
7276  
netdev_walk_all_lower_dev(struct net_device * dev,int (* fn)(struct net_device * dev,struct netdev_nested_priv * priv),struct netdev_nested_priv * priv)7277  int netdev_walk_all_lower_dev(struct net_device *dev,
7278  			      int (*fn)(struct net_device *dev,
7279  					struct netdev_nested_priv *priv),
7280  			      struct netdev_nested_priv *priv)
7281  {
7282  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7283  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7284  	int ret, cur = 0;
7285  
7286  	now = dev;
7287  	iter = &dev->adj_list.lower;
7288  
7289  	while (1) {
7290  		if (now != dev) {
7291  			ret = fn(now, priv);
7292  			if (ret)
7293  				return ret;
7294  		}
7295  
7296  		next = NULL;
7297  		while (1) {
7298  			ldev = netdev_next_lower_dev(now, &iter);
7299  			if (!ldev)
7300  				break;
7301  
7302  			next = ldev;
7303  			niter = &ldev->adj_list.lower;
7304  			dev_stack[cur] = now;
7305  			iter_stack[cur++] = iter;
7306  			break;
7307  		}
7308  
7309  		if (!next) {
7310  			if (!cur)
7311  				return 0;
7312  			next = dev_stack[--cur];
7313  			niter = iter_stack[cur];
7314  		}
7315  
7316  		now = next;
7317  		iter = niter;
7318  	}
7319  
7320  	return 0;
7321  }
7322  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7323  
__netdev_walk_all_lower_dev(struct net_device * dev,int (* fn)(struct net_device * dev,struct netdev_nested_priv * priv),struct netdev_nested_priv * priv)7324  static int __netdev_walk_all_lower_dev(struct net_device *dev,
7325  				       int (*fn)(struct net_device *dev,
7326  					 struct netdev_nested_priv *priv),
7327  				       struct netdev_nested_priv *priv)
7328  {
7329  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7330  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7331  	int ret, cur = 0;
7332  	bool ignore;
7333  
7334  	now = dev;
7335  	iter = &dev->adj_list.lower;
7336  
7337  	while (1) {
7338  		if (now != dev) {
7339  			ret = fn(now, priv);
7340  			if (ret)
7341  				return ret;
7342  		}
7343  
7344  		next = NULL;
7345  		while (1) {
7346  			ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7347  			if (!ldev)
7348  				break;
7349  			if (ignore)
7350  				continue;
7351  
7352  			next = ldev;
7353  			niter = &ldev->adj_list.lower;
7354  			dev_stack[cur] = now;
7355  			iter_stack[cur++] = iter;
7356  			break;
7357  		}
7358  
7359  		if (!next) {
7360  			if (!cur)
7361  				return 0;
7362  			next = dev_stack[--cur];
7363  			niter = iter_stack[cur];
7364  		}
7365  
7366  		now = next;
7367  		iter = niter;
7368  	}
7369  
7370  	return 0;
7371  }
7372  
netdev_next_lower_dev_rcu(struct net_device * dev,struct list_head ** iter)7373  struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7374  					     struct list_head **iter)
7375  {
7376  	struct netdev_adjacent *lower;
7377  
7378  	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7379  	if (&lower->list == &dev->adj_list.lower)
7380  		return NULL;
7381  
7382  	*iter = &lower->list;
7383  
7384  	return lower->dev;
7385  }
7386  EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7387  
__netdev_upper_depth(struct net_device * dev)7388  static u8 __netdev_upper_depth(struct net_device *dev)
7389  {
7390  	struct net_device *udev;
7391  	struct list_head *iter;
7392  	u8 max_depth = 0;
7393  	bool ignore;
7394  
7395  	for (iter = &dev->adj_list.upper,
7396  	     udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7397  	     udev;
7398  	     udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7399  		if (ignore)
7400  			continue;
7401  		if (max_depth < udev->upper_level)
7402  			max_depth = udev->upper_level;
7403  	}
7404  
7405  	return max_depth;
7406  }
7407  
__netdev_lower_depth(struct net_device * dev)7408  static u8 __netdev_lower_depth(struct net_device *dev)
7409  {
7410  	struct net_device *ldev;
7411  	struct list_head *iter;
7412  	u8 max_depth = 0;
7413  	bool ignore;
7414  
7415  	for (iter = &dev->adj_list.lower,
7416  	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7417  	     ldev;
7418  	     ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7419  		if (ignore)
7420  			continue;
7421  		if (max_depth < ldev->lower_level)
7422  			max_depth = ldev->lower_level;
7423  	}
7424  
7425  	return max_depth;
7426  }
7427  
__netdev_update_upper_level(struct net_device * dev,struct netdev_nested_priv * __unused)7428  static int __netdev_update_upper_level(struct net_device *dev,
7429  				       struct netdev_nested_priv *__unused)
7430  {
7431  	dev->upper_level = __netdev_upper_depth(dev) + 1;
7432  	return 0;
7433  }
7434  
7435  #ifdef CONFIG_LOCKDEP
7436  static LIST_HEAD(net_unlink_list);
7437  
net_unlink_todo(struct net_device * dev)7438  static void net_unlink_todo(struct net_device *dev)
7439  {
7440  	if (list_empty(&dev->unlink_list))
7441  		list_add_tail(&dev->unlink_list, &net_unlink_list);
7442  }
7443  #endif
7444  
__netdev_update_lower_level(struct net_device * dev,struct netdev_nested_priv * priv)7445  static int __netdev_update_lower_level(struct net_device *dev,
7446  				       struct netdev_nested_priv *priv)
7447  {
7448  	dev->lower_level = __netdev_lower_depth(dev) + 1;
7449  
7450  #ifdef CONFIG_LOCKDEP
7451  	if (!priv)
7452  		return 0;
7453  
7454  	if (priv->flags & NESTED_SYNC_IMM)
7455  		dev->nested_level = dev->lower_level - 1;
7456  	if (priv->flags & NESTED_SYNC_TODO)
7457  		net_unlink_todo(dev);
7458  #endif
7459  	return 0;
7460  }
7461  
netdev_walk_all_lower_dev_rcu(struct net_device * dev,int (* fn)(struct net_device * dev,struct netdev_nested_priv * priv),struct netdev_nested_priv * priv)7462  int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7463  				  int (*fn)(struct net_device *dev,
7464  					    struct netdev_nested_priv *priv),
7465  				  struct netdev_nested_priv *priv)
7466  {
7467  	struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7468  	struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7469  	int ret, cur = 0;
7470  
7471  	now = dev;
7472  	iter = &dev->adj_list.lower;
7473  
7474  	while (1) {
7475  		if (now != dev) {
7476  			ret = fn(now, priv);
7477  			if (ret)
7478  				return ret;
7479  		}
7480  
7481  		next = NULL;
7482  		while (1) {
7483  			ldev = netdev_next_lower_dev_rcu(now, &iter);
7484  			if (!ldev)
7485  				break;
7486  
7487  			next = ldev;
7488  			niter = &ldev->adj_list.lower;
7489  			dev_stack[cur] = now;
7490  			iter_stack[cur++] = iter;
7491  			break;
7492  		}
7493  
7494  		if (!next) {
7495  			if (!cur)
7496  				return 0;
7497  			next = dev_stack[--cur];
7498  			niter = iter_stack[cur];
7499  		}
7500  
7501  		now = next;
7502  		iter = niter;
7503  	}
7504  
7505  	return 0;
7506  }
7507  EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7508  
7509  /**
7510   * netdev_lower_get_first_private_rcu - Get the first ->private from the
7511   *				       lower neighbour list, RCU
7512   *				       variant
7513   * @dev: device
7514   *
7515   * Gets the first netdev_adjacent->private from the dev's lower neighbour
7516   * list. The caller must hold RCU read lock.
7517   */
netdev_lower_get_first_private_rcu(struct net_device * dev)7518  void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7519  {
7520  	struct netdev_adjacent *lower;
7521  
7522  	lower = list_first_or_null_rcu(&dev->adj_list.lower,
7523  			struct netdev_adjacent, list);
7524  	if (lower)
7525  		return lower->private;
7526  	return NULL;
7527  }
7528  EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7529  
7530  /**
7531   * netdev_master_upper_dev_get_rcu - Get master upper device
7532   * @dev: device
7533   *
7534   * Find a master upper device and return pointer to it or NULL in case
7535   * it's not there. The caller must hold the RCU read lock.
7536   */
netdev_master_upper_dev_get_rcu(struct net_device * dev)7537  struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7538  {
7539  	struct netdev_adjacent *upper;
7540  
7541  	upper = list_first_or_null_rcu(&dev->adj_list.upper,
7542  				       struct netdev_adjacent, list);
7543  	if (upper && likely(upper->master))
7544  		return upper->dev;
7545  	return NULL;
7546  }
7547  EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7548  
netdev_adjacent_sysfs_add(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)7549  static int netdev_adjacent_sysfs_add(struct net_device *dev,
7550  			      struct net_device *adj_dev,
7551  			      struct list_head *dev_list)
7552  {
7553  	char linkname[IFNAMSIZ+7];
7554  
7555  	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7556  		"upper_%s" : "lower_%s", adj_dev->name);
7557  	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7558  				 linkname);
7559  }
netdev_adjacent_sysfs_del(struct net_device * dev,char * name,struct list_head * dev_list)7560  static void netdev_adjacent_sysfs_del(struct net_device *dev,
7561  			       char *name,
7562  			       struct list_head *dev_list)
7563  {
7564  	char linkname[IFNAMSIZ+7];
7565  
7566  	sprintf(linkname, dev_list == &dev->adj_list.upper ?
7567  		"upper_%s" : "lower_%s", name);
7568  	sysfs_remove_link(&(dev->dev.kobj), linkname);
7569  }
7570  
netdev_adjacent_is_neigh_list(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)7571  static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7572  						 struct net_device *adj_dev,
7573  						 struct list_head *dev_list)
7574  {
7575  	return (dev_list == &dev->adj_list.upper ||
7576  		dev_list == &dev->adj_list.lower) &&
7577  		net_eq(dev_net(dev), dev_net(adj_dev));
7578  }
7579  
__netdev_adjacent_dev_insert(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list,void * private,bool master)7580  static int __netdev_adjacent_dev_insert(struct net_device *dev,
7581  					struct net_device *adj_dev,
7582  					struct list_head *dev_list,
7583  					void *private, bool master)
7584  {
7585  	struct netdev_adjacent *adj;
7586  	int ret;
7587  
7588  	adj = __netdev_find_adj(adj_dev, dev_list);
7589  
7590  	if (adj) {
7591  		adj->ref_nr += 1;
7592  		pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7593  			 dev->name, adj_dev->name, adj->ref_nr);
7594  
7595  		return 0;
7596  	}
7597  
7598  	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7599  	if (!adj)
7600  		return -ENOMEM;
7601  
7602  	adj->dev = adj_dev;
7603  	adj->master = master;
7604  	adj->ref_nr = 1;
7605  	adj->private = private;
7606  	adj->ignore = false;
7607  	netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7608  
7609  	pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7610  		 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7611  
7612  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7613  		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7614  		if (ret)
7615  			goto free_adj;
7616  	}
7617  
7618  	/* Ensure that master link is always the first item in list. */
7619  	if (master) {
7620  		ret = sysfs_create_link(&(dev->dev.kobj),
7621  					&(adj_dev->dev.kobj), "master");
7622  		if (ret)
7623  			goto remove_symlinks;
7624  
7625  		list_add_rcu(&adj->list, dev_list);
7626  	} else {
7627  		list_add_tail_rcu(&adj->list, dev_list);
7628  	}
7629  
7630  	return 0;
7631  
7632  remove_symlinks:
7633  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7634  		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7635  free_adj:
7636  	netdev_put(adj_dev, &adj->dev_tracker);
7637  	kfree(adj);
7638  
7639  	return ret;
7640  }
7641  
__netdev_adjacent_dev_remove(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list)7642  static void __netdev_adjacent_dev_remove(struct net_device *dev,
7643  					 struct net_device *adj_dev,
7644  					 u16 ref_nr,
7645  					 struct list_head *dev_list)
7646  {
7647  	struct netdev_adjacent *adj;
7648  
7649  	pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7650  		 dev->name, adj_dev->name, ref_nr);
7651  
7652  	adj = __netdev_find_adj(adj_dev, dev_list);
7653  
7654  	if (!adj) {
7655  		pr_err("Adjacency does not exist for device %s from %s\n",
7656  		       dev->name, adj_dev->name);
7657  		WARN_ON(1);
7658  		return;
7659  	}
7660  
7661  	if (adj->ref_nr > ref_nr) {
7662  		pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7663  			 dev->name, adj_dev->name, ref_nr,
7664  			 adj->ref_nr - ref_nr);
7665  		adj->ref_nr -= ref_nr;
7666  		return;
7667  	}
7668  
7669  	if (adj->master)
7670  		sysfs_remove_link(&(dev->dev.kobj), "master");
7671  
7672  	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7673  		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7674  
7675  	list_del_rcu(&adj->list);
7676  	pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7677  		 adj_dev->name, dev->name, adj_dev->name);
7678  	netdev_put(adj_dev, &adj->dev_tracker);
7679  	kfree_rcu(adj, rcu);
7680  }
7681  
__netdev_adjacent_dev_link_lists(struct net_device * dev,struct net_device * upper_dev,struct list_head * up_list,struct list_head * down_list,void * private,bool master)7682  static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7683  					    struct net_device *upper_dev,
7684  					    struct list_head *up_list,
7685  					    struct list_head *down_list,
7686  					    void *private, bool master)
7687  {
7688  	int ret;
7689  
7690  	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7691  					   private, master);
7692  	if (ret)
7693  		return ret;
7694  
7695  	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7696  					   private, false);
7697  	if (ret) {
7698  		__netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7699  		return ret;
7700  	}
7701  
7702  	return 0;
7703  }
7704  
__netdev_adjacent_dev_unlink_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list)7705  static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7706  					       struct net_device *upper_dev,
7707  					       u16 ref_nr,
7708  					       struct list_head *up_list,
7709  					       struct list_head *down_list)
7710  {
7711  	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7712  	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7713  }
7714  
__netdev_adjacent_dev_link_neighbour(struct net_device * dev,struct net_device * upper_dev,void * private,bool master)7715  static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7716  						struct net_device *upper_dev,
7717  						void *private, bool master)
7718  {
7719  	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7720  						&dev->adj_list.upper,
7721  						&upper_dev->adj_list.lower,
7722  						private, master);
7723  }
7724  
__netdev_adjacent_dev_unlink_neighbour(struct net_device * dev,struct net_device * upper_dev)7725  static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7726  						   struct net_device *upper_dev)
7727  {
7728  	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7729  					   &dev->adj_list.upper,
7730  					   &upper_dev->adj_list.lower);
7731  }
7732  
__netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,bool master,void * upper_priv,void * upper_info,struct netdev_nested_priv * priv,struct netlink_ext_ack * extack)7733  static int __netdev_upper_dev_link(struct net_device *dev,
7734  				   struct net_device *upper_dev, bool master,
7735  				   void *upper_priv, void *upper_info,
7736  				   struct netdev_nested_priv *priv,
7737  				   struct netlink_ext_ack *extack)
7738  {
7739  	struct netdev_notifier_changeupper_info changeupper_info = {
7740  		.info = {
7741  			.dev = dev,
7742  			.extack = extack,
7743  		},
7744  		.upper_dev = upper_dev,
7745  		.master = master,
7746  		.linking = true,
7747  		.upper_info = upper_info,
7748  	};
7749  	struct net_device *master_dev;
7750  	int ret = 0;
7751  
7752  	ASSERT_RTNL();
7753  
7754  	if (dev == upper_dev)
7755  		return -EBUSY;
7756  
7757  	/* To prevent loops, check if dev is not upper device to upper_dev. */
7758  	if (__netdev_has_upper_dev(upper_dev, dev))
7759  		return -EBUSY;
7760  
7761  	if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7762  		return -EMLINK;
7763  
7764  	if (!master) {
7765  		if (__netdev_has_upper_dev(dev, upper_dev))
7766  			return -EEXIST;
7767  	} else {
7768  		master_dev = __netdev_master_upper_dev_get(dev);
7769  		if (master_dev)
7770  			return master_dev == upper_dev ? -EEXIST : -EBUSY;
7771  	}
7772  
7773  	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7774  					    &changeupper_info.info);
7775  	ret = notifier_to_errno(ret);
7776  	if (ret)
7777  		return ret;
7778  
7779  	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7780  						   master);
7781  	if (ret)
7782  		return ret;
7783  
7784  	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7785  					    &changeupper_info.info);
7786  	ret = notifier_to_errno(ret);
7787  	if (ret)
7788  		goto rollback;
7789  
7790  	__netdev_update_upper_level(dev, NULL);
7791  	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7792  
7793  	__netdev_update_lower_level(upper_dev, priv);
7794  	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7795  				    priv);
7796  
7797  	return 0;
7798  
7799  rollback:
7800  	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7801  
7802  	return ret;
7803  }
7804  
7805  /**
7806   * netdev_upper_dev_link - Add a link to the upper device
7807   * @dev: device
7808   * @upper_dev: new upper device
7809   * @extack: netlink extended ack
7810   *
7811   * Adds a link to device which is upper to this one. The caller must hold
7812   * the RTNL lock. On a failure a negative errno code is returned.
7813   * On success the reference counts are adjusted and the function
7814   * returns zero.
7815   */
netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,struct netlink_ext_ack * extack)7816  int netdev_upper_dev_link(struct net_device *dev,
7817  			  struct net_device *upper_dev,
7818  			  struct netlink_ext_ack *extack)
7819  {
7820  	struct netdev_nested_priv priv = {
7821  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7822  		.data = NULL,
7823  	};
7824  
7825  	return __netdev_upper_dev_link(dev, upper_dev, false,
7826  				       NULL, NULL, &priv, extack);
7827  }
7828  EXPORT_SYMBOL(netdev_upper_dev_link);
7829  
7830  /**
7831   * netdev_master_upper_dev_link - Add a master link to the upper device
7832   * @dev: device
7833   * @upper_dev: new upper device
7834   * @upper_priv: upper device private
7835   * @upper_info: upper info to be passed down via notifier
7836   * @extack: netlink extended ack
7837   *
7838   * Adds a link to device which is upper to this one. In this case, only
7839   * one master upper device can be linked, although other non-master devices
7840   * might be linked as well. The caller must hold the RTNL lock.
7841   * On a failure a negative errno code is returned. On success the reference
7842   * counts are adjusted and the function returns zero.
7843   */
netdev_master_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,void * upper_priv,void * upper_info,struct netlink_ext_ack * extack)7844  int netdev_master_upper_dev_link(struct net_device *dev,
7845  				 struct net_device *upper_dev,
7846  				 void *upper_priv, void *upper_info,
7847  				 struct netlink_ext_ack *extack)
7848  {
7849  	struct netdev_nested_priv priv = {
7850  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7851  		.data = NULL,
7852  	};
7853  
7854  	return __netdev_upper_dev_link(dev, upper_dev, true,
7855  				       upper_priv, upper_info, &priv, extack);
7856  }
7857  EXPORT_SYMBOL(netdev_master_upper_dev_link);
7858  
__netdev_upper_dev_unlink(struct net_device * dev,struct net_device * upper_dev,struct netdev_nested_priv * priv)7859  static void __netdev_upper_dev_unlink(struct net_device *dev,
7860  				      struct net_device *upper_dev,
7861  				      struct netdev_nested_priv *priv)
7862  {
7863  	struct netdev_notifier_changeupper_info changeupper_info = {
7864  		.info = {
7865  			.dev = dev,
7866  		},
7867  		.upper_dev = upper_dev,
7868  		.linking = false,
7869  	};
7870  
7871  	ASSERT_RTNL();
7872  
7873  	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
7874  
7875  	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7876  				      &changeupper_info.info);
7877  
7878  	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7879  
7880  	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7881  				      &changeupper_info.info);
7882  
7883  	__netdev_update_upper_level(dev, NULL);
7884  	__netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7885  
7886  	__netdev_update_lower_level(upper_dev, priv);
7887  	__netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7888  				    priv);
7889  }
7890  
7891  /**
7892   * netdev_upper_dev_unlink - Removes a link to upper device
7893   * @dev: device
7894   * @upper_dev: new upper device
7895   *
7896   * Removes a link to device which is upper to this one. The caller must hold
7897   * the RTNL lock.
7898   */
netdev_upper_dev_unlink(struct net_device * dev,struct net_device * upper_dev)7899  void netdev_upper_dev_unlink(struct net_device *dev,
7900  			     struct net_device *upper_dev)
7901  {
7902  	struct netdev_nested_priv priv = {
7903  		.flags = NESTED_SYNC_TODO,
7904  		.data = NULL,
7905  	};
7906  
7907  	__netdev_upper_dev_unlink(dev, upper_dev, &priv);
7908  }
7909  EXPORT_SYMBOL(netdev_upper_dev_unlink);
7910  
__netdev_adjacent_dev_set(struct net_device * upper_dev,struct net_device * lower_dev,bool val)7911  static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
7912  				      struct net_device *lower_dev,
7913  				      bool val)
7914  {
7915  	struct netdev_adjacent *adj;
7916  
7917  	adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
7918  	if (adj)
7919  		adj->ignore = val;
7920  
7921  	adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
7922  	if (adj)
7923  		adj->ignore = val;
7924  }
7925  
netdev_adjacent_dev_disable(struct net_device * upper_dev,struct net_device * lower_dev)7926  static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
7927  					struct net_device *lower_dev)
7928  {
7929  	__netdev_adjacent_dev_set(upper_dev, lower_dev, true);
7930  }
7931  
netdev_adjacent_dev_enable(struct net_device * upper_dev,struct net_device * lower_dev)7932  static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
7933  				       struct net_device *lower_dev)
7934  {
7935  	__netdev_adjacent_dev_set(upper_dev, lower_dev, false);
7936  }
7937  
netdev_adjacent_change_prepare(struct net_device * old_dev,struct net_device * new_dev,struct net_device * dev,struct netlink_ext_ack * extack)7938  int netdev_adjacent_change_prepare(struct net_device *old_dev,
7939  				   struct net_device *new_dev,
7940  				   struct net_device *dev,
7941  				   struct netlink_ext_ack *extack)
7942  {
7943  	struct netdev_nested_priv priv = {
7944  		.flags = 0,
7945  		.data = NULL,
7946  	};
7947  	int err;
7948  
7949  	if (!new_dev)
7950  		return 0;
7951  
7952  	if (old_dev && new_dev != old_dev)
7953  		netdev_adjacent_dev_disable(dev, old_dev);
7954  	err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
7955  				      extack);
7956  	if (err) {
7957  		if (old_dev && new_dev != old_dev)
7958  			netdev_adjacent_dev_enable(dev, old_dev);
7959  		return err;
7960  	}
7961  
7962  	return 0;
7963  }
7964  EXPORT_SYMBOL(netdev_adjacent_change_prepare);
7965  
netdev_adjacent_change_commit(struct net_device * old_dev,struct net_device * new_dev,struct net_device * dev)7966  void netdev_adjacent_change_commit(struct net_device *old_dev,
7967  				   struct net_device *new_dev,
7968  				   struct net_device *dev)
7969  {
7970  	struct netdev_nested_priv priv = {
7971  		.flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7972  		.data = NULL,
7973  	};
7974  
7975  	if (!new_dev || !old_dev)
7976  		return;
7977  
7978  	if (new_dev == old_dev)
7979  		return;
7980  
7981  	netdev_adjacent_dev_enable(dev, old_dev);
7982  	__netdev_upper_dev_unlink(old_dev, dev, &priv);
7983  }
7984  EXPORT_SYMBOL(netdev_adjacent_change_commit);
7985  
netdev_adjacent_change_abort(struct net_device * old_dev,struct net_device * new_dev,struct net_device * dev)7986  void netdev_adjacent_change_abort(struct net_device *old_dev,
7987  				  struct net_device *new_dev,
7988  				  struct net_device *dev)
7989  {
7990  	struct netdev_nested_priv priv = {
7991  		.flags = 0,
7992  		.data = NULL,
7993  	};
7994  
7995  	if (!new_dev)
7996  		return;
7997  
7998  	if (old_dev && new_dev != old_dev)
7999  		netdev_adjacent_dev_enable(dev, old_dev);
8000  
8001  	__netdev_upper_dev_unlink(new_dev, dev, &priv);
8002  }
8003  EXPORT_SYMBOL(netdev_adjacent_change_abort);
8004  
8005  /**
8006   * netdev_bonding_info_change - Dispatch event about slave change
8007   * @dev: device
8008   * @bonding_info: info to dispatch
8009   *
8010   * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8011   * The caller must hold the RTNL lock.
8012   */
netdev_bonding_info_change(struct net_device * dev,struct netdev_bonding_info * bonding_info)8013  void netdev_bonding_info_change(struct net_device *dev,
8014  				struct netdev_bonding_info *bonding_info)
8015  {
8016  	struct netdev_notifier_bonding_info info = {
8017  		.info.dev = dev,
8018  	};
8019  
8020  	memcpy(&info.bonding_info, bonding_info,
8021  	       sizeof(struct netdev_bonding_info));
8022  	call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8023  				      &info.info);
8024  }
8025  EXPORT_SYMBOL(netdev_bonding_info_change);
8026  
netdev_offload_xstats_enable_l3(struct net_device * dev,struct netlink_ext_ack * extack)8027  static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8028  					   struct netlink_ext_ack *extack)
8029  {
8030  	struct netdev_notifier_offload_xstats_info info = {
8031  		.info.dev = dev,
8032  		.info.extack = extack,
8033  		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8034  	};
8035  	int err;
8036  	int rc;
8037  
8038  	dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8039  					 GFP_KERNEL);
8040  	if (!dev->offload_xstats_l3)
8041  		return -ENOMEM;
8042  
8043  	rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8044  						  NETDEV_OFFLOAD_XSTATS_DISABLE,
8045  						  &info.info);
8046  	err = notifier_to_errno(rc);
8047  	if (err)
8048  		goto free_stats;
8049  
8050  	return 0;
8051  
8052  free_stats:
8053  	kfree(dev->offload_xstats_l3);
8054  	dev->offload_xstats_l3 = NULL;
8055  	return err;
8056  }
8057  
netdev_offload_xstats_enable(struct net_device * dev,enum netdev_offload_xstats_type type,struct netlink_ext_ack * extack)8058  int netdev_offload_xstats_enable(struct net_device *dev,
8059  				 enum netdev_offload_xstats_type type,
8060  				 struct netlink_ext_ack *extack)
8061  {
8062  	ASSERT_RTNL();
8063  
8064  	if (netdev_offload_xstats_enabled(dev, type))
8065  		return -EALREADY;
8066  
8067  	switch (type) {
8068  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8069  		return netdev_offload_xstats_enable_l3(dev, extack);
8070  	}
8071  
8072  	WARN_ON(1);
8073  	return -EINVAL;
8074  }
8075  EXPORT_SYMBOL(netdev_offload_xstats_enable);
8076  
netdev_offload_xstats_disable_l3(struct net_device * dev)8077  static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8078  {
8079  	struct netdev_notifier_offload_xstats_info info = {
8080  		.info.dev = dev,
8081  		.type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8082  	};
8083  
8084  	call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8085  				      &info.info);
8086  	kfree(dev->offload_xstats_l3);
8087  	dev->offload_xstats_l3 = NULL;
8088  }
8089  
netdev_offload_xstats_disable(struct net_device * dev,enum netdev_offload_xstats_type type)8090  int netdev_offload_xstats_disable(struct net_device *dev,
8091  				  enum netdev_offload_xstats_type type)
8092  {
8093  	ASSERT_RTNL();
8094  
8095  	if (!netdev_offload_xstats_enabled(dev, type))
8096  		return -EALREADY;
8097  
8098  	switch (type) {
8099  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8100  		netdev_offload_xstats_disable_l3(dev);
8101  		return 0;
8102  	}
8103  
8104  	WARN_ON(1);
8105  	return -EINVAL;
8106  }
8107  EXPORT_SYMBOL(netdev_offload_xstats_disable);
8108  
netdev_offload_xstats_disable_all(struct net_device * dev)8109  static void netdev_offload_xstats_disable_all(struct net_device *dev)
8110  {
8111  	netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8112  }
8113  
8114  static struct rtnl_hw_stats64 *
netdev_offload_xstats_get_ptr(const struct net_device * dev,enum netdev_offload_xstats_type type)8115  netdev_offload_xstats_get_ptr(const struct net_device *dev,
8116  			      enum netdev_offload_xstats_type type)
8117  {
8118  	switch (type) {
8119  	case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8120  		return dev->offload_xstats_l3;
8121  	}
8122  
8123  	WARN_ON(1);
8124  	return NULL;
8125  }
8126  
netdev_offload_xstats_enabled(const struct net_device * dev,enum netdev_offload_xstats_type type)8127  bool netdev_offload_xstats_enabled(const struct net_device *dev,
8128  				   enum netdev_offload_xstats_type type)
8129  {
8130  	ASSERT_RTNL();
8131  
8132  	return netdev_offload_xstats_get_ptr(dev, type);
8133  }
8134  EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8135  
8136  struct netdev_notifier_offload_xstats_ru {
8137  	bool used;
8138  };
8139  
8140  struct netdev_notifier_offload_xstats_rd {
8141  	struct rtnl_hw_stats64 stats;
8142  	bool used;
8143  };
8144  
netdev_hw_stats64_add(struct rtnl_hw_stats64 * dest,const struct rtnl_hw_stats64 * src)8145  static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8146  				  const struct rtnl_hw_stats64 *src)
8147  {
8148  	dest->rx_packets	  += src->rx_packets;
8149  	dest->tx_packets	  += src->tx_packets;
8150  	dest->rx_bytes		  += src->rx_bytes;
8151  	dest->tx_bytes		  += src->tx_bytes;
8152  	dest->rx_errors		  += src->rx_errors;
8153  	dest->tx_errors		  += src->tx_errors;
8154  	dest->rx_dropped	  += src->rx_dropped;
8155  	dest->tx_dropped	  += src->tx_dropped;
8156  	dest->multicast		  += src->multicast;
8157  }
8158  
netdev_offload_xstats_get_used(struct net_device * dev,enum netdev_offload_xstats_type type,bool * p_used,struct netlink_ext_ack * extack)8159  static int netdev_offload_xstats_get_used(struct net_device *dev,
8160  					  enum netdev_offload_xstats_type type,
8161  					  bool *p_used,
8162  					  struct netlink_ext_ack *extack)
8163  {
8164  	struct netdev_notifier_offload_xstats_ru report_used = {};
8165  	struct netdev_notifier_offload_xstats_info info = {
8166  		.info.dev = dev,
8167  		.info.extack = extack,
8168  		.type = type,
8169  		.report_used = &report_used,
8170  	};
8171  	int rc;
8172  
8173  	WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8174  	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8175  					   &info.info);
8176  	*p_used = report_used.used;
8177  	return notifier_to_errno(rc);
8178  }
8179  
netdev_offload_xstats_get_stats(struct net_device * dev,enum netdev_offload_xstats_type type,struct rtnl_hw_stats64 * p_stats,bool * p_used,struct netlink_ext_ack * extack)8180  static int netdev_offload_xstats_get_stats(struct net_device *dev,
8181  					   enum netdev_offload_xstats_type type,
8182  					   struct rtnl_hw_stats64 *p_stats,
8183  					   bool *p_used,
8184  					   struct netlink_ext_ack *extack)
8185  {
8186  	struct netdev_notifier_offload_xstats_rd report_delta = {};
8187  	struct netdev_notifier_offload_xstats_info info = {
8188  		.info.dev = dev,
8189  		.info.extack = extack,
8190  		.type = type,
8191  		.report_delta = &report_delta,
8192  	};
8193  	struct rtnl_hw_stats64 *stats;
8194  	int rc;
8195  
8196  	stats = netdev_offload_xstats_get_ptr(dev, type);
8197  	if (WARN_ON(!stats))
8198  		return -EINVAL;
8199  
8200  	rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8201  					   &info.info);
8202  
8203  	/* Cache whatever we got, even if there was an error, otherwise the
8204  	 * successful stats retrievals would get lost.
8205  	 */
8206  	netdev_hw_stats64_add(stats, &report_delta.stats);
8207  
8208  	if (p_stats)
8209  		*p_stats = *stats;
8210  	*p_used = report_delta.used;
8211  
8212  	return notifier_to_errno(rc);
8213  }
8214  
netdev_offload_xstats_get(struct net_device * dev,enum netdev_offload_xstats_type type,struct rtnl_hw_stats64 * p_stats,bool * p_used,struct netlink_ext_ack * extack)8215  int netdev_offload_xstats_get(struct net_device *dev,
8216  			      enum netdev_offload_xstats_type type,
8217  			      struct rtnl_hw_stats64 *p_stats, bool *p_used,
8218  			      struct netlink_ext_ack *extack)
8219  {
8220  	ASSERT_RTNL();
8221  
8222  	if (p_stats)
8223  		return netdev_offload_xstats_get_stats(dev, type, p_stats,
8224  						       p_used, extack);
8225  	else
8226  		return netdev_offload_xstats_get_used(dev, type, p_used,
8227  						      extack);
8228  }
8229  EXPORT_SYMBOL(netdev_offload_xstats_get);
8230  
8231  void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd * report_delta,const struct rtnl_hw_stats64 * stats)8232  netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8233  				   const struct rtnl_hw_stats64 *stats)
8234  {
8235  	report_delta->used = true;
8236  	netdev_hw_stats64_add(&report_delta->stats, stats);
8237  }
8238  EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8239  
8240  void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru * report_used)8241  netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8242  {
8243  	report_used->used = true;
8244  }
8245  EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8246  
netdev_offload_xstats_push_delta(struct net_device * dev,enum netdev_offload_xstats_type type,const struct rtnl_hw_stats64 * p_stats)8247  void netdev_offload_xstats_push_delta(struct net_device *dev,
8248  				      enum netdev_offload_xstats_type type,
8249  				      const struct rtnl_hw_stats64 *p_stats)
8250  {
8251  	struct rtnl_hw_stats64 *stats;
8252  
8253  	ASSERT_RTNL();
8254  
8255  	stats = netdev_offload_xstats_get_ptr(dev, type);
8256  	if (WARN_ON(!stats))
8257  		return;
8258  
8259  	netdev_hw_stats64_add(stats, p_stats);
8260  }
8261  EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8262  
8263  /**
8264   * netdev_get_xmit_slave - Get the xmit slave of master device
8265   * @dev: device
8266   * @skb: The packet
8267   * @all_slaves: assume all the slaves are active
8268   *
8269   * The reference counters are not incremented so the caller must be
8270   * careful with locks. The caller must hold RCU lock.
8271   * %NULL is returned if no slave is found.
8272   */
8273  
netdev_get_xmit_slave(struct net_device * dev,struct sk_buff * skb,bool all_slaves)8274  struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8275  					 struct sk_buff *skb,
8276  					 bool all_slaves)
8277  {
8278  	const struct net_device_ops *ops = dev->netdev_ops;
8279  
8280  	if (!ops->ndo_get_xmit_slave)
8281  		return NULL;
8282  	return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8283  }
8284  EXPORT_SYMBOL(netdev_get_xmit_slave);
8285  
netdev_sk_get_lower_dev(struct net_device * dev,struct sock * sk)8286  static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8287  						  struct sock *sk)
8288  {
8289  	const struct net_device_ops *ops = dev->netdev_ops;
8290  
8291  	if (!ops->ndo_sk_get_lower_dev)
8292  		return NULL;
8293  	return ops->ndo_sk_get_lower_dev(dev, sk);
8294  }
8295  
8296  /**
8297   * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8298   * @dev: device
8299   * @sk: the socket
8300   *
8301   * %NULL is returned if no lower device is found.
8302   */
8303  
netdev_sk_get_lowest_dev(struct net_device * dev,struct sock * sk)8304  struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8305  					    struct sock *sk)
8306  {
8307  	struct net_device *lower;
8308  
8309  	lower = netdev_sk_get_lower_dev(dev, sk);
8310  	while (lower) {
8311  		dev = lower;
8312  		lower = netdev_sk_get_lower_dev(dev, sk);
8313  	}
8314  
8315  	return dev;
8316  }
8317  EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8318  
netdev_adjacent_add_links(struct net_device * dev)8319  static void netdev_adjacent_add_links(struct net_device *dev)
8320  {
8321  	struct netdev_adjacent *iter;
8322  
8323  	struct net *net = dev_net(dev);
8324  
8325  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8326  		if (!net_eq(net, dev_net(iter->dev)))
8327  			continue;
8328  		netdev_adjacent_sysfs_add(iter->dev, dev,
8329  					  &iter->dev->adj_list.lower);
8330  		netdev_adjacent_sysfs_add(dev, iter->dev,
8331  					  &dev->adj_list.upper);
8332  	}
8333  
8334  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8335  		if (!net_eq(net, dev_net(iter->dev)))
8336  			continue;
8337  		netdev_adjacent_sysfs_add(iter->dev, dev,
8338  					  &iter->dev->adj_list.upper);
8339  		netdev_adjacent_sysfs_add(dev, iter->dev,
8340  					  &dev->adj_list.lower);
8341  	}
8342  }
8343  
netdev_adjacent_del_links(struct net_device * dev)8344  static void netdev_adjacent_del_links(struct net_device *dev)
8345  {
8346  	struct netdev_adjacent *iter;
8347  
8348  	struct net *net = dev_net(dev);
8349  
8350  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8351  		if (!net_eq(net, dev_net(iter->dev)))
8352  			continue;
8353  		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8354  					  &iter->dev->adj_list.lower);
8355  		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8356  					  &dev->adj_list.upper);
8357  	}
8358  
8359  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8360  		if (!net_eq(net, dev_net(iter->dev)))
8361  			continue;
8362  		netdev_adjacent_sysfs_del(iter->dev, dev->name,
8363  					  &iter->dev->adj_list.upper);
8364  		netdev_adjacent_sysfs_del(dev, iter->dev->name,
8365  					  &dev->adj_list.lower);
8366  	}
8367  }
8368  
netdev_adjacent_rename_links(struct net_device * dev,char * oldname)8369  void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8370  {
8371  	struct netdev_adjacent *iter;
8372  
8373  	struct net *net = dev_net(dev);
8374  
8375  	list_for_each_entry(iter, &dev->adj_list.upper, list) {
8376  		if (!net_eq(net, dev_net(iter->dev)))
8377  			continue;
8378  		netdev_adjacent_sysfs_del(iter->dev, oldname,
8379  					  &iter->dev->adj_list.lower);
8380  		netdev_adjacent_sysfs_add(iter->dev, dev,
8381  					  &iter->dev->adj_list.lower);
8382  	}
8383  
8384  	list_for_each_entry(iter, &dev->adj_list.lower, list) {
8385  		if (!net_eq(net, dev_net(iter->dev)))
8386  			continue;
8387  		netdev_adjacent_sysfs_del(iter->dev, oldname,
8388  					  &iter->dev->adj_list.upper);
8389  		netdev_adjacent_sysfs_add(iter->dev, dev,
8390  					  &iter->dev->adj_list.upper);
8391  	}
8392  }
8393  
netdev_lower_dev_get_private(struct net_device * dev,struct net_device * lower_dev)8394  void *netdev_lower_dev_get_private(struct net_device *dev,
8395  				   struct net_device *lower_dev)
8396  {
8397  	struct netdev_adjacent *lower;
8398  
8399  	if (!lower_dev)
8400  		return NULL;
8401  	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8402  	if (!lower)
8403  		return NULL;
8404  
8405  	return lower->private;
8406  }
8407  EXPORT_SYMBOL(netdev_lower_dev_get_private);
8408  
8409  
8410  /**
8411   * netdev_lower_state_changed - Dispatch event about lower device state change
8412   * @lower_dev: device
8413   * @lower_state_info: state to dispatch
8414   *
8415   * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8416   * The caller must hold the RTNL lock.
8417   */
netdev_lower_state_changed(struct net_device * lower_dev,void * lower_state_info)8418  void netdev_lower_state_changed(struct net_device *lower_dev,
8419  				void *lower_state_info)
8420  {
8421  	struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8422  		.info.dev = lower_dev,
8423  	};
8424  
8425  	ASSERT_RTNL();
8426  	changelowerstate_info.lower_state_info = lower_state_info;
8427  	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8428  				      &changelowerstate_info.info);
8429  }
8430  EXPORT_SYMBOL(netdev_lower_state_changed);
8431  
dev_change_rx_flags(struct net_device * dev,int flags)8432  static void dev_change_rx_flags(struct net_device *dev, int flags)
8433  {
8434  	const struct net_device_ops *ops = dev->netdev_ops;
8435  
8436  	if (ops->ndo_change_rx_flags)
8437  		ops->ndo_change_rx_flags(dev, flags);
8438  }
8439  
__dev_set_promiscuity(struct net_device * dev,int inc,bool notify)8440  static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8441  {
8442  	unsigned int old_flags = dev->flags;
8443  	kuid_t uid;
8444  	kgid_t gid;
8445  
8446  	ASSERT_RTNL();
8447  
8448  	dev->flags |= IFF_PROMISC;
8449  	dev->promiscuity += inc;
8450  	if (dev->promiscuity == 0) {
8451  		/*
8452  		 * Avoid overflow.
8453  		 * If inc causes overflow, untouch promisc and return error.
8454  		 */
8455  		if (inc < 0)
8456  			dev->flags &= ~IFF_PROMISC;
8457  		else {
8458  			dev->promiscuity -= inc;
8459  			netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8460  			return -EOVERFLOW;
8461  		}
8462  	}
8463  	if (dev->flags != old_flags) {
8464  		netdev_info(dev, "%s promiscuous mode\n",
8465  			    dev->flags & IFF_PROMISC ? "entered" : "left");
8466  		if (audit_enabled) {
8467  			current_uid_gid(&uid, &gid);
8468  			audit_log(audit_context(), GFP_ATOMIC,
8469  				  AUDIT_ANOM_PROMISCUOUS,
8470  				  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8471  				  dev->name, (dev->flags & IFF_PROMISC),
8472  				  (old_flags & IFF_PROMISC),
8473  				  from_kuid(&init_user_ns, audit_get_loginuid(current)),
8474  				  from_kuid(&init_user_ns, uid),
8475  				  from_kgid(&init_user_ns, gid),
8476  				  audit_get_sessionid(current));
8477  		}
8478  
8479  		dev_change_rx_flags(dev, IFF_PROMISC);
8480  	}
8481  	if (notify)
8482  		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8483  	return 0;
8484  }
8485  
8486  /**
8487   *	dev_set_promiscuity	- update promiscuity count on a device
8488   *	@dev: device
8489   *	@inc: modifier
8490   *
8491   *	Add or remove promiscuity from a device. While the count in the device
8492   *	remains above zero the interface remains promiscuous. Once it hits zero
8493   *	the device reverts back to normal filtering operation. A negative inc
8494   *	value is used to drop promiscuity on the device.
8495   *	Return 0 if successful or a negative errno code on error.
8496   */
dev_set_promiscuity(struct net_device * dev,int inc)8497  int dev_set_promiscuity(struct net_device *dev, int inc)
8498  {
8499  	unsigned int old_flags = dev->flags;
8500  	int err;
8501  
8502  	err = __dev_set_promiscuity(dev, inc, true);
8503  	if (err < 0)
8504  		return err;
8505  	if (dev->flags != old_flags)
8506  		dev_set_rx_mode(dev);
8507  	return err;
8508  }
8509  EXPORT_SYMBOL(dev_set_promiscuity);
8510  
__dev_set_allmulti(struct net_device * dev,int inc,bool notify)8511  static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8512  {
8513  	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8514  
8515  	ASSERT_RTNL();
8516  
8517  	dev->flags |= IFF_ALLMULTI;
8518  	dev->allmulti += inc;
8519  	if (dev->allmulti == 0) {
8520  		/*
8521  		 * Avoid overflow.
8522  		 * If inc causes overflow, untouch allmulti and return error.
8523  		 */
8524  		if (inc < 0)
8525  			dev->flags &= ~IFF_ALLMULTI;
8526  		else {
8527  			dev->allmulti -= inc;
8528  			netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8529  			return -EOVERFLOW;
8530  		}
8531  	}
8532  	if (dev->flags ^ old_flags) {
8533  		netdev_info(dev, "%s allmulticast mode\n",
8534  			    dev->flags & IFF_ALLMULTI ? "entered" : "left");
8535  		dev_change_rx_flags(dev, IFF_ALLMULTI);
8536  		dev_set_rx_mode(dev);
8537  		if (notify)
8538  			__dev_notify_flags(dev, old_flags,
8539  					   dev->gflags ^ old_gflags, 0, NULL);
8540  	}
8541  	return 0;
8542  }
8543  
8544  /**
8545   *	dev_set_allmulti	- update allmulti count on a device
8546   *	@dev: device
8547   *	@inc: modifier
8548   *
8549   *	Add or remove reception of all multicast frames to a device. While the
8550   *	count in the device remains above zero the interface remains listening
8551   *	to all interfaces. Once it hits zero the device reverts back to normal
8552   *	filtering operation. A negative @inc value is used to drop the counter
8553   *	when releasing a resource needing all multicasts.
8554   *	Return 0 if successful or a negative errno code on error.
8555   */
8556  
dev_set_allmulti(struct net_device * dev,int inc)8557  int dev_set_allmulti(struct net_device *dev, int inc)
8558  {
8559  	return __dev_set_allmulti(dev, inc, true);
8560  }
8561  EXPORT_SYMBOL(dev_set_allmulti);
8562  
8563  /*
8564   *	Upload unicast and multicast address lists to device and
8565   *	configure RX filtering. When the device doesn't support unicast
8566   *	filtering it is put in promiscuous mode while unicast addresses
8567   *	are present.
8568   */
__dev_set_rx_mode(struct net_device * dev)8569  void __dev_set_rx_mode(struct net_device *dev)
8570  {
8571  	const struct net_device_ops *ops = dev->netdev_ops;
8572  
8573  	/* dev_open will call this function so the list will stay sane. */
8574  	if (!(dev->flags&IFF_UP))
8575  		return;
8576  
8577  	if (!netif_device_present(dev))
8578  		return;
8579  
8580  	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8581  		/* Unicast addresses changes may only happen under the rtnl,
8582  		 * therefore calling __dev_set_promiscuity here is safe.
8583  		 */
8584  		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8585  			__dev_set_promiscuity(dev, 1, false);
8586  			dev->uc_promisc = true;
8587  		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8588  			__dev_set_promiscuity(dev, -1, false);
8589  			dev->uc_promisc = false;
8590  		}
8591  	}
8592  
8593  	if (ops->ndo_set_rx_mode)
8594  		ops->ndo_set_rx_mode(dev);
8595  }
8596  
dev_set_rx_mode(struct net_device * dev)8597  void dev_set_rx_mode(struct net_device *dev)
8598  {
8599  	netif_addr_lock_bh(dev);
8600  	__dev_set_rx_mode(dev);
8601  	netif_addr_unlock_bh(dev);
8602  }
8603  
8604  /**
8605   *	dev_get_flags - get flags reported to userspace
8606   *	@dev: device
8607   *
8608   *	Get the combination of flag bits exported through APIs to userspace.
8609   */
dev_get_flags(const struct net_device * dev)8610  unsigned int dev_get_flags(const struct net_device *dev)
8611  {
8612  	unsigned int flags;
8613  
8614  	flags = (dev->flags & ~(IFF_PROMISC |
8615  				IFF_ALLMULTI |
8616  				IFF_RUNNING |
8617  				IFF_LOWER_UP |
8618  				IFF_DORMANT)) |
8619  		(dev->gflags & (IFF_PROMISC |
8620  				IFF_ALLMULTI));
8621  
8622  	if (netif_running(dev)) {
8623  		if (netif_oper_up(dev))
8624  			flags |= IFF_RUNNING;
8625  		if (netif_carrier_ok(dev))
8626  			flags |= IFF_LOWER_UP;
8627  		if (netif_dormant(dev))
8628  			flags |= IFF_DORMANT;
8629  	}
8630  
8631  	return flags;
8632  }
8633  EXPORT_SYMBOL(dev_get_flags);
8634  
__dev_change_flags(struct net_device * dev,unsigned int flags,struct netlink_ext_ack * extack)8635  int __dev_change_flags(struct net_device *dev, unsigned int flags,
8636  		       struct netlink_ext_ack *extack)
8637  {
8638  	unsigned int old_flags = dev->flags;
8639  	int ret;
8640  
8641  	ASSERT_RTNL();
8642  
8643  	/*
8644  	 *	Set the flags on our device.
8645  	 */
8646  
8647  	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8648  			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8649  			       IFF_AUTOMEDIA)) |
8650  		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8651  				    IFF_ALLMULTI));
8652  
8653  	/*
8654  	 *	Load in the correct multicast list now the flags have changed.
8655  	 */
8656  
8657  	if ((old_flags ^ flags) & IFF_MULTICAST)
8658  		dev_change_rx_flags(dev, IFF_MULTICAST);
8659  
8660  	dev_set_rx_mode(dev);
8661  
8662  	/*
8663  	 *	Have we downed the interface. We handle IFF_UP ourselves
8664  	 *	according to user attempts to set it, rather than blindly
8665  	 *	setting it.
8666  	 */
8667  
8668  	ret = 0;
8669  	if ((old_flags ^ flags) & IFF_UP) {
8670  		if (old_flags & IFF_UP)
8671  			__dev_close(dev);
8672  		else
8673  			ret = __dev_open(dev, extack);
8674  	}
8675  
8676  	if ((flags ^ dev->gflags) & IFF_PROMISC) {
8677  		int inc = (flags & IFF_PROMISC) ? 1 : -1;
8678  		unsigned int old_flags = dev->flags;
8679  
8680  		dev->gflags ^= IFF_PROMISC;
8681  
8682  		if (__dev_set_promiscuity(dev, inc, false) >= 0)
8683  			if (dev->flags != old_flags)
8684  				dev_set_rx_mode(dev);
8685  	}
8686  
8687  	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8688  	 * is important. Some (broken) drivers set IFF_PROMISC, when
8689  	 * IFF_ALLMULTI is requested not asking us and not reporting.
8690  	 */
8691  	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8692  		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8693  
8694  		dev->gflags ^= IFF_ALLMULTI;
8695  		__dev_set_allmulti(dev, inc, false);
8696  	}
8697  
8698  	return ret;
8699  }
8700  
__dev_notify_flags(struct net_device * dev,unsigned int old_flags,unsigned int gchanges,u32 portid,const struct nlmsghdr * nlh)8701  void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8702  			unsigned int gchanges, u32 portid,
8703  			const struct nlmsghdr *nlh)
8704  {
8705  	unsigned int changes = dev->flags ^ old_flags;
8706  
8707  	if (gchanges)
8708  		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8709  
8710  	if (changes & IFF_UP) {
8711  		if (dev->flags & IFF_UP)
8712  			call_netdevice_notifiers(NETDEV_UP, dev);
8713  		else
8714  			call_netdevice_notifiers(NETDEV_DOWN, dev);
8715  	}
8716  
8717  	if (dev->flags & IFF_UP &&
8718  	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8719  		struct netdev_notifier_change_info change_info = {
8720  			.info = {
8721  				.dev = dev,
8722  			},
8723  			.flags_changed = changes,
8724  		};
8725  
8726  		call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8727  	}
8728  }
8729  
8730  /**
8731   *	dev_change_flags - change device settings
8732   *	@dev: device
8733   *	@flags: device state flags
8734   *	@extack: netlink extended ack
8735   *
8736   *	Change settings on device based state flags. The flags are
8737   *	in the userspace exported format.
8738   */
dev_change_flags(struct net_device * dev,unsigned int flags,struct netlink_ext_ack * extack)8739  int dev_change_flags(struct net_device *dev, unsigned int flags,
8740  		     struct netlink_ext_ack *extack)
8741  {
8742  	int ret;
8743  	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8744  
8745  	ret = __dev_change_flags(dev, flags, extack);
8746  	if (ret < 0)
8747  		return ret;
8748  
8749  	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8750  	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
8751  	return ret;
8752  }
8753  EXPORT_SYMBOL(dev_change_flags);
8754  
__dev_set_mtu(struct net_device * dev,int new_mtu)8755  int __dev_set_mtu(struct net_device *dev, int new_mtu)
8756  {
8757  	const struct net_device_ops *ops = dev->netdev_ops;
8758  
8759  	if (ops->ndo_change_mtu)
8760  		return ops->ndo_change_mtu(dev, new_mtu);
8761  
8762  	/* Pairs with all the lockless reads of dev->mtu in the stack */
8763  	WRITE_ONCE(dev->mtu, new_mtu);
8764  	return 0;
8765  }
8766  EXPORT_SYMBOL(__dev_set_mtu);
8767  
dev_validate_mtu(struct net_device * dev,int new_mtu,struct netlink_ext_ack * extack)8768  int dev_validate_mtu(struct net_device *dev, int new_mtu,
8769  		     struct netlink_ext_ack *extack)
8770  {
8771  	/* MTU must be positive, and in range */
8772  	if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8773  		NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8774  		return -EINVAL;
8775  	}
8776  
8777  	if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8778  		NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8779  		return -EINVAL;
8780  	}
8781  	return 0;
8782  }
8783  
8784  /**
8785   *	dev_set_mtu_ext - Change maximum transfer unit
8786   *	@dev: device
8787   *	@new_mtu: new transfer unit
8788   *	@extack: netlink extended ack
8789   *
8790   *	Change the maximum transfer size of the network device.
8791   */
dev_set_mtu_ext(struct net_device * dev,int new_mtu,struct netlink_ext_ack * extack)8792  int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8793  		    struct netlink_ext_ack *extack)
8794  {
8795  	int err, orig_mtu;
8796  
8797  	if (new_mtu == dev->mtu)
8798  		return 0;
8799  
8800  	err = dev_validate_mtu(dev, new_mtu, extack);
8801  	if (err)
8802  		return err;
8803  
8804  	if (!netif_device_present(dev))
8805  		return -ENODEV;
8806  
8807  	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8808  	err = notifier_to_errno(err);
8809  	if (err)
8810  		return err;
8811  
8812  	orig_mtu = dev->mtu;
8813  	err = __dev_set_mtu(dev, new_mtu);
8814  
8815  	if (!err) {
8816  		err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8817  						   orig_mtu);
8818  		err = notifier_to_errno(err);
8819  		if (err) {
8820  			/* setting mtu back and notifying everyone again,
8821  			 * so that they have a chance to revert changes.
8822  			 */
8823  			__dev_set_mtu(dev, orig_mtu);
8824  			call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8825  						     new_mtu);
8826  		}
8827  	}
8828  	return err;
8829  }
8830  
dev_set_mtu(struct net_device * dev,int new_mtu)8831  int dev_set_mtu(struct net_device *dev, int new_mtu)
8832  {
8833  	struct netlink_ext_ack extack;
8834  	int err;
8835  
8836  	memset(&extack, 0, sizeof(extack));
8837  	err = dev_set_mtu_ext(dev, new_mtu, &extack);
8838  	if (err && extack._msg)
8839  		net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8840  	return err;
8841  }
8842  EXPORT_SYMBOL(dev_set_mtu);
8843  
8844  /**
8845   *	dev_change_tx_queue_len - Change TX queue length of a netdevice
8846   *	@dev: device
8847   *	@new_len: new tx queue length
8848   */
dev_change_tx_queue_len(struct net_device * dev,unsigned long new_len)8849  int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8850  {
8851  	unsigned int orig_len = dev->tx_queue_len;
8852  	int res;
8853  
8854  	if (new_len != (unsigned int)new_len)
8855  		return -ERANGE;
8856  
8857  	if (new_len != orig_len) {
8858  		dev->tx_queue_len = new_len;
8859  		res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8860  		res = notifier_to_errno(res);
8861  		if (res)
8862  			goto err_rollback;
8863  		res = dev_qdisc_change_tx_queue_len(dev);
8864  		if (res)
8865  			goto err_rollback;
8866  	}
8867  
8868  	return 0;
8869  
8870  err_rollback:
8871  	netdev_err(dev, "refused to change device tx_queue_len\n");
8872  	dev->tx_queue_len = orig_len;
8873  	return res;
8874  }
8875  
8876  /**
8877   *	dev_set_group - Change group this device belongs to
8878   *	@dev: device
8879   *	@new_group: group this device should belong to
8880   */
dev_set_group(struct net_device * dev,int new_group)8881  void dev_set_group(struct net_device *dev, int new_group)
8882  {
8883  	dev->group = new_group;
8884  }
8885  
8886  /**
8887   *	dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8888   *	@dev: device
8889   *	@addr: new address
8890   *	@extack: netlink extended ack
8891   */
dev_pre_changeaddr_notify(struct net_device * dev,const char * addr,struct netlink_ext_ack * extack)8892  int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8893  			      struct netlink_ext_ack *extack)
8894  {
8895  	struct netdev_notifier_pre_changeaddr_info info = {
8896  		.info.dev = dev,
8897  		.info.extack = extack,
8898  		.dev_addr = addr,
8899  	};
8900  	int rc;
8901  
8902  	rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8903  	return notifier_to_errno(rc);
8904  }
8905  EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8906  
8907  /**
8908   *	dev_set_mac_address - Change Media Access Control Address
8909   *	@dev: device
8910   *	@sa: new address
8911   *	@extack: netlink extended ack
8912   *
8913   *	Change the hardware (MAC) address of the device
8914   */
dev_set_mac_address(struct net_device * dev,struct sockaddr * sa,struct netlink_ext_ack * extack)8915  int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8916  			struct netlink_ext_ack *extack)
8917  {
8918  	const struct net_device_ops *ops = dev->netdev_ops;
8919  	int err;
8920  
8921  	if (!ops->ndo_set_mac_address)
8922  		return -EOPNOTSUPP;
8923  	if (sa->sa_family != dev->type)
8924  		return -EINVAL;
8925  	if (!netif_device_present(dev))
8926  		return -ENODEV;
8927  	err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8928  	if (err)
8929  		return err;
8930  	if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
8931  		err = ops->ndo_set_mac_address(dev, sa);
8932  		if (err)
8933  			return err;
8934  	}
8935  	dev->addr_assign_type = NET_ADDR_SET;
8936  	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8937  	add_device_randomness(dev->dev_addr, dev->addr_len);
8938  	return 0;
8939  }
8940  EXPORT_SYMBOL(dev_set_mac_address);
8941  
8942  static DECLARE_RWSEM(dev_addr_sem);
8943  
dev_set_mac_address_user(struct net_device * dev,struct sockaddr * sa,struct netlink_ext_ack * extack)8944  int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8945  			     struct netlink_ext_ack *extack)
8946  {
8947  	int ret;
8948  
8949  	down_write(&dev_addr_sem);
8950  	ret = dev_set_mac_address(dev, sa, extack);
8951  	up_write(&dev_addr_sem);
8952  	return ret;
8953  }
8954  EXPORT_SYMBOL(dev_set_mac_address_user);
8955  
dev_get_mac_address(struct sockaddr * sa,struct net * net,char * dev_name)8956  int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8957  {
8958  	size_t size = sizeof(sa->sa_data_min);
8959  	struct net_device *dev;
8960  	int ret = 0;
8961  
8962  	down_read(&dev_addr_sem);
8963  	rcu_read_lock();
8964  
8965  	dev = dev_get_by_name_rcu(net, dev_name);
8966  	if (!dev) {
8967  		ret = -ENODEV;
8968  		goto unlock;
8969  	}
8970  	if (!dev->addr_len)
8971  		memset(sa->sa_data, 0, size);
8972  	else
8973  		memcpy(sa->sa_data, dev->dev_addr,
8974  		       min_t(size_t, size, dev->addr_len));
8975  	sa->sa_family = dev->type;
8976  
8977  unlock:
8978  	rcu_read_unlock();
8979  	up_read(&dev_addr_sem);
8980  	return ret;
8981  }
8982  EXPORT_SYMBOL(dev_get_mac_address);
8983  
8984  /**
8985   *	dev_change_carrier - Change device carrier
8986   *	@dev: device
8987   *	@new_carrier: new value
8988   *
8989   *	Change device carrier
8990   */
dev_change_carrier(struct net_device * dev,bool new_carrier)8991  int dev_change_carrier(struct net_device *dev, bool new_carrier)
8992  {
8993  	const struct net_device_ops *ops = dev->netdev_ops;
8994  
8995  	if (!ops->ndo_change_carrier)
8996  		return -EOPNOTSUPP;
8997  	if (!netif_device_present(dev))
8998  		return -ENODEV;
8999  	return ops->ndo_change_carrier(dev, new_carrier);
9000  }
9001  
9002  /**
9003   *	dev_get_phys_port_id - Get device physical port ID
9004   *	@dev: device
9005   *	@ppid: port ID
9006   *
9007   *	Get device physical port ID
9008   */
dev_get_phys_port_id(struct net_device * dev,struct netdev_phys_item_id * ppid)9009  int dev_get_phys_port_id(struct net_device *dev,
9010  			 struct netdev_phys_item_id *ppid)
9011  {
9012  	const struct net_device_ops *ops = dev->netdev_ops;
9013  
9014  	if (!ops->ndo_get_phys_port_id)
9015  		return -EOPNOTSUPP;
9016  	return ops->ndo_get_phys_port_id(dev, ppid);
9017  }
9018  
9019  /**
9020   *	dev_get_phys_port_name - Get device physical port name
9021   *	@dev: device
9022   *	@name: port name
9023   *	@len: limit of bytes to copy to name
9024   *
9025   *	Get device physical port name
9026   */
dev_get_phys_port_name(struct net_device * dev,char * name,size_t len)9027  int dev_get_phys_port_name(struct net_device *dev,
9028  			   char *name, size_t len)
9029  {
9030  	const struct net_device_ops *ops = dev->netdev_ops;
9031  	int err;
9032  
9033  	if (ops->ndo_get_phys_port_name) {
9034  		err = ops->ndo_get_phys_port_name(dev, name, len);
9035  		if (err != -EOPNOTSUPP)
9036  			return err;
9037  	}
9038  	return devlink_compat_phys_port_name_get(dev, name, len);
9039  }
9040  
9041  /**
9042   *	dev_get_port_parent_id - Get the device's port parent identifier
9043   *	@dev: network device
9044   *	@ppid: pointer to a storage for the port's parent identifier
9045   *	@recurse: allow/disallow recursion to lower devices
9046   *
9047   *	Get the devices's port parent identifier
9048   */
dev_get_port_parent_id(struct net_device * dev,struct netdev_phys_item_id * ppid,bool recurse)9049  int dev_get_port_parent_id(struct net_device *dev,
9050  			   struct netdev_phys_item_id *ppid,
9051  			   bool recurse)
9052  {
9053  	const struct net_device_ops *ops = dev->netdev_ops;
9054  	struct netdev_phys_item_id first = { };
9055  	struct net_device *lower_dev;
9056  	struct list_head *iter;
9057  	int err;
9058  
9059  	if (ops->ndo_get_port_parent_id) {
9060  		err = ops->ndo_get_port_parent_id(dev, ppid);
9061  		if (err != -EOPNOTSUPP)
9062  			return err;
9063  	}
9064  
9065  	err = devlink_compat_switch_id_get(dev, ppid);
9066  	if (!recurse || err != -EOPNOTSUPP)
9067  		return err;
9068  
9069  	netdev_for_each_lower_dev(dev, lower_dev, iter) {
9070  		err = dev_get_port_parent_id(lower_dev, ppid, true);
9071  		if (err)
9072  			break;
9073  		if (!first.id_len)
9074  			first = *ppid;
9075  		else if (memcmp(&first, ppid, sizeof(*ppid)))
9076  			return -EOPNOTSUPP;
9077  	}
9078  
9079  	return err;
9080  }
9081  EXPORT_SYMBOL(dev_get_port_parent_id);
9082  
9083  /**
9084   *	netdev_port_same_parent_id - Indicate if two network devices have
9085   *	the same port parent identifier
9086   *	@a: first network device
9087   *	@b: second network device
9088   */
netdev_port_same_parent_id(struct net_device * a,struct net_device * b)9089  bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9090  {
9091  	struct netdev_phys_item_id a_id = { };
9092  	struct netdev_phys_item_id b_id = { };
9093  
9094  	if (dev_get_port_parent_id(a, &a_id, true) ||
9095  	    dev_get_port_parent_id(b, &b_id, true))
9096  		return false;
9097  
9098  	return netdev_phys_item_id_same(&a_id, &b_id);
9099  }
9100  EXPORT_SYMBOL(netdev_port_same_parent_id);
9101  
9102  /**
9103   *	dev_change_proto_down - set carrier according to proto_down.
9104   *
9105   *	@dev: device
9106   *	@proto_down: new value
9107   */
dev_change_proto_down(struct net_device * dev,bool proto_down)9108  int dev_change_proto_down(struct net_device *dev, bool proto_down)
9109  {
9110  	if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9111  		return -EOPNOTSUPP;
9112  	if (!netif_device_present(dev))
9113  		return -ENODEV;
9114  	if (proto_down)
9115  		netif_carrier_off(dev);
9116  	else
9117  		netif_carrier_on(dev);
9118  	dev->proto_down = proto_down;
9119  	return 0;
9120  }
9121  
9122  /**
9123   *	dev_change_proto_down_reason - proto down reason
9124   *
9125   *	@dev: device
9126   *	@mask: proto down mask
9127   *	@value: proto down value
9128   */
dev_change_proto_down_reason(struct net_device * dev,unsigned long mask,u32 value)9129  void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9130  				  u32 value)
9131  {
9132  	int b;
9133  
9134  	if (!mask) {
9135  		dev->proto_down_reason = value;
9136  	} else {
9137  		for_each_set_bit(b, &mask, 32) {
9138  			if (value & (1 << b))
9139  				dev->proto_down_reason |= BIT(b);
9140  			else
9141  				dev->proto_down_reason &= ~BIT(b);
9142  		}
9143  	}
9144  }
9145  
9146  struct bpf_xdp_link {
9147  	struct bpf_link link;
9148  	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9149  	int flags;
9150  };
9151  
dev_xdp_mode(struct net_device * dev,u32 flags)9152  static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9153  {
9154  	if (flags & XDP_FLAGS_HW_MODE)
9155  		return XDP_MODE_HW;
9156  	if (flags & XDP_FLAGS_DRV_MODE)
9157  		return XDP_MODE_DRV;
9158  	if (flags & XDP_FLAGS_SKB_MODE)
9159  		return XDP_MODE_SKB;
9160  	return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9161  }
9162  
dev_xdp_bpf_op(struct net_device * dev,enum bpf_xdp_mode mode)9163  static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9164  {
9165  	switch (mode) {
9166  	case XDP_MODE_SKB:
9167  		return generic_xdp_install;
9168  	case XDP_MODE_DRV:
9169  	case XDP_MODE_HW:
9170  		return dev->netdev_ops->ndo_bpf;
9171  	default:
9172  		return NULL;
9173  	}
9174  }
9175  
dev_xdp_link(struct net_device * dev,enum bpf_xdp_mode mode)9176  static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9177  					 enum bpf_xdp_mode mode)
9178  {
9179  	return dev->xdp_state[mode].link;
9180  }
9181  
dev_xdp_prog(struct net_device * dev,enum bpf_xdp_mode mode)9182  static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9183  				     enum bpf_xdp_mode mode)
9184  {
9185  	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9186  
9187  	if (link)
9188  		return link->link.prog;
9189  	return dev->xdp_state[mode].prog;
9190  }
9191  
dev_xdp_prog_count(struct net_device * dev)9192  u8 dev_xdp_prog_count(struct net_device *dev)
9193  {
9194  	u8 count = 0;
9195  	int i;
9196  
9197  	for (i = 0; i < __MAX_XDP_MODE; i++)
9198  		if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9199  			count++;
9200  	return count;
9201  }
9202  EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9203  
dev_xdp_prog_id(struct net_device * dev,enum bpf_xdp_mode mode)9204  u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9205  {
9206  	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9207  
9208  	return prog ? prog->aux->id : 0;
9209  }
9210  
dev_xdp_set_link(struct net_device * dev,enum bpf_xdp_mode mode,struct bpf_xdp_link * link)9211  static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9212  			     struct bpf_xdp_link *link)
9213  {
9214  	dev->xdp_state[mode].link = link;
9215  	dev->xdp_state[mode].prog = NULL;
9216  }
9217  
dev_xdp_set_prog(struct net_device * dev,enum bpf_xdp_mode mode,struct bpf_prog * prog)9218  static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9219  			     struct bpf_prog *prog)
9220  {
9221  	dev->xdp_state[mode].link = NULL;
9222  	dev->xdp_state[mode].prog = prog;
9223  }
9224  
dev_xdp_install(struct net_device * dev,enum bpf_xdp_mode mode,bpf_op_t bpf_op,struct netlink_ext_ack * extack,u32 flags,struct bpf_prog * prog)9225  static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9226  			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9227  			   u32 flags, struct bpf_prog *prog)
9228  {
9229  	struct netdev_bpf xdp;
9230  	int err;
9231  
9232  	memset(&xdp, 0, sizeof(xdp));
9233  	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9234  	xdp.extack = extack;
9235  	xdp.flags = flags;
9236  	xdp.prog = prog;
9237  
9238  	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
9239  	 * "moved" into driver), so they don't increment it on their own, but
9240  	 * they do decrement refcnt when program is detached or replaced.
9241  	 * Given net_device also owns link/prog, we need to bump refcnt here
9242  	 * to prevent drivers from underflowing it.
9243  	 */
9244  	if (prog)
9245  		bpf_prog_inc(prog);
9246  	err = bpf_op(dev, &xdp);
9247  	if (err) {
9248  		if (prog)
9249  			bpf_prog_put(prog);
9250  		return err;
9251  	}
9252  
9253  	if (mode != XDP_MODE_HW)
9254  		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9255  
9256  	return 0;
9257  }
9258  
dev_xdp_uninstall(struct net_device * dev)9259  static void dev_xdp_uninstall(struct net_device *dev)
9260  {
9261  	struct bpf_xdp_link *link;
9262  	struct bpf_prog *prog;
9263  	enum bpf_xdp_mode mode;
9264  	bpf_op_t bpf_op;
9265  
9266  	ASSERT_RTNL();
9267  
9268  	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9269  		prog = dev_xdp_prog(dev, mode);
9270  		if (!prog)
9271  			continue;
9272  
9273  		bpf_op = dev_xdp_bpf_op(dev, mode);
9274  		if (!bpf_op)
9275  			continue;
9276  
9277  		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9278  
9279  		/* auto-detach link from net device */
9280  		link = dev_xdp_link(dev, mode);
9281  		if (link)
9282  			link->dev = NULL;
9283  		else
9284  			bpf_prog_put(prog);
9285  
9286  		dev_xdp_set_link(dev, mode, NULL);
9287  	}
9288  }
9289  
dev_xdp_attach(struct net_device * dev,struct netlink_ext_ack * extack,struct bpf_xdp_link * link,struct bpf_prog * new_prog,struct bpf_prog * old_prog,u32 flags)9290  static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9291  			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9292  			  struct bpf_prog *old_prog, u32 flags)
9293  {
9294  	unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9295  	struct bpf_prog *cur_prog;
9296  	struct net_device *upper;
9297  	struct list_head *iter;
9298  	enum bpf_xdp_mode mode;
9299  	bpf_op_t bpf_op;
9300  	int err;
9301  
9302  	ASSERT_RTNL();
9303  
9304  	/* either link or prog attachment, never both */
9305  	if (link && (new_prog || old_prog))
9306  		return -EINVAL;
9307  	/* link supports only XDP mode flags */
9308  	if (link && (flags & ~XDP_FLAGS_MODES)) {
9309  		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9310  		return -EINVAL;
9311  	}
9312  	/* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9313  	if (num_modes > 1) {
9314  		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9315  		return -EINVAL;
9316  	}
9317  	/* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9318  	if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9319  		NL_SET_ERR_MSG(extack,
9320  			       "More than one program loaded, unset mode is ambiguous");
9321  		return -EINVAL;
9322  	}
9323  	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9324  	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9325  		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9326  		return -EINVAL;
9327  	}
9328  
9329  	mode = dev_xdp_mode(dev, flags);
9330  	/* can't replace attached link */
9331  	if (dev_xdp_link(dev, mode)) {
9332  		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9333  		return -EBUSY;
9334  	}
9335  
9336  	/* don't allow if an upper device already has a program */
9337  	netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9338  		if (dev_xdp_prog_count(upper) > 0) {
9339  			NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9340  			return -EEXIST;
9341  		}
9342  	}
9343  
9344  	cur_prog = dev_xdp_prog(dev, mode);
9345  	/* can't replace attached prog with link */
9346  	if (link && cur_prog) {
9347  		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9348  		return -EBUSY;
9349  	}
9350  	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9351  		NL_SET_ERR_MSG(extack, "Active program does not match expected");
9352  		return -EEXIST;
9353  	}
9354  
9355  	/* put effective new program into new_prog */
9356  	if (link)
9357  		new_prog = link->link.prog;
9358  
9359  	if (new_prog) {
9360  		bool offload = mode == XDP_MODE_HW;
9361  		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9362  					       ? XDP_MODE_DRV : XDP_MODE_SKB;
9363  
9364  		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9365  			NL_SET_ERR_MSG(extack, "XDP program already attached");
9366  			return -EBUSY;
9367  		}
9368  		if (!offload && dev_xdp_prog(dev, other_mode)) {
9369  			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9370  			return -EEXIST;
9371  		}
9372  		if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9373  			NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9374  			return -EINVAL;
9375  		}
9376  		if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9377  			NL_SET_ERR_MSG(extack, "Program bound to different device");
9378  			return -EINVAL;
9379  		}
9380  		if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
9381  			NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
9382  			return -EINVAL;
9383  		}
9384  		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9385  			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9386  			return -EINVAL;
9387  		}
9388  		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9389  			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9390  			return -EINVAL;
9391  		}
9392  	}
9393  
9394  	/* don't call drivers if the effective program didn't change */
9395  	if (new_prog != cur_prog) {
9396  		bpf_op = dev_xdp_bpf_op(dev, mode);
9397  		if (!bpf_op) {
9398  			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9399  			return -EOPNOTSUPP;
9400  		}
9401  
9402  		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9403  		if (err)
9404  			return err;
9405  	}
9406  
9407  	if (link)
9408  		dev_xdp_set_link(dev, mode, link);
9409  	else
9410  		dev_xdp_set_prog(dev, mode, new_prog);
9411  	if (cur_prog)
9412  		bpf_prog_put(cur_prog);
9413  
9414  	return 0;
9415  }
9416  
dev_xdp_attach_link(struct net_device * dev,struct netlink_ext_ack * extack,struct bpf_xdp_link * link)9417  static int dev_xdp_attach_link(struct net_device *dev,
9418  			       struct netlink_ext_ack *extack,
9419  			       struct bpf_xdp_link *link)
9420  {
9421  	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9422  }
9423  
dev_xdp_detach_link(struct net_device * dev,struct netlink_ext_ack * extack,struct bpf_xdp_link * link)9424  static int dev_xdp_detach_link(struct net_device *dev,
9425  			       struct netlink_ext_ack *extack,
9426  			       struct bpf_xdp_link *link)
9427  {
9428  	enum bpf_xdp_mode mode;
9429  	bpf_op_t bpf_op;
9430  
9431  	ASSERT_RTNL();
9432  
9433  	mode = dev_xdp_mode(dev, link->flags);
9434  	if (dev_xdp_link(dev, mode) != link)
9435  		return -EINVAL;
9436  
9437  	bpf_op = dev_xdp_bpf_op(dev, mode);
9438  	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9439  	dev_xdp_set_link(dev, mode, NULL);
9440  	return 0;
9441  }
9442  
bpf_xdp_link_release(struct bpf_link * link)9443  static void bpf_xdp_link_release(struct bpf_link *link)
9444  {
9445  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9446  
9447  	rtnl_lock();
9448  
9449  	/* if racing with net_device's tear down, xdp_link->dev might be
9450  	 * already NULL, in which case link was already auto-detached
9451  	 */
9452  	if (xdp_link->dev) {
9453  		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9454  		xdp_link->dev = NULL;
9455  	}
9456  
9457  	rtnl_unlock();
9458  }
9459  
bpf_xdp_link_detach(struct bpf_link * link)9460  static int bpf_xdp_link_detach(struct bpf_link *link)
9461  {
9462  	bpf_xdp_link_release(link);
9463  	return 0;
9464  }
9465  
bpf_xdp_link_dealloc(struct bpf_link * link)9466  static void bpf_xdp_link_dealloc(struct bpf_link *link)
9467  {
9468  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9469  
9470  	kfree(xdp_link);
9471  }
9472  
bpf_xdp_link_show_fdinfo(const struct bpf_link * link,struct seq_file * seq)9473  static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9474  				     struct seq_file *seq)
9475  {
9476  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9477  	u32 ifindex = 0;
9478  
9479  	rtnl_lock();
9480  	if (xdp_link->dev)
9481  		ifindex = xdp_link->dev->ifindex;
9482  	rtnl_unlock();
9483  
9484  	seq_printf(seq, "ifindex:\t%u\n", ifindex);
9485  }
9486  
bpf_xdp_link_fill_link_info(const struct bpf_link * link,struct bpf_link_info * info)9487  static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9488  				       struct bpf_link_info *info)
9489  {
9490  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9491  	u32 ifindex = 0;
9492  
9493  	rtnl_lock();
9494  	if (xdp_link->dev)
9495  		ifindex = xdp_link->dev->ifindex;
9496  	rtnl_unlock();
9497  
9498  	info->xdp.ifindex = ifindex;
9499  	return 0;
9500  }
9501  
bpf_xdp_link_update(struct bpf_link * link,struct bpf_prog * new_prog,struct bpf_prog * old_prog)9502  static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9503  			       struct bpf_prog *old_prog)
9504  {
9505  	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9506  	enum bpf_xdp_mode mode;
9507  	bpf_op_t bpf_op;
9508  	int err = 0;
9509  
9510  	rtnl_lock();
9511  
9512  	/* link might have been auto-released already, so fail */
9513  	if (!xdp_link->dev) {
9514  		err = -ENOLINK;
9515  		goto out_unlock;
9516  	}
9517  
9518  	if (old_prog && link->prog != old_prog) {
9519  		err = -EPERM;
9520  		goto out_unlock;
9521  	}
9522  	old_prog = link->prog;
9523  	if (old_prog->type != new_prog->type ||
9524  	    old_prog->expected_attach_type != new_prog->expected_attach_type) {
9525  		err = -EINVAL;
9526  		goto out_unlock;
9527  	}
9528  
9529  	if (old_prog == new_prog) {
9530  		/* no-op, don't disturb drivers */
9531  		bpf_prog_put(new_prog);
9532  		goto out_unlock;
9533  	}
9534  
9535  	mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9536  	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9537  	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9538  			      xdp_link->flags, new_prog);
9539  	if (err)
9540  		goto out_unlock;
9541  
9542  	old_prog = xchg(&link->prog, new_prog);
9543  	bpf_prog_put(old_prog);
9544  
9545  out_unlock:
9546  	rtnl_unlock();
9547  	return err;
9548  }
9549  
9550  static const struct bpf_link_ops bpf_xdp_link_lops = {
9551  	.release = bpf_xdp_link_release,
9552  	.dealloc = bpf_xdp_link_dealloc,
9553  	.detach = bpf_xdp_link_detach,
9554  	.show_fdinfo = bpf_xdp_link_show_fdinfo,
9555  	.fill_link_info = bpf_xdp_link_fill_link_info,
9556  	.update_prog = bpf_xdp_link_update,
9557  };
9558  
bpf_xdp_link_attach(const union bpf_attr * attr,struct bpf_prog * prog)9559  int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9560  {
9561  	struct net *net = current->nsproxy->net_ns;
9562  	struct bpf_link_primer link_primer;
9563  	struct netlink_ext_ack extack = {};
9564  	struct bpf_xdp_link *link;
9565  	struct net_device *dev;
9566  	int err, fd;
9567  
9568  	rtnl_lock();
9569  	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9570  	if (!dev) {
9571  		rtnl_unlock();
9572  		return -EINVAL;
9573  	}
9574  
9575  	link = kzalloc(sizeof(*link), GFP_USER);
9576  	if (!link) {
9577  		err = -ENOMEM;
9578  		goto unlock;
9579  	}
9580  
9581  	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9582  	link->dev = dev;
9583  	link->flags = attr->link_create.flags;
9584  
9585  	err = bpf_link_prime(&link->link, &link_primer);
9586  	if (err) {
9587  		kfree(link);
9588  		goto unlock;
9589  	}
9590  
9591  	err = dev_xdp_attach_link(dev, &extack, link);
9592  	rtnl_unlock();
9593  
9594  	if (err) {
9595  		link->dev = NULL;
9596  		bpf_link_cleanup(&link_primer);
9597  		trace_bpf_xdp_link_attach_failed(extack._msg);
9598  		goto out_put_dev;
9599  	}
9600  
9601  	fd = bpf_link_settle(&link_primer);
9602  	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
9603  	dev_put(dev);
9604  	return fd;
9605  
9606  unlock:
9607  	rtnl_unlock();
9608  
9609  out_put_dev:
9610  	dev_put(dev);
9611  	return err;
9612  }
9613  
9614  /**
9615   *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
9616   *	@dev: device
9617   *	@extack: netlink extended ack
9618   *	@fd: new program fd or negative value to clear
9619   *	@expected_fd: old program fd that userspace expects to replace or clear
9620   *	@flags: xdp-related flags
9621   *
9622   *	Set or clear a bpf program for a device
9623   */
dev_change_xdp_fd(struct net_device * dev,struct netlink_ext_ack * extack,int fd,int expected_fd,u32 flags)9624  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9625  		      int fd, int expected_fd, u32 flags)
9626  {
9627  	enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9628  	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9629  	int err;
9630  
9631  	ASSERT_RTNL();
9632  
9633  	if (fd >= 0) {
9634  		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9635  						 mode != XDP_MODE_SKB);
9636  		if (IS_ERR(new_prog))
9637  			return PTR_ERR(new_prog);
9638  	}
9639  
9640  	if (expected_fd >= 0) {
9641  		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9642  						 mode != XDP_MODE_SKB);
9643  		if (IS_ERR(old_prog)) {
9644  			err = PTR_ERR(old_prog);
9645  			old_prog = NULL;
9646  			goto err_out;
9647  		}
9648  	}
9649  
9650  	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9651  
9652  err_out:
9653  	if (err && new_prog)
9654  		bpf_prog_put(new_prog);
9655  	if (old_prog)
9656  		bpf_prog_put(old_prog);
9657  	return err;
9658  }
9659  
9660  /**
9661   * dev_index_reserve() - allocate an ifindex in a namespace
9662   * @net: the applicable net namespace
9663   * @ifindex: requested ifindex, pass %0 to get one allocated
9664   *
9665   * Allocate a ifindex for a new device. Caller must either use the ifindex
9666   * to store the device (via list_netdevice()) or call dev_index_release()
9667   * to give the index up.
9668   *
9669   * Return: a suitable unique value for a new device interface number or -errno.
9670   */
dev_index_reserve(struct net * net,u32 ifindex)9671  static int dev_index_reserve(struct net *net, u32 ifindex)
9672  {
9673  	int err;
9674  
9675  	if (ifindex > INT_MAX) {
9676  		DEBUG_NET_WARN_ON_ONCE(1);
9677  		return -EINVAL;
9678  	}
9679  
9680  	if (!ifindex)
9681  		err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9682  				      xa_limit_31b, &net->ifindex, GFP_KERNEL);
9683  	else
9684  		err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9685  	if (err < 0)
9686  		return err;
9687  
9688  	return ifindex;
9689  }
9690  
dev_index_release(struct net * net,int ifindex)9691  static void dev_index_release(struct net *net, int ifindex)
9692  {
9693  	/* Expect only unused indexes, unlist_netdevice() removes the used */
9694  	WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9695  }
9696  
9697  /* Delayed registration/unregisteration */
9698  LIST_HEAD(net_todo_list);
9699  DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9700  
net_set_todo(struct net_device * dev)9701  static void net_set_todo(struct net_device *dev)
9702  {
9703  	list_add_tail(&dev->todo_list, &net_todo_list);
9704  	atomic_inc(&dev_net(dev)->dev_unreg_count);
9705  }
9706  
netdev_sync_upper_features(struct net_device * lower,struct net_device * upper,netdev_features_t features)9707  static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9708  	struct net_device *upper, netdev_features_t features)
9709  {
9710  	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9711  	netdev_features_t feature;
9712  	int feature_bit;
9713  
9714  	for_each_netdev_feature(upper_disables, feature_bit) {
9715  		feature = __NETIF_F_BIT(feature_bit);
9716  		if (!(upper->wanted_features & feature)
9717  		    && (features & feature)) {
9718  			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9719  				   &feature, upper->name);
9720  			features &= ~feature;
9721  		}
9722  	}
9723  
9724  	return features;
9725  }
9726  
netdev_sync_lower_features(struct net_device * upper,struct net_device * lower,netdev_features_t features)9727  static void netdev_sync_lower_features(struct net_device *upper,
9728  	struct net_device *lower, netdev_features_t features)
9729  {
9730  	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9731  	netdev_features_t feature;
9732  	int feature_bit;
9733  
9734  	for_each_netdev_feature(upper_disables, feature_bit) {
9735  		feature = __NETIF_F_BIT(feature_bit);
9736  		if (!(features & feature) && (lower->features & feature)) {
9737  			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9738  				   &feature, lower->name);
9739  			lower->wanted_features &= ~feature;
9740  			__netdev_update_features(lower);
9741  
9742  			if (unlikely(lower->features & feature))
9743  				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9744  					    &feature, lower->name);
9745  			else
9746  				netdev_features_change(lower);
9747  		}
9748  	}
9749  }
9750  
netdev_fix_features(struct net_device * dev,netdev_features_t features)9751  static netdev_features_t netdev_fix_features(struct net_device *dev,
9752  	netdev_features_t features)
9753  {
9754  	/* Fix illegal checksum combinations */
9755  	if ((features & NETIF_F_HW_CSUM) &&
9756  	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9757  		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9758  		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9759  	}
9760  
9761  	/* TSO requires that SG is present as well. */
9762  	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9763  		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9764  		features &= ~NETIF_F_ALL_TSO;
9765  	}
9766  
9767  	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9768  					!(features & NETIF_F_IP_CSUM)) {
9769  		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9770  		features &= ~NETIF_F_TSO;
9771  		features &= ~NETIF_F_TSO_ECN;
9772  	}
9773  
9774  	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9775  					 !(features & NETIF_F_IPV6_CSUM)) {
9776  		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9777  		features &= ~NETIF_F_TSO6;
9778  	}
9779  
9780  	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9781  	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9782  		features &= ~NETIF_F_TSO_MANGLEID;
9783  
9784  	/* TSO ECN requires that TSO is present as well. */
9785  	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9786  		features &= ~NETIF_F_TSO_ECN;
9787  
9788  	/* Software GSO depends on SG. */
9789  	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9790  		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9791  		features &= ~NETIF_F_GSO;
9792  	}
9793  
9794  	/* GSO partial features require GSO partial be set */
9795  	if ((features & dev->gso_partial_features) &&
9796  	    !(features & NETIF_F_GSO_PARTIAL)) {
9797  		netdev_dbg(dev,
9798  			   "Dropping partially supported GSO features since no GSO partial.\n");
9799  		features &= ~dev->gso_partial_features;
9800  	}
9801  
9802  	if (!(features & NETIF_F_RXCSUM)) {
9803  		/* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9804  		 * successfully merged by hardware must also have the
9805  		 * checksum verified by hardware.  If the user does not
9806  		 * want to enable RXCSUM, logically, we should disable GRO_HW.
9807  		 */
9808  		if (features & NETIF_F_GRO_HW) {
9809  			netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9810  			features &= ~NETIF_F_GRO_HW;
9811  		}
9812  	}
9813  
9814  	/* LRO/HW-GRO features cannot be combined with RX-FCS */
9815  	if (features & NETIF_F_RXFCS) {
9816  		if (features & NETIF_F_LRO) {
9817  			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9818  			features &= ~NETIF_F_LRO;
9819  		}
9820  
9821  		if (features & NETIF_F_GRO_HW) {
9822  			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9823  			features &= ~NETIF_F_GRO_HW;
9824  		}
9825  	}
9826  
9827  	if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
9828  		netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
9829  		features &= ~NETIF_F_LRO;
9830  	}
9831  
9832  	if (features & NETIF_F_HW_TLS_TX) {
9833  		bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9834  			(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9835  		bool hw_csum = features & NETIF_F_HW_CSUM;
9836  
9837  		if (!ip_csum && !hw_csum) {
9838  			netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9839  			features &= ~NETIF_F_HW_TLS_TX;
9840  		}
9841  	}
9842  
9843  	if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9844  		netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9845  		features &= ~NETIF_F_HW_TLS_RX;
9846  	}
9847  
9848  	return features;
9849  }
9850  
__netdev_update_features(struct net_device * dev)9851  int __netdev_update_features(struct net_device *dev)
9852  {
9853  	struct net_device *upper, *lower;
9854  	netdev_features_t features;
9855  	struct list_head *iter;
9856  	int err = -1;
9857  
9858  	ASSERT_RTNL();
9859  
9860  	features = netdev_get_wanted_features(dev);
9861  
9862  	if (dev->netdev_ops->ndo_fix_features)
9863  		features = dev->netdev_ops->ndo_fix_features(dev, features);
9864  
9865  	/* driver might be less strict about feature dependencies */
9866  	features = netdev_fix_features(dev, features);
9867  
9868  	/* some features can't be enabled if they're off on an upper device */
9869  	netdev_for_each_upper_dev_rcu(dev, upper, iter)
9870  		features = netdev_sync_upper_features(dev, upper, features);
9871  
9872  	if (dev->features == features)
9873  		goto sync_lower;
9874  
9875  	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9876  		&dev->features, &features);
9877  
9878  	if (dev->netdev_ops->ndo_set_features)
9879  		err = dev->netdev_ops->ndo_set_features(dev, features);
9880  	else
9881  		err = 0;
9882  
9883  	if (unlikely(err < 0)) {
9884  		netdev_err(dev,
9885  			"set_features() failed (%d); wanted %pNF, left %pNF\n",
9886  			err, &features, &dev->features);
9887  		/* return non-0 since some features might have changed and
9888  		 * it's better to fire a spurious notification than miss it
9889  		 */
9890  		return -1;
9891  	}
9892  
9893  sync_lower:
9894  	/* some features must be disabled on lower devices when disabled
9895  	 * on an upper device (think: bonding master or bridge)
9896  	 */
9897  	netdev_for_each_lower_dev(dev, lower, iter)
9898  		netdev_sync_lower_features(dev, lower, features);
9899  
9900  	if (!err) {
9901  		netdev_features_t diff = features ^ dev->features;
9902  
9903  		if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9904  			/* udp_tunnel_{get,drop}_rx_info both need
9905  			 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9906  			 * device, or they won't do anything.
9907  			 * Thus we need to update dev->features
9908  			 * *before* calling udp_tunnel_get_rx_info,
9909  			 * but *after* calling udp_tunnel_drop_rx_info.
9910  			 */
9911  			if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9912  				dev->features = features;
9913  				udp_tunnel_get_rx_info(dev);
9914  			} else {
9915  				udp_tunnel_drop_rx_info(dev);
9916  			}
9917  		}
9918  
9919  		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9920  			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9921  				dev->features = features;
9922  				err |= vlan_get_rx_ctag_filter_info(dev);
9923  			} else {
9924  				vlan_drop_rx_ctag_filter_info(dev);
9925  			}
9926  		}
9927  
9928  		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9929  			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9930  				dev->features = features;
9931  				err |= vlan_get_rx_stag_filter_info(dev);
9932  			} else {
9933  				vlan_drop_rx_stag_filter_info(dev);
9934  			}
9935  		}
9936  
9937  		dev->features = features;
9938  	}
9939  
9940  	return err < 0 ? 0 : 1;
9941  }
9942  
9943  /**
9944   *	netdev_update_features - recalculate device features
9945   *	@dev: the device to check
9946   *
9947   *	Recalculate dev->features set and send notifications if it
9948   *	has changed. Should be called after driver or hardware dependent
9949   *	conditions might have changed that influence the features.
9950   */
netdev_update_features(struct net_device * dev)9951  void netdev_update_features(struct net_device *dev)
9952  {
9953  	if (__netdev_update_features(dev))
9954  		netdev_features_change(dev);
9955  }
9956  EXPORT_SYMBOL(netdev_update_features);
9957  
9958  /**
9959   *	netdev_change_features - recalculate device features
9960   *	@dev: the device to check
9961   *
9962   *	Recalculate dev->features set and send notifications even
9963   *	if they have not changed. Should be called instead of
9964   *	netdev_update_features() if also dev->vlan_features might
9965   *	have changed to allow the changes to be propagated to stacked
9966   *	VLAN devices.
9967   */
netdev_change_features(struct net_device * dev)9968  void netdev_change_features(struct net_device *dev)
9969  {
9970  	__netdev_update_features(dev);
9971  	netdev_features_change(dev);
9972  }
9973  EXPORT_SYMBOL(netdev_change_features);
9974  
9975  /**
9976   *	netif_stacked_transfer_operstate -	transfer operstate
9977   *	@rootdev: the root or lower level device to transfer state from
9978   *	@dev: the device to transfer operstate to
9979   *
9980   *	Transfer operational state from root to device. This is normally
9981   *	called when a stacking relationship exists between the root
9982   *	device and the device(a leaf device).
9983   */
netif_stacked_transfer_operstate(const struct net_device * rootdev,struct net_device * dev)9984  void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9985  					struct net_device *dev)
9986  {
9987  	if (rootdev->operstate == IF_OPER_DORMANT)
9988  		netif_dormant_on(dev);
9989  	else
9990  		netif_dormant_off(dev);
9991  
9992  	if (rootdev->operstate == IF_OPER_TESTING)
9993  		netif_testing_on(dev);
9994  	else
9995  		netif_testing_off(dev);
9996  
9997  	if (netif_carrier_ok(rootdev))
9998  		netif_carrier_on(dev);
9999  	else
10000  		netif_carrier_off(dev);
10001  }
10002  EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10003  
netif_alloc_rx_queues(struct net_device * dev)10004  static int netif_alloc_rx_queues(struct net_device *dev)
10005  {
10006  	unsigned int i, count = dev->num_rx_queues;
10007  	struct netdev_rx_queue *rx;
10008  	size_t sz = count * sizeof(*rx);
10009  	int err = 0;
10010  
10011  	BUG_ON(count < 1);
10012  
10013  	rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10014  	if (!rx)
10015  		return -ENOMEM;
10016  
10017  	dev->_rx = rx;
10018  
10019  	for (i = 0; i < count; i++) {
10020  		rx[i].dev = dev;
10021  
10022  		/* XDP RX-queue setup */
10023  		err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10024  		if (err < 0)
10025  			goto err_rxq_info;
10026  	}
10027  	return 0;
10028  
10029  err_rxq_info:
10030  	/* Rollback successful reg's and free other resources */
10031  	while (i--)
10032  		xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10033  	kvfree(dev->_rx);
10034  	dev->_rx = NULL;
10035  	return err;
10036  }
10037  
netif_free_rx_queues(struct net_device * dev)10038  static void netif_free_rx_queues(struct net_device *dev)
10039  {
10040  	unsigned int i, count = dev->num_rx_queues;
10041  
10042  	/* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10043  	if (!dev->_rx)
10044  		return;
10045  
10046  	for (i = 0; i < count; i++)
10047  		xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10048  
10049  	kvfree(dev->_rx);
10050  }
10051  
netdev_init_one_queue(struct net_device * dev,struct netdev_queue * queue,void * _unused)10052  static void netdev_init_one_queue(struct net_device *dev,
10053  				  struct netdev_queue *queue, void *_unused)
10054  {
10055  	/* Initialize queue lock */
10056  	spin_lock_init(&queue->_xmit_lock);
10057  	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10058  	queue->xmit_lock_owner = -1;
10059  	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10060  	queue->dev = dev;
10061  #ifdef CONFIG_BQL
10062  	dql_init(&queue->dql, HZ);
10063  #endif
10064  }
10065  
netif_free_tx_queues(struct net_device * dev)10066  static void netif_free_tx_queues(struct net_device *dev)
10067  {
10068  	kvfree(dev->_tx);
10069  }
10070  
netif_alloc_netdev_queues(struct net_device * dev)10071  static int netif_alloc_netdev_queues(struct net_device *dev)
10072  {
10073  	unsigned int count = dev->num_tx_queues;
10074  	struct netdev_queue *tx;
10075  	size_t sz = count * sizeof(*tx);
10076  
10077  	if (count < 1 || count > 0xffff)
10078  		return -EINVAL;
10079  
10080  	tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10081  	if (!tx)
10082  		return -ENOMEM;
10083  
10084  	dev->_tx = tx;
10085  
10086  	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10087  	spin_lock_init(&dev->tx_global_lock);
10088  
10089  	return 0;
10090  }
10091  
netif_tx_stop_all_queues(struct net_device * dev)10092  void netif_tx_stop_all_queues(struct net_device *dev)
10093  {
10094  	unsigned int i;
10095  
10096  	for (i = 0; i < dev->num_tx_queues; i++) {
10097  		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10098  
10099  		netif_tx_stop_queue(txq);
10100  	}
10101  }
10102  EXPORT_SYMBOL(netif_tx_stop_all_queues);
10103  
netdev_do_alloc_pcpu_stats(struct net_device * dev)10104  static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10105  {
10106  	void __percpu *v;
10107  
10108  	/* Drivers implementing ndo_get_peer_dev must support tstat
10109  	 * accounting, so that skb_do_redirect() can bump the dev's
10110  	 * RX stats upon network namespace switch.
10111  	 */
10112  	if (dev->netdev_ops->ndo_get_peer_dev &&
10113  	    dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10114  		return -EOPNOTSUPP;
10115  
10116  	switch (dev->pcpu_stat_type) {
10117  	case NETDEV_PCPU_STAT_NONE:
10118  		return 0;
10119  	case NETDEV_PCPU_STAT_LSTATS:
10120  		v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10121  		break;
10122  	case NETDEV_PCPU_STAT_TSTATS:
10123  		v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10124  		break;
10125  	case NETDEV_PCPU_STAT_DSTATS:
10126  		v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10127  		break;
10128  	default:
10129  		return -EINVAL;
10130  	}
10131  
10132  	return v ? 0 : -ENOMEM;
10133  }
10134  
netdev_do_free_pcpu_stats(struct net_device * dev)10135  static void netdev_do_free_pcpu_stats(struct net_device *dev)
10136  {
10137  	switch (dev->pcpu_stat_type) {
10138  	case NETDEV_PCPU_STAT_NONE:
10139  		return;
10140  	case NETDEV_PCPU_STAT_LSTATS:
10141  		free_percpu(dev->lstats);
10142  		break;
10143  	case NETDEV_PCPU_STAT_TSTATS:
10144  		free_percpu(dev->tstats);
10145  		break;
10146  	case NETDEV_PCPU_STAT_DSTATS:
10147  		free_percpu(dev->dstats);
10148  		break;
10149  	}
10150  }
10151  
10152  /**
10153   * register_netdevice() - register a network device
10154   * @dev: device to register
10155   *
10156   * Take a prepared network device structure and make it externally accessible.
10157   * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10158   * Callers must hold the rtnl lock - you may want register_netdev()
10159   * instead of this.
10160   */
register_netdevice(struct net_device * dev)10161  int register_netdevice(struct net_device *dev)
10162  {
10163  	int ret;
10164  	struct net *net = dev_net(dev);
10165  
10166  	BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10167  		     NETDEV_FEATURE_COUNT);
10168  	BUG_ON(dev_boot_phase);
10169  	ASSERT_RTNL();
10170  
10171  	might_sleep();
10172  
10173  	/* When net_device's are persistent, this will be fatal. */
10174  	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10175  	BUG_ON(!net);
10176  
10177  	ret = ethtool_check_ops(dev->ethtool_ops);
10178  	if (ret)
10179  		return ret;
10180  
10181  	spin_lock_init(&dev->addr_list_lock);
10182  	netdev_set_addr_lockdep_class(dev);
10183  
10184  	ret = dev_get_valid_name(net, dev, dev->name);
10185  	if (ret < 0)
10186  		goto out;
10187  
10188  	ret = -ENOMEM;
10189  	dev->name_node = netdev_name_node_head_alloc(dev);
10190  	if (!dev->name_node)
10191  		goto out;
10192  
10193  	/* Init, if this function is available */
10194  	if (dev->netdev_ops->ndo_init) {
10195  		ret = dev->netdev_ops->ndo_init(dev);
10196  		if (ret) {
10197  			if (ret > 0)
10198  				ret = -EIO;
10199  			goto err_free_name;
10200  		}
10201  	}
10202  
10203  	if (((dev->hw_features | dev->features) &
10204  	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
10205  	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10206  	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10207  		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10208  		ret = -EINVAL;
10209  		goto err_uninit;
10210  	}
10211  
10212  	ret = netdev_do_alloc_pcpu_stats(dev);
10213  	if (ret)
10214  		goto err_uninit;
10215  
10216  	ret = dev_index_reserve(net, dev->ifindex);
10217  	if (ret < 0)
10218  		goto err_free_pcpu;
10219  	dev->ifindex = ret;
10220  
10221  	/* Transfer changeable features to wanted_features and enable
10222  	 * software offloads (GSO and GRO).
10223  	 */
10224  	dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10225  	dev->features |= NETIF_F_SOFT_FEATURES;
10226  
10227  	if (dev->udp_tunnel_nic_info) {
10228  		dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10229  		dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10230  	}
10231  
10232  	dev->wanted_features = dev->features & dev->hw_features;
10233  
10234  	if (!(dev->flags & IFF_LOOPBACK))
10235  		dev->hw_features |= NETIF_F_NOCACHE_COPY;
10236  
10237  	/* If IPv4 TCP segmentation offload is supported we should also
10238  	 * allow the device to enable segmenting the frame with the option
10239  	 * of ignoring a static IP ID value.  This doesn't enable the
10240  	 * feature itself but allows the user to enable it later.
10241  	 */
10242  	if (dev->hw_features & NETIF_F_TSO)
10243  		dev->hw_features |= NETIF_F_TSO_MANGLEID;
10244  	if (dev->vlan_features & NETIF_F_TSO)
10245  		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10246  	if (dev->mpls_features & NETIF_F_TSO)
10247  		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10248  	if (dev->hw_enc_features & NETIF_F_TSO)
10249  		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10250  
10251  	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10252  	 */
10253  	dev->vlan_features |= NETIF_F_HIGHDMA;
10254  
10255  	/* Make NETIF_F_SG inheritable to tunnel devices.
10256  	 */
10257  	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10258  
10259  	/* Make NETIF_F_SG inheritable to MPLS.
10260  	 */
10261  	dev->mpls_features |= NETIF_F_SG;
10262  
10263  	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10264  	ret = notifier_to_errno(ret);
10265  	if (ret)
10266  		goto err_ifindex_release;
10267  
10268  	ret = netdev_register_kobject(dev);
10269  	write_lock(&dev_base_lock);
10270  	dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED;
10271  	write_unlock(&dev_base_lock);
10272  	if (ret)
10273  		goto err_uninit_notify;
10274  
10275  	__netdev_update_features(dev);
10276  
10277  	/*
10278  	 *	Default initial state at registry is that the
10279  	 *	device is present.
10280  	 */
10281  
10282  	set_bit(__LINK_STATE_PRESENT, &dev->state);
10283  
10284  	linkwatch_init_dev(dev);
10285  
10286  	dev_init_scheduler(dev);
10287  
10288  	netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10289  	list_netdevice(dev);
10290  
10291  	add_device_randomness(dev->dev_addr, dev->addr_len);
10292  
10293  	/* If the device has permanent device address, driver should
10294  	 * set dev_addr and also addr_assign_type should be set to
10295  	 * NET_ADDR_PERM (default value).
10296  	 */
10297  	if (dev->addr_assign_type == NET_ADDR_PERM)
10298  		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10299  
10300  	/* Notify protocols, that a new device appeared. */
10301  	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10302  	ret = notifier_to_errno(ret);
10303  	if (ret) {
10304  		/* Expect explicit free_netdev() on failure */
10305  		dev->needs_free_netdev = false;
10306  		unregister_netdevice_queue(dev, NULL);
10307  		goto out;
10308  	}
10309  	/*
10310  	 *	Prevent userspace races by waiting until the network
10311  	 *	device is fully setup before sending notifications.
10312  	 */
10313  	if (!dev->rtnl_link_ops ||
10314  	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10315  		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10316  
10317  out:
10318  	return ret;
10319  
10320  err_uninit_notify:
10321  	call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10322  err_ifindex_release:
10323  	dev_index_release(net, dev->ifindex);
10324  err_free_pcpu:
10325  	netdev_do_free_pcpu_stats(dev);
10326  err_uninit:
10327  	if (dev->netdev_ops->ndo_uninit)
10328  		dev->netdev_ops->ndo_uninit(dev);
10329  	if (dev->priv_destructor)
10330  		dev->priv_destructor(dev);
10331  err_free_name:
10332  	netdev_name_node_free(dev->name_node);
10333  	goto out;
10334  }
10335  EXPORT_SYMBOL(register_netdevice);
10336  
10337  /**
10338   *	init_dummy_netdev	- init a dummy network device for NAPI
10339   *	@dev: device to init
10340   *
10341   *	This takes a network device structure and initialize the minimum
10342   *	amount of fields so it can be used to schedule NAPI polls without
10343   *	registering a full blown interface. This is to be used by drivers
10344   *	that need to tie several hardware interfaces to a single NAPI
10345   *	poll scheduler due to HW limitations.
10346   */
init_dummy_netdev(struct net_device * dev)10347  int init_dummy_netdev(struct net_device *dev)
10348  {
10349  	/* Clear everything. Note we don't initialize spinlocks
10350  	 * are they aren't supposed to be taken by any of the
10351  	 * NAPI code and this dummy netdev is supposed to be
10352  	 * only ever used for NAPI polls
10353  	 */
10354  	memset(dev, 0, sizeof(struct net_device));
10355  
10356  	/* make sure we BUG if trying to hit standard
10357  	 * register/unregister code path
10358  	 */
10359  	dev->reg_state = NETREG_DUMMY;
10360  
10361  	/* NAPI wants this */
10362  	INIT_LIST_HEAD(&dev->napi_list);
10363  
10364  	/* a dummy interface is started by default */
10365  	set_bit(__LINK_STATE_PRESENT, &dev->state);
10366  	set_bit(__LINK_STATE_START, &dev->state);
10367  
10368  	/* napi_busy_loop stats accounting wants this */
10369  	dev_net_set(dev, &init_net);
10370  
10371  	/* Note : We dont allocate pcpu_refcnt for dummy devices,
10372  	 * because users of this 'device' dont need to change
10373  	 * its refcount.
10374  	 */
10375  
10376  	return 0;
10377  }
10378  EXPORT_SYMBOL_GPL(init_dummy_netdev);
10379  
10380  
10381  /**
10382   *	register_netdev	- register a network device
10383   *	@dev: device to register
10384   *
10385   *	Take a completed network device structure and add it to the kernel
10386   *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10387   *	chain. 0 is returned on success. A negative errno code is returned
10388   *	on a failure to set up the device, or if the name is a duplicate.
10389   *
10390   *	This is a wrapper around register_netdevice that takes the rtnl semaphore
10391   *	and expands the device name if you passed a format string to
10392   *	alloc_netdev.
10393   */
register_netdev(struct net_device * dev)10394  int register_netdev(struct net_device *dev)
10395  {
10396  	int err;
10397  
10398  	if (rtnl_lock_killable())
10399  		return -EINTR;
10400  	err = register_netdevice(dev);
10401  	rtnl_unlock();
10402  	return err;
10403  }
10404  EXPORT_SYMBOL(register_netdev);
10405  
netdev_refcnt_read(const struct net_device * dev)10406  int netdev_refcnt_read(const struct net_device *dev)
10407  {
10408  #ifdef CONFIG_PCPU_DEV_REFCNT
10409  	int i, refcnt = 0;
10410  
10411  	for_each_possible_cpu(i)
10412  		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10413  	return refcnt;
10414  #else
10415  	return refcount_read(&dev->dev_refcnt);
10416  #endif
10417  }
10418  EXPORT_SYMBOL(netdev_refcnt_read);
10419  
10420  int netdev_unregister_timeout_secs __read_mostly = 10;
10421  
10422  #define WAIT_REFS_MIN_MSECS 1
10423  #define WAIT_REFS_MAX_MSECS 250
10424  /**
10425   * netdev_wait_allrefs_any - wait until all references are gone.
10426   * @list: list of net_devices to wait on
10427   *
10428   * This is called when unregistering network devices.
10429   *
10430   * Any protocol or device that holds a reference should register
10431   * for netdevice notification, and cleanup and put back the
10432   * reference if they receive an UNREGISTER event.
10433   * We can get stuck here if buggy protocols don't correctly
10434   * call dev_put.
10435   */
netdev_wait_allrefs_any(struct list_head * list)10436  static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10437  {
10438  	unsigned long rebroadcast_time, warning_time;
10439  	struct net_device *dev;
10440  	int wait = 0;
10441  
10442  	rebroadcast_time = warning_time = jiffies;
10443  
10444  	list_for_each_entry(dev, list, todo_list)
10445  		if (netdev_refcnt_read(dev) == 1)
10446  			return dev;
10447  
10448  	while (true) {
10449  		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10450  			rtnl_lock();
10451  
10452  			/* Rebroadcast unregister notification */
10453  			list_for_each_entry(dev, list, todo_list)
10454  				call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10455  
10456  			__rtnl_unlock();
10457  			rcu_barrier();
10458  			rtnl_lock();
10459  
10460  			list_for_each_entry(dev, list, todo_list)
10461  				if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10462  					     &dev->state)) {
10463  					/* We must not have linkwatch events
10464  					 * pending on unregister. If this
10465  					 * happens, we simply run the queue
10466  					 * unscheduled, resulting in a noop
10467  					 * for this device.
10468  					 */
10469  					linkwatch_run_queue();
10470  					break;
10471  				}
10472  
10473  			__rtnl_unlock();
10474  
10475  			rebroadcast_time = jiffies;
10476  		}
10477  
10478  		rcu_barrier();
10479  
10480  		if (!wait) {
10481  			wait = WAIT_REFS_MIN_MSECS;
10482  		} else {
10483  			msleep(wait);
10484  			wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10485  		}
10486  
10487  		list_for_each_entry(dev, list, todo_list)
10488  			if (netdev_refcnt_read(dev) == 1)
10489  				return dev;
10490  
10491  		if (time_after(jiffies, warning_time +
10492  			       READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10493  			list_for_each_entry(dev, list, todo_list) {
10494  				pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10495  					 dev->name, netdev_refcnt_read(dev));
10496  				ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10497  			}
10498  
10499  			warning_time = jiffies;
10500  		}
10501  	}
10502  }
10503  
10504  /* The sequence is:
10505   *
10506   *	rtnl_lock();
10507   *	...
10508   *	register_netdevice(x1);
10509   *	register_netdevice(x2);
10510   *	...
10511   *	unregister_netdevice(y1);
10512   *	unregister_netdevice(y2);
10513   *      ...
10514   *	rtnl_unlock();
10515   *	free_netdev(y1);
10516   *	free_netdev(y2);
10517   *
10518   * We are invoked by rtnl_unlock().
10519   * This allows us to deal with problems:
10520   * 1) We can delete sysfs objects which invoke hotplug
10521   *    without deadlocking with linkwatch via keventd.
10522   * 2) Since we run with the RTNL semaphore not held, we can sleep
10523   *    safely in order to wait for the netdev refcnt to drop to zero.
10524   *
10525   * We must not return until all unregister events added during
10526   * the interval the lock was held have been completed.
10527   */
netdev_run_todo(void)10528  void netdev_run_todo(void)
10529  {
10530  	struct net_device *dev, *tmp;
10531  	struct list_head list;
10532  #ifdef CONFIG_LOCKDEP
10533  	struct list_head unlink_list;
10534  
10535  	list_replace_init(&net_unlink_list, &unlink_list);
10536  
10537  	while (!list_empty(&unlink_list)) {
10538  		struct net_device *dev = list_first_entry(&unlink_list,
10539  							  struct net_device,
10540  							  unlink_list);
10541  		list_del_init(&dev->unlink_list);
10542  		dev->nested_level = dev->lower_level - 1;
10543  	}
10544  #endif
10545  
10546  	/* Snapshot list, allow later requests */
10547  	list_replace_init(&net_todo_list, &list);
10548  
10549  	__rtnl_unlock();
10550  
10551  	/* Wait for rcu callbacks to finish before next phase */
10552  	if (!list_empty(&list))
10553  		rcu_barrier();
10554  
10555  	list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10556  		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10557  			netdev_WARN(dev, "run_todo but not unregistering\n");
10558  			list_del(&dev->todo_list);
10559  			continue;
10560  		}
10561  
10562  		write_lock(&dev_base_lock);
10563  		dev->reg_state = NETREG_UNREGISTERED;
10564  		write_unlock(&dev_base_lock);
10565  		linkwatch_forget_dev(dev);
10566  	}
10567  
10568  	while (!list_empty(&list)) {
10569  		dev = netdev_wait_allrefs_any(&list);
10570  		list_del(&dev->todo_list);
10571  
10572  		/* paranoia */
10573  		BUG_ON(netdev_refcnt_read(dev) != 1);
10574  		BUG_ON(!list_empty(&dev->ptype_all));
10575  		BUG_ON(!list_empty(&dev->ptype_specific));
10576  		WARN_ON(rcu_access_pointer(dev->ip_ptr));
10577  		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10578  
10579  		netdev_do_free_pcpu_stats(dev);
10580  		if (dev->priv_destructor)
10581  			dev->priv_destructor(dev);
10582  		if (dev->needs_free_netdev)
10583  			free_netdev(dev);
10584  
10585  		if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
10586  			wake_up(&netdev_unregistering_wq);
10587  
10588  		/* Free network device */
10589  		kobject_put(&dev->dev.kobj);
10590  	}
10591  }
10592  
10593  /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10594   * all the same fields in the same order as net_device_stats, with only
10595   * the type differing, but rtnl_link_stats64 may have additional fields
10596   * at the end for newer counters.
10597   */
netdev_stats_to_stats64(struct rtnl_link_stats64 * stats64,const struct net_device_stats * netdev_stats)10598  void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10599  			     const struct net_device_stats *netdev_stats)
10600  {
10601  	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10602  	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10603  	u64 *dst = (u64 *)stats64;
10604  
10605  	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10606  	for (i = 0; i < n; i++)
10607  		dst[i] = (unsigned long)atomic_long_read(&src[i]);
10608  	/* zero out counters that only exist in rtnl_link_stats64 */
10609  	memset((char *)stats64 + n * sizeof(u64), 0,
10610  	       sizeof(*stats64) - n * sizeof(u64));
10611  }
10612  EXPORT_SYMBOL(netdev_stats_to_stats64);
10613  
netdev_core_stats_alloc(struct net_device * dev)10614  struct net_device_core_stats __percpu *netdev_core_stats_alloc(struct net_device *dev)
10615  {
10616  	struct net_device_core_stats __percpu *p;
10617  
10618  	p = alloc_percpu_gfp(struct net_device_core_stats,
10619  			     GFP_ATOMIC | __GFP_NOWARN);
10620  
10621  	if (p && cmpxchg(&dev->core_stats, NULL, p))
10622  		free_percpu(p);
10623  
10624  	/* This READ_ONCE() pairs with the cmpxchg() above */
10625  	return READ_ONCE(dev->core_stats);
10626  }
10627  EXPORT_SYMBOL(netdev_core_stats_alloc);
10628  
10629  /**
10630   *	dev_get_stats	- get network device statistics
10631   *	@dev: device to get statistics from
10632   *	@storage: place to store stats
10633   *
10634   *	Get network statistics from device. Return @storage.
10635   *	The device driver may provide its own method by setting
10636   *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10637   *	otherwise the internal statistics structure is used.
10638   */
dev_get_stats(struct net_device * dev,struct rtnl_link_stats64 * storage)10639  struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10640  					struct rtnl_link_stats64 *storage)
10641  {
10642  	const struct net_device_ops *ops = dev->netdev_ops;
10643  	const struct net_device_core_stats __percpu *p;
10644  
10645  	if (ops->ndo_get_stats64) {
10646  		memset(storage, 0, sizeof(*storage));
10647  		ops->ndo_get_stats64(dev, storage);
10648  	} else if (ops->ndo_get_stats) {
10649  		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10650  	} else {
10651  		netdev_stats_to_stats64(storage, &dev->stats);
10652  	}
10653  
10654  	/* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10655  	p = READ_ONCE(dev->core_stats);
10656  	if (p) {
10657  		const struct net_device_core_stats *core_stats;
10658  		int i;
10659  
10660  		for_each_possible_cpu(i) {
10661  			core_stats = per_cpu_ptr(p, i);
10662  			storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10663  			storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10664  			storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10665  			storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10666  		}
10667  	}
10668  	return storage;
10669  }
10670  EXPORT_SYMBOL(dev_get_stats);
10671  
10672  /**
10673   *	dev_fetch_sw_netstats - get per-cpu network device statistics
10674   *	@s: place to store stats
10675   *	@netstats: per-cpu network stats to read from
10676   *
10677   *	Read per-cpu network statistics and populate the related fields in @s.
10678   */
dev_fetch_sw_netstats(struct rtnl_link_stats64 * s,const struct pcpu_sw_netstats __percpu * netstats)10679  void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10680  			   const struct pcpu_sw_netstats __percpu *netstats)
10681  {
10682  	int cpu;
10683  
10684  	for_each_possible_cpu(cpu) {
10685  		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10686  		const struct pcpu_sw_netstats *stats;
10687  		unsigned int start;
10688  
10689  		stats = per_cpu_ptr(netstats, cpu);
10690  		do {
10691  			start = u64_stats_fetch_begin(&stats->syncp);
10692  			rx_packets = u64_stats_read(&stats->rx_packets);
10693  			rx_bytes   = u64_stats_read(&stats->rx_bytes);
10694  			tx_packets = u64_stats_read(&stats->tx_packets);
10695  			tx_bytes   = u64_stats_read(&stats->tx_bytes);
10696  		} while (u64_stats_fetch_retry(&stats->syncp, start));
10697  
10698  		s->rx_packets += rx_packets;
10699  		s->rx_bytes   += rx_bytes;
10700  		s->tx_packets += tx_packets;
10701  		s->tx_bytes   += tx_bytes;
10702  	}
10703  }
10704  EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10705  
10706  /**
10707   *	dev_get_tstats64 - ndo_get_stats64 implementation
10708   *	@dev: device to get statistics from
10709   *	@s: place to store stats
10710   *
10711   *	Populate @s from dev->stats and dev->tstats. Can be used as
10712   *	ndo_get_stats64() callback.
10713   */
dev_get_tstats64(struct net_device * dev,struct rtnl_link_stats64 * s)10714  void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10715  {
10716  	netdev_stats_to_stats64(s, &dev->stats);
10717  	dev_fetch_sw_netstats(s, dev->tstats);
10718  }
10719  EXPORT_SYMBOL_GPL(dev_get_tstats64);
10720  
dev_ingress_queue_create(struct net_device * dev)10721  struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10722  {
10723  	struct netdev_queue *queue = dev_ingress_queue(dev);
10724  
10725  #ifdef CONFIG_NET_CLS_ACT
10726  	if (queue)
10727  		return queue;
10728  	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10729  	if (!queue)
10730  		return NULL;
10731  	netdev_init_one_queue(dev, queue, NULL);
10732  	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10733  	RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10734  	rcu_assign_pointer(dev->ingress_queue, queue);
10735  #endif
10736  	return queue;
10737  }
10738  
10739  static const struct ethtool_ops default_ethtool_ops;
10740  
netdev_set_default_ethtool_ops(struct net_device * dev,const struct ethtool_ops * ops)10741  void netdev_set_default_ethtool_ops(struct net_device *dev,
10742  				    const struct ethtool_ops *ops)
10743  {
10744  	if (dev->ethtool_ops == &default_ethtool_ops)
10745  		dev->ethtool_ops = ops;
10746  }
10747  EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10748  
10749  /**
10750   * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
10751   * @dev: netdev to enable the IRQ coalescing on
10752   *
10753   * Sets a conservative default for SW IRQ coalescing. Users can use
10754   * sysfs attributes to override the default values.
10755   */
netdev_sw_irq_coalesce_default_on(struct net_device * dev)10756  void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
10757  {
10758  	WARN_ON(dev->reg_state == NETREG_REGISTERED);
10759  
10760  	if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
10761  		dev->gro_flush_timeout = 20000;
10762  		dev->napi_defer_hard_irqs = 1;
10763  	}
10764  }
10765  EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
10766  
netdev_freemem(struct net_device * dev)10767  void netdev_freemem(struct net_device *dev)
10768  {
10769  	char *addr = (char *)dev - dev->padded;
10770  
10771  	kvfree(addr);
10772  }
10773  
10774  /**
10775   * alloc_netdev_mqs - allocate network device
10776   * @sizeof_priv: size of private data to allocate space for
10777   * @name: device name format string
10778   * @name_assign_type: origin of device name
10779   * @setup: callback to initialize device
10780   * @txqs: the number of TX subqueues to allocate
10781   * @rxqs: the number of RX subqueues to allocate
10782   *
10783   * Allocates a struct net_device with private data area for driver use
10784   * and performs basic initialization.  Also allocates subqueue structs
10785   * for each queue on the device.
10786   */
alloc_netdev_mqs(int sizeof_priv,const char * name,unsigned char name_assign_type,void (* setup)(struct net_device *),unsigned int txqs,unsigned int rxqs)10787  struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10788  		unsigned char name_assign_type,
10789  		void (*setup)(struct net_device *),
10790  		unsigned int txqs, unsigned int rxqs)
10791  {
10792  	struct net_device *dev;
10793  	unsigned int alloc_size;
10794  	struct net_device *p;
10795  
10796  	BUG_ON(strlen(name) >= sizeof(dev->name));
10797  
10798  	if (txqs < 1) {
10799  		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10800  		return NULL;
10801  	}
10802  
10803  	if (rxqs < 1) {
10804  		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10805  		return NULL;
10806  	}
10807  
10808  	alloc_size = sizeof(struct net_device);
10809  	if (sizeof_priv) {
10810  		/* ensure 32-byte alignment of private area */
10811  		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10812  		alloc_size += sizeof_priv;
10813  	}
10814  	/* ensure 32-byte alignment of whole construct */
10815  	alloc_size += NETDEV_ALIGN - 1;
10816  
10817  	p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10818  	if (!p)
10819  		return NULL;
10820  
10821  	dev = PTR_ALIGN(p, NETDEV_ALIGN);
10822  	dev->padded = (char *)dev - (char *)p;
10823  
10824  	ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
10825  #ifdef CONFIG_PCPU_DEV_REFCNT
10826  	dev->pcpu_refcnt = alloc_percpu(int);
10827  	if (!dev->pcpu_refcnt)
10828  		goto free_dev;
10829  	__dev_hold(dev);
10830  #else
10831  	refcount_set(&dev->dev_refcnt, 1);
10832  #endif
10833  
10834  	if (dev_addr_init(dev))
10835  		goto free_pcpu;
10836  
10837  	dev_mc_init(dev);
10838  	dev_uc_init(dev);
10839  
10840  	dev_net_set(dev, &init_net);
10841  
10842  	dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
10843  	dev->xdp_zc_max_segs = 1;
10844  	dev->gso_max_segs = GSO_MAX_SEGS;
10845  	dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
10846  	dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
10847  	dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
10848  	dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
10849  	dev->tso_max_segs = TSO_MAX_SEGS;
10850  	dev->upper_level = 1;
10851  	dev->lower_level = 1;
10852  #ifdef CONFIG_LOCKDEP
10853  	dev->nested_level = 0;
10854  	INIT_LIST_HEAD(&dev->unlink_list);
10855  #endif
10856  
10857  	INIT_LIST_HEAD(&dev->napi_list);
10858  	INIT_LIST_HEAD(&dev->unreg_list);
10859  	INIT_LIST_HEAD(&dev->close_list);
10860  	INIT_LIST_HEAD(&dev->link_watch_list);
10861  	INIT_LIST_HEAD(&dev->adj_list.upper);
10862  	INIT_LIST_HEAD(&dev->adj_list.lower);
10863  	INIT_LIST_HEAD(&dev->ptype_all);
10864  	INIT_LIST_HEAD(&dev->ptype_specific);
10865  	INIT_LIST_HEAD(&dev->net_notifier_list);
10866  #ifdef CONFIG_NET_SCHED
10867  	hash_init(dev->qdisc_hash);
10868  #endif
10869  	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10870  	setup(dev);
10871  
10872  	if (!dev->tx_queue_len) {
10873  		dev->priv_flags |= IFF_NO_QUEUE;
10874  		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10875  	}
10876  
10877  	dev->num_tx_queues = txqs;
10878  	dev->real_num_tx_queues = txqs;
10879  	if (netif_alloc_netdev_queues(dev))
10880  		goto free_all;
10881  
10882  	dev->num_rx_queues = rxqs;
10883  	dev->real_num_rx_queues = rxqs;
10884  	if (netif_alloc_rx_queues(dev))
10885  		goto free_all;
10886  
10887  	strcpy(dev->name, name);
10888  	dev->name_assign_type = name_assign_type;
10889  	dev->group = INIT_NETDEV_GROUP;
10890  	if (!dev->ethtool_ops)
10891  		dev->ethtool_ops = &default_ethtool_ops;
10892  
10893  	nf_hook_netdev_init(dev);
10894  
10895  	return dev;
10896  
10897  free_all:
10898  	free_netdev(dev);
10899  	return NULL;
10900  
10901  free_pcpu:
10902  #ifdef CONFIG_PCPU_DEV_REFCNT
10903  	free_percpu(dev->pcpu_refcnt);
10904  free_dev:
10905  #endif
10906  	netdev_freemem(dev);
10907  	return NULL;
10908  }
10909  EXPORT_SYMBOL(alloc_netdev_mqs);
10910  
10911  /**
10912   * free_netdev - free network device
10913   * @dev: device
10914   *
10915   * This function does the last stage of destroying an allocated device
10916   * interface. The reference to the device object is released. If this
10917   * is the last reference then it will be freed.Must be called in process
10918   * context.
10919   */
free_netdev(struct net_device * dev)10920  void free_netdev(struct net_device *dev)
10921  {
10922  	struct napi_struct *p, *n;
10923  
10924  	might_sleep();
10925  
10926  	/* When called immediately after register_netdevice() failed the unwind
10927  	 * handling may still be dismantling the device. Handle that case by
10928  	 * deferring the free.
10929  	 */
10930  	if (dev->reg_state == NETREG_UNREGISTERING) {
10931  		ASSERT_RTNL();
10932  		dev->needs_free_netdev = true;
10933  		return;
10934  	}
10935  
10936  	netif_free_tx_queues(dev);
10937  	netif_free_rx_queues(dev);
10938  
10939  	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10940  
10941  	/* Flush device addresses */
10942  	dev_addr_flush(dev);
10943  
10944  	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10945  		netif_napi_del(p);
10946  
10947  	ref_tracker_dir_exit(&dev->refcnt_tracker);
10948  #ifdef CONFIG_PCPU_DEV_REFCNT
10949  	free_percpu(dev->pcpu_refcnt);
10950  	dev->pcpu_refcnt = NULL;
10951  #endif
10952  	free_percpu(dev->core_stats);
10953  	dev->core_stats = NULL;
10954  	free_percpu(dev->xdp_bulkq);
10955  	dev->xdp_bulkq = NULL;
10956  
10957  	/*  Compatibility with error handling in drivers */
10958  	if (dev->reg_state == NETREG_UNINITIALIZED) {
10959  		netdev_freemem(dev);
10960  		return;
10961  	}
10962  
10963  	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10964  	dev->reg_state = NETREG_RELEASED;
10965  
10966  	/* will free via device release */
10967  	put_device(&dev->dev);
10968  }
10969  EXPORT_SYMBOL(free_netdev);
10970  
10971  /**
10972   *	synchronize_net -  Synchronize with packet receive processing
10973   *
10974   *	Wait for packets currently being received to be done.
10975   *	Does not block later packets from starting.
10976   */
synchronize_net(void)10977  void synchronize_net(void)
10978  {
10979  	might_sleep();
10980  	if (rtnl_is_locked())
10981  		synchronize_rcu_expedited();
10982  	else
10983  		synchronize_rcu();
10984  }
10985  EXPORT_SYMBOL(synchronize_net);
10986  
10987  /**
10988   *	unregister_netdevice_queue - remove device from the kernel
10989   *	@dev: device
10990   *	@head: list
10991   *
10992   *	This function shuts down a device interface and removes it
10993   *	from the kernel tables.
10994   *	If head not NULL, device is queued to be unregistered later.
10995   *
10996   *	Callers must hold the rtnl semaphore.  You may want
10997   *	unregister_netdev() instead of this.
10998   */
10999  
unregister_netdevice_queue(struct net_device * dev,struct list_head * head)11000  void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11001  {
11002  	ASSERT_RTNL();
11003  
11004  	if (head) {
11005  		list_move_tail(&dev->unreg_list, head);
11006  	} else {
11007  		LIST_HEAD(single);
11008  
11009  		list_add(&dev->unreg_list, &single);
11010  		unregister_netdevice_many(&single);
11011  	}
11012  }
11013  EXPORT_SYMBOL(unregister_netdevice_queue);
11014  
unregister_netdevice_many_notify(struct list_head * head,u32 portid,const struct nlmsghdr * nlh)11015  void unregister_netdevice_many_notify(struct list_head *head,
11016  				      u32 portid, const struct nlmsghdr *nlh)
11017  {
11018  	struct net_device *dev, *tmp;
11019  	LIST_HEAD(close_head);
11020  
11021  	BUG_ON(dev_boot_phase);
11022  	ASSERT_RTNL();
11023  
11024  	if (list_empty(head))
11025  		return;
11026  
11027  	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11028  		/* Some devices call without registering
11029  		 * for initialization unwind. Remove those
11030  		 * devices and proceed with the remaining.
11031  		 */
11032  		if (dev->reg_state == NETREG_UNINITIALIZED) {
11033  			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11034  				 dev->name, dev);
11035  
11036  			WARN_ON(1);
11037  			list_del(&dev->unreg_list);
11038  			continue;
11039  		}
11040  		dev->dismantle = true;
11041  		BUG_ON(dev->reg_state != NETREG_REGISTERED);
11042  	}
11043  
11044  	/* If device is running, close it first. */
11045  	list_for_each_entry(dev, head, unreg_list)
11046  		list_add_tail(&dev->close_list, &close_head);
11047  	dev_close_many(&close_head, true);
11048  
11049  	list_for_each_entry(dev, head, unreg_list) {
11050  		/* And unlink it from device chain. */
11051  		write_lock(&dev_base_lock);
11052  		unlist_netdevice(dev, false);
11053  		dev->reg_state = NETREG_UNREGISTERING;
11054  		write_unlock(&dev_base_lock);
11055  	}
11056  	flush_all_backlogs();
11057  
11058  	synchronize_net();
11059  
11060  	list_for_each_entry(dev, head, unreg_list) {
11061  		struct sk_buff *skb = NULL;
11062  
11063  		/* Shutdown queueing discipline. */
11064  		dev_shutdown(dev);
11065  		dev_tcx_uninstall(dev);
11066  		dev_xdp_uninstall(dev);
11067  		bpf_dev_bound_netdev_unregister(dev);
11068  
11069  		netdev_offload_xstats_disable_all(dev);
11070  
11071  		/* Notify protocols, that we are about to destroy
11072  		 * this device. They should clean all the things.
11073  		 */
11074  		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11075  
11076  		if (!dev->rtnl_link_ops ||
11077  		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11078  			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11079  						     GFP_KERNEL, NULL, 0,
11080  						     portid, nlh);
11081  
11082  		/*
11083  		 *	Flush the unicast and multicast chains
11084  		 */
11085  		dev_uc_flush(dev);
11086  		dev_mc_flush(dev);
11087  
11088  		netdev_name_node_alt_flush(dev);
11089  		netdev_name_node_free(dev->name_node);
11090  
11091  		call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11092  
11093  		if (dev->netdev_ops->ndo_uninit)
11094  			dev->netdev_ops->ndo_uninit(dev);
11095  
11096  		if (skb)
11097  			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11098  
11099  		/* Notifier chain MUST detach us all upper devices. */
11100  		WARN_ON(netdev_has_any_upper_dev(dev));
11101  		WARN_ON(netdev_has_any_lower_dev(dev));
11102  
11103  		/* Remove entries from kobject tree */
11104  		netdev_unregister_kobject(dev);
11105  #ifdef CONFIG_XPS
11106  		/* Remove XPS queueing entries */
11107  		netif_reset_xps_queues_gt(dev, 0);
11108  #endif
11109  	}
11110  
11111  	synchronize_net();
11112  
11113  	list_for_each_entry(dev, head, unreg_list) {
11114  		netdev_put(dev, &dev->dev_registered_tracker);
11115  		net_set_todo(dev);
11116  	}
11117  
11118  	list_del(head);
11119  }
11120  
11121  /**
11122   *	unregister_netdevice_many - unregister many devices
11123   *	@head: list of devices
11124   *
11125   *  Note: As most callers use a stack allocated list_head,
11126   *  we force a list_del() to make sure stack wont be corrupted later.
11127   */
unregister_netdevice_many(struct list_head * head)11128  void unregister_netdevice_many(struct list_head *head)
11129  {
11130  	unregister_netdevice_many_notify(head, 0, NULL);
11131  }
11132  EXPORT_SYMBOL(unregister_netdevice_many);
11133  
11134  /**
11135   *	unregister_netdev - remove device from the kernel
11136   *	@dev: device
11137   *
11138   *	This function shuts down a device interface and removes it
11139   *	from the kernel tables.
11140   *
11141   *	This is just a wrapper for unregister_netdevice that takes
11142   *	the rtnl semaphore.  In general you want to use this and not
11143   *	unregister_netdevice.
11144   */
unregister_netdev(struct net_device * dev)11145  void unregister_netdev(struct net_device *dev)
11146  {
11147  	rtnl_lock();
11148  	unregister_netdevice(dev);
11149  	rtnl_unlock();
11150  }
11151  EXPORT_SYMBOL(unregister_netdev);
11152  
11153  /**
11154   *	__dev_change_net_namespace - move device to different nethost namespace
11155   *	@dev: device
11156   *	@net: network namespace
11157   *	@pat: If not NULL name pattern to try if the current device name
11158   *	      is already taken in the destination network namespace.
11159   *	@new_ifindex: If not zero, specifies device index in the target
11160   *	              namespace.
11161   *
11162   *	This function shuts down a device interface and moves it
11163   *	to a new network namespace. On success 0 is returned, on
11164   *	a failure a netagive errno code is returned.
11165   *
11166   *	Callers must hold the rtnl semaphore.
11167   */
11168  
__dev_change_net_namespace(struct net_device * dev,struct net * net,const char * pat,int new_ifindex)11169  int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11170  			       const char *pat, int new_ifindex)
11171  {
11172  	struct netdev_name_node *name_node;
11173  	struct net *net_old = dev_net(dev);
11174  	char new_name[IFNAMSIZ] = {};
11175  	int err, new_nsid;
11176  
11177  	ASSERT_RTNL();
11178  
11179  	/* Don't allow namespace local devices to be moved. */
11180  	err = -EINVAL;
11181  	if (dev->features & NETIF_F_NETNS_LOCAL)
11182  		goto out;
11183  
11184  	/* Ensure the device has been registrered */
11185  	if (dev->reg_state != NETREG_REGISTERED)
11186  		goto out;
11187  
11188  	/* Get out if there is nothing todo */
11189  	err = 0;
11190  	if (net_eq(net_old, net))
11191  		goto out;
11192  
11193  	/* Pick the destination device name, and ensure
11194  	 * we can use it in the destination network namespace.
11195  	 */
11196  	err = -EEXIST;
11197  	if (netdev_name_in_use(net, dev->name)) {
11198  		/* We get here if we can't use the current device name */
11199  		if (!pat)
11200  			goto out;
11201  		err = dev_prep_valid_name(net, dev, pat, new_name);
11202  		if (err < 0)
11203  			goto out;
11204  	}
11205  	/* Check that none of the altnames conflicts. */
11206  	err = -EEXIST;
11207  	netdev_for_each_altname(dev, name_node)
11208  		if (netdev_name_in_use(net, name_node->name))
11209  			goto out;
11210  
11211  	/* Check that new_ifindex isn't used yet. */
11212  	if (new_ifindex) {
11213  		err = dev_index_reserve(net, new_ifindex);
11214  		if (err < 0)
11215  			goto out;
11216  	} else {
11217  		/* If there is an ifindex conflict assign a new one */
11218  		err = dev_index_reserve(net, dev->ifindex);
11219  		if (err == -EBUSY)
11220  			err = dev_index_reserve(net, 0);
11221  		if (err < 0)
11222  			goto out;
11223  		new_ifindex = err;
11224  	}
11225  
11226  	/*
11227  	 * And now a mini version of register_netdevice unregister_netdevice.
11228  	 */
11229  
11230  	/* If device is running close it first. */
11231  	dev_close(dev);
11232  
11233  	/* And unlink it from device chain */
11234  	unlist_netdevice(dev, true);
11235  
11236  	synchronize_net();
11237  
11238  	/* Shutdown queueing discipline. */
11239  	dev_shutdown(dev);
11240  
11241  	/* Notify protocols, that we are about to destroy
11242  	 * this device. They should clean all the things.
11243  	 *
11244  	 * Note that dev->reg_state stays at NETREG_REGISTERED.
11245  	 * This is wanted because this way 8021q and macvlan know
11246  	 * the device is just moving and can keep their slaves up.
11247  	 */
11248  	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11249  	rcu_barrier();
11250  
11251  	new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11252  
11253  	rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11254  			    new_ifindex);
11255  
11256  	/*
11257  	 *	Flush the unicast and multicast chains
11258  	 */
11259  	dev_uc_flush(dev);
11260  	dev_mc_flush(dev);
11261  
11262  	/* Send a netdev-removed uevent to the old namespace */
11263  	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11264  	netdev_adjacent_del_links(dev);
11265  
11266  	/* Move per-net netdevice notifiers that are following the netdevice */
11267  	move_netdevice_notifiers_dev_net(dev, net);
11268  
11269  	/* Actually switch the network namespace */
11270  	dev_net_set(dev, net);
11271  	dev->ifindex = new_ifindex;
11272  
11273  	/* Send a netdev-add uevent to the new namespace */
11274  	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11275  	netdev_adjacent_add_links(dev);
11276  
11277  	if (new_name[0]) /* Rename the netdev to prepared name */
11278  		strscpy(dev->name, new_name, IFNAMSIZ);
11279  
11280  	/* Fixup kobjects */
11281  	err = device_rename(&dev->dev, dev->name);
11282  	WARN_ON(err);
11283  
11284  	/* Adapt owner in case owning user namespace of target network
11285  	 * namespace is different from the original one.
11286  	 */
11287  	err = netdev_change_owner(dev, net_old, net);
11288  	WARN_ON(err);
11289  
11290  	/* Add the device back in the hashes */
11291  	list_netdevice(dev);
11292  
11293  	/* Notify protocols, that a new device appeared. */
11294  	call_netdevice_notifiers(NETDEV_REGISTER, dev);
11295  
11296  	/*
11297  	 *	Prevent userspace races by waiting until the network
11298  	 *	device is fully setup before sending notifications.
11299  	 */
11300  	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11301  
11302  	synchronize_net();
11303  	err = 0;
11304  out:
11305  	return err;
11306  }
11307  EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11308  
dev_cpu_dead(unsigned int oldcpu)11309  static int dev_cpu_dead(unsigned int oldcpu)
11310  {
11311  	struct sk_buff **list_skb;
11312  	struct sk_buff *skb;
11313  	unsigned int cpu;
11314  	struct softnet_data *sd, *oldsd, *remsd = NULL;
11315  
11316  	local_irq_disable();
11317  	cpu = smp_processor_id();
11318  	sd = &per_cpu(softnet_data, cpu);
11319  	oldsd = &per_cpu(softnet_data, oldcpu);
11320  
11321  	/* Find end of our completion_queue. */
11322  	list_skb = &sd->completion_queue;
11323  	while (*list_skb)
11324  		list_skb = &(*list_skb)->next;
11325  	/* Append completion queue from offline CPU. */
11326  	*list_skb = oldsd->completion_queue;
11327  	oldsd->completion_queue = NULL;
11328  
11329  	/* Append output queue from offline CPU. */
11330  	if (oldsd->output_queue) {
11331  		*sd->output_queue_tailp = oldsd->output_queue;
11332  		sd->output_queue_tailp = oldsd->output_queue_tailp;
11333  		oldsd->output_queue = NULL;
11334  		oldsd->output_queue_tailp = &oldsd->output_queue;
11335  	}
11336  	/* Append NAPI poll list from offline CPU, with one exception :
11337  	 * process_backlog() must be called by cpu owning percpu backlog.
11338  	 * We properly handle process_queue & input_pkt_queue later.
11339  	 */
11340  	while (!list_empty(&oldsd->poll_list)) {
11341  		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11342  							    struct napi_struct,
11343  							    poll_list);
11344  
11345  		list_del_init(&napi->poll_list);
11346  		if (napi->poll == process_backlog)
11347  			napi->state = 0;
11348  		else
11349  			____napi_schedule(sd, napi);
11350  	}
11351  
11352  	raise_softirq_irqoff(NET_TX_SOFTIRQ);
11353  	local_irq_enable();
11354  
11355  #ifdef CONFIG_RPS
11356  	remsd = oldsd->rps_ipi_list;
11357  	oldsd->rps_ipi_list = NULL;
11358  #endif
11359  	/* send out pending IPI's on offline CPU */
11360  	net_rps_send_ipi(remsd);
11361  
11362  	/* Process offline CPU's input_pkt_queue */
11363  	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11364  		netif_rx(skb);
11365  		input_queue_head_incr(oldsd);
11366  	}
11367  	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11368  		netif_rx(skb);
11369  		input_queue_head_incr(oldsd);
11370  	}
11371  
11372  	return 0;
11373  }
11374  
11375  /**
11376   *	netdev_increment_features - increment feature set by one
11377   *	@all: current feature set
11378   *	@one: new feature set
11379   *	@mask: mask feature set
11380   *
11381   *	Computes a new feature set after adding a device with feature set
11382   *	@one to the master device with current feature set @all.  Will not
11383   *	enable anything that is off in @mask. Returns the new feature set.
11384   */
netdev_increment_features(netdev_features_t all,netdev_features_t one,netdev_features_t mask)11385  netdev_features_t netdev_increment_features(netdev_features_t all,
11386  	netdev_features_t one, netdev_features_t mask)
11387  {
11388  	if (mask & NETIF_F_HW_CSUM)
11389  		mask |= NETIF_F_CSUM_MASK;
11390  	mask |= NETIF_F_VLAN_CHALLENGED;
11391  
11392  	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11393  	all &= one | ~NETIF_F_ALL_FOR_ALL;
11394  
11395  	/* If one device supports hw checksumming, set for all. */
11396  	if (all & NETIF_F_HW_CSUM)
11397  		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11398  
11399  	return all;
11400  }
11401  EXPORT_SYMBOL(netdev_increment_features);
11402  
netdev_create_hash(void)11403  static struct hlist_head * __net_init netdev_create_hash(void)
11404  {
11405  	int i;
11406  	struct hlist_head *hash;
11407  
11408  	hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11409  	if (hash != NULL)
11410  		for (i = 0; i < NETDEV_HASHENTRIES; i++)
11411  			INIT_HLIST_HEAD(&hash[i]);
11412  
11413  	return hash;
11414  }
11415  
11416  /* Initialize per network namespace state */
netdev_init(struct net * net)11417  static int __net_init netdev_init(struct net *net)
11418  {
11419  	BUILD_BUG_ON(GRO_HASH_BUCKETS >
11420  		     8 * sizeof_field(struct napi_struct, gro_bitmask));
11421  
11422  	INIT_LIST_HEAD(&net->dev_base_head);
11423  
11424  	net->dev_name_head = netdev_create_hash();
11425  	if (net->dev_name_head == NULL)
11426  		goto err_name;
11427  
11428  	net->dev_index_head = netdev_create_hash();
11429  	if (net->dev_index_head == NULL)
11430  		goto err_idx;
11431  
11432  	xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11433  
11434  	RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11435  
11436  	return 0;
11437  
11438  err_idx:
11439  	kfree(net->dev_name_head);
11440  err_name:
11441  	return -ENOMEM;
11442  }
11443  
11444  /**
11445   *	netdev_drivername - network driver for the device
11446   *	@dev: network device
11447   *
11448   *	Determine network driver for device.
11449   */
netdev_drivername(const struct net_device * dev)11450  const char *netdev_drivername(const struct net_device *dev)
11451  {
11452  	const struct device_driver *driver;
11453  	const struct device *parent;
11454  	const char *empty = "";
11455  
11456  	parent = dev->dev.parent;
11457  	if (!parent)
11458  		return empty;
11459  
11460  	driver = parent->driver;
11461  	if (driver && driver->name)
11462  		return driver->name;
11463  	return empty;
11464  }
11465  
__netdev_printk(const char * level,const struct net_device * dev,struct va_format * vaf)11466  static void __netdev_printk(const char *level, const struct net_device *dev,
11467  			    struct va_format *vaf)
11468  {
11469  	if (dev && dev->dev.parent) {
11470  		dev_printk_emit(level[1] - '0',
11471  				dev->dev.parent,
11472  				"%s %s %s%s: %pV",
11473  				dev_driver_string(dev->dev.parent),
11474  				dev_name(dev->dev.parent),
11475  				netdev_name(dev), netdev_reg_state(dev),
11476  				vaf);
11477  	} else if (dev) {
11478  		printk("%s%s%s: %pV",
11479  		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
11480  	} else {
11481  		printk("%s(NULL net_device): %pV", level, vaf);
11482  	}
11483  }
11484  
netdev_printk(const char * level,const struct net_device * dev,const char * format,...)11485  void netdev_printk(const char *level, const struct net_device *dev,
11486  		   const char *format, ...)
11487  {
11488  	struct va_format vaf;
11489  	va_list args;
11490  
11491  	va_start(args, format);
11492  
11493  	vaf.fmt = format;
11494  	vaf.va = &args;
11495  
11496  	__netdev_printk(level, dev, &vaf);
11497  
11498  	va_end(args);
11499  }
11500  EXPORT_SYMBOL(netdev_printk);
11501  
11502  #define define_netdev_printk_level(func, level)			\
11503  void func(const struct net_device *dev, const char *fmt, ...)	\
11504  {								\
11505  	struct va_format vaf;					\
11506  	va_list args;						\
11507  								\
11508  	va_start(args, fmt);					\
11509  								\
11510  	vaf.fmt = fmt;						\
11511  	vaf.va = &args;						\
11512  								\
11513  	__netdev_printk(level, dev, &vaf);			\
11514  								\
11515  	va_end(args);						\
11516  }								\
11517  EXPORT_SYMBOL(func);
11518  
11519  define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11520  define_netdev_printk_level(netdev_alert, KERN_ALERT);
11521  define_netdev_printk_level(netdev_crit, KERN_CRIT);
11522  define_netdev_printk_level(netdev_err, KERN_ERR);
11523  define_netdev_printk_level(netdev_warn, KERN_WARNING);
11524  define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11525  define_netdev_printk_level(netdev_info, KERN_INFO);
11526  
netdev_exit(struct net * net)11527  static void __net_exit netdev_exit(struct net *net)
11528  {
11529  	kfree(net->dev_name_head);
11530  	kfree(net->dev_index_head);
11531  	xa_destroy(&net->dev_by_index);
11532  	if (net != &init_net)
11533  		WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11534  }
11535  
11536  static struct pernet_operations __net_initdata netdev_net_ops = {
11537  	.init = netdev_init,
11538  	.exit = netdev_exit,
11539  };
11540  
default_device_exit_net(struct net * net)11541  static void __net_exit default_device_exit_net(struct net *net)
11542  {
11543  	struct netdev_name_node *name_node, *tmp;
11544  	struct net_device *dev, *aux;
11545  	/*
11546  	 * Push all migratable network devices back to the
11547  	 * initial network namespace
11548  	 */
11549  	ASSERT_RTNL();
11550  	for_each_netdev_safe(net, dev, aux) {
11551  		int err;
11552  		char fb_name[IFNAMSIZ];
11553  
11554  		/* Ignore unmoveable devices (i.e. loopback) */
11555  		if (dev->features & NETIF_F_NETNS_LOCAL)
11556  			continue;
11557  
11558  		/* Leave virtual devices for the generic cleanup */
11559  		if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11560  			continue;
11561  
11562  		/* Push remaining network devices to init_net */
11563  		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11564  		if (netdev_name_in_use(&init_net, fb_name))
11565  			snprintf(fb_name, IFNAMSIZ, "dev%%d");
11566  
11567  		netdev_for_each_altname_safe(dev, name_node, tmp)
11568  			if (netdev_name_in_use(&init_net, name_node->name)) {
11569  				netdev_name_node_del(name_node);
11570  				synchronize_rcu();
11571  				__netdev_name_node_alt_destroy(name_node);
11572  			}
11573  
11574  		err = dev_change_net_namespace(dev, &init_net, fb_name);
11575  		if (err) {
11576  			pr_emerg("%s: failed to move %s to init_net: %d\n",
11577  				 __func__, dev->name, err);
11578  			BUG();
11579  		}
11580  	}
11581  }
11582  
default_device_exit_batch(struct list_head * net_list)11583  static void __net_exit default_device_exit_batch(struct list_head *net_list)
11584  {
11585  	/* At exit all network devices most be removed from a network
11586  	 * namespace.  Do this in the reverse order of registration.
11587  	 * Do this across as many network namespaces as possible to
11588  	 * improve batching efficiency.
11589  	 */
11590  	struct net_device *dev;
11591  	struct net *net;
11592  	LIST_HEAD(dev_kill_list);
11593  
11594  	rtnl_lock();
11595  	list_for_each_entry(net, net_list, exit_list) {
11596  		default_device_exit_net(net);
11597  		cond_resched();
11598  	}
11599  
11600  	list_for_each_entry(net, net_list, exit_list) {
11601  		for_each_netdev_reverse(net, dev) {
11602  			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11603  				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11604  			else
11605  				unregister_netdevice_queue(dev, &dev_kill_list);
11606  		}
11607  	}
11608  	unregister_netdevice_many(&dev_kill_list);
11609  	rtnl_unlock();
11610  }
11611  
11612  static struct pernet_operations __net_initdata default_device_ops = {
11613  	.exit_batch = default_device_exit_batch,
11614  };
11615  
11616  /*
11617   *	Initialize the DEV module. At boot time this walks the device list and
11618   *	unhooks any devices that fail to initialise (normally hardware not
11619   *	present) and leaves us with a valid list of present and active devices.
11620   *
11621   */
11622  
11623  /*
11624   *       This is called single threaded during boot, so no need
11625   *       to take the rtnl semaphore.
11626   */
net_dev_init(void)11627  static int __init net_dev_init(void)
11628  {
11629  	int i, rc = -ENOMEM;
11630  
11631  	BUG_ON(!dev_boot_phase);
11632  
11633  	if (dev_proc_init())
11634  		goto out;
11635  
11636  	if (netdev_kobject_init())
11637  		goto out;
11638  
11639  	INIT_LIST_HEAD(&ptype_all);
11640  	for (i = 0; i < PTYPE_HASH_SIZE; i++)
11641  		INIT_LIST_HEAD(&ptype_base[i]);
11642  
11643  	if (register_pernet_subsys(&netdev_net_ops))
11644  		goto out;
11645  
11646  	/*
11647  	 *	Initialise the packet receive queues.
11648  	 */
11649  
11650  	for_each_possible_cpu(i) {
11651  		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11652  		struct softnet_data *sd = &per_cpu(softnet_data, i);
11653  
11654  		INIT_WORK(flush, flush_backlog);
11655  
11656  		skb_queue_head_init(&sd->input_pkt_queue);
11657  		skb_queue_head_init(&sd->process_queue);
11658  #ifdef CONFIG_XFRM_OFFLOAD
11659  		skb_queue_head_init(&sd->xfrm_backlog);
11660  #endif
11661  		INIT_LIST_HEAD(&sd->poll_list);
11662  		sd->output_queue_tailp = &sd->output_queue;
11663  #ifdef CONFIG_RPS
11664  		INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11665  		sd->cpu = i;
11666  #endif
11667  		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
11668  		spin_lock_init(&sd->defer_lock);
11669  
11670  		init_gro_hash(&sd->backlog);
11671  		sd->backlog.poll = process_backlog;
11672  		sd->backlog.weight = weight_p;
11673  	}
11674  
11675  	dev_boot_phase = 0;
11676  
11677  	/* The loopback device is special if any other network devices
11678  	 * is present in a network namespace the loopback device must
11679  	 * be present. Since we now dynamically allocate and free the
11680  	 * loopback device ensure this invariant is maintained by
11681  	 * keeping the loopback device as the first device on the
11682  	 * list of network devices.  Ensuring the loopback devices
11683  	 * is the first device that appears and the last network device
11684  	 * that disappears.
11685  	 */
11686  	if (register_pernet_device(&loopback_net_ops))
11687  		goto out;
11688  
11689  	if (register_pernet_device(&default_device_ops))
11690  		goto out;
11691  
11692  	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11693  	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11694  
11695  	rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11696  				       NULL, dev_cpu_dead);
11697  	WARN_ON(rc < 0);
11698  	rc = 0;
11699  out:
11700  	return rc;
11701  }
11702  
11703  subsys_initcall(net_dev_init);
11704